aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-04 11:47:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-04 11:47:12 -0400
commitb05d59dfceaea72565b1648af929b037b0f96d7f (patch)
treebbe92714be468ed8783bce6ac2c305c0aedf8eb5 /arch/x86
parentdaf342af2f7856fd2f5c66b9fb39a8f24986ca53 (diff)
parent820b3fcdeb80d30410f4427d2cbf9161c35fdeef (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm into next
Pull KVM updates from Paolo Bonzini: "At over 200 commits, covering almost all supported architectures, this was a pretty active cycle for KVM. Changes include: - a lot of s390 changes: optimizations, support for migration, GDB support and more - ARM changes are pretty small: support for the PSCI 0.2 hypercall interface on both the guest and the host (the latter acked by Catalin) - initial POWER8 and little-endian host support - support for running u-boot on embedded POWER targets - pretty large changes to MIPS too, completing the userspace interface and improving the handling of virtualized timer hardware - for x86, a larger set of changes is scheduled for 3.17. Still, we have a few emulator bugfixes and support for running nested fully-virtualized Xen guests (para-virtualized Xen guests have always worked). And some optimizations too. The only missing architecture here is ia64. It's not a coincidence that support for KVM on ia64 is scheduled for removal in 3.17" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (203 commits) KVM: add missing cleanup_srcu_struct KVM: PPC: Book3S PR: Rework SLB switching code KVM: PPC: Book3S PR: Use SLB entry 0 KVM: PPC: Book3S HV: Fix machine check delivery to guest KVM: PPC: Book3S HV: Work around POWER8 performance monitor bugs KVM: PPC: Book3S HV: Make sure we don't miss dirty pages KVM: PPC: Book3S HV: Fix dirty map for hugepages KVM: PPC: Book3S HV: Put huge-page HPTEs in rmap chain for base address KVM: PPC: Book3S HV: Fix check for running inside guest in global_invalidates() KVM: PPC: Book3S: Move KVM_REG_PPC_WORT to an unused register number KVM: PPC: Book3S: Add ONE_REG register names that were missed KVM: PPC: Add CAP to indicate hcall fixes KVM: PPC: MPIC: Reset IRQ source private members KVM: PPC: Graciously fail broken LE hypercalls PPC: ePAPR: Fix hypercall on LE guest KVM: PPC: BOOK3S: Remove open coded make_dsisr in alignment handler KVM: PPC: BOOK3S: Always use the saved DAR value PPC: KVM: Make NX bit available with magic page KVM: PPC: Disable NX for old magic page using guests KVM: PPC: BOOK3S: HV: Add mixed page-size support for guest ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/kvm_emulate.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h7
-rw-r--r--arch/x86/include/asm/traps.h5
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kvm/cpuid.c11
-rw-r--r--arch/x86/kvm/cpuid.h7
-rw-r--r--arch/x86/kvm/emulate.c93
-rw-r--r--arch/x86/kvm/irq.c1
-rw-r--r--arch/x86/kvm/lapic.c62
-rw-r--r--arch/x86/kvm/mmu.c84
-rw-r--r--arch/x86/kvm/mmu.h33
-rw-r--r--arch/x86/kvm/paging_tmpl.h7
-rw-r--r--arch/x86/kvm/pmu.c7
-rw-r--r--arch/x86/kvm/svm.c63
-rw-r--r--arch/x86/kvm/trace.h20
-rw-r--r--arch/x86/kvm/vmx.c333
-rw-r--r--arch/x86/kvm/x86.c61
17 files changed, 505 insertions, 292 deletions
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 24ec1216596e..a04fe4eb237d 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -189,7 +189,6 @@ struct x86_emulate_ops {
189 void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); 189 void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
190 ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); 190 ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
191 int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); 191 int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
192 void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val);
193 int (*cpl)(struct x86_emulate_ctxt *ctxt); 192 int (*cpl)(struct x86_emulate_ctxt *ctxt);
194 int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); 193 int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
195 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); 194 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7de069afb382..49314155b66c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -50,11 +50,7 @@
50 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 50 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
51 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 51 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
52 52
53#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 53#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
54#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
55#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL
56#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
57 0xFFFFFF0000000000ULL)
58#define CR4_RESERVED_BITS \ 54#define CR4_RESERVED_BITS \
59 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 55 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
60 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 56 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
@@ -134,7 +130,6 @@ enum kvm_reg_ex {
134 VCPU_EXREG_PDPTR = NR_VCPU_REGS, 130 VCPU_EXREG_PDPTR = NR_VCPU_REGS,
135 VCPU_EXREG_CR3, 131 VCPU_EXREG_CR3,
136 VCPU_EXREG_RFLAGS, 132 VCPU_EXREG_RFLAGS,
137 VCPU_EXREG_CPL,
138 VCPU_EXREG_SEGMENTS, 133 VCPU_EXREG_SEGMENTS,
139}; 134};
140 135
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 58d66fe06b61..8ba18842c48e 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -74,6 +74,11 @@ dotraplinkage void do_general_protection(struct pt_regs *, long);
74dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); 74dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
75#ifdef CONFIG_TRACING 75#ifdef CONFIG_TRACING
76dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); 76dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
77#else
78static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error)
79{
80 do_page_fault(regs, error);
81}
77#endif 82#endif
78dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); 83dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
79dotraplinkage void do_coprocessor_error(struct pt_regs *, long); 84dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 0331cb389d68..7e97371387fd 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -259,7 +259,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
259 259
260 switch (kvm_read_and_reset_pf_reason()) { 260 switch (kvm_read_and_reset_pf_reason()) {
261 default: 261 default:
262 do_page_fault(regs, error_code); 262 trace_do_page_fault(regs, error_code);
263 break; 263 break;
264 case KVM_PV_REASON_PAGE_NOT_PRESENT: 264 case KVM_PV_REASON_PAGE_NOT_PRESENT:
265 /* page is swapped out by the host. */ 265 /* page is swapped out by the host. */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index f47a104a749c..38a0afe83c6b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -283,6 +283,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
283 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 283 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
284 /* cpuid 1.ecx */ 284 /* cpuid 1.ecx */
285 const u32 kvm_supported_word4_x86_features = 285 const u32 kvm_supported_word4_x86_features =
286 /* NOTE: MONITOR (and MWAIT) are emulated as NOP,
287 * but *not* advertised to guests via CPUID ! */
286 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 288 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
287 0 /* DS-CPL, VMX, SMX, EST */ | 289 0 /* DS-CPL, VMX, SMX, EST */ |
288 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 290 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
@@ -495,6 +497,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
495 entry->ecx &= kvm_supported_word6_x86_features; 497 entry->ecx &= kvm_supported_word6_x86_features;
496 cpuid_mask(&entry->ecx, 6); 498 cpuid_mask(&entry->ecx, 6);
497 break; 499 break;
500 case 0x80000007: /* Advanced power management */
501 /* invariant TSC is CPUID.80000007H:EDX[8] */
502 entry->edx &= (1 << 8);
503 /* mask against host */
504 entry->edx &= boot_cpu_data.x86_power;
505 entry->eax = entry->ebx = entry->ecx = 0;
506 break;
498 case 0x80000008: { 507 case 0x80000008: {
499 unsigned g_phys_as = (entry->eax >> 16) & 0xff; 508 unsigned g_phys_as = (entry->eax >> 16) & 0xff;
500 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); 509 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
@@ -525,7 +534,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
525 case 3: /* Processor serial number */ 534 case 3: /* Processor serial number */
526 case 5: /* MONITOR/MWAIT */ 535 case 5: /* MONITOR/MWAIT */
527 case 6: /* Thermal management */ 536 case 6: /* Thermal management */
528 case 0x80000007: /* Advanced power management */
529 case 0xC0000002: 537 case 0xC0000002:
530 case 0xC0000003: 538 case 0xC0000003:
531 case 0xC0000004: 539 case 0xC0000004:
@@ -726,6 +734,7 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
726not_found: 734not_found:
727 return 36; 735 return 36;
728} 736}
737EXPORT_SYMBOL_GPL(cpuid_maxphyaddr);
729 738
730/* 739/*
731 * If no match is found, check whether we exceed the vCPU's limit 740 * If no match is found, check whether we exceed the vCPU's limit
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index eeecbed26ac7..f9087315e0cd 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -88,4 +88,11 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu)
88 return best && (best->ecx & bit(X86_FEATURE_X2APIC)); 88 return best && (best->ecx & bit(X86_FEATURE_X2APIC));
89} 89}
90 90
91static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu)
92{
93 struct kvm_cpuid_entry2 *best;
94
95 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
96 return best && (best->edx & bit(X86_FEATURE_GBPAGES));
97}
91#endif 98#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 205b17eed93c..e4e833d3d7d7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -161,6 +161,7 @@
161#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ 161#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
162#define NoWrite ((u64)1 << 45) /* No writeback */ 162#define NoWrite ((u64)1 << 45) /* No writeback */
163#define SrcWrite ((u64)1 << 46) /* Write back src operand */ 163#define SrcWrite ((u64)1 << 46) /* Write back src operand */
164#define NoMod ((u64)1 << 47) /* Mod field is ignored */
164 165
165#define DstXacc (DstAccLo | SrcAccHi | SrcWrite) 166#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
166 167
@@ -1077,7 +1078,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1077 ctxt->modrm_rm |= (ctxt->modrm & 0x07); 1078 ctxt->modrm_rm |= (ctxt->modrm & 0x07);
1078 ctxt->modrm_seg = VCPU_SREG_DS; 1079 ctxt->modrm_seg = VCPU_SREG_DS;
1079 1080
1080 if (ctxt->modrm_mod == 3) { 1081 if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
1081 op->type = OP_REG; 1082 op->type = OP_REG;
1082 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 1083 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
1083 op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1084 op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
@@ -1324,7 +1325,8 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1324 rc->end = n * size; 1325 rc->end = n * size;
1325 } 1326 }
1326 1327
1327 if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) { 1328 if (ctxt->rep_prefix && (ctxt->d & String) &&
1329 !(ctxt->eflags & EFLG_DF)) {
1328 ctxt->dst.data = rc->data + rc->pos; 1330 ctxt->dst.data = rc->data + rc->pos;
1329 ctxt->dst.type = OP_MEM_STR; 1331 ctxt->dst.type = OP_MEM_STR;
1330 ctxt->dst.count = (rc->end - rc->pos) / size; 1332 ctxt->dst.count = (rc->end - rc->pos) / size;
@@ -1409,11 +1411,11 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1409} 1411}
1410 1412
1411/* Does not support long mode */ 1413/* Does not support long mode */
1412static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1414static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1413 u16 selector, int seg) 1415 u16 selector, int seg, u8 cpl, bool in_task_switch)
1414{ 1416{
1415 struct desc_struct seg_desc, old_desc; 1417 struct desc_struct seg_desc, old_desc;
1416 u8 dpl, rpl, cpl; 1418 u8 dpl, rpl;
1417 unsigned err_vec = GP_VECTOR; 1419 unsigned err_vec = GP_VECTOR;
1418 u32 err_code = 0; 1420 u32 err_code = 0;
1419 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ 1421 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
@@ -1441,7 +1443,6 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1441 } 1443 }
1442 1444
1443 rpl = selector & 3; 1445 rpl = selector & 3;
1444 cpl = ctxt->ops->cpl(ctxt);
1445 1446
1446 /* NULL selector is not valid for TR, CS and SS (except for long mode) */ 1447 /* NULL selector is not valid for TR, CS and SS (except for long mode) */
1447 if ((seg == VCPU_SREG_CS 1448 if ((seg == VCPU_SREG_CS
@@ -1486,6 +1487,9 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1486 goto exception; 1487 goto exception;
1487 break; 1488 break;
1488 case VCPU_SREG_CS: 1489 case VCPU_SREG_CS:
1490 if (in_task_switch && rpl != dpl)
1491 goto exception;
1492
1489 if (!(seg_desc.type & 8)) 1493 if (!(seg_desc.type & 8))
1490 goto exception; 1494 goto exception;
1491 1495
@@ -1543,6 +1547,13 @@ exception:
1543 return X86EMUL_PROPAGATE_FAULT; 1547 return X86EMUL_PROPAGATE_FAULT;
1544} 1548}
1545 1549
1550static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1551 u16 selector, int seg)
1552{
1553 u8 cpl = ctxt->ops->cpl(ctxt);
1554 return __load_segment_descriptor(ctxt, selector, seg, cpl, false);
1555}
1556
1546static void write_register_operand(struct operand *op) 1557static void write_register_operand(struct operand *op)
1547{ 1558{
1548 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ 1559 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
@@ -2404,6 +2415,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2404 struct tss_segment_16 *tss) 2415 struct tss_segment_16 *tss)
2405{ 2416{
2406 int ret; 2417 int ret;
2418 u8 cpl;
2407 2419
2408 ctxt->_eip = tss->ip; 2420 ctxt->_eip = tss->ip;
2409 ctxt->eflags = tss->flag | 2; 2421 ctxt->eflags = tss->flag | 2;
@@ -2426,23 +2438,25 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2426 set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); 2438 set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
2427 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); 2439 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
2428 2440
2441 cpl = tss->cs & 3;
2442
2429 /* 2443 /*
2430 * Now load segment descriptors. If fault happens at this stage 2444 * Now load segment descriptors. If fault happens at this stage
2431 * it is handled in a context of new task 2445 * it is handled in a context of new task
2432 */ 2446 */
2433 ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); 2447 ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true);
2434 if (ret != X86EMUL_CONTINUE) 2448 if (ret != X86EMUL_CONTINUE)
2435 return ret; 2449 return ret;
2436 ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); 2450 ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true);
2437 if (ret != X86EMUL_CONTINUE) 2451 if (ret != X86EMUL_CONTINUE)
2438 return ret; 2452 return ret;
2439 ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); 2453 ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true);
2440 if (ret != X86EMUL_CONTINUE) 2454 if (ret != X86EMUL_CONTINUE)
2441 return ret; 2455 return ret;
2442 ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); 2456 ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true);
2443 if (ret != X86EMUL_CONTINUE) 2457 if (ret != X86EMUL_CONTINUE)
2444 return ret; 2458 return ret;
2445 ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); 2459 ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true);
2446 if (ret != X86EMUL_CONTINUE) 2460 if (ret != X86EMUL_CONTINUE)
2447 return ret; 2461 return ret;
2448 2462
@@ -2496,7 +2510,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2496static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, 2510static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2497 struct tss_segment_32 *tss) 2511 struct tss_segment_32 *tss)
2498{ 2512{
2499 tss->cr3 = ctxt->ops->get_cr(ctxt, 3); 2513 /* CR3 and ldt selector are not saved intentionally */
2500 tss->eip = ctxt->_eip; 2514 tss->eip = ctxt->_eip;
2501 tss->eflags = ctxt->eflags; 2515 tss->eflags = ctxt->eflags;
2502 tss->eax = reg_read(ctxt, VCPU_REGS_RAX); 2516 tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
@@ -2514,13 +2528,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2514 tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); 2528 tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
2515 tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); 2529 tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
2516 tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); 2530 tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
2517 tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR);
2518} 2531}
2519 2532
2520static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, 2533static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2521 struct tss_segment_32 *tss) 2534 struct tss_segment_32 *tss)
2522{ 2535{
2523 int ret; 2536 int ret;
2537 u8 cpl;
2524 2538
2525 if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) 2539 if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
2526 return emulate_gp(ctxt, 0); 2540 return emulate_gp(ctxt, 0);
@@ -2539,7 +2553,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2539 2553
2540 /* 2554 /*
2541 * SDM says that segment selectors are loaded before segment 2555 * SDM says that segment selectors are loaded before segment
2542 * descriptors 2556 * descriptors. This is important because CPL checks will
2557 * use CS.RPL.
2543 */ 2558 */
2544 set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); 2559 set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
2545 set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); 2560 set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
@@ -2553,43 +2568,38 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2553 * If we're switching between Protected Mode and VM86, we need to make 2568 * If we're switching between Protected Mode and VM86, we need to make
2554 * sure to update the mode before loading the segment descriptors so 2569 * sure to update the mode before loading the segment descriptors so
2555 * that the selectors are interpreted correctly. 2570 * that the selectors are interpreted correctly.
2556 *
2557 * Need to get rflags to the vcpu struct immediately because it
2558 * influences the CPL which is checked at least when loading the segment
2559 * descriptors and when pushing an error code to the new kernel stack.
2560 *
2561 * TODO Introduce a separate ctxt->ops->set_cpl callback
2562 */ 2571 */
2563 if (ctxt->eflags & X86_EFLAGS_VM) 2572 if (ctxt->eflags & X86_EFLAGS_VM) {
2564 ctxt->mode = X86EMUL_MODE_VM86; 2573 ctxt->mode = X86EMUL_MODE_VM86;
2565 else 2574 cpl = 3;
2575 } else {
2566 ctxt->mode = X86EMUL_MODE_PROT32; 2576 ctxt->mode = X86EMUL_MODE_PROT32;
2567 2577 cpl = tss->cs & 3;
2568 ctxt->ops->set_rflags(ctxt, ctxt->eflags); 2578 }
2569 2579
2570 /* 2580 /*
2571 * Now load segment descriptors. If fault happenes at this stage 2581 * Now load segment descriptors. If fault happenes at this stage
2572 * it is handled in a context of new task 2582 * it is handled in a context of new task
2573 */ 2583 */
2574 ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); 2584 ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true);
2575 if (ret != X86EMUL_CONTINUE) 2585 if (ret != X86EMUL_CONTINUE)
2576 return ret; 2586 return ret;
2577 ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); 2587 ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true);
2578 if (ret != X86EMUL_CONTINUE) 2588 if (ret != X86EMUL_CONTINUE)
2579 return ret; 2589 return ret;
2580 ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); 2590 ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true);
2581 if (ret != X86EMUL_CONTINUE) 2591 if (ret != X86EMUL_CONTINUE)
2582 return ret; 2592 return ret;
2583 ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); 2593 ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true);
2584 if (ret != X86EMUL_CONTINUE) 2594 if (ret != X86EMUL_CONTINUE)
2585 return ret; 2595 return ret;
2586 ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); 2596 ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true);
2587 if (ret != X86EMUL_CONTINUE) 2597 if (ret != X86EMUL_CONTINUE)
2588 return ret; 2598 return ret;
2589 ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS); 2599 ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true);
2590 if (ret != X86EMUL_CONTINUE) 2600 if (ret != X86EMUL_CONTINUE)
2591 return ret; 2601 return ret;
2592 ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS); 2602 ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true);
2593 if (ret != X86EMUL_CONTINUE) 2603 if (ret != X86EMUL_CONTINUE)
2594 return ret; 2604 return ret;
2595 2605
@@ -2604,6 +2614,8 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2604 struct tss_segment_32 tss_seg; 2614 struct tss_segment_32 tss_seg;
2605 int ret; 2615 int ret;
2606 u32 new_tss_base = get_desc_base(new_desc); 2616 u32 new_tss_base = get_desc_base(new_desc);
2617 u32 eip_offset = offsetof(struct tss_segment_32, eip);
2618 u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector);
2607 2619
2608 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, 2620 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2609 &ctxt->exception); 2621 &ctxt->exception);
@@ -2613,8 +2625,9 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2613 2625
2614 save_state_to_tss32(ctxt, &tss_seg); 2626 save_state_to_tss32(ctxt, &tss_seg);
2615 2627
2616 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, 2628 /* Only GP registers and segment selectors are saved */
2617 &ctxt->exception); 2629 ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
2630 ldt_sel_offset - eip_offset, &ctxt->exception);
2618 if (ret != X86EMUL_CONTINUE) 2631 if (ret != X86EMUL_CONTINUE)
2619 /* FIXME: need to provide precise fault address */ 2632 /* FIXME: need to provide precise fault address */
2620 return ret; 2633 return ret;
@@ -3386,10 +3399,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
3386 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); 3399 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
3387 if (efer & EFER_LMA) 3400 if (efer & EFER_LMA)
3388 rsvd = CR3_L_MODE_RESERVED_BITS; 3401 rsvd = CR3_L_MODE_RESERVED_BITS;
3389 else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE)
3390 rsvd = CR3_PAE_RESERVED_BITS;
3391 else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG)
3392 rsvd = CR3_NONPAE_RESERVED_BITS;
3393 3402
3394 if (new_val & rsvd) 3403 if (new_val & rsvd)
3395 return emulate_gp(ctxt, 0); 3404 return emulate_gp(ctxt, 0);
@@ -3869,10 +3878,12 @@ static const struct opcode twobyte_table[256] = {
3869 N, N, N, N, N, N, N, N, 3878 N, N, N, N, N, N, N, N,
3870 D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM), 3879 D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM),
3871 /* 0x20 - 0x2F */ 3880 /* 0x20 - 0x2F */
3872 DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), 3881 DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read),
3873 DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), 3882 DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
3874 IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), 3883 IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
3875 IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), 3884 check_cr_write),
3885 IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
3886 check_dr_write),
3876 N, N, N, N, 3887 N, N, N, N,
3877 GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), 3888 GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
3878 GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), 3889 GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 484bc874688b..bd0da433e6d7 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -113,6 +113,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
113 113
114 return kvm_get_apic_interrupt(v); /* APIC */ 114 return kvm_get_apic_interrupt(v); /* APIC */
115} 115}
116EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
116 117
117void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 118void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
118{ 119{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9736529ade08..006911858174 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -360,6 +360,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
360 360
361static inline void apic_set_isr(int vec, struct kvm_lapic *apic) 361static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
362{ 362{
363 /* Note that we never get here with APIC virtualization enabled. */
364
363 if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) 365 if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
364 ++apic->isr_count; 366 ++apic->isr_count;
365 BUG_ON(apic->isr_count > MAX_APIC_VECTOR); 367 BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -371,12 +373,48 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
371 apic->highest_isr_cache = vec; 373 apic->highest_isr_cache = vec;
372} 374}
373 375
376static inline int apic_find_highest_isr(struct kvm_lapic *apic)
377{
378 int result;
379
380 /*
381 * Note that isr_count is always 1, and highest_isr_cache
382 * is always -1, with APIC virtualization enabled.
383 */
384 if (!apic->isr_count)
385 return -1;
386 if (likely(apic->highest_isr_cache != -1))
387 return apic->highest_isr_cache;
388
389 result = find_highest_vector(apic->regs + APIC_ISR);
390 ASSERT(result == -1 || result >= 16);
391
392 return result;
393}
394
374static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) 395static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
375{ 396{
376 if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) 397 struct kvm_vcpu *vcpu;
398 if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
399 return;
400
401 vcpu = apic->vcpu;
402
403 /*
404 * We do get here for APIC virtualization enabled if the guest
405 * uses the Hyper-V APIC enlightenment. In this case we may need
406 * to trigger a new interrupt delivery by writing the SVI field;
407 * on the other hand isr_count and highest_isr_cache are unused
408 * and must be left alone.
409 */
410 if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
411 kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
412 apic_find_highest_isr(apic));
413 else {
377 --apic->isr_count; 414 --apic->isr_count;
378 BUG_ON(apic->isr_count < 0); 415 BUG_ON(apic->isr_count < 0);
379 apic->highest_isr_cache = -1; 416 apic->highest_isr_cache = -1;
417 }
380} 418}
381 419
382int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 420int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
@@ -456,22 +494,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
456 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); 494 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
457} 495}
458 496
459static inline int apic_find_highest_isr(struct kvm_lapic *apic)
460{
461 int result;
462
463 /* Note that isr_count is always 1 with vid enabled */
464 if (!apic->isr_count)
465 return -1;
466 if (likely(apic->highest_isr_cache != -1))
467 return apic->highest_isr_cache;
468
469 result = find_highest_vector(apic->regs + APIC_ISR);
470 ASSERT(result == -1 || result >= 16);
471
472 return result;
473}
474
475void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) 497void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
476{ 498{
477 struct kvm_lapic *apic = vcpu->arch.apic; 499 struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1605,6 +1627,8 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1605 int vector = kvm_apic_has_interrupt(vcpu); 1627 int vector = kvm_apic_has_interrupt(vcpu);
1606 struct kvm_lapic *apic = vcpu->arch.apic; 1628 struct kvm_lapic *apic = vcpu->arch.apic;
1607 1629
1630 /* Note that we never get here with APIC virtualization enabled. */
1631
1608 if (vector == -1) 1632 if (vector == -1)
1609 return -1; 1633 return -1;
1610 1634
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 813d31038b93..931467881da7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -22,6 +22,7 @@
22#include "mmu.h" 22#include "mmu.h"
23#include "x86.h" 23#include "x86.h"
24#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include "cpuid.h"
25 26
26#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
27#include <linux/types.h> 28#include <linux/types.h>
@@ -595,7 +596,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
595 * we always atomicly update it, see the comments in 596 * we always atomicly update it, see the comments in
596 * spte_has_volatile_bits(). 597 * spte_has_volatile_bits().
597 */ 598 */
598 if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) 599 if (spte_is_locklessly_modifiable(old_spte) &&
600 !is_writable_pte(new_spte))
599 ret = true; 601 ret = true;
600 602
601 if (!shadow_accessed_mask) 603 if (!shadow_accessed_mask)
@@ -1176,8 +1178,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1176 1178
1177/* 1179/*
1178 * Write-protect on the specified @sptep, @pt_protect indicates whether 1180 * Write-protect on the specified @sptep, @pt_protect indicates whether
1179 * spte writ-protection is caused by protecting shadow page table. 1181 * spte write-protection is caused by protecting shadow page table.
1180 * @flush indicates whether tlb need be flushed.
1181 * 1182 *
1182 * Note: write protection is difference between drity logging and spte 1183 * Note: write protection is difference between drity logging and spte
1183 * protection: 1184 * protection:
@@ -1186,10 +1187,9 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1186 * - for spte protection, the spte can be writable only after unsync-ing 1187 * - for spte protection, the spte can be writable only after unsync-ing
1187 * shadow page. 1188 * shadow page.
1188 * 1189 *
1189 * Return true if the spte is dropped. 1190 * Return true if tlb need be flushed.
1190 */ 1191 */
1191static bool 1192static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
1192spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
1193{ 1193{
1194 u64 spte = *sptep; 1194 u64 spte = *sptep;
1195 1195
@@ -1199,17 +1199,11 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
1199 1199
1200 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); 1200 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1201 1201
1202 if (__drop_large_spte(kvm, sptep)) {
1203 *flush |= true;
1204 return true;
1205 }
1206
1207 if (pt_protect) 1202 if (pt_protect)
1208 spte &= ~SPTE_MMU_WRITEABLE; 1203 spte &= ~SPTE_MMU_WRITEABLE;
1209 spte = spte & ~PT_WRITABLE_MASK; 1204 spte = spte & ~PT_WRITABLE_MASK;
1210 1205
1211 *flush |= mmu_spte_update(sptep, spte); 1206 return mmu_spte_update(sptep, spte);
1212 return false;
1213} 1207}
1214 1208
1215static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, 1209static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
@@ -1221,11 +1215,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1221 1215
1222 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { 1216 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1223 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1217 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1224 if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
1225 sptep = rmap_get_first(*rmapp, &iter);
1226 continue;
1227 }
1228 1218
1219 flush |= spte_write_protect(kvm, sptep, pt_protect);
1229 sptep = rmap_get_next(&iter); 1220 sptep = rmap_get_next(&iter);
1230 } 1221 }
1231 1222
@@ -2802,9 +2793,9 @@ static bool page_fault_can_be_fast(u32 error_code)
2802} 2793}
2803 2794
2804static bool 2795static bool
2805fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) 2796fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2797 u64 *sptep, u64 spte)
2806{ 2798{
2807 struct kvm_mmu_page *sp = page_header(__pa(sptep));
2808 gfn_t gfn; 2799 gfn_t gfn;
2809 2800
2810 WARN_ON(!sp->role.direct); 2801 WARN_ON(!sp->role.direct);
@@ -2830,6 +2821,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2830 u32 error_code) 2821 u32 error_code)
2831{ 2822{
2832 struct kvm_shadow_walk_iterator iterator; 2823 struct kvm_shadow_walk_iterator iterator;
2824 struct kvm_mmu_page *sp;
2833 bool ret = false; 2825 bool ret = false;
2834 u64 spte = 0ull; 2826 u64 spte = 0ull;
2835 2827
@@ -2853,7 +2845,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2853 goto exit; 2845 goto exit;
2854 } 2846 }
2855 2847
2856 if (!is_last_spte(spte, level)) 2848 sp = page_header(__pa(iterator.sptep));
2849 if (!is_last_spte(spte, sp->role.level))
2857 goto exit; 2850 goto exit;
2858 2851
2859 /* 2852 /*
@@ -2875,11 +2868,24 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2875 goto exit; 2868 goto exit;
2876 2869
2877 /* 2870 /*
2871 * Do not fix write-permission on the large spte since we only dirty
2872 * the first page into the dirty-bitmap in fast_pf_fix_direct_spte()
2873 * that means other pages are missed if its slot is dirty-logged.
2874 *
2875 * Instead, we let the slow page fault path create a normal spte to
2876 * fix the access.
2877 *
2878 * See the comments in kvm_arch_commit_memory_region().
2879 */
2880 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2881 goto exit;
2882
2883 /*
2878 * Currently, fast page fault only works for direct mapping since 2884 * Currently, fast page fault only works for direct mapping since
2879 * the gfn is not stable for indirect shadow page. 2885 * the gfn is not stable for indirect shadow page.
2880 * See Documentation/virtual/kvm/locking.txt to get more detail. 2886 * See Documentation/virtual/kvm/locking.txt to get more detail.
2881 */ 2887 */
2882 ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); 2888 ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
2883exit: 2889exit:
2884 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, 2890 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
2885 spte, ret); 2891 spte, ret);
@@ -3511,11 +3517,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3511{ 3517{
3512 int maxphyaddr = cpuid_maxphyaddr(vcpu); 3518 int maxphyaddr = cpuid_maxphyaddr(vcpu);
3513 u64 exb_bit_rsvd = 0; 3519 u64 exb_bit_rsvd = 0;
3520 u64 gbpages_bit_rsvd = 0;
3514 3521
3515 context->bad_mt_xwr = 0; 3522 context->bad_mt_xwr = 0;
3516 3523
3517 if (!context->nx) 3524 if (!context->nx)
3518 exb_bit_rsvd = rsvd_bits(63, 63); 3525 exb_bit_rsvd = rsvd_bits(63, 63);
3526 if (!guest_cpuid_has_gbpages(vcpu))
3527 gbpages_bit_rsvd = rsvd_bits(7, 7);
3519 switch (context->root_level) { 3528 switch (context->root_level) {
3520 case PT32_ROOT_LEVEL: 3529 case PT32_ROOT_LEVEL:
3521 /* no rsvd bits for 2 level 4K page table entries */ 3530 /* no rsvd bits for 2 level 4K page table entries */
@@ -3538,7 +3547,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3538 case PT32E_ROOT_LEVEL: 3547 case PT32E_ROOT_LEVEL:
3539 context->rsvd_bits_mask[0][2] = 3548 context->rsvd_bits_mask[0][2] =
3540 rsvd_bits(maxphyaddr, 63) | 3549 rsvd_bits(maxphyaddr, 63) |
3541 rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ 3550 rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
3542 context->rsvd_bits_mask[0][1] = exb_bit_rsvd | 3551 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
3543 rsvd_bits(maxphyaddr, 62); /* PDE */ 3552 rsvd_bits(maxphyaddr, 62); /* PDE */
3544 context->rsvd_bits_mask[0][0] = exb_bit_rsvd | 3553 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
@@ -3550,16 +3559,16 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3550 break; 3559 break;
3551 case PT64_ROOT_LEVEL: 3560 case PT64_ROOT_LEVEL:
3552 context->rsvd_bits_mask[0][3] = exb_bit_rsvd | 3561 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
3553 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); 3562 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7);
3554 context->rsvd_bits_mask[0][2] = exb_bit_rsvd | 3563 context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
3555 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); 3564 gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
3556 context->rsvd_bits_mask[0][1] = exb_bit_rsvd | 3565 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
3557 rsvd_bits(maxphyaddr, 51); 3566 rsvd_bits(maxphyaddr, 51);
3558 context->rsvd_bits_mask[0][0] = exb_bit_rsvd | 3567 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
3559 rsvd_bits(maxphyaddr, 51); 3568 rsvd_bits(maxphyaddr, 51);
3560 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; 3569 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
3561 context->rsvd_bits_mask[1][2] = exb_bit_rsvd | 3570 context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
3562 rsvd_bits(maxphyaddr, 51) | 3571 gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
3563 rsvd_bits(13, 29); 3572 rsvd_bits(13, 29);
3564 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 3573 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
3565 rsvd_bits(maxphyaddr, 51) | 3574 rsvd_bits(maxphyaddr, 51) |
@@ -4304,15 +4313,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4304 if (*rmapp) 4313 if (*rmapp)
4305 __rmap_write_protect(kvm, rmapp, false); 4314 __rmap_write_protect(kvm, rmapp, false);
4306 4315
4307 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 4316 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
4308 kvm_flush_remote_tlbs(kvm);
4309 cond_resched_lock(&kvm->mmu_lock); 4317 cond_resched_lock(&kvm->mmu_lock);
4310 }
4311 } 4318 }
4312 } 4319 }
4313 4320
4314 kvm_flush_remote_tlbs(kvm);
4315 spin_unlock(&kvm->mmu_lock); 4321 spin_unlock(&kvm->mmu_lock);
4322
4323 /*
4324 * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
4325 * which do tlb flush out of mmu-lock should be serialized by
4326 * kvm->slots_lock otherwise tlb flush would be missed.
4327 */
4328 lockdep_assert_held(&kvm->slots_lock);
4329
4330 /*
4331 * We can flush all the TLBs out of the mmu lock without TLB
4332 * corruption since we just change the spte from writable to
4333 * readonly so that we only need to care the case of changing
4334 * spte from present to present (changing the spte from present
4335 * to nonpresent will flush all the TLBs immediately), in other
4336 * words, the only case we care is mmu_spte_update() where we
4337 * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
4338 * instead of PT_WRITABLE_MASK, that means it does not depend
4339 * on PT_WRITABLE_MASK anymore.
4340 */
4341 kvm_flush_remote_tlbs(kvm);
4316} 4342}
4317 4343
4318#define BATCH_ZAP_PAGES 10 4344#define BATCH_ZAP_PAGES 10
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3842e70bdb7c..b982112d2ca5 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -104,6 +104,39 @@ static inline int is_present_gpte(unsigned long pte)
104 return pte & PT_PRESENT_MASK; 104 return pte & PT_PRESENT_MASK;
105} 105}
106 106
107/*
108 * Currently, we have two sorts of write-protection, a) the first one
109 * write-protects guest page to sync the guest modification, b) another one is
110 * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
111 * between these two sorts are:
112 * 1) the first case clears SPTE_MMU_WRITEABLE bit.
113 * 2) the first case requires flushing tlb immediately avoiding corrupting
114 * shadow page table between all vcpus so it should be in the protection of
115 * mmu-lock. And the another case does not need to flush tlb until returning
116 * the dirty bitmap to userspace since it only write-protects the page
117 * logged in the bitmap, that means the page in the dirty bitmap is not
118 * missed, so it can flush tlb out of mmu-lock.
119 *
120 * So, there is the problem: the first case can meet the corrupted tlb caused
121 * by another case which write-protects pages but without flush tlb
122 * immediately. In order to making the first case be aware this problem we let
123 * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
124 * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
125 *
126 * Anyway, whenever a spte is updated (only permission and status bits are
127 * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
128 * readonly, if that happens, we need to flush tlb. Fortunately,
129 * mmu_spte_update() has already handled it perfectly.
130 *
131 * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
132 * - if we want to see if it has writable tlb entry or if the spte can be
133 * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
134 * case, otherwise
135 * - if we fix page fault on the spte or do write-protection by dirty logging,
136 * check PT_WRITABLE_MASK.
137 *
138 * TODO: introduce APIs to split these two cases.
139 */
107static inline int is_writable_pte(unsigned long pte) 140static inline int is_writable_pte(unsigned long pte)
108{ 141{
109 return pte & PT_WRITABLE_MASK; 142 return pte & PT_WRITABLE_MASK;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 123efd3ec29f..410776528265 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -913,8 +913,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
913 * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't 913 * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
914 * used by guest then tlbs are not flushed, so guest is allowed to access the 914 * used by guest then tlbs are not flushed, so guest is allowed to access the
915 * freed pages. 915 * freed pages.
916 * We set tlbs_dirty to let the notifier know this change and delay the flush 916 * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
917 * until such a case actually happens.
918 */ 917 */
919static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 918static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
920{ 919{
@@ -943,7 +942,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
943 return -EINVAL; 942 return -EINVAL;
944 943
945 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 944 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
946 vcpu->kvm->tlbs_dirty = true; 945 vcpu->kvm->tlbs_dirty++;
947 continue; 946 continue;
948 } 947 }
949 948
@@ -958,7 +957,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
958 957
959 if (gfn != sp->gfns[i]) { 958 if (gfn != sp->gfns[i]) {
960 drop_spte(vcpu->kvm, &sp->spt[i]); 959 drop_spte(vcpu->kvm, &sp->spt[i]);
961 vcpu->kvm->tlbs_dirty = true; 960 vcpu->kvm->tlbs_dirty++;
962 continue; 961 continue;
963 } 962 }
964 963
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 5c4f63151b4d..cbecaa90399c 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -108,7 +108,10 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
108{ 108{
109 struct kvm_pmc *pmc = perf_event->overflow_handler_context; 109 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
110 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; 110 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
111 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); 111 if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
112 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
113 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
114 }
112} 115}
113 116
114static void kvm_perf_overflow_intr(struct perf_event *perf_event, 117static void kvm_perf_overflow_intr(struct perf_event *perf_event,
@@ -117,7 +120,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event,
117 struct kvm_pmc *pmc = perf_event->overflow_handler_context; 120 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
118 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; 121 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
119 if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { 122 if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
120 kvm_perf_overflow(perf_event, data, regs); 123 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
121 kvm_make_request(KVM_REQ_PMU, pmc->vcpu); 124 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
122 /* 125 /*
123 * Inject PMI. If vcpu was in a guest mode during NMI PMI 126 * Inject PMI. If vcpu was in a guest mode during NMI PMI
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7f4f9c2badae..ec8366c5cfea 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1338,21 +1338,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1338 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1338 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1339} 1339}
1340 1340
1341static void svm_update_cpl(struct kvm_vcpu *vcpu)
1342{
1343 struct vcpu_svm *svm = to_svm(vcpu);
1344 int cpl;
1345
1346 if (!is_protmode(vcpu))
1347 cpl = 0;
1348 else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
1349 cpl = 3;
1350 else
1351 cpl = svm->vmcb->save.cs.selector & 0x3;
1352
1353 svm->vmcb->save.cpl = cpl;
1354}
1355
1356static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1341static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1357{ 1342{
1358 return to_svm(vcpu)->vmcb->save.rflags; 1343 return to_svm(vcpu)->vmcb->save.rflags;
@@ -1360,11 +1345,12 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1360 1345
1361static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1346static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1362{ 1347{
1363 unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; 1348 /*
1364 1349 * Any change of EFLAGS.VM is accompained by a reload of SS
1350 * (caused by either a task switch or an inter-privilege IRET),
1351 * so we do not need to update the CPL here.
1352 */
1365 to_svm(vcpu)->vmcb->save.rflags = rflags; 1353 to_svm(vcpu)->vmcb->save.rflags = rflags;
1366 if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
1367 svm_update_cpl(vcpu);
1368} 1354}
1369 1355
1370static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 1356static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
@@ -1631,8 +1617,15 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1631 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; 1617 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1632 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 1618 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1633 } 1619 }
1634 if (seg == VCPU_SREG_CS) 1620
1635 svm_update_cpl(vcpu); 1621 /*
1622 * This is always accurate, except if SYSRET returned to a segment
1623 * with SS.DPL != 3. Intel does not have this quirk, and always
1624 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1625 * would entail passing the CPL to userspace and back.
1626 */
1627 if (seg == VCPU_SREG_SS)
1628 svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1636 1629
1637 mark_dirty(svm->vmcb, VMCB_SEG); 1630 mark_dirty(svm->vmcb, VMCB_SEG);
1638} 1631}
@@ -2770,12 +2763,6 @@ static int xsetbv_interception(struct vcpu_svm *svm)
2770 return 1; 2763 return 1;
2771} 2764}
2772 2765
2773static int invalid_op_interception(struct vcpu_svm *svm)
2774{
2775 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2776 return 1;
2777}
2778
2779static int task_switch_interception(struct vcpu_svm *svm) 2766static int task_switch_interception(struct vcpu_svm *svm)
2780{ 2767{
2781 u16 tss_selector; 2768 u16 tss_selector;
@@ -3287,6 +3274,24 @@ static int pause_interception(struct vcpu_svm *svm)
3287 return 1; 3274 return 1;
3288} 3275}
3289 3276
3277static int nop_interception(struct vcpu_svm *svm)
3278{
3279 skip_emulated_instruction(&(svm->vcpu));
3280 return 1;
3281}
3282
3283static int monitor_interception(struct vcpu_svm *svm)
3284{
3285 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
3286 return nop_interception(svm);
3287}
3288
3289static int mwait_interception(struct vcpu_svm *svm)
3290{
3291 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
3292 return nop_interception(svm);
3293}
3294
3290static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { 3295static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3291 [SVM_EXIT_READ_CR0] = cr_interception, 3296 [SVM_EXIT_READ_CR0] = cr_interception,
3292 [SVM_EXIT_READ_CR3] = cr_interception, 3297 [SVM_EXIT_READ_CR3] = cr_interception,
@@ -3344,8 +3349,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3344 [SVM_EXIT_CLGI] = clgi_interception, 3349 [SVM_EXIT_CLGI] = clgi_interception,
3345 [SVM_EXIT_SKINIT] = skinit_interception, 3350 [SVM_EXIT_SKINIT] = skinit_interception,
3346 [SVM_EXIT_WBINVD] = emulate_on_interception, 3351 [SVM_EXIT_WBINVD] = emulate_on_interception,
3347 [SVM_EXIT_MONITOR] = invalid_op_interception, 3352 [SVM_EXIT_MONITOR] = monitor_interception,
3348 [SVM_EXIT_MWAIT] = invalid_op_interception, 3353 [SVM_EXIT_MWAIT] = mwait_interception,
3349 [SVM_EXIT_XSETBV] = xsetbv_interception, 3354 [SVM_EXIT_XSETBV] = xsetbv_interception,
3350 [SVM_EXIT_NPF] = pf_interception, 3355 [SVM_EXIT_NPF] = pf_interception,
3351}; 3356};
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 545245d7cc63..33574c95220d 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -91,16 +91,21 @@ TRACE_EVENT(kvm_hv_hypercall,
91/* 91/*
92 * Tracepoint for PIO. 92 * Tracepoint for PIO.
93 */ 93 */
94
95#define KVM_PIO_IN 0
96#define KVM_PIO_OUT 1
97
94TRACE_EVENT(kvm_pio, 98TRACE_EVENT(kvm_pio,
95 TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, 99 TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
96 unsigned int count), 100 unsigned int count, void *data),
97 TP_ARGS(rw, port, size, count), 101 TP_ARGS(rw, port, size, count, data),
98 102
99 TP_STRUCT__entry( 103 TP_STRUCT__entry(
100 __field( unsigned int, rw ) 104 __field( unsigned int, rw )
101 __field( unsigned int, port ) 105 __field( unsigned int, port )
102 __field( unsigned int, size ) 106 __field( unsigned int, size )
103 __field( unsigned int, count ) 107 __field( unsigned int, count )
108 __field( unsigned int, val )
104 ), 109 ),
105 110
106 TP_fast_assign( 111 TP_fast_assign(
@@ -108,11 +113,18 @@ TRACE_EVENT(kvm_pio,
108 __entry->port = port; 113 __entry->port = port;
109 __entry->size = size; 114 __entry->size = size;
110 __entry->count = count; 115 __entry->count = count;
116 if (size == 1)
117 __entry->val = *(unsigned char *)data;
118 else if (size == 2)
119 __entry->val = *(unsigned short *)data;
120 else
121 __entry->val = *(unsigned int *)data;
111 ), 122 ),
112 123
113 TP_printk("pio_%s at 0x%x size %d count %d", 124 TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s",
114 __entry->rw ? "write" : "read", 125 __entry->rw ? "write" : "read",
115 __entry->port, __entry->size, __entry->count) 126 __entry->port, __entry->size, __entry->count, __entry->val,
127 __entry->count > 1 ? "(...)" : "")
116); 128);
117 129
118/* 130/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 138ceffc6377..801332edefc3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -354,6 +354,7 @@ struct vmcs02_list {
354struct nested_vmx { 354struct nested_vmx {
355 /* Has the level1 guest done vmxon? */ 355 /* Has the level1 guest done vmxon? */
356 bool vmxon; 356 bool vmxon;
357 gpa_t vmxon_ptr;
357 358
358 /* The guest-physical address of the current VMCS L1 keeps for L2 */ 359 /* The guest-physical address of the current VMCS L1 keeps for L2 */
359 gpa_t current_vmptr; 360 gpa_t current_vmptr;
@@ -413,7 +414,6 @@ struct vcpu_vmx {
413 struct kvm_vcpu vcpu; 414 struct kvm_vcpu vcpu;
414 unsigned long host_rsp; 415 unsigned long host_rsp;
415 u8 fail; 416 u8 fail;
416 u8 cpl;
417 bool nmi_known_unmasked; 417 bool nmi_known_unmasked;
418 u32 exit_intr_info; 418 u32 exit_intr_info;
419 u32 idt_vectoring_info; 419 u32 idt_vectoring_info;
@@ -2283,7 +2283,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2283 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 2283 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2284 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); 2284 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
2285 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2285 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2286 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ 2286
2287 nested_vmx_exit_ctls_high &= 2287 nested_vmx_exit_ctls_high &=
2288#ifdef CONFIG_X86_64 2288#ifdef CONFIG_X86_64
2289 VM_EXIT_HOST_ADDR_SPACE_SIZE | 2289 VM_EXIT_HOST_ADDR_SPACE_SIZE |
@@ -2291,7 +2291,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2291 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 2291 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2292 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 2292 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2293 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 2293 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2294 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; 2294 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
2295
2295 if (vmx_mpx_supported()) 2296 if (vmx_mpx_supported())
2296 nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 2297 nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2297 2298
@@ -2353,12 +2354,11 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2353 VMX_EPT_INVEPT_BIT; 2354 VMX_EPT_INVEPT_BIT;
2354 nested_vmx_ept_caps &= vmx_capability.ept; 2355 nested_vmx_ept_caps &= vmx_capability.ept;
2355 /* 2356 /*
2356 * Since invept is completely emulated we support both global 2357 * For nested guests, we don't do anything specific
2357 * and context invalidation independent of what host cpu 2358 * for single context invalidation. Hence, only advertise
2358 * supports 2359 * support for global context invalidation.
2359 */ 2360 */
2360 nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 2361 nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
2361 VMX_EPT_EXTENT_CONTEXT_BIT;
2362 } else 2362 } else
2363 nested_vmx_ept_caps = 0; 2363 nested_vmx_ept_caps = 0;
2364 2364
@@ -3186,10 +3186,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
3186 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3186 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3187 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3187 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3188 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3188 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3189
3190 /* CPL is always 0 when CPU enters protected mode */
3191 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3192 vmx->cpl = 0;
3193} 3189}
3194 3190
3195static void fix_rmode_seg(int seg, struct kvm_segment *save) 3191static void fix_rmode_seg(int seg, struct kvm_segment *save)
@@ -3591,22 +3587,14 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3591{ 3587{
3592 struct vcpu_vmx *vmx = to_vmx(vcpu); 3588 struct vcpu_vmx *vmx = to_vmx(vcpu);
3593 3589
3594 if (!is_protmode(vcpu)) 3590 if (unlikely(vmx->rmode.vm86_active))
3595 return 0; 3591 return 0;
3596 3592 else {
3597 if (!is_long_mode(vcpu) 3593 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3598 && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ 3594 return AR_DPL(ar);
3599 return 3;
3600
3601 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
3602 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3603 vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3;
3604 } 3595 }
3605
3606 return vmx->cpl;
3607} 3596}
3608 3597
3609
3610static u32 vmx_segment_access_rights(struct kvm_segment *var) 3598static u32 vmx_segment_access_rights(struct kvm_segment *var)
3611{ 3599{
3612 u32 ar; 3600 u32 ar;
@@ -3634,8 +3622,6 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3634 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3622 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3635 3623
3636 vmx_segment_cache_clear(vmx); 3624 vmx_segment_cache_clear(vmx);
3637 if (seg == VCPU_SREG_CS)
3638 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3639 3625
3640 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3626 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3641 vmx->rmode.segs[seg] = *var; 3627 vmx->rmode.segs[seg] = *var;
@@ -4564,6 +4550,16 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
4564 PIN_BASED_EXT_INTR_MASK; 4550 PIN_BASED_EXT_INTR_MASK;
4565} 4551}
4566 4552
4553/*
4554 * In nested virtualization, check if L1 has set
4555 * VM_EXIT_ACK_INTR_ON_EXIT
4556 */
4557static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
4558{
4559 return get_vmcs12(vcpu)->vm_exit_controls &
4560 VM_EXIT_ACK_INTR_ON_EXIT;
4561}
4562
4567static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 4563static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
4568{ 4564{
4569 return get_vmcs12(vcpu)->pin_based_vm_exec_control & 4565 return get_vmcs12(vcpu)->pin_based_vm_exec_control &
@@ -4878,6 +4874,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4878 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 4874 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
4879 vcpu->arch.dr6 &= ~15; 4875 vcpu->arch.dr6 &= ~15;
4880 vcpu->arch.dr6 |= dr6; 4876 vcpu->arch.dr6 |= dr6;
4877 if (!(dr6 & ~DR6_RESERVED)) /* icebp */
4878 skip_emulated_instruction(vcpu);
4879
4881 kvm_queue_exception(vcpu, DB_VECTOR); 4880 kvm_queue_exception(vcpu, DB_VECTOR);
4882 return 1; 4881 return 1;
4883 } 4882 }
@@ -5166,7 +5165,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
5166 return 1; 5165 return 1;
5167 kvm_register_write(vcpu, reg, val); 5166 kvm_register_write(vcpu, reg, val);
5168 } else 5167 } else
5169 if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg])) 5168 if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)))
5170 return 1; 5169 return 1;
5171 5170
5172 skip_emulated_instruction(vcpu); 5171 skip_emulated_instruction(vcpu);
@@ -5439,7 +5438,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
5439 } 5438 }
5440 5439
5441 /* clear all local breakpoint enable flags */ 5440 /* clear all local breakpoint enable flags */
5442 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); 5441 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55);
5443 5442
5444 /* 5443 /*
5445 * TODO: What about debug traps on tss switch? 5444 * TODO: What about debug traps on tss switch?
@@ -5565,6 +5564,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5565 gpa_t gpa; 5564 gpa_t gpa;
5566 5565
5567 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5566 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5567 if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5568 skip_emulated_instruction(vcpu);
5569 return 1;
5570 }
5568 5571
5569 ret = handle_mmio_page_fault_common(vcpu, gpa, true); 5572 ret = handle_mmio_page_fault_common(vcpu, gpa, true);
5570 if (likely(ret == RET_MMIO_PF_EMULATE)) 5573 if (likely(ret == RET_MMIO_PF_EMULATE))
@@ -5669,12 +5672,24 @@ static int handle_pause(struct kvm_vcpu *vcpu)
5669 return 1; 5672 return 1;
5670} 5673}
5671 5674
5672static int handle_invalid_op(struct kvm_vcpu *vcpu) 5675static int handle_nop(struct kvm_vcpu *vcpu)
5673{ 5676{
5674 kvm_queue_exception(vcpu, UD_VECTOR); 5677 skip_emulated_instruction(vcpu);
5675 return 1; 5678 return 1;
5676} 5679}
5677 5680
5681static int handle_mwait(struct kvm_vcpu *vcpu)
5682{
5683 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
5684 return handle_nop(vcpu);
5685}
5686
5687static int handle_monitor(struct kvm_vcpu *vcpu)
5688{
5689 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
5690 return handle_nop(vcpu);
5691}
5692
5678/* 5693/*
5679 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. 5694 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
5680 * We could reuse a single VMCS for all the L2 guests, but we also want the 5695 * We could reuse a single VMCS for all the L2 guests, but we also want the
@@ -5812,6 +5827,154 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
5812} 5827}
5813 5828
5814/* 5829/*
5830 * Decode the memory-address operand of a vmx instruction, as recorded on an
5831 * exit caused by such an instruction (run by a guest hypervisor).
5832 * On success, returns 0. When the operand is invalid, returns 1 and throws
5833 * #UD or #GP.
5834 */
5835static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
5836 unsigned long exit_qualification,
5837 u32 vmx_instruction_info, gva_t *ret)
5838{
5839 /*
5840 * According to Vol. 3B, "Information for VM Exits Due to Instruction
5841 * Execution", on an exit, vmx_instruction_info holds most of the
5842 * addressing components of the operand. Only the displacement part
5843 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
5844 * For how an actual address is calculated from all these components,
5845 * refer to Vol. 1, "Operand Addressing".
5846 */
5847 int scaling = vmx_instruction_info & 3;
5848 int addr_size = (vmx_instruction_info >> 7) & 7;
5849 bool is_reg = vmx_instruction_info & (1u << 10);
5850 int seg_reg = (vmx_instruction_info >> 15) & 7;
5851 int index_reg = (vmx_instruction_info >> 18) & 0xf;
5852 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
5853 int base_reg = (vmx_instruction_info >> 23) & 0xf;
5854 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
5855
5856 if (is_reg) {
5857 kvm_queue_exception(vcpu, UD_VECTOR);
5858 return 1;
5859 }
5860
5861 /* Addr = segment_base + offset */
5862 /* offset = base + [index * scale] + displacement */
5863 *ret = vmx_get_segment_base(vcpu, seg_reg);
5864 if (base_is_valid)
5865 *ret += kvm_register_read(vcpu, base_reg);
5866 if (index_is_valid)
5867 *ret += kvm_register_read(vcpu, index_reg)<<scaling;
5868 *ret += exit_qualification; /* holds the displacement */
5869
5870 if (addr_size == 1) /* 32 bit */
5871 *ret &= 0xffffffff;
5872
5873 /*
5874 * TODO: throw #GP (and return 1) in various cases that the VM*
5875 * instructions require it - e.g., offset beyond segment limit,
5876 * unusable or unreadable/unwritable segment, non-canonical 64-bit
5877 * address, and so on. Currently these are not checked.
5878 */
5879 return 0;
5880}
5881
5882/*
5883 * This function performs the various checks including
5884 * - if it's 4KB aligned
5885 * - No bits beyond the physical address width are set
5886 * - Returns 0 on success or else 1
5887 * (Intel SDM Section 30.3)
5888 */
5889static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
5890 gpa_t *vmpointer)
5891{
5892 gva_t gva;
5893 gpa_t vmptr;
5894 struct x86_exception e;
5895 struct page *page;
5896 struct vcpu_vmx *vmx = to_vmx(vcpu);
5897 int maxphyaddr = cpuid_maxphyaddr(vcpu);
5898
5899 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5900 vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
5901 return 1;
5902
5903 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
5904 sizeof(vmptr), &e)) {
5905 kvm_inject_page_fault(vcpu, &e);
5906 return 1;
5907 }
5908
5909 switch (exit_reason) {
5910 case EXIT_REASON_VMON:
5911 /*
5912 * SDM 3: 24.11.5
5913 * The first 4 bytes of VMXON region contain the supported
5914 * VMCS revision identifier
5915 *
5916 * Note - IA32_VMX_BASIC[48] will never be 1
5917 * for the nested case;
5918 * which replaces physical address width with 32
5919 *
5920 */
5921 if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
5922 nested_vmx_failInvalid(vcpu);
5923 skip_emulated_instruction(vcpu);
5924 return 1;
5925 }
5926
5927 page = nested_get_page(vcpu, vmptr);
5928 if (page == NULL ||
5929 *(u32 *)kmap(page) != VMCS12_REVISION) {
5930 nested_vmx_failInvalid(vcpu);
5931 kunmap(page);
5932 skip_emulated_instruction(vcpu);
5933 return 1;
5934 }
5935 kunmap(page);
5936 vmx->nested.vmxon_ptr = vmptr;
5937 break;
5938 case EXIT_REASON_VMCLEAR:
5939 if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
5940 nested_vmx_failValid(vcpu,
5941 VMXERR_VMCLEAR_INVALID_ADDRESS);
5942 skip_emulated_instruction(vcpu);
5943 return 1;
5944 }
5945
5946 if (vmptr == vmx->nested.vmxon_ptr) {
5947 nested_vmx_failValid(vcpu,
5948 VMXERR_VMCLEAR_VMXON_POINTER);
5949 skip_emulated_instruction(vcpu);
5950 return 1;
5951 }
5952 break;
5953 case EXIT_REASON_VMPTRLD:
5954 if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
5955 nested_vmx_failValid(vcpu,
5956 VMXERR_VMPTRLD_INVALID_ADDRESS);
5957 skip_emulated_instruction(vcpu);
5958 return 1;
5959 }
5960
5961 if (vmptr == vmx->nested.vmxon_ptr) {
5962 nested_vmx_failValid(vcpu,
5963 VMXERR_VMCLEAR_VMXON_POINTER);
5964 skip_emulated_instruction(vcpu);
5965 return 1;
5966 }
5967 break;
5968 default:
5969 return 1; /* shouldn't happen */
5970 }
5971
5972 if (vmpointer)
5973 *vmpointer = vmptr;
5974 return 0;
5975}
5976
5977/*
5815 * Emulate the VMXON instruction. 5978 * Emulate the VMXON instruction.
5816 * Currently, we just remember that VMX is active, and do not save or even 5979 * Currently, we just remember that VMX is active, and do not save or even
5817 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 5980 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
@@ -5849,6 +6012,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5849 kvm_inject_gp(vcpu, 0); 6012 kvm_inject_gp(vcpu, 0);
5850 return 1; 6013 return 1;
5851 } 6014 }
6015
6016 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
6017 return 1;
6018
5852 if (vmx->nested.vmxon) { 6019 if (vmx->nested.vmxon) {
5853 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 6020 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
5854 skip_emulated_instruction(vcpu); 6021 skip_emulated_instruction(vcpu);
@@ -5971,87 +6138,19 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
5971 return 1; 6138 return 1;
5972} 6139}
5973 6140
5974/*
5975 * Decode the memory-address operand of a vmx instruction, as recorded on an
5976 * exit caused by such an instruction (run by a guest hypervisor).
5977 * On success, returns 0. When the operand is invalid, returns 1 and throws
5978 * #UD or #GP.
5979 */
5980static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
5981 unsigned long exit_qualification,
5982 u32 vmx_instruction_info, gva_t *ret)
5983{
5984 /*
5985 * According to Vol. 3B, "Information for VM Exits Due to Instruction
5986 * Execution", on an exit, vmx_instruction_info holds most of the
5987 * addressing components of the operand. Only the displacement part
5988 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
5989 * For how an actual address is calculated from all these components,
5990 * refer to Vol. 1, "Operand Addressing".
5991 */
5992 int scaling = vmx_instruction_info & 3;
5993 int addr_size = (vmx_instruction_info >> 7) & 7;
5994 bool is_reg = vmx_instruction_info & (1u << 10);
5995 int seg_reg = (vmx_instruction_info >> 15) & 7;
5996 int index_reg = (vmx_instruction_info >> 18) & 0xf;
5997 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
5998 int base_reg = (vmx_instruction_info >> 23) & 0xf;
5999 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
6000
6001 if (is_reg) {
6002 kvm_queue_exception(vcpu, UD_VECTOR);
6003 return 1;
6004 }
6005
6006 /* Addr = segment_base + offset */
6007 /* offset = base + [index * scale] + displacement */
6008 *ret = vmx_get_segment_base(vcpu, seg_reg);
6009 if (base_is_valid)
6010 *ret += kvm_register_read(vcpu, base_reg);
6011 if (index_is_valid)
6012 *ret += kvm_register_read(vcpu, index_reg)<<scaling;
6013 *ret += exit_qualification; /* holds the displacement */
6014
6015 if (addr_size == 1) /* 32 bit */
6016 *ret &= 0xffffffff;
6017
6018 /*
6019 * TODO: throw #GP (and return 1) in various cases that the VM*
6020 * instructions require it - e.g., offset beyond segment limit,
6021 * unusable or unreadable/unwritable segment, non-canonical 64-bit
6022 * address, and so on. Currently these are not checked.
6023 */
6024 return 0;
6025}
6026
6027/* Emulate the VMCLEAR instruction */ 6141/* Emulate the VMCLEAR instruction */
6028static int handle_vmclear(struct kvm_vcpu *vcpu) 6142static int handle_vmclear(struct kvm_vcpu *vcpu)
6029{ 6143{
6030 struct vcpu_vmx *vmx = to_vmx(vcpu); 6144 struct vcpu_vmx *vmx = to_vmx(vcpu);
6031 gva_t gva;
6032 gpa_t vmptr; 6145 gpa_t vmptr;
6033 struct vmcs12 *vmcs12; 6146 struct vmcs12 *vmcs12;
6034 struct page *page; 6147 struct page *page;
6035 struct x86_exception e;
6036 6148
6037 if (!nested_vmx_check_permission(vcpu)) 6149 if (!nested_vmx_check_permission(vcpu))
6038 return 1; 6150 return 1;
6039 6151
6040 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 6152 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
6041 vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
6042 return 1;
6043
6044 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
6045 sizeof(vmptr), &e)) {
6046 kvm_inject_page_fault(vcpu, &e);
6047 return 1;
6048 }
6049
6050 if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
6051 nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
6052 skip_emulated_instruction(vcpu);
6053 return 1; 6153 return 1;
6054 }
6055 6154
6056 if (vmptr == vmx->nested.current_vmptr) { 6155 if (vmptr == vmx->nested.current_vmptr) {
6057 nested_release_vmcs12(vmx); 6156 nested_release_vmcs12(vmx);
@@ -6372,29 +6471,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
6372static int handle_vmptrld(struct kvm_vcpu *vcpu) 6471static int handle_vmptrld(struct kvm_vcpu *vcpu)
6373{ 6472{
6374 struct vcpu_vmx *vmx = to_vmx(vcpu); 6473 struct vcpu_vmx *vmx = to_vmx(vcpu);
6375 gva_t gva;
6376 gpa_t vmptr; 6474 gpa_t vmptr;
6377 struct x86_exception e;
6378 u32 exec_control; 6475 u32 exec_control;
6379 6476
6380 if (!nested_vmx_check_permission(vcpu)) 6477 if (!nested_vmx_check_permission(vcpu))
6381 return 1; 6478 return 1;
6382 6479
6383 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 6480 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))
6384 vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
6385 return 1;
6386
6387 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
6388 sizeof(vmptr), &e)) {
6389 kvm_inject_page_fault(vcpu, &e);
6390 return 1;
6391 }
6392
6393 if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
6394 nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
6395 skip_emulated_instruction(vcpu);
6396 return 1; 6481 return 1;
6397 }
6398 6482
6399 if (vmx->nested.current_vmptr != vmptr) { 6483 if (vmx->nested.current_vmptr != vmptr) {
6400 struct vmcs12 *new_vmcs12; 6484 struct vmcs12 *new_vmcs12;
@@ -6471,7 +6555,6 @@ static int handle_invept(struct kvm_vcpu *vcpu)
6471 struct { 6555 struct {
6472 u64 eptp, gpa; 6556 u64 eptp, gpa;
6473 } operand; 6557 } operand;
6474 u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
6475 6558
6476 if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || 6559 if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
6477 !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { 6560 !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
@@ -6511,16 +6594,13 @@ static int handle_invept(struct kvm_vcpu *vcpu)
6511 } 6594 }
6512 6595
6513 switch (type) { 6596 switch (type) {
6514 case VMX_EPT_EXTENT_CONTEXT:
6515 if ((operand.eptp & eptp_mask) !=
6516 (nested_ept_get_cr3(vcpu) & eptp_mask))
6517 break;
6518 case VMX_EPT_EXTENT_GLOBAL: 6597 case VMX_EPT_EXTENT_GLOBAL:
6519 kvm_mmu_sync_roots(vcpu); 6598 kvm_mmu_sync_roots(vcpu);
6520 kvm_mmu_flush_tlb(vcpu); 6599 kvm_mmu_flush_tlb(vcpu);
6521 nested_vmx_succeed(vcpu); 6600 nested_vmx_succeed(vcpu);
6522 break; 6601 break;
6523 default: 6602 default:
6603 /* Trap single context invalidation invept calls */
6524 BUG_ON(1); 6604 BUG_ON(1);
6525 break; 6605 break;
6526 } 6606 }
@@ -6571,8 +6651,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6571 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6651 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
6572 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6652 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
6573 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6653 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
6574 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, 6654 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
6575 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, 6655 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
6576 [EXIT_REASON_INVEPT] = handle_invept, 6656 [EXIT_REASON_INVEPT] = handle_invept,
6577}; 6657};
6578 6658
@@ -7413,7 +7493,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
7413 7493
7414 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 7494 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
7415 | (1 << VCPU_EXREG_RFLAGS) 7495 | (1 << VCPU_EXREG_RFLAGS)
7416 | (1 << VCPU_EXREG_CPL)
7417 | (1 << VCPU_EXREG_PDPTR) 7496 | (1 << VCPU_EXREG_PDPTR)
7418 | (1 << VCPU_EXREG_SEGMENTS) 7497 | (1 << VCPU_EXREG_SEGMENTS)
7419 | (1 << VCPU_EXREG_CR3)); 7498 | (1 << VCPU_EXREG_CR3));
@@ -8601,6 +8680,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
8601 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 8680 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
8602 exit_qualification); 8681 exit_qualification);
8603 8682
8683 if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
8684 && nested_exit_intr_ack_set(vcpu)) {
8685 int irq = kvm_cpu_get_interrupt(vcpu);
8686 WARN_ON(irq < 0);
8687 vmcs12->vm_exit_intr_info = irq |
8688 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
8689 }
8690
8604 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 8691 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
8605 vmcs12->exit_qualification, 8692 vmcs12->exit_qualification,
8606 vmcs12->idt_vectoring_info_field, 8693 vmcs12->idt_vectoring_info_field,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 20316c67b824..f32a02578c0d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -704,25 +704,11 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
704 } 704 }
705 705
706 if (is_long_mode(vcpu)) { 706 if (is_long_mode(vcpu)) {
707 if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { 707 if (cr3 & CR3_L_MODE_RESERVED_BITS)
708 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) 708 return 1;
709 return 1; 709 } else if (is_pae(vcpu) && is_paging(vcpu) &&
710 } else 710 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
711 if (cr3 & CR3_L_MODE_RESERVED_BITS) 711 return 1;
712 return 1;
713 } else {
714 if (is_pae(vcpu)) {
715 if (cr3 & CR3_PAE_RESERVED_BITS)
716 return 1;
717 if (is_paging(vcpu) &&
718 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
719 return 1;
720 }
721 /*
722 * We don't check reserved bits in nonpae mode, because
723 * this isn't enforced, and VMware depends on this.
724 */
725 }
726 712
727 vcpu->arch.cr3 = cr3; 713 vcpu->arch.cr3 = cr3;
728 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 714 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
@@ -1935,6 +1921,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1935 1921
1936 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { 1922 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1937 vcpu->arch.hv_vapic = data; 1923 vcpu->arch.hv_vapic = data;
1924 if (kvm_lapic_enable_pv_eoi(vcpu, 0))
1925 return 1;
1938 break; 1926 break;
1939 } 1927 }
1940 gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; 1928 gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
@@ -1945,6 +1933,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1945 return 1; 1933 return 1;
1946 vcpu->arch.hv_vapic = data; 1934 vcpu->arch.hv_vapic = data;
1947 mark_page_dirty(vcpu->kvm, gfn); 1935 mark_page_dirty(vcpu->kvm, gfn);
1936 if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
1937 return 1;
1948 break; 1938 break;
1949 } 1939 }
1950 case HV_X64_MSR_EOI: 1940 case HV_X64_MSR_EOI:
@@ -2647,6 +2637,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2647 case KVM_CAP_IRQ_INJECT_STATUS: 2637 case KVM_CAP_IRQ_INJECT_STATUS:
2648 case KVM_CAP_IRQFD: 2638 case KVM_CAP_IRQFD:
2649 case KVM_CAP_IOEVENTFD: 2639 case KVM_CAP_IOEVENTFD:
2640 case KVM_CAP_IOEVENTFD_NO_LENGTH:
2650 case KVM_CAP_PIT2: 2641 case KVM_CAP_PIT2:
2651 case KVM_CAP_PIT_STATE2: 2642 case KVM_CAP_PIT_STATE2:
2652 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 2643 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
@@ -3649,11 +3640,19 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3649 offset = i * BITS_PER_LONG; 3640 offset = i * BITS_PER_LONG;
3650 kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); 3641 kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
3651 } 3642 }
3652 if (is_dirty)
3653 kvm_flush_remote_tlbs(kvm);
3654 3643
3655 spin_unlock(&kvm->mmu_lock); 3644 spin_unlock(&kvm->mmu_lock);
3656 3645
3646 /* See the comments in kvm_mmu_slot_remove_write_access(). */
3647 lockdep_assert_held(&kvm->slots_lock);
3648
3649 /*
3650 * All the TLBs can be flushed out of mmu lock, see the comments in
3651 * kvm_mmu_slot_remove_write_access().
3652 */
3653 if (is_dirty)
3654 kvm_flush_remote_tlbs(kvm);
3655
3657 r = -EFAULT; 3656 r = -EFAULT;
3658 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 3657 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
3659 goto out; 3658 goto out;
@@ -4489,8 +4488,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
4489 unsigned short port, void *val, 4488 unsigned short port, void *val,
4490 unsigned int count, bool in) 4489 unsigned int count, bool in)
4491{ 4490{
4492 trace_kvm_pio(!in, port, size, count);
4493
4494 vcpu->arch.pio.port = port; 4491 vcpu->arch.pio.port = port;
4495 vcpu->arch.pio.in = in; 4492 vcpu->arch.pio.in = in;
4496 vcpu->arch.pio.count = count; 4493 vcpu->arch.pio.count = count;
@@ -4525,6 +4522,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
4525 if (ret) { 4522 if (ret) {
4526data_avail: 4523data_avail:
4527 memcpy(val, vcpu->arch.pio_data, size * count); 4524 memcpy(val, vcpu->arch.pio_data, size * count);
4525 trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
4528 vcpu->arch.pio.count = 0; 4526 vcpu->arch.pio.count = 0;
4529 return 1; 4527 return 1;
4530 } 4528 }
@@ -4539,6 +4537,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
4539 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4537 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4540 4538
4541 memcpy(vcpu->arch.pio_data, val, size * count); 4539 memcpy(vcpu->arch.pio_data, val, size * count);
4540 trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
4542 return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); 4541 return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
4543} 4542}
4544 4543
@@ -4650,11 +4649,6 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4650 return res; 4649 return res;
4651} 4650}
4652 4651
4653static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
4654{
4655 kvm_set_rflags(emul_to_vcpu(ctxt), val);
4656}
4657
4658static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) 4652static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
4659{ 4653{
4660 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); 4654 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4839,7 +4833,6 @@ static const struct x86_emulate_ops emulate_ops = {
4839 .set_idt = emulator_set_idt, 4833 .set_idt = emulator_set_idt,
4840 .get_cr = emulator_get_cr, 4834 .get_cr = emulator_get_cr,
4841 .set_cr = emulator_set_cr, 4835 .set_cr = emulator_set_cr,
4842 .set_rflags = emulator_set_rflags,
4843 .cpl = emulator_get_cpl, 4836 .cpl = emulator_get_cpl,
4844 .get_dr = emulator_get_dr, 4837 .get_dr = emulator_get_dr,
4845 .set_dr = emulator_set_dr, 4838 .set_dr = emulator_set_dr,
@@ -4905,7 +4898,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4905 ctxt->eip = kvm_rip_read(vcpu); 4898 ctxt->eip = kvm_rip_read(vcpu);
4906 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 4899 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4907 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : 4900 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
4908 cs_l ? X86EMUL_MODE_PROT64 : 4901 (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
4909 cs_db ? X86EMUL_MODE_PROT32 : 4902 cs_db ? X86EMUL_MODE_PROT32 :
4910 X86EMUL_MODE_PROT16; 4903 X86EMUL_MODE_PROT16;
4911 ctxt->guest_mode = is_guest_mode(vcpu); 4904 ctxt->guest_mode = is_guest_mode(vcpu);
@@ -7333,8 +7326,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7333 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 7326 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
7334 /* 7327 /*
7335 * Write protect all pages for dirty logging. 7328 * Write protect all pages for dirty logging.
7336 * Existing largepage mappings are destroyed here and new ones will 7329 *
7337 * not be created until the end of the logging. 7330 * All the sptes including the large sptes which point to this
7331 * slot are set to readonly. We can not create any new large
7332 * spte on this slot until the end of the logging.
7333 *
7334 * See the comments in fast_page_fault().
7338 */ 7335 */
7339 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) 7336 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
7340 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 7337 kvm_mmu_slot_remove_write_access(kvm, mem->slot);