aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/emulate.c52
-rw-r--r--arch/x86/kvm/i8259.c25
-rw-r--r--arch/x86/kvm/lapic.c13
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c150
-rw-r--r--arch/x86/kvm/paging_tmpl.h17
-rw-r--r--arch/x86/kvm/svm.c27
-rw-r--r--arch/x86/kvm/vmx.c128
-rw-r--r--arch/x86/kvm/x86.c153
9 files changed, 298 insertions, 268 deletions
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index caf966781d25..0ad47b819a8b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -76,6 +76,7 @@
76#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 76#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
77#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 77#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
78/* Misc flags */ 78/* Misc flags */
79#define VendorSpecific (1<<22) /* Vendor specific instruction */
79#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ 80#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
80#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ 81#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
81#define Undefined (1<<25) /* No Such Instruction */ 82#define Undefined (1<<25) /* No Such Instruction */
@@ -877,7 +878,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
877 if (selector & 1 << 2) { 878 if (selector & 1 << 2) {
878 struct desc_struct desc; 879 struct desc_struct desc;
879 memset (dt, 0, sizeof *dt); 880 memset (dt, 0, sizeof *dt);
880 if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) 881 if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR,
882 ctxt->vcpu))
881 return; 883 return;
882 884
883 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ 885 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
@@ -929,6 +931,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
929 return ret; 931 return ret;
930} 932}
931 933
934/* Does not support long mode */
932static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 935static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
933 struct x86_emulate_ops *ops, 936 struct x86_emulate_ops *ops,
934 u16 selector, int seg) 937 u16 selector, int seg)
@@ -1040,7 +1043,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1040 } 1043 }
1041load: 1044load:
1042 ops->set_segment_selector(selector, seg, ctxt->vcpu); 1045 ops->set_segment_selector(selector, seg, ctxt->vcpu);
1043 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); 1046 ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu);
1044 return X86EMUL_CONTINUE; 1047 return X86EMUL_CONTINUE;
1045exception: 1048exception:
1046 emulate_exception(ctxt, err_vec, err_code, true); 1049 emulate_exception(ctxt, err_vec, err_code, true);
@@ -1560,7 +1563,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1560 struct desc_struct *ss) 1563 struct desc_struct *ss)
1561{ 1564{
1562 memset(cs, 0, sizeof(struct desc_struct)); 1565 memset(cs, 0, sizeof(struct desc_struct));
1563 ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); 1566 ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu);
1564 memset(ss, 0, sizeof(struct desc_struct)); 1567 memset(ss, 0, sizeof(struct desc_struct));
1565 1568
1566 cs->l = 0; /* will be adjusted later */ 1569 cs->l = 0; /* will be adjusted later */
@@ -1607,9 +1610,9 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1607 cs.d = 0; 1610 cs.d = 0;
1608 cs.l = 1; 1611 cs.l = 1;
1609 } 1612 }
1610 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1613 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1611 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1614 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1612 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1615 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1613 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1616 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1614 1617
1615 c->regs[VCPU_REGS_RCX] = c->eip; 1618 c->regs[VCPU_REGS_RCX] = c->eip;
@@ -1679,9 +1682,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1679 cs.l = 1; 1682 cs.l = 1;
1680 } 1683 }
1681 1684
1682 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1685 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1683 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1686 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1684 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1687 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1685 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1688 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1686 1689
1687 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 1690 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
@@ -1736,9 +1739,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1736 cs_sel |= SELECTOR_RPL_MASK; 1739 cs_sel |= SELECTOR_RPL_MASK;
1737 ss_sel |= SELECTOR_RPL_MASK; 1740 ss_sel |= SELECTOR_RPL_MASK;
1738 1741
1739 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1742 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
1740 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1743 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1741 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1744 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1742 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 1745 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1743 1746
1744 c->eip = c->regs[VCPU_REGS_RDX]; 1747 c->eip = c->regs[VCPU_REGS_RDX];
@@ -1764,24 +1767,28 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1764 u16 port, u16 len) 1767 u16 port, u16 len)
1765{ 1768{
1766 struct desc_struct tr_seg; 1769 struct desc_struct tr_seg;
1770 u32 base3;
1767 int r; 1771 int r;
1768 u16 io_bitmap_ptr; 1772 u16 io_bitmap_ptr, perm, bit_idx = port & 0x7;
1769 u8 perm, bit_idx = port & 0x7;
1770 unsigned mask = (1 << len) - 1; 1773 unsigned mask = (1 << len) - 1;
1774 unsigned long base;
1771 1775
1772 ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); 1776 ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu);
1773 if (!tr_seg.p) 1777 if (!tr_seg.p)
1774 return false; 1778 return false;
1775 if (desc_limit_scaled(&tr_seg) < 103) 1779 if (desc_limit_scaled(&tr_seg) < 103)
1776 return false; 1780 return false;
1777 r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, 1781 base = get_desc_base(&tr_seg);
1778 ctxt->vcpu, NULL); 1782#ifdef CONFIG_X86_64
1783 base |= ((u64)base3) << 32;
1784#endif
1785 r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL);
1779 if (r != X86EMUL_CONTINUE) 1786 if (r != X86EMUL_CONTINUE)
1780 return false; 1787 return false;
1781 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) 1788 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
1782 return false; 1789 return false;
1783 r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, 1790 r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu,
1784 &perm, 1, ctxt->vcpu, NULL); 1791 NULL);
1785 if (r != X86EMUL_CONTINUE) 1792 if (r != X86EMUL_CONTINUE)
1786 return false; 1793 return false;
1787 if ((perm >> bit_idx) & mask) 1794 if ((perm >> bit_idx) & mask)
@@ -2126,7 +2133,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2126 } 2133 }
2127 2134
2128 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); 2135 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
2129 ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); 2136 ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu);
2130 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); 2137 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2131 2138
2132 if (has_error_code) { 2139 if (has_error_code) {
@@ -2365,7 +2372,8 @@ static struct group_dual group7 = { {
2365 D(SrcMem16 | ModRM | Mov | Priv), 2372 D(SrcMem16 | ModRM | Mov | Priv),
2366 D(SrcMem | ModRM | ByteOp | Priv | NoAccess), 2373 D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
2367}, { 2374}, {
2368 D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), 2375 D(SrcNone | ModRM | Priv | VendorSpecific), N,
2376 N, D(SrcNone | ModRM | Priv | VendorSpecific),
2369 D(SrcNone | ModRM | DstMem | Mov), N, 2377 D(SrcNone | ModRM | DstMem | Mov), N,
2370 D(SrcMem16 | ModRM | Mov | Priv), N, 2378 D(SrcMem16 | ModRM | Mov | Priv), N,
2371} }; 2379} };
@@ -2489,7 +2497,7 @@ static struct opcode opcode_table[256] = {
2489static struct opcode twobyte_table[256] = { 2497static struct opcode twobyte_table[256] = {
2490 /* 0x00 - 0x0F */ 2498 /* 0x00 - 0x0F */
2491 N, GD(0, &group7), N, N, 2499 N, GD(0, &group7), N, N,
2492 N, D(ImplicitOps), D(ImplicitOps | Priv), N, 2500 N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N,
2493 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, 2501 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
2494 N, D(ImplicitOps | ModRM), N, N, 2502 N, D(ImplicitOps | ModRM), N, N,
2495 /* 0x10 - 0x1F */ 2503 /* 0x10 - 0x1F */
@@ -2502,7 +2510,8 @@ static struct opcode twobyte_table[256] = {
2502 /* 0x30 - 0x3F */ 2510 /* 0x30 - 0x3F */
2503 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), 2511 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
2504 D(ImplicitOps | Priv), N, 2512 D(ImplicitOps | Priv), N,
2505 D(ImplicitOps), D(ImplicitOps | Priv), N, N, 2513 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
2514 N, N,
2506 N, N, N, N, N, N, N, N, 2515 N, N, N, N, N, N, N, N,
2507 /* 0x40 - 0x4F */ 2516 /* 0x40 - 0x4F */
2508 X16(D(DstReg | SrcMem | ModRM | Mov)), 2517 X16(D(DstReg | SrcMem | ModRM | Mov)),
@@ -2741,6 +2750,9 @@ done_prefixes:
2741 if (c->d == 0 || (c->d & Undefined)) 2750 if (c->d == 0 || (c->d & Undefined))
2742 return -1; 2751 return -1;
2743 2752
2753 if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
2754 return -1;
2755
2744 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 2756 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2745 c->op_bytes = 8; 2757 c->op_bytes = 8;
2746 2758
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 3cece05e4ac4..19fe855e7953 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -62,9 +62,6 @@ static void pic_unlock(struct kvm_pic *s)
62 } 62 }
63 63
64 if (!found) 64 if (!found)
65 found = s->kvm->bsp_vcpu;
66
67 if (!found)
68 return; 65 return;
69 66
70 kvm_make_request(KVM_REQ_EVENT, found); 67 kvm_make_request(KVM_REQ_EVENT, found);
@@ -75,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s)
75static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 72static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
76{ 73{
77 s->isr &= ~(1 << irq); 74 s->isr &= ~(1 << irq);
78 s->isr_ack |= (1 << irq);
79 if (s != &s->pics_state->pics[0]) 75 if (s != &s->pics_state->pics[0])
80 irq += 8; 76 irq += 8;
81 /* 77 /*
@@ -89,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
89 pic_lock(s->pics_state); 85 pic_lock(s->pics_state);
90} 86}
91 87
92void kvm_pic_clear_isr_ack(struct kvm *kvm)
93{
94 struct kvm_pic *s = pic_irqchip(kvm);
95
96 pic_lock(s);
97 s->pics[0].isr_ack = 0xff;
98 s->pics[1].isr_ack = 0xff;
99 pic_unlock(s);
100}
101
102/* 88/*
103 * set irq level. If an edge is detected, then the IRR is set to 1 89 * set irq level. If an edge is detected, then the IRR is set to 1
104 */ 90 */
@@ -281,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
281 s->irr = 0; 267 s->irr = 0;
282 s->imr = 0; 268 s->imr = 0;
283 s->isr = 0; 269 s->isr = 0;
284 s->isr_ack = 0xff;
285 s->priority_add = 0; 270 s->priority_add = 0;
286 s->irq_base = 0; 271 s->irq_base = 0;
287 s->read_reg_select = 0; 272 s->read_reg_select = 0;
@@ -545,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this,
545 */ 530 */
546static void pic_irq_request(struct kvm *kvm, int level) 531static void pic_irq_request(struct kvm *kvm, int level)
547{ 532{
548 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
549 struct kvm_pic *s = pic_irqchip(kvm); 533 struct kvm_pic *s = pic_irqchip(kvm);
550 int irq = pic_get_irq(&s->pics[0]);
551 534
552 s->output = level; 535 if (!s->output)
553 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
554 s->pics[0].isr_ack &= ~(1 << irq);
555 s->wakeup_needed = true; 536 s->wakeup_needed = true;
556 } 537 s->output = level;
557} 538}
558 539
559static const struct kvm_io_device_ops picdev_ops = { 540static const struct kvm_io_device_ops picdev_ops = {
@@ -575,8 +556,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
575 s->pics[1].elcr_mask = 0xde; 556 s->pics[1].elcr_mask = 0xde;
576 s->pics[0].pics_state = s; 557 s->pics[0].pics_state = s;
577 s->pics[1].pics_state = s; 558 s->pics[1].pics_state = s;
578 s->pics[0].isr_ack = 0xff;
579 s->pics[1].isr_ack = 0xff;
580 559
581 /* 560 /*
582 * Initialize PIO device 561 * Initialize PIO device
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93cf9d0d3653..2b2255b1f04b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -417,10 +417,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
417 case APIC_DM_INIT: 417 case APIC_DM_INIT:
418 if (level) { 418 if (level) {
419 result = 1; 419 result = 1;
420 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
421 printk(KERN_DEBUG
422 "INIT on a runnable vcpu %d\n",
423 vcpu->vcpu_id);
424 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 420 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
425 kvm_make_request(KVM_REQ_EVENT, vcpu); 421 kvm_make_request(KVM_REQ_EVENT, vcpu);
426 kvm_vcpu_kick(vcpu); 422 kvm_vcpu_kick(vcpu);
@@ -875,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
875 871
876 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); 872 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
877 873
878 if (vcpu->arch.apic->regs_page) 874 if (vcpu->arch.apic->regs)
879 __free_page(vcpu->arch.apic->regs_page); 875 free_page((unsigned long)vcpu->arch.apic->regs);
880 876
881 kfree(vcpu->arch.apic); 877 kfree(vcpu->arch.apic);
882} 878}
@@ -1065,13 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1065 1061
1066 vcpu->arch.apic = apic; 1062 vcpu->arch.apic = apic;
1067 1063
1068 apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); 1064 apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
1069 if (apic->regs_page == NULL) { 1065 if (!apic->regs) {
1070 printk(KERN_ERR "malloc apic regs error for vcpu %x\n", 1066 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
1071 vcpu->vcpu_id); 1067 vcpu->vcpu_id);
1072 goto nomem_free_apic; 1068 goto nomem_free_apic;
1073 } 1069 }
1074 apic->regs = page_address(apic->regs_page);
1075 apic->vcpu = vcpu; 1070 apic->vcpu = vcpu;
1076 1071
1077 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, 1072 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f5fe32c5edad..52c9e6b9e725 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,7 +13,6 @@ struct kvm_lapic {
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending; 15 bool irr_pending;
16 struct page *regs_page;
17 void *regs; 16 void *regs;
18 gpa_t vapic_addr; 17 gpa_t vapic_addr;
19 struct page *vapic_page; 18 struct page *vapic_page;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f02b8edc3d44..22fae7593ee7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -111,9 +111,6 @@ module_param(oos_shadow, bool, 0644);
111#define PT64_LEVEL_SHIFT(level) \ 111#define PT64_LEVEL_SHIFT(level) \
112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) 112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
113 113
114#define PT64_LEVEL_MASK(level) \
115 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
116
117#define PT64_INDEX(address, level)\ 114#define PT64_INDEX(address, level)\
118 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 115 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
119 116
@@ -123,8 +120,6 @@ module_param(oos_shadow, bool, 0644);
123#define PT32_LEVEL_SHIFT(level) \ 120#define PT32_LEVEL_SHIFT(level) \
124 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 121 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
125 122
126#define PT32_LEVEL_MASK(level) \
127 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
128#define PT32_LVL_OFFSET_MASK(level) \ 123#define PT32_LVL_OFFSET_MASK(level) \
129 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 124 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
130 * PT32_LEVEL_BITS))) - 1)) 125 * PT32_LEVEL_BITS))) - 1))
@@ -379,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
379static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 374static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
380 int min) 375 int min)
381{ 376{
382 struct page *page; 377 void *page;
383 378
384 if (cache->nobjs >= min) 379 if (cache->nobjs >= min)
385 return 0; 380 return 0;
386 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 381 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
387 page = alloc_page(GFP_KERNEL); 382 page = (void *)__get_free_page(GFP_KERNEL);
388 if (!page) 383 if (!page)
389 return -ENOMEM; 384 return -ENOMEM;
390 cache->objects[cache->nobjs++] = page_address(page); 385 cache->objects[cache->nobjs++] = page;
391 } 386 }
392 return 0; 387 return 0;
393} 388}
@@ -554,13 +549,23 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
554 return ret; 549 return ret;
555} 550}
556 551
557static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) 552static struct kvm_memory_slot *
553gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
554 bool no_dirty_log)
558{ 555{
559 struct kvm_memory_slot *slot; 556 struct kvm_memory_slot *slot;
560 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 557
561 if (slot && slot->dirty_bitmap) 558 slot = gfn_to_memslot(vcpu->kvm, gfn);
562 return true; 559 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
563 return false; 560 (no_dirty_log && slot->dirty_bitmap))
561 slot = NULL;
562
563 return slot;
564}
565
566static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
564} 569}
565 570
566static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 571static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -1032,9 +1037,9 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1032 ASSERT(is_empty_shadow_page(sp->spt)); 1037 ASSERT(is_empty_shadow_page(sp->spt));
1033 hlist_del(&sp->hash_link); 1038 hlist_del(&sp->hash_link);
1034 list_del(&sp->link); 1039 list_del(&sp->link);
1035 __free_page(virt_to_page(sp->spt)); 1040 free_page((unsigned long)sp->spt);
1036 if (!sp->role.direct) 1041 if (!sp->role.direct)
1037 __free_page(virt_to_page(sp->gfns)); 1042 free_page((unsigned long)sp->gfns);
1038 kmem_cache_free(mmu_page_header_cache, sp); 1043 kmem_cache_free(mmu_page_header_cache, sp);
1039 kvm_mod_used_mmu_pages(kvm, -1); 1044 kvm_mod_used_mmu_pages(kvm, -1);
1040} 1045}
@@ -1199,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1199{ 1204{
1200} 1205}
1201 1206
1207static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1208 struct kvm_mmu_page *sp, u64 *spte,
1209 const void *pte, unsigned long mmu_seq)
1210{
1211 WARN_ON(1);
1212}
1213
1202#define KVM_PAGE_ARRAY_NR 16 1214#define KVM_PAGE_ARRAY_NR 16
1203 1215
1204struct kvm_mmu_pages { 1216struct kvm_mmu_pages {
@@ -2150,26 +2162,13 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2150{ 2162{
2151} 2163}
2152 2164
2153static struct kvm_memory_slot *
2154pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
2155{
2156 struct kvm_memory_slot *slot;
2157
2158 slot = gfn_to_memslot(vcpu->kvm, gfn);
2159 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
2160 (no_dirty_log && slot->dirty_bitmap))
2161 slot = NULL;
2162
2163 return slot;
2164}
2165
2166static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2165static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2167 bool no_dirty_log) 2166 bool no_dirty_log)
2168{ 2167{
2169 struct kvm_memory_slot *slot; 2168 struct kvm_memory_slot *slot;
2170 unsigned long hva; 2169 unsigned long hva;
2171 2170
2172 slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); 2171 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2173 if (!slot) { 2172 if (!slot) {
2174 get_page(bad_page); 2173 get_page(bad_page);
2175 return page_to_pfn(bad_page); 2174 return page_to_pfn(bad_page);
@@ -2190,7 +2189,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2190 gfn_t gfn; 2189 gfn_t gfn;
2191 2190
2192 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); 2191 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2193 if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) 2192 if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
2194 return -1; 2193 return -1;
2195 2194
2196 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); 2195 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
@@ -2804,6 +2803,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2804 context->prefetch_page = nonpaging_prefetch_page; 2803 context->prefetch_page = nonpaging_prefetch_page;
2805 context->sync_page = nonpaging_sync_page; 2804 context->sync_page = nonpaging_sync_page;
2806 context->invlpg = nonpaging_invlpg; 2805 context->invlpg = nonpaging_invlpg;
2806 context->update_pte = nonpaging_update_pte;
2807 context->root_level = 0; 2807 context->root_level = 0;
2808 context->shadow_root_level = PT32E_ROOT_LEVEL; 2808 context->shadow_root_level = PT32E_ROOT_LEVEL;
2809 context->root_hpa = INVALID_PAGE; 2809 context->root_hpa = INVALID_PAGE;
@@ -2933,6 +2933,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2933 context->prefetch_page = paging64_prefetch_page; 2933 context->prefetch_page = paging64_prefetch_page;
2934 context->sync_page = paging64_sync_page; 2934 context->sync_page = paging64_sync_page;
2935 context->invlpg = paging64_invlpg; 2935 context->invlpg = paging64_invlpg;
2936 context->update_pte = paging64_update_pte;
2936 context->free = paging_free; 2937 context->free = paging_free;
2937 context->root_level = level; 2938 context->root_level = level;
2938 context->shadow_root_level = level; 2939 context->shadow_root_level = level;
@@ -2961,6 +2962,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
2961 context->prefetch_page = paging32_prefetch_page; 2962 context->prefetch_page = paging32_prefetch_page;
2962 context->sync_page = paging32_sync_page; 2963 context->sync_page = paging32_sync_page;
2963 context->invlpg = paging32_invlpg; 2964 context->invlpg = paging32_invlpg;
2965 context->update_pte = paging32_update_pte;
2964 context->root_level = PT32_ROOT_LEVEL; 2966 context->root_level = PT32_ROOT_LEVEL;
2965 context->shadow_root_level = PT32E_ROOT_LEVEL; 2967 context->shadow_root_level = PT32E_ROOT_LEVEL;
2966 context->root_hpa = INVALID_PAGE; 2968 context->root_hpa = INVALID_PAGE;
@@ -2985,6 +2987,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2985 context->prefetch_page = nonpaging_prefetch_page; 2987 context->prefetch_page = nonpaging_prefetch_page;
2986 context->sync_page = nonpaging_sync_page; 2988 context->sync_page = nonpaging_sync_page;
2987 context->invlpg = nonpaging_invlpg; 2989 context->invlpg = nonpaging_invlpg;
2990 context->update_pte = nonpaging_update_pte;
2988 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2991 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2989 context->root_hpa = INVALID_PAGE; 2992 context->root_hpa = INVALID_PAGE;
2990 context->direct_map = true; 2993 context->direct_map = true;
@@ -3089,8 +3092,6 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3089 3092
3090static int init_kvm_mmu(struct kvm_vcpu *vcpu) 3093static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3091{ 3094{
3092 vcpu->arch.update_pte.pfn = bad_pfn;
3093
3094 if (mmu_is_nested(vcpu)) 3095 if (mmu_is_nested(vcpu))
3095 return init_kvm_nested_mmu(vcpu); 3096 return init_kvm_nested_mmu(vcpu);
3096 else if (tdp_enabled) 3097 else if (tdp_enabled)
@@ -3164,7 +3165,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3164static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3165 struct kvm_mmu_page *sp, 3166 struct kvm_mmu_page *sp,
3166 u64 *spte, 3167 u64 *spte,
3167 const void *new) 3168 const void *new, unsigned long mmu_seq)
3168{ 3169{
3169 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 3170 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3170 ++vcpu->kvm->stat.mmu_pde_zapped; 3171 ++vcpu->kvm->stat.mmu_pde_zapped;
@@ -3172,10 +3173,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3172 } 3173 }
3173 3174
3174 ++vcpu->kvm->stat.mmu_pte_updated; 3175 ++vcpu->kvm->stat.mmu_pte_updated;
3175 if (!sp->role.cr4_pae) 3176 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq);
3176 paging32_update_pte(vcpu, sp, spte, new);
3177 else
3178 paging64_update_pte(vcpu, sp, spte, new);
3179} 3177}
3180 3178
3181static bool need_remote_flush(u64 old, u64 new) 3179static bool need_remote_flush(u64 old, u64 new)
@@ -3210,28 +3208,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
3210 return !!(spte && (*spte & shadow_accessed_mask)); 3208 return !!(spte && (*spte & shadow_accessed_mask));
3211} 3209}
3212 3210
3213static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3214 u64 gpte)
3215{
3216 gfn_t gfn;
3217 pfn_t pfn;
3218
3219 if (!is_present_gpte(gpte))
3220 return;
3221 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
3222
3223 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
3224 smp_rmb();
3225 pfn = gfn_to_pfn(vcpu->kvm, gfn);
3226
3227 if (is_error_pfn(pfn)) {
3228 kvm_release_pfn_clean(pfn);
3229 return;
3230 }
3231 vcpu->arch.update_pte.gfn = gfn;
3232 vcpu->arch.update_pte.pfn = pfn;
3233}
3234
3235static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) 3211static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
3236{ 3212{
3237 u64 *spte = vcpu->arch.last_pte_updated; 3213 u64 *spte = vcpu->arch.last_pte_updated;
@@ -3253,21 +3229,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3253 struct kvm_mmu_page *sp; 3229 struct kvm_mmu_page *sp;
3254 struct hlist_node *node; 3230 struct hlist_node *node;
3255 LIST_HEAD(invalid_list); 3231 LIST_HEAD(invalid_list);
3256 u64 entry, gentry; 3232 unsigned long mmu_seq;
3257 u64 *spte; 3233 u64 entry, gentry, *spte;
3258 unsigned offset = offset_in_page(gpa); 3234 unsigned pte_size, page_offset, misaligned, quadrant, offset;
3259 unsigned pte_size; 3235 int level, npte, invlpg_counter, r, flooded = 0;
3260 unsigned page_offset;
3261 unsigned misaligned;
3262 unsigned quadrant;
3263 int level;
3264 int flooded = 0;
3265 int npte;
3266 int r;
3267 int invlpg_counter;
3268 bool remote_flush, local_flush, zap_page; 3236 bool remote_flush, local_flush, zap_page;
3269 3237
3270 zap_page = remote_flush = local_flush = false; 3238 zap_page = remote_flush = local_flush = false;
3239 offset = offset_in_page(gpa);
3271 3240
3272 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 3241 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3273 3242
@@ -3275,9 +3244,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3275 3244
3276 /* 3245 /*
3277 * Assume that the pte write on a page table of the same type 3246 * Assume that the pte write on a page table of the same type
3278 * as the current vcpu paging mode. This is nearly always true 3247 * as the current vcpu paging mode since we update the sptes only
3279 * (might be false while changing modes). Note it is verified later 3248 * when they have the same mode.
3280 * by update_pte().
3281 */ 3249 */
3282 if ((is_pae(vcpu) && bytes == 4) || !new) { 3250 if ((is_pae(vcpu) && bytes == 4) || !new) {
3283 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3251 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
@@ -3303,15 +3271,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3303 break; 3271 break;
3304 } 3272 }
3305 3273
3306 mmu_guess_page_from_pte_write(vcpu, gpa, gentry); 3274 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3275 smp_rmb();
3276
3307 spin_lock(&vcpu->kvm->mmu_lock); 3277 spin_lock(&vcpu->kvm->mmu_lock);
3308 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) 3278 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3309 gentry = 0; 3279 gentry = 0;
3310 kvm_mmu_access_page(vcpu, gfn);
3311 kvm_mmu_free_some_pages(vcpu); 3280 kvm_mmu_free_some_pages(vcpu);
3312 ++vcpu->kvm->stat.mmu_pte_write; 3281 ++vcpu->kvm->stat.mmu_pte_write;
3313 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 3282 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3314 if (guest_initiated) { 3283 if (guest_initiated) {
3284 kvm_mmu_access_page(vcpu, gfn);
3315 if (gfn == vcpu->arch.last_pt_write_gfn 3285 if (gfn == vcpu->arch.last_pt_write_gfn
3316 && !last_updated_pte_accessed(vcpu)) { 3286 && !last_updated_pte_accessed(vcpu)) {
3317 ++vcpu->arch.last_pt_write_count; 3287 ++vcpu->arch.last_pt_write_count;
@@ -3375,7 +3345,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3375 if (gentry && 3345 if (gentry &&
3376 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3346 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3377 & mask.word)) 3347 & mask.word))
3378 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 3348 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry,
3349 mmu_seq);
3379 if (!remote_flush && need_remote_flush(entry, *spte)) 3350 if (!remote_flush && need_remote_flush(entry, *spte))
3380 remote_flush = true; 3351 remote_flush = true;
3381 ++spte; 3352 ++spte;
@@ -3385,10 +3356,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3385 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3356 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3386 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); 3357 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3387 spin_unlock(&vcpu->kvm->mmu_lock); 3358 spin_unlock(&vcpu->kvm->mmu_lock);
3388 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
3389 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
3390 vcpu->arch.update_pte.pfn = bad_pfn;
3391 }
3392} 3359}
3393 3360
3394int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 3361int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3538,14 +3505,23 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3538 if (!test_bit(slot, sp->slot_bitmap)) 3505 if (!test_bit(slot, sp->slot_bitmap))
3539 continue; 3506 continue;
3540 3507
3541 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3542 continue;
3543
3544 pt = sp->spt; 3508 pt = sp->spt;
3545 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3509 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3510 if (!is_shadow_present_pte(pt[i]) ||
3511 !is_last_spte(pt[i], sp->role.level))
3512 continue;
3513
3514 if (is_large_pte(pt[i])) {
3515 drop_spte(kvm, &pt[i],
3516 shadow_trap_nonpresent_pte);
3517 --kvm->stat.lpages;
3518 continue;
3519 }
3520
3546 /* avoid RMW */ 3521 /* avoid RMW */
3547 if (is_writable_pte(pt[i])) 3522 if (is_writable_pte(pt[i]))
3548 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); 3523 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3524 }
3549 } 3525 }
3550 kvm_flush_remote_tlbs(kvm); 3526 kvm_flush_remote_tlbs(kvm);
3551} 3527}
@@ -3583,7 +3559,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3583 if (nr_to_scan == 0) 3559 if (nr_to_scan == 0)
3584 goto out; 3560 goto out;
3585 3561
3586 spin_lock(&kvm_lock); 3562 raw_spin_lock(&kvm_lock);
3587 3563
3588 list_for_each_entry(kvm, &vm_list, vm_list) { 3564 list_for_each_entry(kvm, &vm_list, vm_list) {
3589 int idx, freed_pages; 3565 int idx, freed_pages;
@@ -3606,7 +3582,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3606 if (kvm_freed) 3582 if (kvm_freed)
3607 list_move_tail(&kvm_freed->vm_list, &vm_list); 3583 list_move_tail(&kvm_freed->vm_list, &vm_list);
3608 3584
3609 spin_unlock(&kvm_lock); 3585 raw_spin_unlock(&kvm_lock);
3610 3586
3611out: 3587out:
3612 return percpu_counter_read_positive(&kvm_total_used_mmu_pages); 3588 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bccc24c4181..751405097d8c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -31,7 +31,6 @@
31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) 31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
35 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
36 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
37 #define PT_MAX_FULL_LEVELS 4 36 #define PT_MAX_FULL_LEVELS 4
@@ -48,7 +47,6 @@
48 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) 47 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
49 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) 48 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
50 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
51 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
52 #define PT_LEVEL_BITS PT32_LEVEL_BITS 50 #define PT_LEVEL_BITS PT32_LEVEL_BITS
53 #define PT_MAX_FULL_LEVELS 2 51 #define PT_MAX_FULL_LEVELS 2
54 #define CMPXCHG cmpxchg 52 #define CMPXCHG cmpxchg
@@ -327,7 +325,7 @@ no_present:
327} 325}
328 326
329static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 327static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
330 u64 *spte, const void *pte) 328 u64 *spte, const void *pte, unsigned long mmu_seq)
331{ 329{
332 pt_element_t gpte; 330 pt_element_t gpte;
333 unsigned pte_access; 331 unsigned pte_access;
@@ -339,14 +337,14 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
339 337
340 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 338 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
341 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 339 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
342 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 340 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
341 if (is_error_pfn(pfn)) {
342 kvm_release_pfn_clean(pfn);
343 return; 343 return;
344 pfn = vcpu->arch.update_pte.pfn; 344 }
345 if (is_error_pfn(pfn)) 345 if (mmu_notifier_retry(vcpu, mmu_seq))
346 return;
347 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
348 return; 346 return;
349 kvm_get_pfn(pfn); 347
350 /* 348 /*
351 * we call mmu_set_spte() with host_writable = true beacuse that 349 * we call mmu_set_spte() with host_writable = true beacuse that
352 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 350 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
@@ -829,7 +827,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
829#undef FNAME 827#undef FNAME
830#undef PT_BASE_ADDR_MASK 828#undef PT_BASE_ADDR_MASK
831#undef PT_INDEX 829#undef PT_INDEX
832#undef PT_LEVEL_MASK
833#undef PT_LVL_ADDR_MASK 830#undef PT_LVL_ADDR_MASK
834#undef PT_LVL_OFFSET_MASK 831#undef PT_LVL_OFFSET_MASK
835#undef PT_LEVEL_BITS 832#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 63fec1531e89..6bb15d583e47 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -135,6 +135,8 @@ struct vcpu_svm {
135 135
136 u32 *msrpm; 136 u32 *msrpm;
137 137
138 ulong nmi_iret_rip;
139
138 struct nested_state nested; 140 struct nested_state nested;
139 141
140 bool nmi_singlestep; 142 bool nmi_singlestep;
@@ -1153,8 +1155,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1153 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); 1155 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1154 load_gs_index(svm->host.gs); 1156 load_gs_index(svm->host.gs);
1155#else 1157#else
1158#ifdef CONFIG_X86_32_LAZY_GS
1156 loadsegment(gs, svm->host.gs); 1159 loadsegment(gs, svm->host.gs);
1157#endif 1160#endif
1161#endif
1158 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1162 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1159 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1163 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1160} 1164}
@@ -2653,6 +2657,7 @@ static int iret_interception(struct vcpu_svm *svm)
2653 ++svm->vcpu.stat.nmi_window_exits; 2657 ++svm->vcpu.stat.nmi_window_exits;
2654 clr_intercept(svm, INTERCEPT_IRET); 2658 clr_intercept(svm, INTERCEPT_IRET);
2655 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2659 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2660 svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2656 return 1; 2661 return 1;
2657} 2662}
2658 2663
@@ -3474,7 +3479,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3474 3479
3475 svm->int3_injected = 0; 3480 svm->int3_injected = 0;
3476 3481
3477 if (svm->vcpu.arch.hflags & HF_IRET_MASK) { 3482 /*
3483 * If we've made progress since setting HF_IRET_MASK, we've
3484 * executed an IRET and can allow NMI injection.
3485 */
3486 if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3487 && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3478 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3488 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3479 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3489 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3480 } 3490 }
@@ -3641,19 +3651,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3641 wrmsrl(MSR_GS_BASE, svm->host.gs_base); 3651 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3642#else 3652#else
3643 loadsegment(fs, svm->host.fs); 3653 loadsegment(fs, svm->host.fs);
3654#ifndef CONFIG_X86_32_LAZY_GS
3655 loadsegment(gs, svm->host.gs);
3656#endif
3644#endif 3657#endif
3645 3658
3646 reload_tss(vcpu); 3659 reload_tss(vcpu);
3647 3660
3648 local_irq_disable(); 3661 local_irq_disable();
3649 3662
3650 stgi();
3651
3652 vcpu->arch.cr2 = svm->vmcb->save.cr2; 3663 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3653 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 3664 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3654 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 3665 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3655 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 3666 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3656 3667
3668 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3669 kvm_before_handle_nmi(&svm->vcpu);
3670
3671 stgi();
3672
3673 /* Any pending NMI will happen here */
3674
3675 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3676 kvm_after_handle_nmi(&svm->vcpu);
3677
3657 sync_cr8_to_lapic(vcpu); 3678 sync_cr8_to_lapic(vcpu);
3658 3679
3659 svm->next_rip = 0; 3680 svm->next_rip = 0;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bf89ec2cfb82..5b4cdcbd154c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -93,14 +93,14 @@ module_param(yield_on_hlt, bool, S_IRUGO);
93 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 93 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
94 * ple_gap: upper bound on the amount of time between two successive 94 * ple_gap: upper bound on the amount of time between two successive
95 * executions of PAUSE in a loop. Also indicate if ple enabled. 95 * executions of PAUSE in a loop. Also indicate if ple enabled.
96 * According to test, this time is usually small than 41 cycles. 96 * According to test, this time is usually smaller than 128 cycles.
97 * ple_window: upper bound on the amount of time a guest is allowed to execute 97 * ple_window: upper bound on the amount of time a guest is allowed to execute
98 * in a PAUSE loop. Tests indicate that most spinlocks are held for 98 * in a PAUSE loop. Tests indicate that most spinlocks are held for
99 * less than 2^12 cycles 99 * less than 2^12 cycles
100 * Time is measured based on a counter that runs at the same rate as the TSC, 100 * Time is measured based on a counter that runs at the same rate as the TSC,
101 * refer SDM volume 3b section 21.6.13 & 22.1.3. 101 * refer SDM volume 3b section 21.6.13 & 22.1.3.
102 */ 102 */
103#define KVM_VMX_DEFAULT_PLE_GAP 41 103#define KVM_VMX_DEFAULT_PLE_GAP 128
104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; 105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
106module_param(ple_gap, int, S_IRUGO); 106module_param(ple_gap, int, S_IRUGO);
@@ -176,11 +176,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
176 return container_of(vcpu, struct vcpu_vmx, vcpu); 176 return container_of(vcpu, struct vcpu_vmx, vcpu);
177} 177}
178 178
179static int init_rmode(struct kvm *kvm);
180static u64 construct_eptp(unsigned long root_hpa); 179static u64 construct_eptp(unsigned long root_hpa);
181static void kvm_cpu_vmxon(u64 addr); 180static void kvm_cpu_vmxon(u64 addr);
182static void kvm_cpu_vmxoff(void); 181static void kvm_cpu_vmxoff(void);
183static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 182static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
183static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
184 184
185static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
186static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1333,19 +1333,25 @@ static __init int vmx_disabled_by_bios(void)
1333 1333
1334 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1334 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1335 if (msr & FEATURE_CONTROL_LOCKED) { 1335 if (msr & FEATURE_CONTROL_LOCKED) {
1336 /* launched w/ TXT and VMX disabled */
1336 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 1337 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1337 && tboot_enabled()) 1338 && tboot_enabled())
1338 return 1; 1339 return 1;
1340 /* launched w/o TXT and VMX only enabled w/ TXT */
1339 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 1341 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1342 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1340 && !tboot_enabled()) { 1343 && !tboot_enabled()) {
1341 printk(KERN_WARNING "kvm: disable TXT in the BIOS or " 1344 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1342 " activate TXT before enabling KVM\n"); 1345 "activate TXT before enabling KVM\n");
1343 return 1; 1346 return 1;
1344 } 1347 }
1348 /* launched w/o TXT and VMX disabled */
1349 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1350 && !tboot_enabled())
1351 return 1;
1345 } 1352 }
1346 1353
1347 return 0; 1354 return 0;
1348 /* locked but not enabled */
1349} 1355}
1350 1356
1351static void kvm_cpu_vmxon(u64 addr) 1357static void kvm_cpu_vmxon(u64 addr)
@@ -1683,6 +1689,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1683 vmx->emulation_required = 1; 1689 vmx->emulation_required = 1;
1684 vmx->rmode.vm86_active = 0; 1690 vmx->rmode.vm86_active = 0;
1685 1691
1692 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
1686 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); 1693 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1687 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); 1694 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
1688 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); 1695 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
@@ -1756,6 +1763,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1756 vmx->emulation_required = 1; 1763 vmx->emulation_required = 1;
1757 vmx->rmode.vm86_active = 1; 1764 vmx->rmode.vm86_active = 1;
1758 1765
1766 /*
1767 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
1768 * vcpu. Call it here with phys address pointing 16M below 4G.
1769 */
1770 if (!vcpu->kvm->arch.tss_addr) {
1771 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
1772 "called before entering vcpu\n");
1773 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
1774 vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
1775 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1776 }
1777
1778 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
1759 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1779 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1760 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1780 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1761 1781
@@ -1794,7 +1814,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1794 1814
1795continue_rmode: 1815continue_rmode:
1796 kvm_mmu_reset_context(vcpu); 1816 kvm_mmu_reset_context(vcpu);
1797 init_rmode(vcpu->kvm);
1798} 1817}
1799 1818
1800static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1819static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -2030,23 +2049,40 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2030 vmcs_writel(GUEST_CR4, hw_cr4); 2049 vmcs_writel(GUEST_CR4, hw_cr4);
2031} 2050}
2032 2051
2033static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2034{
2035 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2036
2037 return vmcs_readl(sf->base);
2038}
2039
2040static void vmx_get_segment(struct kvm_vcpu *vcpu, 2052static void vmx_get_segment(struct kvm_vcpu *vcpu,
2041 struct kvm_segment *var, int seg) 2053 struct kvm_segment *var, int seg)
2042{ 2054{
2055 struct vcpu_vmx *vmx = to_vmx(vcpu);
2043 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2056 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2057 struct kvm_save_segment *save;
2044 u32 ar; 2058 u32 ar;
2045 2059
2060 if (vmx->rmode.vm86_active
2061 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
2062 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
2063 || seg == VCPU_SREG_GS)
2064 && !emulate_invalid_guest_state) {
2065 switch (seg) {
2066 case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
2067 case VCPU_SREG_ES: save = &vmx->rmode.es; break;
2068 case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
2069 case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
2070 case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
2071 default: BUG();
2072 }
2073 var->selector = save->selector;
2074 var->base = save->base;
2075 var->limit = save->limit;
2076 ar = save->ar;
2077 if (seg == VCPU_SREG_TR
2078 || var->selector == vmcs_read16(sf->selector))
2079 goto use_saved_rmode_seg;
2080 }
2046 var->base = vmcs_readl(sf->base); 2081 var->base = vmcs_readl(sf->base);
2047 var->limit = vmcs_read32(sf->limit); 2082 var->limit = vmcs_read32(sf->limit);
2048 var->selector = vmcs_read16(sf->selector); 2083 var->selector = vmcs_read16(sf->selector);
2049 ar = vmcs_read32(sf->ar_bytes); 2084 ar = vmcs_read32(sf->ar_bytes);
2085use_saved_rmode_seg:
2050 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) 2086 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
2051 ar = 0; 2087 ar = 0;
2052 var->type = ar & 15; 2088 var->type = ar & 15;
@@ -2060,6 +2096,18 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
2060 var->unusable = (ar >> 16) & 1; 2096 var->unusable = (ar >> 16) & 1;
2061} 2097}
2062 2098
2099static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2100{
2101 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2102 struct kvm_segment s;
2103
2104 if (to_vmx(vcpu)->rmode.vm86_active) {
2105 vmx_get_segment(vcpu, &s, seg);
2106 return s.base;
2107 }
2108 return vmcs_readl(sf->base);
2109}
2110
2063static int vmx_get_cpl(struct kvm_vcpu *vcpu) 2111static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2064{ 2112{
2065 if (!is_protmode(vcpu)) 2113 if (!is_protmode(vcpu))
@@ -2101,6 +2149,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2101 u32 ar; 2149 u32 ar;
2102 2150
2103 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 2151 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
2152 vmcs_write16(sf->selector, var->selector);
2104 vmx->rmode.tr.selector = var->selector; 2153 vmx->rmode.tr.selector = var->selector;
2105 vmx->rmode.tr.base = var->base; 2154 vmx->rmode.tr.base = var->base;
2106 vmx->rmode.tr.limit = var->limit; 2155 vmx->rmode.tr.limit = var->limit;
@@ -2361,11 +2410,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
2361 2410
2362static int init_rmode_tss(struct kvm *kvm) 2411static int init_rmode_tss(struct kvm *kvm)
2363{ 2412{
2364 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 2413 gfn_t fn;
2365 u16 data = 0; 2414 u16 data = 0;
2366 int ret = 0; 2415 int r, idx, ret = 0;
2367 int r;
2368 2416
2417 idx = srcu_read_lock(&kvm->srcu);
2418 fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
2369 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 2419 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2370 if (r < 0) 2420 if (r < 0)
2371 goto out; 2421 goto out;
@@ -2389,12 +2439,13 @@ static int init_rmode_tss(struct kvm *kvm)
2389 2439
2390 ret = 1; 2440 ret = 1;
2391out: 2441out:
2442 srcu_read_unlock(&kvm->srcu, idx);
2392 return ret; 2443 return ret;
2393} 2444}
2394 2445
2395static int init_rmode_identity_map(struct kvm *kvm) 2446static int init_rmode_identity_map(struct kvm *kvm)
2396{ 2447{
2397 int i, r, ret; 2448 int i, idx, r, ret;
2398 pfn_t identity_map_pfn; 2449 pfn_t identity_map_pfn;
2399 u32 tmp; 2450 u32 tmp;
2400 2451
@@ -2409,6 +2460,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2409 return 1; 2460 return 1;
2410 ret = 0; 2461 ret = 0;
2411 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 2462 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
2463 idx = srcu_read_lock(&kvm->srcu);
2412 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 2464 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2413 if (r < 0) 2465 if (r < 0)
2414 goto out; 2466 goto out;
@@ -2424,6 +2476,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2424 kvm->arch.ept_identity_pagetable_done = true; 2476 kvm->arch.ept_identity_pagetable_done = true;
2425 ret = 1; 2477 ret = 1;
2426out: 2478out:
2479 srcu_read_unlock(&kvm->srcu, idx);
2427 return ret; 2480 return ret;
2428} 2481}
2429 2482
@@ -2699,22 +2752,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2699 return 0; 2752 return 0;
2700} 2753}
2701 2754
2702static int init_rmode(struct kvm *kvm)
2703{
2704 int idx, ret = 0;
2705
2706 idx = srcu_read_lock(&kvm->srcu);
2707 if (!init_rmode_tss(kvm))
2708 goto exit;
2709 if (!init_rmode_identity_map(kvm))
2710 goto exit;
2711
2712 ret = 1;
2713exit:
2714 srcu_read_unlock(&kvm->srcu, idx);
2715 return ret;
2716}
2717
2718static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2755static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2719{ 2756{
2720 struct vcpu_vmx *vmx = to_vmx(vcpu); 2757 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2722,10 +2759,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2722 int ret; 2759 int ret;
2723 2760
2724 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2761 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2725 if (!init_rmode(vmx->vcpu.kvm)) {
2726 ret = -ENOMEM;
2727 goto out;
2728 }
2729 2762
2730 vmx->rmode.vm86_active = 0; 2763 vmx->rmode.vm86_active = 0;
2731 2764
@@ -2805,7 +2838,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2805 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 2838 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
2806 if (vm_need_tpr_shadow(vmx->vcpu.kvm)) 2839 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
2807 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 2840 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
2808 page_to_phys(vmx->vcpu.arch.apic->regs_page)); 2841 __pa(vmx->vcpu.arch.apic->regs));
2809 vmcs_write32(TPR_THRESHOLD, 0); 2842 vmcs_write32(TPR_THRESHOLD, 0);
2810 } 2843 }
2811 2844
@@ -2971,6 +3004,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2971 if (ret) 3004 if (ret)
2972 return ret; 3005 return ret;
2973 kvm->arch.tss_addr = addr; 3006 kvm->arch.tss_addr = addr;
3007 if (!init_rmode_tss(kvm))
3008 return -ENOMEM;
3009
2974 return 0; 3010 return 0;
2975} 3011}
2976 3012
@@ -3962,7 +3998,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
3962#define Q "l" 3998#define Q "l"
3963#endif 3999#endif
3964 4000
3965static void vmx_vcpu_run(struct kvm_vcpu *vcpu) 4001static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
3966{ 4002{
3967 struct vcpu_vmx *vmx = to_vmx(vcpu); 4003 struct vcpu_vmx *vmx = to_vmx(vcpu);
3968 4004
@@ -3991,6 +4027,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3991 asm( 4027 asm(
3992 /* Store host registers */ 4028 /* Store host registers */
3993 "push %%"R"dx; push %%"R"bp;" 4029 "push %%"R"dx; push %%"R"bp;"
4030 "push %%"R"cx \n\t" /* placeholder for guest rcx */
3994 "push %%"R"cx \n\t" 4031 "push %%"R"cx \n\t"
3995 "cmp %%"R"sp, %c[host_rsp](%0) \n\t" 4032 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
3996 "je 1f \n\t" 4033 "je 1f \n\t"
@@ -4032,10 +4069,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4032 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 4069 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
4033 ".Lkvm_vmx_return: " 4070 ".Lkvm_vmx_return: "
4034 /* Save guest registers, load host registers, keep flags */ 4071 /* Save guest registers, load host registers, keep flags */
4035 "xchg %0, (%%"R"sp) \n\t" 4072 "mov %0, %c[wordsize](%%"R"sp) \n\t"
4073 "pop %0 \n\t"
4036 "mov %%"R"ax, %c[rax](%0) \n\t" 4074 "mov %%"R"ax, %c[rax](%0) \n\t"
4037 "mov %%"R"bx, %c[rbx](%0) \n\t" 4075 "mov %%"R"bx, %c[rbx](%0) \n\t"
4038 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" 4076 "pop"Q" %c[rcx](%0) \n\t"
4039 "mov %%"R"dx, %c[rdx](%0) \n\t" 4077 "mov %%"R"dx, %c[rdx](%0) \n\t"
4040 "mov %%"R"si, %c[rsi](%0) \n\t" 4078 "mov %%"R"si, %c[rsi](%0) \n\t"
4041 "mov %%"R"di, %c[rdi](%0) \n\t" 4079 "mov %%"R"di, %c[rdi](%0) \n\t"
@@ -4053,7 +4091,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4053 "mov %%cr2, %%"R"ax \n\t" 4091 "mov %%cr2, %%"R"ax \n\t"
4054 "mov %%"R"ax, %c[cr2](%0) \n\t" 4092 "mov %%"R"ax, %c[cr2](%0) \n\t"
4055 4093
4056 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" 4094 "pop %%"R"bp; pop %%"R"dx \n\t"
4057 "setbe %c[fail](%0) \n\t" 4095 "setbe %c[fail](%0) \n\t"
4058 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 4096 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4059 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 4097 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
@@ -4076,7 +4114,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4076 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), 4114 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
4077 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), 4115 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
4078#endif 4116#endif
4079 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 4117 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
4118 [wordsize]"i"(sizeof(ulong))
4080 : "cc", "memory" 4119 : "cc", "memory"
4081 , R"ax", R"bx", R"di", R"si" 4120 , R"ax", R"bx", R"di", R"si"
4082#ifdef CONFIG_X86_64 4121#ifdef CONFIG_X86_64
@@ -4183,8 +4222,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4183 if (!kvm->arch.ept_identity_map_addr) 4222 if (!kvm->arch.ept_identity_map_addr)
4184 kvm->arch.ept_identity_map_addr = 4223 kvm->arch.ept_identity_map_addr =
4185 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 4224 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4225 err = -ENOMEM;
4186 if (alloc_identity_pagetable(kvm) != 0) 4226 if (alloc_identity_pagetable(kvm) != 0)
4187 goto free_vmcs; 4227 goto free_vmcs;
4228 if (!init_rmode_identity_map(kvm))
4229 goto free_vmcs;
4188 } 4230 }
4189 4231
4190 return &vmx->vcpu; 4232 return &vmx->vcpu;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bcc0efce85bf..f1e4025f1ae2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -81,9 +81,10 @@
81 * - enable LME and LMA per default on 64 bit KVM 81 * - enable LME and LMA per default on 64 bit KVM
82 */ 82 */
83#ifdef CONFIG_X86_64 83#ifdef CONFIG_X86_64
84static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 84static
85u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
85#else 86#else
86static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 87static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
87#endif 88#endif
88 89
89#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 90#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
@@ -360,8 +361,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
360 361
361void kvm_inject_nmi(struct kvm_vcpu *vcpu) 362void kvm_inject_nmi(struct kvm_vcpu *vcpu)
362{ 363{
364 kvm_make_request(KVM_REQ_NMI, vcpu);
363 kvm_make_request(KVM_REQ_EVENT, vcpu); 365 kvm_make_request(KVM_REQ_EVENT, vcpu);
364 vcpu->arch.nmi_pending = 1;
365} 366}
366EXPORT_SYMBOL_GPL(kvm_inject_nmi); 367EXPORT_SYMBOL_GPL(kvm_inject_nmi);
367 368
@@ -525,8 +526,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
525 526
526 kvm_x86_ops->set_cr0(vcpu, cr0); 527 kvm_x86_ops->set_cr0(vcpu, cr0);
527 528
528 if ((cr0 ^ old_cr0) & X86_CR0_PG) 529 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
529 kvm_clear_async_pf_completion_queue(vcpu); 530 kvm_clear_async_pf_completion_queue(vcpu);
531 kvm_async_pf_hash_reset(vcpu);
532 }
530 533
531 if ((cr0 ^ old_cr0) & update_bits) 534 if ((cr0 ^ old_cr0) & update_bits)
532 kvm_mmu_reset_context(vcpu); 535 kvm_mmu_reset_context(vcpu);
@@ -1017,7 +1020,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1017 unsigned long flags; 1020 unsigned long flags;
1018 s64 sdiff; 1021 s64 sdiff;
1019 1022
1020 spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1023 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1021 offset = data - native_read_tsc(); 1024 offset = data - native_read_tsc();
1022 ns = get_kernel_ns(); 1025 ns = get_kernel_ns();
1023 elapsed = ns - kvm->arch.last_tsc_nsec; 1026 elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -1050,7 +1053,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1050 kvm->arch.last_tsc_write = data; 1053 kvm->arch.last_tsc_write = data;
1051 kvm->arch.last_tsc_offset = offset; 1054 kvm->arch.last_tsc_offset = offset;
1052 kvm_x86_ops->write_tsc_offset(vcpu, offset); 1055 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1053 spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1056 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1054 1057
1055 /* Reset of TSC must disable overshoot protection below */ 1058 /* Reset of TSC must disable overshoot protection below */
1056 vcpu->arch.hv_clock.tsc_timestamp = 0; 1059 vcpu->arch.hv_clock.tsc_timestamp = 0;
@@ -1453,6 +1456,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1453 return 0; 1456 return 0;
1454} 1457}
1455 1458
1459static void kvmclock_reset(struct kvm_vcpu *vcpu)
1460{
1461 if (vcpu->arch.time_page) {
1462 kvm_release_page_dirty(vcpu->arch.time_page);
1463 vcpu->arch.time_page = NULL;
1464 }
1465}
1466
1456int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1467int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1457{ 1468{
1458 switch (msr) { 1469 switch (msr) {
@@ -1510,10 +1521,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1510 break; 1521 break;
1511 case MSR_KVM_SYSTEM_TIME_NEW: 1522 case MSR_KVM_SYSTEM_TIME_NEW:
1512 case MSR_KVM_SYSTEM_TIME: { 1523 case MSR_KVM_SYSTEM_TIME: {
1513 if (vcpu->arch.time_page) { 1524 kvmclock_reset(vcpu);
1514 kvm_release_page_dirty(vcpu->arch.time_page);
1515 vcpu->arch.time_page = NULL;
1516 }
1517 1525
1518 vcpu->arch.time = data; 1526 vcpu->arch.time = data;
1519 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 1527 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -1592,6 +1600,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1592 } else 1600 } else
1593 return set_msr_hyperv(vcpu, msr, data); 1601 return set_msr_hyperv(vcpu, msr, data);
1594 break; 1602 break;
1603 case MSR_IA32_BBL_CR_CTL3:
1604 /* Drop writes to this legacy MSR -- see rdmsr
1605 * counterpart for further detail.
1606 */
1607 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1608 break;
1595 default: 1609 default:
1596 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1610 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1597 return xen_hvm_config(vcpu, data); 1611 return xen_hvm_config(vcpu, data);
@@ -1846,6 +1860,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1846 } else 1860 } else
1847 return get_msr_hyperv(vcpu, msr, pdata); 1861 return get_msr_hyperv(vcpu, msr, pdata);
1848 break; 1862 break;
1863 case MSR_IA32_BBL_CR_CTL3:
1864 /* This legacy MSR exists but isn't fully documented in current
1865 * silicon. It is however accessed by winxp in very narrow
1866 * scenarios where it sets bit #19, itself documented as
1867 * a "reserved" bit. Best effort attempt to source coherent
1868 * read data here should the balance of the register be
1869 * interpreted by the guest:
1870 *
1871 * L2 cache control register 3: 64GB range, 256KB size,
1872 * enabled, latency 0x1, configured
1873 */
1874 data = 0xbe702111;
1875 break;
1849 default: 1876 default:
1850 if (!ignore_msrs) { 1877 if (!ignore_msrs) {
1851 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1878 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -2100,8 +2127,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2100 if (check_tsc_unstable()) { 2127 if (check_tsc_unstable()) {
2101 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); 2128 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
2102 vcpu->arch.tsc_catchup = 1; 2129 vcpu->arch.tsc_catchup = 1;
2103 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2104 } 2130 }
2131 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2105 if (vcpu->cpu != cpu) 2132 if (vcpu->cpu != cpu)
2106 kvm_migrate_timers(vcpu); 2133 kvm_migrate_timers(vcpu);
2107 vcpu->cpu = cpu; 2134 vcpu->cpu = cpu;
@@ -2575,9 +2602,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2575 if (mce->status & MCI_STATUS_UC) { 2602 if (mce->status & MCI_STATUS_UC) {
2576 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2603 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2577 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { 2604 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2578 printk(KERN_DEBUG "kvm: set_mce: "
2579 "injects mce exception while "
2580 "previous one is in progress!\n");
2581 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2605 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2582 return 0; 2606 return 0;
2583 } 2607 }
@@ -2648,8 +2672,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2648 vcpu->arch.interrupt.pending = events->interrupt.injected; 2672 vcpu->arch.interrupt.pending = events->interrupt.injected;
2649 vcpu->arch.interrupt.nr = events->interrupt.nr; 2673 vcpu->arch.interrupt.nr = events->interrupt.nr;
2650 vcpu->arch.interrupt.soft = events->interrupt.soft; 2674 vcpu->arch.interrupt.soft = events->interrupt.soft;
2651 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2652 kvm_pic_clear_isr_ack(vcpu->kvm);
2653 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 2675 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2654 kvm_x86_ops->set_interrupt_shadow(vcpu, 2676 kvm_x86_ops->set_interrupt_shadow(vcpu,
2655 events->interrupt.shadow); 2677 events->interrupt.shadow);
@@ -4140,8 +4162,8 @@ static unsigned long emulator_get_cached_segment_base(int seg,
4140 return get_segment_base(vcpu, seg); 4162 return get_segment_base(vcpu, seg);
4141} 4163}
4142 4164
4143static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 4165static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
4144 struct kvm_vcpu *vcpu) 4166 int seg, struct kvm_vcpu *vcpu)
4145{ 4167{
4146 struct kvm_segment var; 4168 struct kvm_segment var;
4147 4169
@@ -4154,6 +4176,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
4154 var.limit >>= 12; 4176 var.limit >>= 12;
4155 set_desc_limit(desc, var.limit); 4177 set_desc_limit(desc, var.limit);
4156 set_desc_base(desc, (unsigned long)var.base); 4178 set_desc_base(desc, (unsigned long)var.base);
4179#ifdef CONFIG_X86_64
4180 if (base3)
4181 *base3 = var.base >> 32;
4182#endif
4157 desc->type = var.type; 4183 desc->type = var.type;
4158 desc->s = var.s; 4184 desc->s = var.s;
4159 desc->dpl = var.dpl; 4185 desc->dpl = var.dpl;
@@ -4166,8 +4192,8 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
4166 return true; 4192 return true;
4167} 4193}
4168 4194
4169static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, 4195static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
4170 struct kvm_vcpu *vcpu) 4196 int seg, struct kvm_vcpu *vcpu)
4171{ 4197{
4172 struct kvm_segment var; 4198 struct kvm_segment var;
4173 4199
@@ -4175,6 +4201,9 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
4175 kvm_get_segment(vcpu, &var, seg); 4201 kvm_get_segment(vcpu, &var, seg);
4176 4202
4177 var.base = get_desc_base(desc); 4203 var.base = get_desc_base(desc);
4204#ifdef CONFIG_X86_64
4205 var.base |= ((u64)base3) << 32;
4206#endif
4178 var.limit = get_desc_limit(desc); 4207 var.limit = get_desc_limit(desc);
4179 if (desc->g) 4208 if (desc->g)
4180 var.limit = (var.limit << 12) | 0xfff; 4209 var.limit = (var.limit << 12) | 0xfff;
@@ -4390,41 +4419,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4390 vcpu->arch.emulate_ctxt.have_exception = false; 4419 vcpu->arch.emulate_ctxt.have_exception = false;
4391 vcpu->arch.emulate_ctxt.perm_ok = false; 4420 vcpu->arch.emulate_ctxt.perm_ok = false;
4392 4421
4422 vcpu->arch.emulate_ctxt.only_vendor_specific_insn
4423 = emulation_type & EMULTYPE_TRAP_UD;
4424
4393 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); 4425 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
4394 if (r == X86EMUL_PROPAGATE_FAULT)
4395 goto done;
4396 4426
4397 trace_kvm_emulate_insn_start(vcpu); 4427 trace_kvm_emulate_insn_start(vcpu);
4398
4399 /* Only allow emulation of specific instructions on #UD
4400 * (namely VMMCALL, sysenter, sysexit, syscall)*/
4401 if (emulation_type & EMULTYPE_TRAP_UD) {
4402 if (!c->twobyte)
4403 return EMULATE_FAIL;
4404 switch (c->b) {
4405 case 0x01: /* VMMCALL */
4406 if (c->modrm_mod != 3 || c->modrm_rm != 1)
4407 return EMULATE_FAIL;
4408 break;
4409 case 0x34: /* sysenter */
4410 case 0x35: /* sysexit */
4411 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4412 return EMULATE_FAIL;
4413 break;
4414 case 0x05: /* syscall */
4415 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4416 return EMULATE_FAIL;
4417 break;
4418 default:
4419 return EMULATE_FAIL;
4420 }
4421
4422 if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
4423 return EMULATE_FAIL;
4424 }
4425
4426 ++vcpu->stat.insn_emulation; 4428 ++vcpu->stat.insn_emulation;
4427 if (r) { 4429 if (r) {
4430 if (emulation_type & EMULTYPE_TRAP_UD)
4431 return EMULATE_FAIL;
4428 if (reexecute_instruction(vcpu, cr2)) 4432 if (reexecute_instruction(vcpu, cr2))
4429 return EMULATE_DONE; 4433 return EMULATE_DONE;
4430 if (emulation_type & EMULTYPE_SKIP) 4434 if (emulation_type & EMULTYPE_SKIP)
@@ -4452,7 +4456,6 @@ restart:
4452 return handle_emulation_failure(vcpu); 4456 return handle_emulation_failure(vcpu);
4453 } 4457 }
4454 4458
4455done:
4456 if (vcpu->arch.emulate_ctxt.have_exception) { 4459 if (vcpu->arch.emulate_ctxt.have_exception) {
4457 inject_emulated_exception(vcpu); 4460 inject_emulated_exception(vcpu);
4458 r = EMULATE_DONE; 4461 r = EMULATE_DONE;
@@ -4562,7 +4565,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4562 4565
4563 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); 4566 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4564 4567
4565 spin_lock(&kvm_lock); 4568 raw_spin_lock(&kvm_lock);
4566 list_for_each_entry(kvm, &vm_list, vm_list) { 4569 list_for_each_entry(kvm, &vm_list, vm_list) {
4567 kvm_for_each_vcpu(i, vcpu, kvm) { 4570 kvm_for_each_vcpu(i, vcpu, kvm) {
4568 if (vcpu->cpu != freq->cpu) 4571 if (vcpu->cpu != freq->cpu)
@@ -4572,7 +4575,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4572 send_ipi = 1; 4575 send_ipi = 1;
4573 } 4576 }
4574 } 4577 }
4575 spin_unlock(&kvm_lock); 4578 raw_spin_unlock(&kvm_lock);
4576 4579
4577 if (freq->old < freq->new && send_ipi) { 4580 if (freq->old < freq->new && send_ipi) {
4578 /* 4581 /*
@@ -5185,6 +5188,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5185 r = 1; 5188 r = 1;
5186 goto out; 5189 goto out;
5187 } 5190 }
5191 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5192 vcpu->arch.nmi_pending = true;
5188 } 5193 }
5189 5194
5190 r = kvm_mmu_reload(vcpu); 5195 r = kvm_mmu_reload(vcpu);
@@ -5213,14 +5218,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5213 kvm_load_guest_fpu(vcpu); 5218 kvm_load_guest_fpu(vcpu);
5214 kvm_load_guest_xcr0(vcpu); 5219 kvm_load_guest_xcr0(vcpu);
5215 5220
5216 atomic_set(&vcpu->guest_mode, 1); 5221 vcpu->mode = IN_GUEST_MODE;
5217 smp_wmb(); 5222
5223 /* We should set ->mode before check ->requests,
5224 * see the comment in make_all_cpus_request.
5225 */
5226 smp_mb();
5218 5227
5219 local_irq_disable(); 5228 local_irq_disable();
5220 5229
5221 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests 5230 if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
5222 || need_resched() || signal_pending(current)) { 5231 || need_resched() || signal_pending(current)) {
5223 atomic_set(&vcpu->guest_mode, 0); 5232 vcpu->mode = OUTSIDE_GUEST_MODE;
5224 smp_wmb(); 5233 smp_wmb();
5225 local_irq_enable(); 5234 local_irq_enable();
5226 preempt_enable(); 5235 preempt_enable();
@@ -5256,7 +5265,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5256 5265
5257 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); 5266 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
5258 5267
5259 atomic_set(&vcpu->guest_mode, 0); 5268 vcpu->mode = OUTSIDE_GUEST_MODE;
5260 smp_wmb(); 5269 smp_wmb();
5261 local_irq_enable(); 5270 local_irq_enable();
5262 5271
@@ -5574,7 +5583,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5574 struct kvm_sregs *sregs) 5583 struct kvm_sregs *sregs)
5575{ 5584{
5576 int mmu_reset_needed = 0; 5585 int mmu_reset_needed = 0;
5577 int pending_vec, max_bits; 5586 int pending_vec, max_bits, idx;
5578 struct desc_ptr dt; 5587 struct desc_ptr dt;
5579 5588
5580 dt.size = sregs->idt.limit; 5589 dt.size = sregs->idt.limit;
@@ -5603,10 +5612,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5603 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5612 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5604 if (sregs->cr4 & X86_CR4_OSXSAVE) 5613 if (sregs->cr4 & X86_CR4_OSXSAVE)
5605 update_cpuid(vcpu); 5614 update_cpuid(vcpu);
5615
5616 idx = srcu_read_lock(&vcpu->kvm->srcu);
5606 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5617 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5607 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 5618 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
5608 mmu_reset_needed = 1; 5619 mmu_reset_needed = 1;
5609 } 5620 }
5621 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5610 5622
5611 if (mmu_reset_needed) 5623 if (mmu_reset_needed)
5612 kvm_mmu_reset_context(vcpu); 5624 kvm_mmu_reset_context(vcpu);
@@ -5617,8 +5629,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5617 if (pending_vec < max_bits) { 5629 if (pending_vec < max_bits) {
5618 kvm_queue_interrupt(vcpu, pending_vec, false); 5630 kvm_queue_interrupt(vcpu, pending_vec, false);
5619 pr_debug("Set back pending irq %d\n", pending_vec); 5631 pr_debug("Set back pending irq %d\n", pending_vec);
5620 if (irqchip_in_kernel(vcpu->kvm))
5621 kvm_pic_clear_isr_ack(vcpu->kvm);
5622 } 5632 }
5623 5633
5624 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5634 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -5814,10 +5824,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5814 5824
5815void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 5825void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5816{ 5826{
5817 if (vcpu->arch.time_page) { 5827 kvmclock_reset(vcpu);
5818 kvm_release_page_dirty(vcpu->arch.time_page);
5819 vcpu->arch.time_page = NULL;
5820 }
5821 5828
5822 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 5829 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
5823 fx_free(vcpu); 5830 fx_free(vcpu);
@@ -5878,6 +5885,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5878 kvm_make_request(KVM_REQ_EVENT, vcpu); 5885 kvm_make_request(KVM_REQ_EVENT, vcpu);
5879 vcpu->arch.apf.msr_val = 0; 5886 vcpu->arch.apf.msr_val = 0;
5880 5887
5888 kvmclock_reset(vcpu);
5889
5881 kvm_clear_async_pf_completion_queue(vcpu); 5890 kvm_clear_async_pf_completion_queue(vcpu);
5882 kvm_async_pf_hash_reset(vcpu); 5891 kvm_async_pf_hash_reset(vcpu);
5883 vcpu->arch.apf.halted = false; 5892 vcpu->arch.apf.halted = false;
@@ -6005,7 +6014,7 @@ int kvm_arch_init_vm(struct kvm *kvm)
6005 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6014 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
6006 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 6015 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
6007 6016
6008 spin_lock_init(&kvm->arch.tsc_write_lock); 6017 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6009 6018
6010 return 0; 6019 return 0;
6011} 6020}
@@ -6103,7 +6112,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6103 int user_alloc) 6112 int user_alloc)
6104{ 6113{
6105 6114
6106 int npages = mem->memory_size >> PAGE_SHIFT; 6115 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6107 6116
6108 if (!user_alloc && !old.user_alloc && old.rmap && !npages) { 6117 if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
6109 int ret; 6118 int ret;
@@ -6118,12 +6127,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6118 "failed to munmap memory\n"); 6127 "failed to munmap memory\n");
6119 } 6128 }
6120 6129
6130 if (!kvm->arch.n_requested_mmu_pages)
6131 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6132
6121 spin_lock(&kvm->mmu_lock); 6133 spin_lock(&kvm->mmu_lock);
6122 if (!kvm->arch.n_requested_mmu_pages) { 6134 if (nr_mmu_pages)
6123 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6124 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6135 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6125 }
6126
6127 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6136 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6128 spin_unlock(&kvm->mmu_lock); 6137 spin_unlock(&kvm->mmu_lock);
6129} 6138}
@@ -6157,7 +6166,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
6157 6166
6158 me = get_cpu(); 6167 me = get_cpu();
6159 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 6168 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
6160 if (atomic_xchg(&vcpu->guest_mode, 0)) 6169 if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
6161 smp_send_reschedule(cpu); 6170 smp_send_reschedule(cpu);
6162 put_cpu(); 6171 put_cpu();
6163} 6172}