aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-10-16 18:36:00 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-16 18:36:00 -0400
commit08d19f51f05a68ce89a289320ce4ed96e757df72 (patch)
tree31c5d718d0aeaff5083fe533cd6e1f9fbbe846bb /arch/x86/kvm
parent1c95e1b69073cff5ff179e592fa1a1e182c78a17 (diff)
parent2381ad241d0bea1253a37f314b270848067640bb (diff)
Merge branch 'kvm-updates/2.6.28' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
* 'kvm-updates/2.6.28' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (134 commits) KVM: ia64: Add intel iommu support for guests. KVM: ia64: add directed mmio range support for kvm guests KVM: ia64: Make pmt table be able to hold physical mmio entries. KVM: Move irqchip_in_kernel() from ioapic.h to irq.h KVM: Separate irq ack notification out of arch/x86/kvm/irq.c KVM: Change is_mmio_pfn to kvm_is_mmio_pfn, and make it common for all archs KVM: Move device assignment logic to common code KVM: Device Assignment: Move vtd.c from arch/x86/kvm/ to virt/kvm/ KVM: VMX: enable invlpg exiting if EPT is disabled KVM: x86: Silence various LAPIC-related host kernel messages KVM: Device Assignment: Map mmio pages into VT-d page table KVM: PIC: enhance IPI avoidance KVM: MMU: add "oos_shadow" parameter to disable oos KVM: MMU: speed up mmu_unsync_walk KVM: MMU: out of sync shadow core KVM: MMU: mmu_convert_notrap helper KVM: MMU: awareness of new kvm_mmu_zap_page behaviour KVM: MMU: mmu_parent_walk KVM: x86: trap invlpg KVM: MMU: sync roots on mmu reload ...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Makefile5
-rw-r--r--arch/x86/kvm/i8254.c81
-rw-r--r--arch/x86/kvm/i8254.h7
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.c3
-rw-r--r--arch/x86/kvm/irq.h6
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h32
-rw-r--r--arch/x86/kvm/lapic.c43
-rw-r--r--arch/x86/kvm/mmu.c680
-rw-r--r--arch/x86/kvm/paging_tmpl.h249
-rw-r--r--arch/x86/kvm/svm.c156
-rw-r--r--arch/x86/kvm/vmx.c712
-rw-r--r--arch/x86/kvm/vmx.h3
-rw-r--r--arch/x86/kvm/x86.c552
-rw-r--r--arch/x86/kvm/x86.h22
-rw-r--r--arch/x86/kvm/x86_emulate.c170
16 files changed, 1862 insertions, 912 deletions
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d0e940bb6f40..c02343594b4d 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,10 +3,13 @@
3# 3#
4 4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
6 coalesced_mmio.o) 6 coalesced_mmio.o irq_comm.o)
7ifeq ($(CONFIG_KVM_TRACE),y) 7ifeq ($(CONFIG_KVM_TRACE),y)
8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) 8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
9endif 9endif
10ifeq ($(CONFIG_DMAR),y)
11common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
12endif
10 13
11EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
12 15
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c0f7872a9124..634132a9a512 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
200 200
201 if (!atomic_inc_and_test(&pt->pending)) 201 if (!atomic_inc_and_test(&pt->pending))
202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); 202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
203 if (vcpu0 && waitqueue_active(&vcpu0->wq)) { 203
204 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 204 if (vcpu0 && waitqueue_active(&vcpu0->wq))
205 wake_up_interruptible(&vcpu0->wq); 205 wake_up_interruptible(&vcpu0->wq);
206 }
207 206
208 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 207 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
209 pt->scheduled = ktime_to_ns(pt->timer.expires); 208 pt->scheduled = ktime_to_ns(pt->timer.expires);
209 if (pt->period)
210 ps->channels[0].count_load_time = pt->timer.expires;
210 211
211 return (pt->period == 0 ? 0 : 1); 212 return (pt->period == 0 ? 0 : 1);
212} 213}
@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
215{ 216{
216 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 217 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
217 218
218 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) 219 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
219 return atomic_read(&pit->pit_state.pit_timer.pending); 220 return atomic_read(&pit->pit_state.pit_timer.pending);
220
221 return 0; 221 return 0;
222} 222}
223 223
224static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
225{
226 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
227 irq_ack_notifier);
228 spin_lock(&ps->inject_lock);
229 if (atomic_dec_return(&ps->pit_timer.pending) < 0)
230 atomic_inc(&ps->pit_timer.pending);
231 ps->irq_ack = 1;
232 spin_unlock(&ps->inject_lock);
233}
234
224static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) 235static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
225{ 236{
226 struct kvm_kpit_state *ps; 237 struct kvm_kpit_state *ps;
@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
255 hrtimer_cancel(&pt->timer); 266 hrtimer_cancel(&pt->timer);
256} 267}
257 268
258static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) 269static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
259{ 270{
271 struct kvm_kpit_timer *pt = &ps->pit_timer;
260 s64 interval; 272 s64 interval;
261 273
262 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 274 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
268 pt->period = (is_period == 0) ? 0 : interval; 280 pt->period = (is_period == 0) ? 0 : interval;
269 pt->timer.function = pit_timer_fn; 281 pt->timer.function = pit_timer_fn;
270 atomic_set(&pt->pending, 0); 282 atomic_set(&pt->pending, 0);
283 ps->irq_ack = 1;
271 284
272 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), 285 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
273 HRTIMER_MODE_ABS); 286 HRTIMER_MODE_ABS);
@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
302 case 1: 315 case 1:
303 /* FIXME: enhance mode 4 precision */ 316 /* FIXME: enhance mode 4 precision */
304 case 4: 317 case 4:
305 create_pit_timer(&ps->pit_timer, val, 0); 318 create_pit_timer(ps, val, 0);
306 break; 319 break;
307 case 2: 320 case 2:
308 case 3: 321 case 3:
309 create_pit_timer(&ps->pit_timer, val, 1); 322 create_pit_timer(ps, val, 1);
310 break; 323 break;
311 default: 324 default:
312 destroy_pit_timer(&ps->pit_timer); 325 destroy_pit_timer(&ps->pit_timer);
@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
520 mutex_unlock(&pit->pit_state.lock); 533 mutex_unlock(&pit->pit_state.lock);
521 534
522 atomic_set(&pit->pit_state.pit_timer.pending, 0); 535 atomic_set(&pit->pit_state.pit_timer.pending, 0);
523 pit->pit_state.inject_pending = 1; 536 pit->pit_state.irq_ack = 1;
524} 537}
525 538
526struct kvm_pit *kvm_create_pit(struct kvm *kvm) 539struct kvm_pit *kvm_create_pit(struct kvm *kvm)
@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
534 547
535 mutex_init(&pit->pit_state.lock); 548 mutex_init(&pit->pit_state.lock);
536 mutex_lock(&pit->pit_state.lock); 549 mutex_lock(&pit->pit_state.lock);
550 spin_lock_init(&pit->pit_state.inject_lock);
537 551
538 /* Initialize PIO device */ 552 /* Initialize PIO device */
539 pit->dev.read = pit_ioport_read; 553 pit->dev.read = pit_ioport_read;
@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
555 pit_state->pit = pit; 569 pit_state->pit = pit;
556 hrtimer_init(&pit_state->pit_timer.timer, 570 hrtimer_init(&pit_state->pit_timer.timer,
557 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 571 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
572 pit_state->irq_ack_notifier.gsi = 0;
573 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
574 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
558 mutex_unlock(&pit->pit_state.lock); 575 mutex_unlock(&pit->pit_state.lock);
559 576
560 kvm_pit_reset(pit); 577 kvm_pit_reset(pit);
@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm)
578static void __inject_pit_timer_intr(struct kvm *kvm) 595static void __inject_pit_timer_intr(struct kvm *kvm)
579{ 596{
580 mutex_lock(&kvm->lock); 597 mutex_lock(&kvm->lock);
581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); 598 kvm_set_irq(kvm, 0, 1);
582 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); 599 kvm_set_irq(kvm, 0, 0);
583 kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
584 kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
585 mutex_unlock(&kvm->lock); 600 mutex_unlock(&kvm->lock);
586} 601}
587 602
@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
592 struct kvm_kpit_state *ps; 607 struct kvm_kpit_state *ps;
593 608
594 if (vcpu && pit) { 609 if (vcpu && pit) {
610 int inject = 0;
595 ps = &pit->pit_state; 611 ps = &pit->pit_state;
596 612
597 /* Try to inject pending interrupts when: 613 /* Try to inject pending interrupts when
598 * 1. Pending exists 614 * last one has been acked.
599 * 2. Last interrupt was accepted or waited for too long time*/ 615 */
600 if (atomic_read(&ps->pit_timer.pending) && 616 spin_lock(&ps->inject_lock);
601 (ps->inject_pending || 617 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
602 (jiffies - ps->last_injected_time 618 ps->irq_ack = 0;
603 >= KVM_MAX_PIT_INTR_INTERVAL))) { 619 inject = 1;
604 ps->inject_pending = 0;
605 __inject_pit_timer_intr(kvm);
606 ps->last_injected_time = jiffies;
607 }
608 }
609}
610
611void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
612{
613 struct kvm_arch *arch = &vcpu->kvm->arch;
614 struct kvm_kpit_state *ps;
615
616 if (vcpu && arch->vpit) {
617 ps = &arch->vpit->pit_state;
618 if (atomic_read(&ps->pit_timer.pending) &&
619 (((arch->vpic->pics[0].imr & 1) == 0 &&
620 arch->vpic->pics[0].irq_base == vec) ||
621 (arch->vioapic->redirtbl[0].fields.vector == vec &&
622 arch->vioapic->redirtbl[0].fields.mask != 1))) {
623 ps->inject_pending = 1;
624 atomic_dec(&ps->pit_timer.pending);
625 ps->channels[0].count_load_time = ktime_get();
626 } 620 }
621 spin_unlock(&ps->inject_lock);
622 if (inject)
623 __inject_pit_timer_intr(kvm);
627 } 624 }
628} 625}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index db25c2a6c8c4..e436d4983aa1 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
8 int irq; 8 int irq;
9 s64 period; /* unit: ns */ 9 s64 period; /* unit: ns */
10 s64 scheduled; 10 s64 scheduled;
11 ktime_t last_update;
12 atomic_t pending; 11 atomic_t pending;
13}; 12};
14 13
@@ -34,8 +33,9 @@ struct kvm_kpit_state {
34 u32 speaker_data_on; 33 u32 speaker_data_on;
35 struct mutex lock; 34 struct mutex lock;
36 struct kvm_pit *pit; 35 struct kvm_pit *pit;
37 bool inject_pending; /* if inject pending interrupts */ 36 spinlock_t inject_lock;
38 unsigned long last_injected_time; 37 unsigned long irq_ack;
38 struct kvm_irq_ack_notifier irq_ack_notifier;
39}; 39};
40 40
41struct kvm_pit { 41struct kvm_pit {
@@ -54,7 +54,6 @@ struct kvm_pit {
54#define KVM_PIT_CHANNEL_MASK 0x3 54#define KVM_PIT_CHANNEL_MASK 0x3
55 55
56void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); 56void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
57void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
58void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); 57void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
59struct kvm_pit *kvm_create_pit(struct kvm *kvm); 58struct kvm_pit *kvm_create_pit(struct kvm *kvm);
60void kvm_free_pit(struct kvm *kvm); 59void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index c31164e8aa46..17e41e165f1a 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,6 +30,19 @@
30 30
31#include <linux/kvm_host.h> 31#include <linux/kvm_host.h>
32 32
33static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
34{
35 s->isr &= ~(1 << irq);
36 s->isr_ack |= (1 << irq);
37}
38
39void kvm_pic_clear_isr_ack(struct kvm *kvm)
40{
41 struct kvm_pic *s = pic_irqchip(kvm);
42 s->pics[0].isr_ack = 0xff;
43 s->pics[1].isr_ack = 0xff;
44}
45
33/* 46/*
34 * set irq level. If an edge is detected, then the IRR is set to 1 47 * set irq level. If an edge is detected, then the IRR is set to 1
35 */ 48 */
@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
141 */ 154 */
142static inline void pic_intack(struct kvm_kpic_state *s, int irq) 155static inline void pic_intack(struct kvm_kpic_state *s, int irq)
143{ 156{
157 s->isr |= 1 << irq;
144 if (s->auto_eoi) { 158 if (s->auto_eoi) {
145 if (s->rotate_on_auto_eoi) 159 if (s->rotate_on_auto_eoi)
146 s->priority_add = (irq + 1) & 7; 160 s->priority_add = (irq + 1) & 7;
147 } else 161 pic_clear_isr(s, irq);
148 s->isr |= (1 << irq); 162 }
149 /* 163 /*
150 * We don't clear a level sensitive interrupt here 164 * We don't clear a level sensitive interrupt here
151 */ 165 */
@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
153 s->irr &= ~(1 << irq); 167 s->irr &= ~(1 << irq);
154} 168}
155 169
156int kvm_pic_read_irq(struct kvm_pic *s) 170int kvm_pic_read_irq(struct kvm *kvm)
157{ 171{
158 int irq, irq2, intno; 172 int irq, irq2, intno;
173 struct kvm_pic *s = pic_irqchip(kvm);
159 174
160 irq = pic_get_irq(&s->pics[0]); 175 irq = pic_get_irq(&s->pics[0]);
161 if (irq >= 0) { 176 if (irq >= 0) {
@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s)
181 intno = s->pics[0].irq_base + irq; 196 intno = s->pics[0].irq_base + irq;
182 } 197 }
183 pic_update_irq(s); 198 pic_update_irq(s);
199 kvm_notify_acked_irq(kvm, irq);
184 200
185 return intno; 201 return intno;
186} 202}
187 203
188void kvm_pic_reset(struct kvm_kpic_state *s) 204void kvm_pic_reset(struct kvm_kpic_state *s)
189{ 205{
206 int irq, irqbase;
207 struct kvm *kvm = s->pics_state->irq_request_opaque;
208 struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
209
210 if (s == &s->pics_state->pics[0])
211 irqbase = 0;
212 else
213 irqbase = 8;
214
215 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
216 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
217 if (s->irr & (1 << irq) || s->isr & (1 << irq))
218 kvm_notify_acked_irq(kvm, irq+irqbase);
219 }
190 s->last_irr = 0; 220 s->last_irr = 0;
191 s->irr = 0; 221 s->irr = 0;
192 s->imr = 0; 222 s->imr = 0;
193 s->isr = 0; 223 s->isr = 0;
224 s->isr_ack = 0xff;
194 s->priority_add = 0; 225 s->priority_add = 0;
195 s->irq_base = 0; 226 s->irq_base = 0;
196 s->read_reg_select = 0; 227 s->read_reg_select = 0;
@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
243 priority = get_priority(s, s->isr); 274 priority = get_priority(s, s->isr);
244 if (priority != 8) { 275 if (priority != 8) {
245 irq = (priority + s->priority_add) & 7; 276 irq = (priority + s->priority_add) & 7;
246 s->isr &= ~(1 << irq); 277 pic_clear_isr(s, irq);
247 if (cmd == 5) 278 if (cmd == 5)
248 s->priority_add = (irq + 1) & 7; 279 s->priority_add = (irq + 1) & 7;
249 pic_update_irq(s->pics_state); 280 pic_update_irq(s->pics_state);
@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
251 break; 282 break;
252 case 3: 283 case 3:
253 irq = val & 7; 284 irq = val & 7;
254 s->isr &= ~(1 << irq); 285 pic_clear_isr(s, irq);
255 pic_update_irq(s->pics_state); 286 pic_update_irq(s->pics_state);
256 break; 287 break;
257 case 6: 288 case 6:
@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
260 break; 291 break;
261 case 7: 292 case 7:
262 irq = val & 7; 293 irq = val & 7;
263 s->isr &= ~(1 << irq);
264 s->priority_add = (irq + 1) & 7; 294 s->priority_add = (irq + 1) & 7;
295 pic_clear_isr(s, irq);
265 pic_update_irq(s->pics_state); 296 pic_update_irq(s->pics_state);
266 break; 297 break;
267 default: 298 default:
@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
303 s->pics_state->pics[0].irr &= ~(1 << 2); 334 s->pics_state->pics[0].irr &= ~(1 << 2);
304 } 335 }
305 s->irr &= ~(1 << ret); 336 s->irr &= ~(1 << ret);
306 s->isr &= ~(1 << ret); 337 pic_clear_isr(s, ret);
307 if (addr1 >> 7 || ret != 2) 338 if (addr1 >> 7 || ret != 2)
308 pic_update_irq(s->pics_state); 339 pic_update_irq(s->pics_state);
309 } else { 340 } else {
@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
422{ 453{
423 struct kvm *kvm = opaque; 454 struct kvm *kvm = opaque;
424 struct kvm_vcpu *vcpu = kvm->vcpus[0]; 455 struct kvm_vcpu *vcpu = kvm->vcpus[0];
456 struct kvm_pic *s = pic_irqchip(kvm);
457 int irq = pic_get_irq(&s->pics[0]);
425 458
426 pic_irqchip(kvm)->output = level; 459 s->output = level;
427 if (vcpu) 460 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
461 s->pics[0].isr_ack &= ~(1 << irq);
428 kvm_vcpu_kick(vcpu); 462 kvm_vcpu_kick(vcpu);
463 }
429} 464}
430 465
431struct kvm_pic *kvm_create_pic(struct kvm *kvm) 466struct kvm_pic *kvm_create_pic(struct kvm *kvm)
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 76d736b5f664..c019b8edcdb7 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
72 if (kvm_apic_accept_pic_intr(v)) { 72 if (kvm_apic_accept_pic_intr(v)) {
73 s = pic_irqchip(v->kvm); 73 s = pic_irqchip(v->kvm);
74 s->output = 0; /* PIC */ 74 s->output = 0; /* PIC */
75 vector = kvm_pic_read_irq(s); 75 vector = kvm_pic_read_irq(v->kvm);
76 } 76 }
77 } 77 }
78 return vector; 78 return vector;
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
90void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) 90void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
91{ 91{
92 kvm_apic_timer_intr_post(vcpu, vec); 92 kvm_apic_timer_intr_post(vcpu, vec);
93 kvm_pit_timer_intr_post(vcpu, vec);
94 /* TODO: PIT, RTC etc. */ 93 /* TODO: PIT, RTC etc. */
95} 94}
96EXPORT_SYMBOL_GPL(kvm_timer_intr_post); 95EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7ca47cbb48bb..f17c8f5bbf31 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -42,6 +42,7 @@ struct kvm_kpic_state {
42 u8 irr; /* interrupt request register */ 42 u8 irr; /* interrupt request register */
43 u8 imr; /* interrupt mask register */ 43 u8 imr; /* interrupt mask register */
44 u8 isr; /* interrupt service register */ 44 u8 isr; /* interrupt service register */
45 u8 isr_ack; /* interrupt ack detection */
45 u8 priority_add; /* highest irq priority */ 46 u8 priority_add; /* highest irq priority */
46 u8 irq_base; 47 u8 irq_base;
47 u8 read_reg_select; 48 u8 read_reg_select;
@@ -63,12 +64,13 @@ struct kvm_pic {
63 void *irq_request_opaque; 64 void *irq_request_opaque;
64 int output; /* intr from master PIC */ 65 int output; /* intr from master PIC */
65 struct kvm_io_device dev; 66 struct kvm_io_device dev;
67 void (*ack_notifier)(void *opaque, int irq);
66}; 68};
67 69
68struct kvm_pic *kvm_create_pic(struct kvm *kvm); 70struct kvm_pic *kvm_create_pic(struct kvm *kvm);
69void kvm_pic_set_irq(void *opaque, int irq, int level); 71int kvm_pic_read_irq(struct kvm *kvm);
70int kvm_pic_read_irq(struct kvm_pic *s);
71void kvm_pic_update_irq(struct kvm_pic *s); 72void kvm_pic_update_irq(struct kvm_pic *s);
73void kvm_pic_clear_isr_ack(struct kvm *kvm);
72 74
73static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) 75static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
74{ 76{
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 000000000000..1ff819dce7d3
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,32 @@
1#ifndef ASM_KVM_CACHE_REGS_H
2#define ASM_KVM_CACHE_REGS_H
3
4static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
5 enum kvm_reg reg)
6{
7 if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
8 kvm_x86_ops->cache_reg(vcpu, reg);
9
10 return vcpu->arch.regs[reg];
11}
12
13static inline void kvm_register_write(struct kvm_vcpu *vcpu,
14 enum kvm_reg reg,
15 unsigned long val)
16{
17 vcpu->arch.regs[reg] = val;
18 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
19 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
20}
21
22static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
23{
24 return kvm_register_read(vcpu, VCPU_REGS_RIP);
25}
26
27static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
28{
29 kvm_register_write(vcpu, VCPU_REGS_RIP, val);
30}
31
32#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73f43de69f67..6571926bfd33 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,6 +32,7 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include "kvm_cache_regs.h"
35#include "irq.h" 36#include "irq.h"
36 37
37#define PRId64 "d" 38#define PRId64 "d"
@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
338 } else 339 } else
339 apic_clear_vector(vector, apic->regs + APIC_TMR); 340 apic_clear_vector(vector, apic->regs + APIC_TMR);
340 341
341 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 342 kvm_vcpu_kick(vcpu);
342 kvm_vcpu_kick(vcpu);
343 else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
344 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
345 if (waitqueue_active(&vcpu->wq))
346 wake_up_interruptible(&vcpu->wq);
347 }
348 343
349 result = (orig_irr == 0); 344 result = (orig_irr == 0);
350 break; 345 break;
@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
370 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 365 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
371 kvm_vcpu_kick(vcpu); 366 kvm_vcpu_kick(vcpu);
372 } else { 367 } else {
373 printk(KERN_DEBUG 368 apic_debug("Ignoring de-assert INIT to vcpu %d\n",
374 "Ignoring de-assert INIT to vcpu %d\n", 369 vcpu->vcpu_id);
375 vcpu->vcpu_id);
376 } 370 }
377
378 break; 371 break;
379 372
380 case APIC_DM_STARTUP: 373 case APIC_DM_STARTUP:
381 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", 374 apic_debug("SIPI to vcpu %d vector 0x%02x\n",
382 vcpu->vcpu_id, vector); 375 vcpu->vcpu_id, vector);
383 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 376 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
384 vcpu->arch.sipi_vector = vector; 377 vcpu->arch.sipi_vector = vector;
385 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 378 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
386 if (waitqueue_active(&vcpu->wq)) 379 kvm_vcpu_kick(vcpu);
387 wake_up_interruptible(&vcpu->wq);
388 } 380 }
389 break; 381 break;
390 382
@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
438static void apic_set_eoi(struct kvm_lapic *apic) 430static void apic_set_eoi(struct kvm_lapic *apic)
439{ 431{
440 int vector = apic_find_highest_isr(apic); 432 int vector = apic_find_highest_isr(apic);
441 433 int trigger_mode;
442 /* 434 /*
443 * Not every write EOI will has corresponding ISR, 435 * Not every write EOI will has corresponding ISR,
444 * one example is when Kernel check timer on setup_IO_APIC 436 * one example is when Kernel check timer on setup_IO_APIC
@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
450 apic_update_ppr(apic); 442 apic_update_ppr(apic);
451 443
452 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) 444 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
453 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); 445 trigger_mode = IOAPIC_LEVEL_TRIG;
446 else
447 trigger_mode = IOAPIC_EDGE_TRIG;
448 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
454} 449}
455 450
456static void apic_send_ipi(struct kvm_lapic *apic) 451static void apic_send_ipi(struct kvm_lapic *apic)
@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
558 struct kvm_run *run = vcpu->run; 553 struct kvm_run *run = vcpu->run;
559 554
560 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); 555 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
561 kvm_x86_ops->cache_regs(vcpu); 556 run->tpr_access.rip = kvm_rip_read(vcpu);
562 run->tpr_access.rip = vcpu->arch.rip;
563 run->tpr_access.is_write = write; 557 run->tpr_access.is_write = write;
564} 558}
565 559
@@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
683 * Refer SDM 8.4.1 677 * Refer SDM 8.4.1
684 */ 678 */
685 if (len != 4 || alignment) { 679 if (len != 4 || alignment) {
686 if (printk_ratelimit()) 680 /* Don't shout loud, $infamous_os would cause only noise. */
687 printk(KERN_ERR "apic write: bad size=%d %lx\n", 681 apic_debug("apic write: bad size=%d %lx\n",
688 len, (long)address); 682 len, (long)address);
689 return; 683 return;
690 } 684 }
691 685
@@ -947,10 +941,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
947 941
948 if(!atomic_inc_and_test(&apic->timer.pending)) 942 if(!atomic_inc_and_test(&apic->timer.pending))
949 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); 943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
950 if (waitqueue_active(q)) { 944 if (waitqueue_active(q))
951 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
952 wake_up_interruptible(q); 945 wake_up_interruptible(q);
953 } 946
954 if (apic_lvtt_period(apic)) { 947 if (apic_lvtt_period(apic)) {
955 result = 1; 948 result = 1;
956 apic->timer.dev.expires = ktime_add_ns( 949 apic->timer.dev.expires = ktime_add_ns(
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3da2508eb22a..99c239c5c0ac 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,6 +70,9 @@ static int dbg = 0;
70module_param(dbg, bool, 0644); 70module_param(dbg, bool, 0644);
71#endif 71#endif
72 72
73static int oos_shadow = 1;
74module_param(oos_shadow, bool, 0644);
75
73#ifndef MMU_DEBUG 76#ifndef MMU_DEBUG
74#define ASSERT(x) do { } while (0) 77#define ASSERT(x) do { } while (0)
75#else 78#else
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644);
135#define ACC_USER_MASK PT_USER_MASK 138#define ACC_USER_MASK PT_USER_MASK
136#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 139#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
137 140
138struct kvm_pv_mmu_op_buffer { 141#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
139 void *ptr;
140 unsigned len;
141 unsigned processed;
142 char buf[512] __aligned(sizeof(long));
143};
144 142
145struct kvm_rmap_desc { 143struct kvm_rmap_desc {
146 u64 *shadow_ptes[RMAP_EXT]; 144 u64 *shadow_ptes[RMAP_EXT];
147 struct kvm_rmap_desc *more; 145 struct kvm_rmap_desc *more;
148}; 146};
149 147
148struct kvm_shadow_walk {
149 int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
150 u64 addr, u64 *spte, int level);
151};
152
153struct kvm_unsync_walk {
154 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
155};
156
157typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
158
150static struct kmem_cache *pte_chain_cache; 159static struct kmem_cache *pte_chain_cache;
151static struct kmem_cache *rmap_desc_cache; 160static struct kmem_cache *rmap_desc_cache;
152static struct kmem_cache *mmu_page_header_cache; 161static struct kmem_cache *mmu_page_header_cache;
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
405{ 414{
406 struct vm_area_struct *vma; 415 struct vm_area_struct *vma;
407 unsigned long addr; 416 unsigned long addr;
417 int ret = 0;
408 418
409 addr = gfn_to_hva(kvm, gfn); 419 addr = gfn_to_hva(kvm, gfn);
410 if (kvm_is_error_hva(addr)) 420 if (kvm_is_error_hva(addr))
411 return 0; 421 return ret;
412 422
423 down_read(&current->mm->mmap_sem);
413 vma = find_vma(current->mm, addr); 424 vma = find_vma(current->mm, addr);
414 if (vma && is_vm_hugetlb_page(vma)) 425 if (vma && is_vm_hugetlb_page(vma))
415 return 1; 426 ret = 1;
427 up_read(&current->mm->mmap_sem);
416 428
417 return 0; 429 return ret;
418} 430}
419 431
420static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) 432static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
649 661
650 if (write_protected) 662 if (write_protected)
651 kvm_flush_remote_tlbs(kvm); 663 kvm_flush_remote_tlbs(kvm);
652
653 account_shadowed(kvm, gfn);
654} 664}
655 665
656static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) 666static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
859 BUG(); 869 BUG();
860} 870}
861 871
872
873static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
874 mmu_parent_walk_fn fn)
875{
876 struct kvm_pte_chain *pte_chain;
877 struct hlist_node *node;
878 struct kvm_mmu_page *parent_sp;
879 int i;
880
881 if (!sp->multimapped && sp->parent_pte) {
882 parent_sp = page_header(__pa(sp->parent_pte));
883 fn(vcpu, parent_sp);
884 mmu_parent_walk(vcpu, parent_sp, fn);
885 return;
886 }
887 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
888 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
889 if (!pte_chain->parent_ptes[i])
890 break;
891 parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
892 fn(vcpu, parent_sp);
893 mmu_parent_walk(vcpu, parent_sp, fn);
894 }
895}
896
897static void kvm_mmu_update_unsync_bitmap(u64 *spte)
898{
899 unsigned int index;
900 struct kvm_mmu_page *sp = page_header(__pa(spte));
901
902 index = spte - sp->spt;
903 __set_bit(index, sp->unsync_child_bitmap);
904 sp->unsync_children = 1;
905}
906
907static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
908{
909 struct kvm_pte_chain *pte_chain;
910 struct hlist_node *node;
911 int i;
912
913 if (!sp->parent_pte)
914 return;
915
916 if (!sp->multimapped) {
917 kvm_mmu_update_unsync_bitmap(sp->parent_pte);
918 return;
919 }
920
921 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
922 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
923 if (!pte_chain->parent_ptes[i])
924 break;
925 kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
926 }
927}
928
929static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
930{
931 sp->unsync_children = 1;
932 kvm_mmu_update_parents_unsync(sp);
933 return 1;
934}
935
936static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
937 struct kvm_mmu_page *sp)
938{
939 mmu_parent_walk(vcpu, sp, unsync_walk_fn);
940 kvm_mmu_update_parents_unsync(sp);
941}
942
862static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 943static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
863 struct kvm_mmu_page *sp) 944 struct kvm_mmu_page *sp)
864{ 945{
@@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
868 sp->spt[i] = shadow_trap_nonpresent_pte; 949 sp->spt[i] = shadow_trap_nonpresent_pte;
869} 950}
870 951
952static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
953 struct kvm_mmu_page *sp)
954{
955 return 1;
956}
957
958static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
959{
960}
961
962#define for_each_unsync_children(bitmap, idx) \
963 for (idx = find_first_bit(bitmap, 512); \
964 idx < 512; \
965 idx = find_next_bit(bitmap, 512, idx+1))
966
967static int mmu_unsync_walk(struct kvm_mmu_page *sp,
968 struct kvm_unsync_walk *walker)
969{
970 int i, ret;
971
972 if (!sp->unsync_children)
973 return 0;
974
975 for_each_unsync_children(sp->unsync_child_bitmap, i) {
976 u64 ent = sp->spt[i];
977
978 if (is_shadow_present_pte(ent)) {
979 struct kvm_mmu_page *child;
980 child = page_header(ent & PT64_BASE_ADDR_MASK);
981
982 if (child->unsync_children) {
983 ret = mmu_unsync_walk(child, walker);
984 if (ret)
985 return ret;
986 __clear_bit(i, sp->unsync_child_bitmap);
987 }
988
989 if (child->unsync) {
990 ret = walker->entry(child, walker);
991 __clear_bit(i, sp->unsync_child_bitmap);
992 if (ret)
993 return ret;
994 }
995 }
996 }
997
998 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
999 sp->unsync_children = 0;
1000
1001 return 0;
1002}
1003
871static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 1004static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
872{ 1005{
873 unsigned index; 1006 unsigned index;
@@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
888 return NULL; 1021 return NULL;
889} 1022}
890 1023
1024static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1025{
1026 WARN_ON(!sp->unsync);
1027 sp->unsync = 0;
1028 --kvm->stat.mmu_unsync;
1029}
1030
1031static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1032
1033static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1034{
1035 if (sp->role.glevels != vcpu->arch.mmu.root_level) {
1036 kvm_mmu_zap_page(vcpu->kvm, sp);
1037 return 1;
1038 }
1039
1040 rmap_write_protect(vcpu->kvm, sp->gfn);
1041 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1042 kvm_mmu_zap_page(vcpu->kvm, sp);
1043 return 1;
1044 }
1045
1046 kvm_mmu_flush_tlb(vcpu);
1047 kvm_unlink_unsync_page(vcpu->kvm, sp);
1048 return 0;
1049}
1050
1051struct sync_walker {
1052 struct kvm_vcpu *vcpu;
1053 struct kvm_unsync_walk walker;
1054};
1055
1056static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1057{
1058 struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
1059 walker);
1060 struct kvm_vcpu *vcpu = sync_walk->vcpu;
1061
1062 kvm_sync_page(vcpu, sp);
1063 return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
1064}
1065
1066static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1067{
1068 struct sync_walker walker = {
1069 .walker = { .entry = mmu_sync_fn, },
1070 .vcpu = vcpu,
1071 };
1072
1073 while (mmu_unsync_walk(sp, &walker.walker))
1074 cond_resched_lock(&vcpu->kvm->mmu_lock);
1075}
1076
891static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1077static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
892 gfn_t gfn, 1078 gfn_t gfn,
893 gva_t gaddr, 1079 gva_t gaddr,
@@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
901 unsigned quadrant; 1087 unsigned quadrant;
902 struct hlist_head *bucket; 1088 struct hlist_head *bucket;
903 struct kvm_mmu_page *sp; 1089 struct kvm_mmu_page *sp;
904 struct hlist_node *node; 1090 struct hlist_node *node, *tmp;
905 1091
906 role.word = 0; 1092 role.word = 0;
907 role.glevels = vcpu->arch.mmu.root_level; 1093 role.glevels = vcpu->arch.mmu.root_level;
@@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
917 gfn, role.word); 1103 gfn, role.word);
918 index = kvm_page_table_hashfn(gfn); 1104 index = kvm_page_table_hashfn(gfn);
919 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1105 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
920 hlist_for_each_entry(sp, node, bucket, hash_link) 1106 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
921 if (sp->gfn == gfn && sp->role.word == role.word) { 1107 if (sp->gfn == gfn) {
1108 if (sp->unsync)
1109 if (kvm_sync_page(vcpu, sp))
1110 continue;
1111
1112 if (sp->role.word != role.word)
1113 continue;
1114
922 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1115 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1116 if (sp->unsync_children) {
1117 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1118 kvm_mmu_mark_parents_unsync(vcpu, sp);
1119 }
923 pgprintk("%s: found\n", __func__); 1120 pgprintk("%s: found\n", __func__);
924 return sp; 1121 return sp;
925 } 1122 }
@@ -931,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
931 sp->gfn = gfn; 1128 sp->gfn = gfn;
932 sp->role = role; 1129 sp->role = role;
933 hlist_add_head(&sp->hash_link, bucket); 1130 hlist_add_head(&sp->hash_link, bucket);
934 if (!metaphysical) 1131 if (!metaphysical) {
935 rmap_write_protect(vcpu->kvm, gfn); 1132 rmap_write_protect(vcpu->kvm, gfn);
1133 account_shadowed(vcpu->kvm, gfn);
1134 }
936 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1135 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
937 vcpu->arch.mmu.prefetch_page(vcpu, sp); 1136 vcpu->arch.mmu.prefetch_page(vcpu, sp);
938 else 1137 else
@@ -940,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
940 return sp; 1139 return sp;
941} 1140}
942 1141
1142static int walk_shadow(struct kvm_shadow_walk *walker,
1143 struct kvm_vcpu *vcpu, u64 addr)
1144{
1145 hpa_t shadow_addr;
1146 int level;
1147 int r;
1148 u64 *sptep;
1149 unsigned index;
1150
1151 shadow_addr = vcpu->arch.mmu.root_hpa;
1152 level = vcpu->arch.mmu.shadow_root_level;
1153 if (level == PT32E_ROOT_LEVEL) {
1154 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1155 shadow_addr &= PT64_BASE_ADDR_MASK;
1156 --level;
1157 }
1158
1159 while (level >= PT_PAGE_TABLE_LEVEL) {
1160 index = SHADOW_PT_INDEX(addr, level);
1161 sptep = ((u64 *)__va(shadow_addr)) + index;
1162 r = walker->entry(walker, vcpu, addr, sptep, level);
1163 if (r)
1164 return r;
1165 shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
1166 --level;
1167 }
1168 return 0;
1169}
1170
943static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1171static void kvm_mmu_page_unlink_children(struct kvm *kvm,
944 struct kvm_mmu_page *sp) 1172 struct kvm_mmu_page *sp)
945{ 1173{
@@ -955,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
955 rmap_remove(kvm, &pt[i]); 1183 rmap_remove(kvm, &pt[i]);
956 pt[i] = shadow_trap_nonpresent_pte; 1184 pt[i] = shadow_trap_nonpresent_pte;
957 } 1185 }
958 kvm_flush_remote_tlbs(kvm);
959 return; 1186 return;
960 } 1187 }
961 1188
@@ -974,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
974 } 1201 }
975 pt[i] = shadow_trap_nonpresent_pte; 1202 pt[i] = shadow_trap_nonpresent_pte;
976 } 1203 }
977 kvm_flush_remote_tlbs(kvm);
978} 1204}
979 1205
980static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 1206static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -991,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
991 kvm->vcpus[i]->arch.last_pte_updated = NULL; 1217 kvm->vcpus[i]->arch.last_pte_updated = NULL;
992} 1218}
993 1219
994static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1220static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
995{ 1221{
996 u64 *parent_pte; 1222 u64 *parent_pte;
997 1223
998 ++kvm->stat.mmu_shadow_zapped;
999 while (sp->multimapped || sp->parent_pte) { 1224 while (sp->multimapped || sp->parent_pte) {
1000 if (!sp->multimapped) 1225 if (!sp->multimapped)
1001 parent_pte = sp->parent_pte; 1226 parent_pte = sp->parent_pte;
@@ -1010,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1010 kvm_mmu_put_page(sp, parent_pte); 1235 kvm_mmu_put_page(sp, parent_pte);
1011 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); 1236 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
1012 } 1237 }
1238}
1239
1240struct zap_walker {
1241 struct kvm_unsync_walk walker;
1242 struct kvm *kvm;
1243 int zapped;
1244};
1245
1246static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1247{
1248 struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
1249 walker);
1250 kvm_mmu_zap_page(zap_walk->kvm, sp);
1251 zap_walk->zapped = 1;
1252 return 0;
1253}
1254
1255static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
1256{
1257 struct zap_walker walker = {
1258 .walker = { .entry = mmu_zap_fn, },
1259 .kvm = kvm,
1260 .zapped = 0,
1261 };
1262
1263 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1264 return 0;
1265 mmu_unsync_walk(sp, &walker.walker);
1266 return walker.zapped;
1267}
1268
1269static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1270{
1271 int ret;
1272 ++kvm->stat.mmu_shadow_zapped;
1273 ret = mmu_zap_unsync_children(kvm, sp);
1013 kvm_mmu_page_unlink_children(kvm, sp); 1274 kvm_mmu_page_unlink_children(kvm, sp);
1275 kvm_mmu_unlink_parents(kvm, sp);
1276 kvm_flush_remote_tlbs(kvm);
1277 if (!sp->role.invalid && !sp->role.metaphysical)
1278 unaccount_shadowed(kvm, sp->gfn);
1279 if (sp->unsync)
1280 kvm_unlink_unsync_page(kvm, sp);
1014 if (!sp->root_count) { 1281 if (!sp->root_count) {
1015 if (!sp->role.metaphysical && !sp->role.invalid)
1016 unaccount_shadowed(kvm, sp->gfn);
1017 hlist_del(&sp->hash_link); 1282 hlist_del(&sp->hash_link);
1018 kvm_mmu_free_page(kvm, sp); 1283 kvm_mmu_free_page(kvm, sp);
1019 } else { 1284 } else {
1020 int invalid = sp->role.invalid;
1021 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1022 sp->role.invalid = 1; 1285 sp->role.invalid = 1;
1286 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1023 kvm_reload_remote_mmus(kvm); 1287 kvm_reload_remote_mmus(kvm);
1024 if (!sp->role.metaphysical && !invalid)
1025 unaccount_shadowed(kvm, sp->gfn);
1026 } 1288 }
1027 kvm_mmu_reset_last_pte_updated(kvm); 1289 kvm_mmu_reset_last_pte_updated(kvm);
1290 return ret;
1028} 1291}
1029 1292
1030/* 1293/*
@@ -1077,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1077 if (sp->gfn == gfn && !sp->role.metaphysical) { 1340 if (sp->gfn == gfn && !sp->role.metaphysical) {
1078 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1341 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1079 sp->role.word); 1342 sp->role.word);
1080 kvm_mmu_zap_page(kvm, sp);
1081 r = 1; 1343 r = 1;
1344 if (kvm_mmu_zap_page(kvm, sp))
1345 n = bucket->first;
1082 } 1346 }
1083 return r; 1347 return r;
1084} 1348}
@@ -1101,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1101 __set_bit(slot, &sp->slot_bitmap); 1365 __set_bit(slot, &sp->slot_bitmap);
1102} 1366}
1103 1367
1368static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1369{
1370 int i;
1371 u64 *pt = sp->spt;
1372
1373 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1374 return;
1375
1376 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1377 if (pt[i] == shadow_notrap_nonpresent_pte)
1378 set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
1379 }
1380}
1381
1104struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) 1382struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1105{ 1383{
1106 struct page *page; 1384 struct page *page;
@@ -1110,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1110 if (gpa == UNMAPPED_GVA) 1388 if (gpa == UNMAPPED_GVA)
1111 return NULL; 1389 return NULL;
1112 1390
1113 down_read(&current->mm->mmap_sem);
1114 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1391 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1115 up_read(&current->mm->mmap_sem);
1116 1392
1117 return page; 1393 return page;
1118} 1394}
1119 1395
1120static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1396static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1121 unsigned pt_access, unsigned pte_access,
1122 int user_fault, int write_fault, int dirty,
1123 int *ptwrite, int largepage, gfn_t gfn,
1124 pfn_t pfn, bool speculative)
1125{ 1397{
1126 u64 spte; 1398 unsigned index;
1127 int was_rmapped = 0; 1399 struct hlist_head *bucket;
1128 int was_writeble = is_writeble_pte(*shadow_pte); 1400 struct kvm_mmu_page *s;
1401 struct hlist_node *node, *n;
1129 1402
1130 pgprintk("%s: spte %llx access %x write_fault %d" 1403 index = kvm_page_table_hashfn(sp->gfn);
1131 " user_fault %d gfn %lx\n", 1404 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1132 __func__, *shadow_pte, pt_access, 1405 /* don't unsync if pagetable is shadowed with multiple roles */
1133 write_fault, user_fault, gfn); 1406 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1407 if (s->gfn != sp->gfn || s->role.metaphysical)
1408 continue;
1409 if (s->role.word != sp->role.word)
1410 return 1;
1411 }
1412 kvm_mmu_mark_parents_unsync(vcpu, sp);
1413 ++vcpu->kvm->stat.mmu_unsync;
1414 sp->unsync = 1;
1415 mmu_convert_notrap(sp);
1416 return 0;
1417}
1134 1418
1135 if (is_rmap_pte(*shadow_pte)) { 1419static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1136 /* 1420 bool can_unsync)
1137 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 1421{
1138 * the parent of the now unreachable PTE. 1422 struct kvm_mmu_page *shadow;
1139 */
1140 if (largepage && !is_large_pte(*shadow_pte)) {
1141 struct kvm_mmu_page *child;
1142 u64 pte = *shadow_pte;
1143 1423
1144 child = page_header(pte & PT64_BASE_ADDR_MASK); 1424 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1145 mmu_page_remove_parent_pte(child, shadow_pte); 1425 if (shadow) {
1146 } else if (pfn != spte_to_pfn(*shadow_pte)) { 1426 if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1147 pgprintk("hfn old %lx new %lx\n", 1427 return 1;
1148 spte_to_pfn(*shadow_pte), pfn); 1428 if (shadow->unsync)
1149 rmap_remove(vcpu->kvm, shadow_pte); 1429 return 0;
1150 } else { 1430 if (can_unsync && oos_shadow)
1151 if (largepage) 1431 return kvm_unsync_page(vcpu, shadow);
1152 was_rmapped = is_large_pte(*shadow_pte); 1432 return 1;
1153 else
1154 was_rmapped = 1;
1155 }
1156 } 1433 }
1434 return 0;
1435}
1157 1436
1437static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1438 unsigned pte_access, int user_fault,
1439 int write_fault, int dirty, int largepage,
1440 gfn_t gfn, pfn_t pfn, bool speculative,
1441 bool can_unsync)
1442{
1443 u64 spte;
1444 int ret = 0;
1158 /* 1445 /*
1159 * We don't set the accessed bit, since we sometimes want to see 1446 * We don't set the accessed bit, since we sometimes want to see
1160 * whether the guest actually used the pte (in order to detect 1447 * whether the guest actually used the pte (in order to detect
@@ -1162,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1162 */ 1449 */
1163 spte = shadow_base_present_pte | shadow_dirty_mask; 1450 spte = shadow_base_present_pte | shadow_dirty_mask;
1164 if (!speculative) 1451 if (!speculative)
1165 pte_access |= PT_ACCESSED_MASK; 1452 spte |= shadow_accessed_mask;
1166 if (!dirty) 1453 if (!dirty)
1167 pte_access &= ~ACC_WRITE_MASK; 1454 pte_access &= ~ACC_WRITE_MASK;
1168 if (pte_access & ACC_EXEC_MASK) 1455 if (pte_access & ACC_EXEC_MASK)
@@ -1178,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1178 1465
1179 if ((pte_access & ACC_WRITE_MASK) 1466 if ((pte_access & ACC_WRITE_MASK)
1180 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1467 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1181 struct kvm_mmu_page *shadow; 1468
1469 if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
1470 ret = 1;
1471 spte = shadow_trap_nonpresent_pte;
1472 goto set_pte;
1473 }
1182 1474
1183 spte |= PT_WRITABLE_MASK; 1475 spte |= PT_WRITABLE_MASK;
1184 1476
1185 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1477 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1186 if (shadow ||
1187 (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
1188 pgprintk("%s: found shadow page for %lx, marking ro\n", 1478 pgprintk("%s: found shadow page for %lx, marking ro\n",
1189 __func__, gfn); 1479 __func__, gfn);
1480 ret = 1;
1190 pte_access &= ~ACC_WRITE_MASK; 1481 pte_access &= ~ACC_WRITE_MASK;
1191 if (is_writeble_pte(spte)) { 1482 if (is_writeble_pte(spte))
1192 spte &= ~PT_WRITABLE_MASK; 1483 spte &= ~PT_WRITABLE_MASK;
1193 kvm_x86_ops->tlb_flush(vcpu);
1194 }
1195 if (write_fault)
1196 *ptwrite = 1;
1197 } 1484 }
1198 } 1485 }
1199 1486
1200 if (pte_access & ACC_WRITE_MASK) 1487 if (pte_access & ACC_WRITE_MASK)
1201 mark_page_dirty(vcpu->kvm, gfn); 1488 mark_page_dirty(vcpu->kvm, gfn);
1202 1489
1203 pgprintk("%s: setting spte %llx\n", __func__, spte); 1490set_pte:
1204 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1205 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1206 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1207 set_shadow_pte(shadow_pte, spte); 1491 set_shadow_pte(shadow_pte, spte);
1208 if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) 1492 return ret;
1209 && (spte & PT_PRESENT_MASK)) 1493}
1494
1495static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1496 unsigned pt_access, unsigned pte_access,
1497 int user_fault, int write_fault, int dirty,
1498 int *ptwrite, int largepage, gfn_t gfn,
1499 pfn_t pfn, bool speculative)
1500{
1501 int was_rmapped = 0;
1502 int was_writeble = is_writeble_pte(*shadow_pte);
1503
1504 pgprintk("%s: spte %llx access %x write_fault %d"
1505 " user_fault %d gfn %lx\n",
1506 __func__, *shadow_pte, pt_access,
1507 write_fault, user_fault, gfn);
1508
1509 if (is_rmap_pte(*shadow_pte)) {
1510 /*
1511 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1512 * the parent of the now unreachable PTE.
1513 */
1514 if (largepage && !is_large_pte(*shadow_pte)) {
1515 struct kvm_mmu_page *child;
1516 u64 pte = *shadow_pte;
1517
1518 child = page_header(pte & PT64_BASE_ADDR_MASK);
1519 mmu_page_remove_parent_pte(child, shadow_pte);
1520 } else if (pfn != spte_to_pfn(*shadow_pte)) {
1521 pgprintk("hfn old %lx new %lx\n",
1522 spte_to_pfn(*shadow_pte), pfn);
1523 rmap_remove(vcpu->kvm, shadow_pte);
1524 } else {
1525 if (largepage)
1526 was_rmapped = is_large_pte(*shadow_pte);
1527 else
1528 was_rmapped = 1;
1529 }
1530 }
1531 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1532 dirty, largepage, gfn, pfn, speculative, true)) {
1533 if (write_fault)
1534 *ptwrite = 1;
1535 kvm_x86_ops->tlb_flush(vcpu);
1536 }
1537
1538 pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
1539 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1540 is_large_pte(*shadow_pte)? "2MB" : "4kB",
1541 is_present_pte(*shadow_pte)?"RW":"R", gfn,
1542 *shadow_pte, shadow_pte);
1543 if (!was_rmapped && is_large_pte(*shadow_pte))
1210 ++vcpu->kvm->stat.lpages; 1544 ++vcpu->kvm->stat.lpages;
1211 1545
1212 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1546 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
@@ -1230,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1230{ 1564{
1231} 1565}
1232 1566
1233static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 1567struct direct_shadow_walk {
1234 int largepage, gfn_t gfn, pfn_t pfn, 1568 struct kvm_shadow_walk walker;
1235 int level) 1569 pfn_t pfn;
1236{ 1570 int write;
1237 hpa_t table_addr = vcpu->arch.mmu.root_hpa; 1571 int largepage;
1238 int pt_write = 0; 1572 int pt_write;
1239 1573};
1240 for (; ; level--) {
1241 u32 index = PT64_INDEX(v, level);
1242 u64 *table;
1243
1244 ASSERT(VALID_PAGE(table_addr));
1245 table = __va(table_addr);
1246 1574
1247 if (level == 1) { 1575static int direct_map_entry(struct kvm_shadow_walk *_walk,
1248 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1576 struct kvm_vcpu *vcpu,
1249 0, write, 1, &pt_write, 0, gfn, pfn, false); 1577 u64 addr, u64 *sptep, int level)
1250 return pt_write; 1578{
1251 } 1579 struct direct_shadow_walk *walk =
1580 container_of(_walk, struct direct_shadow_walk, walker);
1581 struct kvm_mmu_page *sp;
1582 gfn_t pseudo_gfn;
1583 gfn_t gfn = addr >> PAGE_SHIFT;
1584
1585 if (level == PT_PAGE_TABLE_LEVEL
1586 || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
1587 mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
1588 0, walk->write, 1, &walk->pt_write,
1589 walk->largepage, gfn, walk->pfn, false);
1590 ++vcpu->stat.pf_fixed;
1591 return 1;
1592 }
1252 1593
1253 if (largepage && level == 2) { 1594 if (*sptep == shadow_trap_nonpresent_pte) {
1254 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1595 pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
1255 0, write, 1, &pt_write, 1, gfn, pfn, false); 1596 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
1256 return pt_write; 1597 1, ACC_ALL, sptep);
1598 if (!sp) {
1599 pgprintk("nonpaging_map: ENOMEM\n");
1600 kvm_release_pfn_clean(walk->pfn);
1601 return -ENOMEM;
1257 } 1602 }
1258 1603
1259 if (table[index] == shadow_trap_nonpresent_pte) { 1604 set_shadow_pte(sptep,
1260 struct kvm_mmu_page *new_table; 1605 __pa(sp->spt)
1261 gfn_t pseudo_gfn; 1606 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1262 1607 | shadow_user_mask | shadow_x_mask);
1263 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
1264 >> PAGE_SHIFT;
1265 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1266 v, level - 1,
1267 1, ACC_ALL, &table[index]);
1268 if (!new_table) {
1269 pgprintk("nonpaging_map: ENOMEM\n");
1270 kvm_release_pfn_clean(pfn);
1271 return -ENOMEM;
1272 }
1273
1274 set_shadow_pte(&table[index],
1275 __pa(new_table->spt)
1276 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1277 | shadow_user_mask | shadow_x_mask);
1278 }
1279 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1280 } 1608 }
1609 return 0;
1610}
1611
1612static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1613 int largepage, gfn_t gfn, pfn_t pfn)
1614{
1615 int r;
1616 struct direct_shadow_walk walker = {
1617 .walker = { .entry = direct_map_entry, },
1618 .pfn = pfn,
1619 .largepage = largepage,
1620 .write = write,
1621 .pt_write = 0,
1622 };
1623
1624 r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
1625 if (r < 0)
1626 return r;
1627 return walker.pt_write;
1281} 1628}
1282 1629
1283static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 1630static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1287,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1287 pfn_t pfn; 1634 pfn_t pfn;
1288 unsigned long mmu_seq; 1635 unsigned long mmu_seq;
1289 1636
1290 down_read(&current->mm->mmap_sem);
1291 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1637 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1292 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1638 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1293 largepage = 1; 1639 largepage = 1;
1294 } 1640 }
1295 1641
1296 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1642 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1297 /* implicit mb(), we'll read before PT lock is unlocked */ 1643 smp_rmb();
1298 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1644 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1299 up_read(&current->mm->mmap_sem);
1300 1645
1301 /* mmio */ 1646 /* mmio */
1302 if (is_error_pfn(pfn)) { 1647 if (is_error_pfn(pfn)) {
@@ -1308,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1308 if (mmu_notifier_retry(vcpu, mmu_seq)) 1653 if (mmu_notifier_retry(vcpu, mmu_seq))
1309 goto out_unlock; 1654 goto out_unlock;
1310 kvm_mmu_free_some_pages(vcpu); 1655 kvm_mmu_free_some_pages(vcpu);
1311 r = __direct_map(vcpu, v, write, largepage, gfn, pfn, 1656 r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
1312 PT32E_ROOT_LEVEL);
1313 spin_unlock(&vcpu->kvm->mmu_lock); 1657 spin_unlock(&vcpu->kvm->mmu_lock);
1314 1658
1315 1659
@@ -1405,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1405 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 1749 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1406} 1750}
1407 1751
1752static void mmu_sync_roots(struct kvm_vcpu *vcpu)
1753{
1754 int i;
1755 struct kvm_mmu_page *sp;
1756
1757 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1758 return;
1759 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1760 hpa_t root = vcpu->arch.mmu.root_hpa;
1761 sp = page_header(root);
1762 mmu_sync_children(vcpu, sp);
1763 return;
1764 }
1765 for (i = 0; i < 4; ++i) {
1766 hpa_t root = vcpu->arch.mmu.pae_root[i];
1767
1768 if (root) {
1769 root &= PT64_BASE_ADDR_MASK;
1770 sp = page_header(root);
1771 mmu_sync_children(vcpu, sp);
1772 }
1773 }
1774}
1775
1776void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1777{
1778 spin_lock(&vcpu->kvm->mmu_lock);
1779 mmu_sync_roots(vcpu);
1780 spin_unlock(&vcpu->kvm->mmu_lock);
1781}
1782
1408static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 1783static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1409{ 1784{
1410 return vaddr; 1785 return vaddr;
@@ -1446,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1446 if (r) 1821 if (r)
1447 return r; 1822 return r;
1448 1823
1449 down_read(&current->mm->mmap_sem);
1450 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1824 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1451 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1825 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1452 largepage = 1; 1826 largepage = 1;
1453 } 1827 }
1454 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1828 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1455 /* implicit mb(), we'll read before PT lock is unlocked */ 1829 smp_rmb();
1456 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1830 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1457 up_read(&current->mm->mmap_sem);
1458 if (is_error_pfn(pfn)) { 1831 if (is_error_pfn(pfn)) {
1459 kvm_release_pfn_clean(pfn); 1832 kvm_release_pfn_clean(pfn);
1460 return 1; 1833 return 1;
@@ -1464,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1464 goto out_unlock; 1837 goto out_unlock;
1465 kvm_mmu_free_some_pages(vcpu); 1838 kvm_mmu_free_some_pages(vcpu);
1466 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1839 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1467 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); 1840 largepage, gfn, pfn);
1468 spin_unlock(&vcpu->kvm->mmu_lock); 1841 spin_unlock(&vcpu->kvm->mmu_lock);
1469 1842
1470 return r; 1843 return r;
@@ -1489,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1489 context->gva_to_gpa = nonpaging_gva_to_gpa; 1862 context->gva_to_gpa = nonpaging_gva_to_gpa;
1490 context->free = nonpaging_free; 1863 context->free = nonpaging_free;
1491 context->prefetch_page = nonpaging_prefetch_page; 1864 context->prefetch_page = nonpaging_prefetch_page;
1865 context->sync_page = nonpaging_sync_page;
1866 context->invlpg = nonpaging_invlpg;
1492 context->root_level = 0; 1867 context->root_level = 0;
1493 context->shadow_root_level = PT32E_ROOT_LEVEL; 1868 context->shadow_root_level = PT32E_ROOT_LEVEL;
1494 context->root_hpa = INVALID_PAGE; 1869 context->root_hpa = INVALID_PAGE;
@@ -1536,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1536 context->page_fault = paging64_page_fault; 1911 context->page_fault = paging64_page_fault;
1537 context->gva_to_gpa = paging64_gva_to_gpa; 1912 context->gva_to_gpa = paging64_gva_to_gpa;
1538 context->prefetch_page = paging64_prefetch_page; 1913 context->prefetch_page = paging64_prefetch_page;
1914 context->sync_page = paging64_sync_page;
1915 context->invlpg = paging64_invlpg;
1539 context->free = paging_free; 1916 context->free = paging_free;
1540 context->root_level = level; 1917 context->root_level = level;
1541 context->shadow_root_level = level; 1918 context->shadow_root_level = level;
@@ -1557,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
1557 context->gva_to_gpa = paging32_gva_to_gpa; 1934 context->gva_to_gpa = paging32_gva_to_gpa;
1558 context->free = paging_free; 1935 context->free = paging_free;
1559 context->prefetch_page = paging32_prefetch_page; 1936 context->prefetch_page = paging32_prefetch_page;
1937 context->sync_page = paging32_sync_page;
1938 context->invlpg = paging32_invlpg;
1560 context->root_level = PT32_ROOT_LEVEL; 1939 context->root_level = PT32_ROOT_LEVEL;
1561 context->shadow_root_level = PT32E_ROOT_LEVEL; 1940 context->shadow_root_level = PT32E_ROOT_LEVEL;
1562 context->root_hpa = INVALID_PAGE; 1941 context->root_hpa = INVALID_PAGE;
@@ -1576,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
1576 context->page_fault = tdp_page_fault; 1955 context->page_fault = tdp_page_fault;
1577 context->free = nonpaging_free; 1956 context->free = nonpaging_free;
1578 context->prefetch_page = nonpaging_prefetch_page; 1957 context->prefetch_page = nonpaging_prefetch_page;
1958 context->sync_page = nonpaging_sync_page;
1959 context->invlpg = nonpaging_invlpg;
1579 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 1960 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
1580 context->root_hpa = INVALID_PAGE; 1961 context->root_hpa = INVALID_PAGE;
1581 1962
@@ -1647,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
1647 spin_lock(&vcpu->kvm->mmu_lock); 2028 spin_lock(&vcpu->kvm->mmu_lock);
1648 kvm_mmu_free_some_pages(vcpu); 2029 kvm_mmu_free_some_pages(vcpu);
1649 mmu_alloc_roots(vcpu); 2030 mmu_alloc_roots(vcpu);
2031 mmu_sync_roots(vcpu);
1650 spin_unlock(&vcpu->kvm->mmu_lock); 2032 spin_unlock(&vcpu->kvm->mmu_lock);
1651 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2033 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1652 kvm_mmu_flush_tlb(vcpu); 2034 kvm_mmu_flush_tlb(vcpu);
@@ -1767,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1767 return; 2149 return;
1768 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2150 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1769 2151
1770 down_read(&current->mm->mmap_sem);
1771 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { 2152 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
1772 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 2153 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1773 vcpu->arch.update_pte.largepage = 1; 2154 vcpu->arch.update_pte.largepage = 1;
1774 } 2155 }
1775 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; 2156 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
1776 /* implicit mb(), we'll read before PT lock is unlocked */ 2157 smp_rmb();
1777 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2158 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1778 up_read(&current->mm->mmap_sem);
1779 2159
1780 if (is_error_pfn(pfn)) { 2160 if (is_error_pfn(pfn)) {
1781 kvm_release_pfn_clean(pfn); 2161 kvm_release_pfn_clean(pfn);
@@ -1837,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1837 index = kvm_page_table_hashfn(gfn); 2217 index = kvm_page_table_hashfn(gfn);
1838 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2218 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1839 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2219 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1840 if (sp->gfn != gfn || sp->role.metaphysical) 2220 if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
1841 continue; 2221 continue;
1842 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2222 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1843 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2223 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -1855,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1855 */ 2235 */
1856 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2236 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1857 gpa, bytes, sp->role.word); 2237 gpa, bytes, sp->role.word);
1858 kvm_mmu_zap_page(vcpu->kvm, sp); 2238 if (kvm_mmu_zap_page(vcpu->kvm, sp))
2239 n = bucket->first;
1859 ++vcpu->kvm->stat.mmu_flooded; 2240 ++vcpu->kvm->stat.mmu_flooded;
1860 continue; 2241 continue;
1861 } 2242 }
@@ -1969,6 +2350,16 @@ out:
1969} 2350}
1970EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 2351EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1971 2352
2353void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2354{
2355 spin_lock(&vcpu->kvm->mmu_lock);
2356 vcpu->arch.mmu.invlpg(vcpu, gva);
2357 spin_unlock(&vcpu->kvm->mmu_lock);
2358 kvm_mmu_flush_tlb(vcpu);
2359 ++vcpu->stat.invlpg;
2360}
2361EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
2362
1972void kvm_enable_tdp(void) 2363void kvm_enable_tdp(void)
1973{ 2364{
1974 tdp_enabled = true; 2365 tdp_enabled = true;
@@ -2055,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2055{ 2446{
2056 struct kvm_mmu_page *sp; 2447 struct kvm_mmu_page *sp;
2057 2448
2449 spin_lock(&kvm->mmu_lock);
2058 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 2450 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2059 int i; 2451 int i;
2060 u64 *pt; 2452 u64 *pt;
@@ -2068,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2068 if (pt[i] & PT_WRITABLE_MASK) 2460 if (pt[i] & PT_WRITABLE_MASK)
2069 pt[i] &= ~PT_WRITABLE_MASK; 2461 pt[i] &= ~PT_WRITABLE_MASK;
2070 } 2462 }
2463 kvm_flush_remote_tlbs(kvm);
2464 spin_unlock(&kvm->mmu_lock);
2071} 2465}
2072 2466
2073void kvm_mmu_zap_all(struct kvm *kvm) 2467void kvm_mmu_zap_all(struct kvm *kvm)
@@ -2076,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
2076 2470
2077 spin_lock(&kvm->mmu_lock); 2471 spin_lock(&kvm->mmu_lock);
2078 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 2472 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2079 kvm_mmu_zap_page(kvm, sp); 2473 if (kvm_mmu_zap_page(kvm, sp))
2474 node = container_of(kvm->arch.active_mmu_pages.next,
2475 struct kvm_mmu_page, link);
2080 spin_unlock(&kvm->mmu_lock); 2476 spin_unlock(&kvm->mmu_lock);
2081 2477
2082 kvm_flush_remote_tlbs(kvm); 2478 kvm_flush_remote_tlbs(kvm);
@@ -2291,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2291 gpa_t addr, unsigned long *ret) 2687 gpa_t addr, unsigned long *ret)
2292{ 2688{
2293 int r; 2689 int r;
2294 struct kvm_pv_mmu_op_buffer buffer; 2690 struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
2295 2691
2296 buffer.ptr = buffer.buf; 2692 buffer->ptr = buffer->buf;
2297 buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); 2693 buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
2298 buffer.processed = 0; 2694 buffer->processed = 0;
2299 2695
2300 r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); 2696 r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
2301 if (r) 2697 if (r)
2302 goto out; 2698 goto out;
2303 2699
2304 while (buffer.len) { 2700 while (buffer->len) {
2305 r = kvm_pv_mmu_op_one(vcpu, &buffer); 2701 r = kvm_pv_mmu_op_one(vcpu, buffer);
2306 if (r < 0) 2702 if (r < 0)
2307 goto out; 2703 goto out;
2308 if (r == 0) 2704 if (r == 0)
@@ -2311,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2311 2707
2312 r = 1; 2708 r = 1;
2313out: 2709out:
2314 *ret = buffer.processed; 2710 *ret = buffer->processed;
2315 return r; 2711 return r;
2316} 2712}
2317 2713
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4a814bff21f2..613ec9aa674a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,11 +25,11 @@
25#if PTTYPE == 64 25#if PTTYPE == 64
26 #define pt_element_t u64 26 #define pt_element_t u64
27 #define guest_walker guest_walker64 27 #define guest_walker guest_walker64
28 #define shadow_walker shadow_walker64
28 #define FNAME(name) paging##64_##name 29 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK 30 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK 31 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
35 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
@@ -42,11 +42,11 @@
42#elif PTTYPE == 32 42#elif PTTYPE == 32
43 #define pt_element_t u32 43 #define pt_element_t u32
44 #define guest_walker guest_walker32 44 #define guest_walker guest_walker32
45 #define shadow_walker shadow_walker32
45 #define FNAME(name) paging##32_##name 46 #define FNAME(name) paging##32_##name
46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK 47 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK 48 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
49 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
51 #define PT_LEVEL_BITS PT32_LEVEL_BITS 51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
52 #define PT_MAX_FULL_LEVELS 2 52 #define PT_MAX_FULL_LEVELS 2
@@ -73,6 +73,17 @@ struct guest_walker {
73 u32 error_code; 73 u32 error_code;
74}; 74};
75 75
76struct shadow_walker {
77 struct kvm_shadow_walk walker;
78 struct guest_walker *guest_walker;
79 int user_fault;
80 int write_fault;
81 int largepage;
82 int *ptwrite;
83 pfn_t pfn;
84 u64 *sptep;
85};
86
76static gfn_t gpte_to_gfn(pt_element_t gpte) 87static gfn_t gpte_to_gfn(pt_element_t gpte)
77{ 88{
78 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; 89 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
91 pt_element_t *table; 102 pt_element_t *table;
92 struct page *page; 103 struct page *page;
93 104
94 down_read(&current->mm->mmap_sem);
95 page = gfn_to_page(kvm, table_gfn); 105 page = gfn_to_page(kvm, table_gfn);
96 up_read(&current->mm->mmap_sem);
97 106
98 table = kmap_atomic(page, KM_USER0); 107 table = kmap_atomic(page, KM_USER0);
99
100 ret = CMPXCHG(&table[index], orig_pte, new_pte); 108 ret = CMPXCHG(&table[index], orig_pte, new_pte);
101
102 kunmap_atomic(table, KM_USER0); 109 kunmap_atomic(table, KM_USER0);
103 110
104 kvm_release_page_dirty(page); 111 kvm_release_page_dirty(page);
@@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
274/* 281/*
275 * Fetch a shadow pte for a specific level in the paging hierarchy. 282 * Fetch a shadow pte for a specific level in the paging hierarchy.
276 */ 283 */
277static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 284static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
278 struct guest_walker *walker, 285 struct kvm_vcpu *vcpu, u64 addr,
279 int user_fault, int write_fault, int largepage, 286 u64 *sptep, int level)
280 int *ptwrite, pfn_t pfn)
281{ 287{
282 hpa_t shadow_addr; 288 struct shadow_walker *sw =
283 int level; 289 container_of(_sw, struct shadow_walker, walker);
284 u64 *shadow_ent; 290 struct guest_walker *gw = sw->guest_walker;
285 unsigned access = walker->pt_access; 291 unsigned access = gw->pt_access;
286 292 struct kvm_mmu_page *shadow_page;
287 if (!is_present_pte(walker->ptes[walker->level - 1])) 293 u64 spte;
288 return NULL; 294 int metaphysical;
289 295 gfn_t table_gfn;
290 shadow_addr = vcpu->arch.mmu.root_hpa; 296 int r;
291 level = vcpu->arch.mmu.shadow_root_level; 297 pt_element_t curr_pte;
292 if (level == PT32E_ROOT_LEVEL) { 298
293 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 299 if (level == PT_PAGE_TABLE_LEVEL
294 shadow_addr &= PT64_BASE_ADDR_MASK; 300 || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
295 --level; 301 mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
302 sw->user_fault, sw->write_fault,
303 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
304 sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
305 false);
306 sw->sptep = sptep;
307 return 1;
296 } 308 }
297 309
298 for (; ; level--) { 310 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
299 u32 index = SHADOW_PT_INDEX(addr, level); 311 return 0;
300 struct kvm_mmu_page *shadow_page;
301 u64 shadow_pte;
302 int metaphysical;
303 gfn_t table_gfn;
304
305 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
306 if (level == PT_PAGE_TABLE_LEVEL)
307 break;
308
309 if (largepage && level == PT_DIRECTORY_LEVEL)
310 break;
311 312
312 if (is_shadow_present_pte(*shadow_ent) 313 if (is_large_pte(*sptep)) {
313 && !is_large_pte(*shadow_ent)) { 314 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
314 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; 315 kvm_flush_remote_tlbs(vcpu->kvm);
315 continue; 316 rmap_remove(vcpu->kvm, sptep);
316 } 317 }
317 318
318 if (is_large_pte(*shadow_ent)) 319 if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
319 rmap_remove(vcpu->kvm, shadow_ent); 320 metaphysical = 1;
320 321 if (!is_dirty_pte(gw->ptes[level - 1]))
321 if (level - 1 == PT_PAGE_TABLE_LEVEL 322 access &= ~ACC_WRITE_MASK;
322 && walker->level == PT_DIRECTORY_LEVEL) { 323 table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
323 metaphysical = 1; 324 } else {
324 if (!is_dirty_pte(walker->ptes[level - 1])) 325 metaphysical = 0;
325 access &= ~ACC_WRITE_MASK; 326 table_gfn = gw->table_gfn[level - 2];
326 table_gfn = gpte_to_gfn(walker->ptes[level - 1]); 327 }
327 } else { 328 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
328 metaphysical = 0; 329 metaphysical, access, sptep);
329 table_gfn = walker->table_gfn[level - 2]; 330 if (!metaphysical) {
330 } 331 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
331 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, 332 &curr_pte, sizeof(curr_pte));
332 metaphysical, access, 333 if (r || curr_pte != gw->ptes[level - 2]) {
333 shadow_ent); 334 kvm_release_pfn_clean(sw->pfn);
334 if (!metaphysical) { 335 sw->sptep = NULL;
335 int r; 336 return 1;
336 pt_element_t curr_pte;
337 r = kvm_read_guest_atomic(vcpu->kvm,
338 walker->pte_gpa[level - 2],
339 &curr_pte, sizeof(curr_pte));
340 if (r || curr_pte != walker->ptes[level - 2]) {
341 kvm_release_pfn_clean(pfn);
342 return NULL;
343 }
344 } 337 }
345 shadow_addr = __pa(shadow_page->spt);
346 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
347 | PT_WRITABLE_MASK | PT_USER_MASK;
348 set_shadow_pte(shadow_ent, shadow_pte);
349 } 338 }
350 339
351 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, 340 spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
352 user_fault, write_fault, 341 | PT_WRITABLE_MASK | PT_USER_MASK;
353 walker->ptes[walker->level-1] & PT_DIRTY_MASK, 342 *sptep = spte;
354 ptwrite, largepage, walker->gfn, pfn, false); 343 return 0;
344}
345
346static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
347 struct guest_walker *guest_walker,
348 int user_fault, int write_fault, int largepage,
349 int *ptwrite, pfn_t pfn)
350{
351 struct shadow_walker walker = {
352 .walker = { .entry = FNAME(shadow_walk_entry), },
353 .guest_walker = guest_walker,
354 .user_fault = user_fault,
355 .write_fault = write_fault,
356 .largepage = largepage,
357 .ptwrite = ptwrite,
358 .pfn = pfn,
359 };
360
361 if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
362 return NULL;
363
364 walk_shadow(&walker.walker, vcpu, addr);
355 365
356 return shadow_ent; 366 return walker.sptep;
357} 367}
358 368
359/* 369/*
@@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
407 return 0; 417 return 0;
408 } 418 }
409 419
410 down_read(&current->mm->mmap_sem);
411 if (walker.level == PT_DIRECTORY_LEVEL) { 420 if (walker.level == PT_DIRECTORY_LEVEL) {
412 gfn_t large_gfn; 421 gfn_t large_gfn;
413 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); 422 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
@@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
417 } 426 }
418 } 427 }
419 mmu_seq = vcpu->kvm->mmu_notifier_seq; 428 mmu_seq = vcpu->kvm->mmu_notifier_seq;
420 /* implicit mb(), we'll read before PT lock is unlocked */ 429 smp_rmb();
421 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 430 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
422 up_read(&current->mm->mmap_sem);
423 431
424 /* mmio */ 432 /* mmio */
425 if (is_error_pfn(pfn)) { 433 if (is_error_pfn(pfn)) {
@@ -453,6 +461,31 @@ out_unlock:
453 return 0; 461 return 0;
454} 462}
455 463
464static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
465 struct kvm_vcpu *vcpu, u64 addr,
466 u64 *sptep, int level)
467{
468
469 if (level == PT_PAGE_TABLE_LEVEL) {
470 if (is_shadow_present_pte(*sptep))
471 rmap_remove(vcpu->kvm, sptep);
472 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
473 return 1;
474 }
475 if (!is_shadow_present_pte(*sptep))
476 return 1;
477 return 0;
478}
479
480static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
481{
482 struct shadow_walker walker = {
483 .walker = { .entry = FNAME(shadow_invlpg_entry), },
484 };
485
486 walk_shadow(&walker.walker, vcpu, gva);
487}
488
456static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 489static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
457{ 490{
458 struct guest_walker walker; 491 struct guest_walker walker;
@@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
499 } 532 }
500} 533}
501 534
535/*
536 * Using the cached information from sp->gfns is safe because:
537 * - The spte has a reference to the struct page, so the pfn for a given gfn
538 * can't change unless all sptes pointing to it are nuked first.
539 * - Alias changes zap the entire shadow cache.
540 */
541static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
542{
543 int i, offset, nr_present;
544
545 offset = nr_present = 0;
546
547 if (PTTYPE == 32)
548 offset = sp->role.quadrant << PT64_LEVEL_BITS;
549
550 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
551 unsigned pte_access;
552 pt_element_t gpte;
553 gpa_t pte_gpa;
554 gfn_t gfn = sp->gfns[i];
555
556 if (!is_shadow_present_pte(sp->spt[i]))
557 continue;
558
559 pte_gpa = gfn_to_gpa(sp->gfn);
560 pte_gpa += (i+offset) * sizeof(pt_element_t);
561
562 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
563 sizeof(pt_element_t)))
564 return -EINVAL;
565
566 if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
567 !(gpte & PT_ACCESSED_MASK)) {
568 u64 nonpresent;
569
570 rmap_remove(vcpu->kvm, &sp->spt[i]);
571 if (is_present_pte(gpte))
572 nonpresent = shadow_trap_nonpresent_pte;
573 else
574 nonpresent = shadow_notrap_nonpresent_pte;
575 set_shadow_pte(&sp->spt[i], nonpresent);
576 continue;
577 }
578
579 nr_present++;
580 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
581 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
582 is_dirty_pte(gpte), 0, gfn,
583 spte_to_pfn(sp->spt[i]), true, false);
584 }
585
586 return !nr_present;
587}
588
502#undef pt_element_t 589#undef pt_element_t
503#undef guest_walker 590#undef guest_walker
591#undef shadow_walker
504#undef FNAME 592#undef FNAME
505#undef PT_BASE_ADDR_MASK 593#undef PT_BASE_ADDR_MASK
506#undef PT_INDEX 594#undef PT_INDEX
507#undef SHADOW_PT_INDEX
508#undef PT_LEVEL_MASK 595#undef PT_LEVEL_MASK
509#undef PT_DIR_BASE_ADDR_MASK 596#undef PT_DIR_BASE_ADDR_MASK
510#undef PT_LEVEL_BITS 597#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8233b86c778c..9c4ce657d963 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -18,6 +18,7 @@
18#include "kvm_svm.h" 18#include "kvm_svm.h"
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h"
21 22
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/kernel.h> 24#include <linux/kernel.h>
@@ -35,10 +36,6 @@ MODULE_LICENSE("GPL");
35#define IOPM_ALLOC_ORDER 2 36#define IOPM_ALLOC_ORDER 2
36#define MSRPM_ALLOC_ORDER 1 37#define MSRPM_ALLOC_ORDER 1
37 38
38#define DB_VECTOR 1
39#define UD_VECTOR 6
40#define GP_VECTOR 13
41
42#define DR7_GD_MASK (1 << 13) 39#define DR7_GD_MASK (1 << 13)
43#define DR6_BD_MASK (1 << 13) 40#define DR6_BD_MASK (1 << 13)
44 41
@@ -47,7 +44,7 @@ MODULE_LICENSE("GPL");
47 44
48#define SVM_FEATURE_NPT (1 << 0) 45#define SVM_FEATURE_NPT (1 << 0)
49#define SVM_FEATURE_LBRV (1 << 1) 46#define SVM_FEATURE_LBRV (1 << 1)
50#define SVM_DEATURE_SVML (1 << 2) 47#define SVM_FEATURE_SVML (1 << 2)
51 48
52#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 49#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
53 50
@@ -236,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
236 printk(KERN_DEBUG "%s: NOP\n", __func__); 233 printk(KERN_DEBUG "%s: NOP\n", __func__);
237 return; 234 return;
238 } 235 }
239 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) 236 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
240 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", 237 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
241 __func__, 238 __func__, kvm_rip_read(vcpu), svm->next_rip);
242 svm->vmcb->save.rip,
243 svm->next_rip);
244 239
245 vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; 240 kvm_rip_write(vcpu, svm->next_rip);
246 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 241 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
247 242
248 vcpu->arch.interrupt_window_open = 1; 243 vcpu->arch.interrupt_window_open = 1;
@@ -530,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm)
530 (1ULL << INTERCEPT_CPUID) | 525 (1ULL << INTERCEPT_CPUID) |
531 (1ULL << INTERCEPT_INVD) | 526 (1ULL << INTERCEPT_INVD) |
532 (1ULL << INTERCEPT_HLT) | 527 (1ULL << INTERCEPT_HLT) |
528 (1ULL << INTERCEPT_INVLPG) |
533 (1ULL << INTERCEPT_INVLPGA) | 529 (1ULL << INTERCEPT_INVLPGA) |
534 (1ULL << INTERCEPT_IOIO_PROT) | 530 (1ULL << INTERCEPT_IOIO_PROT) |
535 (1ULL << INTERCEPT_MSR_PROT) | 531 (1ULL << INTERCEPT_MSR_PROT) |
@@ -581,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
581 save->dr7 = 0x400; 577 save->dr7 = 0x400;
582 save->rflags = 2; 578 save->rflags = 2;
583 save->rip = 0x0000fff0; 579 save->rip = 0x0000fff0;
580 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
584 581
585 /* 582 /*
586 * cr0 val on cpu init should be 0x60000010, we enable cpu 583 * cr0 val on cpu init should be 0x60000010, we enable cpu
@@ -593,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm)
593 if (npt_enabled) { 590 if (npt_enabled) {
594 /* Setup VMCB for Nested Paging */ 591 /* Setup VMCB for Nested Paging */
595 control->nested_ctl = 1; 592 control->nested_ctl = 1;
596 control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); 593 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
594 (1ULL << INTERCEPT_INVLPG));
597 control->intercept_exceptions &= ~(1 << PF_VECTOR); 595 control->intercept_exceptions &= ~(1 << PF_VECTOR);
598 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| 596 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
599 INTERCEPT_CR3_MASK); 597 INTERCEPT_CR3_MASK);
@@ -615,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
615 init_vmcb(svm); 613 init_vmcb(svm);
616 614
617 if (vcpu->vcpu_id != 0) { 615 if (vcpu->vcpu_id != 0) {
618 svm->vmcb->save.rip = 0; 616 kvm_rip_write(vcpu, 0);
619 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; 617 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
620 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; 618 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
621 } 619 }
620 vcpu->arch.regs_avail = ~0;
621 vcpu->arch.regs_dirty = ~0;
622 622
623 return 0; 623 return 0;
624} 624}
@@ -721,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
721 rdtscll(vcpu->arch.host_tsc); 721 rdtscll(vcpu->arch.host_tsc);
722} 722}
723 723
724static void svm_cache_regs(struct kvm_vcpu *vcpu)
725{
726 struct vcpu_svm *svm = to_svm(vcpu);
727
728 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
729 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
730 vcpu->arch.rip = svm->vmcb->save.rip;
731}
732
733static void svm_decache_regs(struct kvm_vcpu *vcpu)
734{
735 struct vcpu_svm *svm = to_svm(vcpu);
736 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
737 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
738 svm->vmcb->save.rip = vcpu->arch.rip;
739}
740
741static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 724static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
742{ 725{
743 return to_svm(vcpu)->vmcb->save.rflags; 726 return to_svm(vcpu)->vmcb->save.rflags;
@@ -1040,7 +1023,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1040 if (npt_enabled) 1023 if (npt_enabled)
1041 svm_flush_tlb(&svm->vcpu); 1024 svm_flush_tlb(&svm->vcpu);
1042 1025
1043 if (event_injection) 1026 if (!npt_enabled && event_injection)
1044 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1027 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1045 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1028 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1046} 1029}
@@ -1139,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1139 1122
1140static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1123static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1141{ 1124{
1142 svm->next_rip = svm->vmcb->save.rip + 1; 1125 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1143 skip_emulated_instruction(&svm->vcpu); 1126 skip_emulated_instruction(&svm->vcpu);
1144 return kvm_emulate_halt(&svm->vcpu); 1127 return kvm_emulate_halt(&svm->vcpu);
1145} 1128}
1146 1129
1147static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1130static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1148{ 1131{
1149 svm->next_rip = svm->vmcb->save.rip + 3; 1132 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1150 skip_emulated_instruction(&svm->vcpu); 1133 skip_emulated_instruction(&svm->vcpu);
1151 kvm_emulate_hypercall(&svm->vcpu); 1134 kvm_emulate_hypercall(&svm->vcpu);
1152 return 1; 1135 return 1;
@@ -1178,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm,
1178 1161
1179static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1162static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1180{ 1163{
1181 svm->next_rip = svm->vmcb->save.rip + 2; 1164 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1182 kvm_emulate_cpuid(&svm->vcpu); 1165 kvm_emulate_cpuid(&svm->vcpu);
1183 return 1; 1166 return 1;
1184} 1167}
1185 1168
1169static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1170{
1171 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
1172 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
1173 return 1;
1174}
1175
1186static int emulate_on_interception(struct vcpu_svm *svm, 1176static int emulate_on_interception(struct vcpu_svm *svm,
1187 struct kvm_run *kvm_run) 1177 struct kvm_run *kvm_run)
1188{ 1178{
@@ -1273,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1273 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, 1263 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
1274 (u32)(data >> 32), handler); 1264 (u32)(data >> 32), handler);
1275 1265
1276 svm->vmcb->save.rax = data & 0xffffffff; 1266 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
1277 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 1267 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1278 svm->next_rip = svm->vmcb->save.rip + 2; 1268 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1279 skip_emulated_instruction(&svm->vcpu); 1269 skip_emulated_instruction(&svm->vcpu);
1280 } 1270 }
1281 return 1; 1271 return 1;
@@ -1359,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1359static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1349static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1360{ 1350{
1361 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1351 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1362 u64 data = (svm->vmcb->save.rax & -1u) 1352 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
1363 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 1353 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1364 1354
1365 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), 1355 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
1366 handler); 1356 handler);
1367 1357
1368 svm->next_rip = svm->vmcb->save.rip + 2; 1358 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1369 if (svm_set_msr(&svm->vcpu, ecx, data)) 1359 if (svm_set_msr(&svm->vcpu, ecx, data))
1370 kvm_inject_gp(&svm->vcpu, 0); 1360 kvm_inject_gp(&svm->vcpu, 0);
1371 else 1361 else
@@ -1436,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1436 [SVM_EXIT_CPUID] = cpuid_interception, 1426 [SVM_EXIT_CPUID] = cpuid_interception,
1437 [SVM_EXIT_INVD] = emulate_on_interception, 1427 [SVM_EXIT_INVD] = emulate_on_interception,
1438 [SVM_EXIT_HLT] = halt_interception, 1428 [SVM_EXIT_HLT] = halt_interception,
1439 [SVM_EXIT_INVLPG] = emulate_on_interception, 1429 [SVM_EXIT_INVLPG] = invlpg_interception,
1440 [SVM_EXIT_INVLPGA] = invalid_op_interception, 1430 [SVM_EXIT_INVLPGA] = invalid_op_interception,
1441 [SVM_EXIT_IOIO] = io_interception, 1431 [SVM_EXIT_IOIO] = io_interception,
1442 [SVM_EXIT_MSR] = msr_interception, 1432 [SVM_EXIT_MSR] = msr_interception,
@@ -1538,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1538 1528
1539 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); 1529 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
1540 1530
1531 ++svm->vcpu.stat.irq_injections;
1541 control = &svm->vmcb->control; 1532 control = &svm->vmcb->control;
1542 control->int_vector = irq; 1533 control->int_vector = irq;
1543 control->int_ctl &= ~V_INTR_PRIO_MASK; 1534 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1716,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
1716 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 1707 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
1717} 1708}
1718 1709
1710#ifdef CONFIG_X86_64
1711#define R "r"
1712#else
1713#define R "e"
1714#endif
1715
1719static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1716static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1720{ 1717{
1721 struct vcpu_svm *svm = to_svm(vcpu); 1718 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1723,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1723 u16 gs_selector; 1720 u16 gs_selector;
1724 u16 ldt_selector; 1721 u16 ldt_selector;
1725 1722
1723 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
1724 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
1725 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
1726
1726 pre_svm_run(svm); 1727 pre_svm_run(svm);
1727 1728
1728 sync_lapic_to_cr8(vcpu); 1729 sync_lapic_to_cr8(vcpu);
@@ -1750,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1750 local_irq_enable(); 1751 local_irq_enable();
1751 1752
1752 asm volatile ( 1753 asm volatile (
1754 "push %%"R"bp; \n\t"
1755 "mov %c[rbx](%[svm]), %%"R"bx \n\t"
1756 "mov %c[rcx](%[svm]), %%"R"cx \n\t"
1757 "mov %c[rdx](%[svm]), %%"R"dx \n\t"
1758 "mov %c[rsi](%[svm]), %%"R"si \n\t"
1759 "mov %c[rdi](%[svm]), %%"R"di \n\t"
1760 "mov %c[rbp](%[svm]), %%"R"bp \n\t"
1753#ifdef CONFIG_X86_64 1761#ifdef CONFIG_X86_64
1754 "push %%rbp; \n\t"
1755#else
1756 "push %%ebp; \n\t"
1757#endif
1758
1759#ifdef CONFIG_X86_64
1760 "mov %c[rbx](%[svm]), %%rbx \n\t"
1761 "mov %c[rcx](%[svm]), %%rcx \n\t"
1762 "mov %c[rdx](%[svm]), %%rdx \n\t"
1763 "mov %c[rsi](%[svm]), %%rsi \n\t"
1764 "mov %c[rdi](%[svm]), %%rdi \n\t"
1765 "mov %c[rbp](%[svm]), %%rbp \n\t"
1766 "mov %c[r8](%[svm]), %%r8 \n\t" 1762 "mov %c[r8](%[svm]), %%r8 \n\t"
1767 "mov %c[r9](%[svm]), %%r9 \n\t" 1763 "mov %c[r9](%[svm]), %%r9 \n\t"
1768 "mov %c[r10](%[svm]), %%r10 \n\t" 1764 "mov %c[r10](%[svm]), %%r10 \n\t"
@@ -1771,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1771 "mov %c[r13](%[svm]), %%r13 \n\t" 1767 "mov %c[r13](%[svm]), %%r13 \n\t"
1772 "mov %c[r14](%[svm]), %%r14 \n\t" 1768 "mov %c[r14](%[svm]), %%r14 \n\t"
1773 "mov %c[r15](%[svm]), %%r15 \n\t" 1769 "mov %c[r15](%[svm]), %%r15 \n\t"
1774#else
1775 "mov %c[rbx](%[svm]), %%ebx \n\t"
1776 "mov %c[rcx](%[svm]), %%ecx \n\t"
1777 "mov %c[rdx](%[svm]), %%edx \n\t"
1778 "mov %c[rsi](%[svm]), %%esi \n\t"
1779 "mov %c[rdi](%[svm]), %%edi \n\t"
1780 "mov %c[rbp](%[svm]), %%ebp \n\t"
1781#endif 1770#endif
1782 1771
1783#ifdef CONFIG_X86_64
1784 /* Enter guest mode */
1785 "push %%rax \n\t"
1786 "mov %c[vmcb](%[svm]), %%rax \n\t"
1787 __ex(SVM_VMLOAD) "\n\t"
1788 __ex(SVM_VMRUN) "\n\t"
1789 __ex(SVM_VMSAVE) "\n\t"
1790 "pop %%rax \n\t"
1791#else
1792 /* Enter guest mode */ 1772 /* Enter guest mode */
1793 "push %%eax \n\t" 1773 "push %%"R"ax \n\t"
1794 "mov %c[vmcb](%[svm]), %%eax \n\t" 1774 "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
1795 __ex(SVM_VMLOAD) "\n\t" 1775 __ex(SVM_VMLOAD) "\n\t"
1796 __ex(SVM_VMRUN) "\n\t" 1776 __ex(SVM_VMRUN) "\n\t"
1797 __ex(SVM_VMSAVE) "\n\t" 1777 __ex(SVM_VMSAVE) "\n\t"
1798 "pop %%eax \n\t" 1778 "pop %%"R"ax \n\t"
1799#endif
1800 1779
1801 /* Save guest registers, load host registers */ 1780 /* Save guest registers, load host registers */
1781 "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
1782 "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
1783 "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
1784 "mov %%"R"si, %c[rsi](%[svm]) \n\t"
1785 "mov %%"R"di, %c[rdi](%[svm]) \n\t"
1786 "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
1802#ifdef CONFIG_X86_64 1787#ifdef CONFIG_X86_64
1803 "mov %%rbx, %c[rbx](%[svm]) \n\t"
1804 "mov %%rcx, %c[rcx](%[svm]) \n\t"
1805 "mov %%rdx, %c[rdx](%[svm]) \n\t"
1806 "mov %%rsi, %c[rsi](%[svm]) \n\t"
1807 "mov %%rdi, %c[rdi](%[svm]) \n\t"
1808 "mov %%rbp, %c[rbp](%[svm]) \n\t"
1809 "mov %%r8, %c[r8](%[svm]) \n\t" 1788 "mov %%r8, %c[r8](%[svm]) \n\t"
1810 "mov %%r9, %c[r9](%[svm]) \n\t" 1789 "mov %%r9, %c[r9](%[svm]) \n\t"
1811 "mov %%r10, %c[r10](%[svm]) \n\t" 1790 "mov %%r10, %c[r10](%[svm]) \n\t"
@@ -1814,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1814 "mov %%r13, %c[r13](%[svm]) \n\t" 1793 "mov %%r13, %c[r13](%[svm]) \n\t"
1815 "mov %%r14, %c[r14](%[svm]) \n\t" 1794 "mov %%r14, %c[r14](%[svm]) \n\t"
1816 "mov %%r15, %c[r15](%[svm]) \n\t" 1795 "mov %%r15, %c[r15](%[svm]) \n\t"
1817
1818 "pop %%rbp; \n\t"
1819#else
1820 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1821 "mov %%ecx, %c[rcx](%[svm]) \n\t"
1822 "mov %%edx, %c[rdx](%[svm]) \n\t"
1823 "mov %%esi, %c[rsi](%[svm]) \n\t"
1824 "mov %%edi, %c[rdi](%[svm]) \n\t"
1825 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1826
1827 "pop %%ebp; \n\t"
1828#endif 1796#endif
1797 "pop %%"R"bp"
1829 : 1798 :
1830 : [svm]"a"(svm), 1799 : [svm]"a"(svm),
1831 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 1800 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -1846,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1846 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) 1815 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
1847#endif 1816#endif
1848 : "cc", "memory" 1817 : "cc", "memory"
1818 , R"bx", R"cx", R"dx", R"si", R"di"
1849#ifdef CONFIG_X86_64 1819#ifdef CONFIG_X86_64
1850 , "rbx", "rcx", "rdx", "rsi", "rdi"
1851 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" 1820 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
1852#else
1853 , "ebx", "ecx", "edx" , "esi", "edi"
1854#endif 1821#endif
1855 ); 1822 );
1856 1823
@@ -1858,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1858 load_db_regs(svm->host_db_regs); 1825 load_db_regs(svm->host_db_regs);
1859 1826
1860 vcpu->arch.cr2 = svm->vmcb->save.cr2; 1827 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1828 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
1829 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
1830 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
1861 1831
1862 write_dr6(svm->host_dr6); 1832 write_dr6(svm->host_dr6);
1863 write_dr7(svm->host_dr7); 1833 write_dr7(svm->host_dr7);
@@ -1879,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1879 svm->next_rip = 0; 1849 svm->next_rip = 0;
1880} 1850}
1881 1851
1852#undef R
1853
1882static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) 1854static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1883{ 1855{
1884 struct vcpu_svm *svm = to_svm(vcpu); 1856 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1977,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1977 .set_gdt = svm_set_gdt, 1949 .set_gdt = svm_set_gdt,
1978 .get_dr = svm_get_dr, 1950 .get_dr = svm_get_dr,
1979 .set_dr = svm_set_dr, 1951 .set_dr = svm_set_dr,
1980 .cache_regs = svm_cache_regs,
1981 .decache_regs = svm_decache_regs,
1982 .get_rflags = svm_get_rflags, 1952 .get_rflags = svm_get_rflags,
1983 .set_rflags = svm_set_rflags, 1953 .set_rflags = svm_set_rflags,
1984 1954
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7041cc52b562..2643b430d83a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -26,6 +26,8 @@
26#include <linux/highmem.h> 26#include <linux/highmem.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/moduleparam.h> 28#include <linux/moduleparam.h>
29#include "kvm_cache_regs.h"
30#include "x86.h"
29 31
30#include <asm/io.h> 32#include <asm/io.h>
31#include <asm/desc.h> 33#include <asm/desc.h>
@@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0);
47static int enable_ept = 1; 49static int enable_ept = 1;
48module_param(enable_ept, bool, 0); 50module_param(enable_ept, bool, 0);
49 51
52static int emulate_invalid_guest_state = 0;
53module_param(emulate_invalid_guest_state, bool, 0);
54
50struct vmcs { 55struct vmcs {
51 u32 revision_id; 56 u32 revision_id;
52 u32 abort; 57 u32 abort;
@@ -56,6 +61,7 @@ struct vmcs {
56struct vcpu_vmx { 61struct vcpu_vmx {
57 struct kvm_vcpu vcpu; 62 struct kvm_vcpu vcpu;
58 struct list_head local_vcpus_link; 63 struct list_head local_vcpus_link;
64 unsigned long host_rsp;
59 int launched; 65 int launched;
60 u8 fail; 66 u8 fail;
61 u32 idt_vectoring_info; 67 u32 idt_vectoring_info;
@@ -83,6 +89,7 @@ struct vcpu_vmx {
83 } irq; 89 } irq;
84 } rmode; 90 } rmode;
85 int vpid; 91 int vpid;
92 bool emulation_required;
86}; 93};
87 94
88static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 95static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
468 if (!vcpu->fpu_active) 475 if (!vcpu->fpu_active)
469 eb |= 1u << NM_VECTOR; 476 eb |= 1u << NM_VECTOR;
470 if (vcpu->guest_debug.enabled) 477 if (vcpu->guest_debug.enabled)
471 eb |= 1u << 1; 478 eb |= 1u << DB_VECTOR;
472 if (vcpu->arch.rmode.active) 479 if (vcpu->arch.rmode.active)
473 eb = ~0; 480 eb = ~0;
474 if (vm_need_ept()) 481 if (vm_need_ept())
@@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
715 unsigned long rip; 722 unsigned long rip;
716 u32 interruptibility; 723 u32 interruptibility;
717 724
718 rip = vmcs_readl(GUEST_RIP); 725 rip = kvm_rip_read(vcpu);
719 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 726 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
720 vmcs_writel(GUEST_RIP, rip); 727 kvm_rip_write(vcpu, rip);
721 728
722 /* 729 /*
723 * We emulated an instruction, so temporary interrupt blocking 730 * We emulated an instruction, so temporary interrupt blocking
@@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
733static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 740static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
734 bool has_error_code, u32 error_code) 741 bool has_error_code, u32 error_code)
735{ 742{
743 struct vcpu_vmx *vmx = to_vmx(vcpu);
744
745 if (has_error_code)
746 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
747
748 if (vcpu->arch.rmode.active) {
749 vmx->rmode.irq.pending = true;
750 vmx->rmode.irq.vector = nr;
751 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
752 if (nr == BP_VECTOR)
753 vmx->rmode.irq.rip++;
754 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
755 nr | INTR_TYPE_SOFT_INTR
756 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
757 | INTR_INFO_VALID_MASK);
758 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
759 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
760 return;
761 }
762
736 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 763 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
737 nr | INTR_TYPE_EXCEPTION 764 nr | INTR_TYPE_EXCEPTION
738 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) 765 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
739 | INTR_INFO_VALID_MASK); 766 | INTR_INFO_VALID_MASK);
740 if (has_error_code)
741 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
742} 767}
743 768
744static bool vmx_exception_injected(struct kvm_vcpu *vcpu) 769static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
745{ 770{
746 struct vcpu_vmx *vmx = to_vmx(vcpu); 771 return false;
747
748 return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
749} 772}
750 773
751/* 774/*
@@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
947 return ret; 970 return ret;
948} 971}
949 972
950/* 973static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
951 * Sync the rsp and rip registers into the vcpu structure. This allows
952 * registers to be accessed by indexing vcpu->arch.regs.
953 */
954static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
955{
956 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
957 vcpu->arch.rip = vmcs_readl(GUEST_RIP);
958}
959
960/*
961 * Syncs rsp and rip back into the vmcs. Should be called after possible
962 * modification.
963 */
964static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
965{ 974{
966 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 975 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
967 vmcs_writel(GUEST_RIP, vcpu->arch.rip); 976 switch (reg) {
977 case VCPU_REGS_RSP:
978 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
979 break;
980 case VCPU_REGS_RIP:
981 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
982 break;
983 default:
984 break;
985 }
968} 986}
969 987
970static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 988static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
1007 1025
1008static int vmx_get_irq(struct kvm_vcpu *vcpu) 1026static int vmx_get_irq(struct kvm_vcpu *vcpu)
1009{ 1027{
1010 struct vcpu_vmx *vmx = to_vmx(vcpu); 1028 if (!vcpu->arch.interrupt.pending)
1011 u32 idtv_info_field; 1029 return -1;
1012 1030 return vcpu->arch.interrupt.nr;
1013 idtv_info_field = vmx->idt_vectoring_info;
1014 if (idtv_info_field & INTR_INFO_VALID_MASK) {
1015 if (is_external_interrupt(idtv_info_field))
1016 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
1017 else
1018 printk(KERN_DEBUG "pending exception: not handled yet\n");
1019 }
1020 return -1;
1021} 1031}
1022 1032
1023static __init int cpu_has_kvm_support(void) 1033static __init int cpu_has_kvm_support(void)
@@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void)
1031 u64 msr; 1041 u64 msr;
1032 1042
1033 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1043 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1034 return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1044 return (msr & (FEATURE_CONTROL_LOCKED |
1035 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1045 FEATURE_CONTROL_VMXON_ENABLED))
1036 == MSR_IA32_FEATURE_CONTROL_LOCKED; 1046 == FEATURE_CONTROL_LOCKED;
1037 /* locked but not enabled */ 1047 /* locked but not enabled */
1038} 1048}
1039 1049
@@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage)
1045 1055
1046 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1056 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1047 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1057 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1048 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1058 if ((old & (FEATURE_CONTROL_LOCKED |
1049 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1059 FEATURE_CONTROL_VMXON_ENABLED))
1050 != (MSR_IA32_FEATURE_CONTROL_LOCKED | 1060 != (FEATURE_CONTROL_LOCKED |
1051 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1061 FEATURE_CONTROL_VMXON_ENABLED))
1052 /* enable and lock */ 1062 /* enable and lock */
1053 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 1063 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
1054 MSR_IA32_FEATURE_CONTROL_LOCKED | 1064 FEATURE_CONTROL_LOCKED |
1055 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); 1065 FEATURE_CONTROL_VMXON_ENABLED);
1056 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1066 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1057 asm volatile (ASM_VMX_VMXON_RAX 1067 asm volatile (ASM_VMX_VMXON_RAX
1058 : : "a"(&phys_addr), "m"(phys_addr) 1068 : : "a"(&phys_addr), "m"(phys_addr)
@@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1120 CPU_BASED_CR3_STORE_EXITING | 1130 CPU_BASED_CR3_STORE_EXITING |
1121 CPU_BASED_USE_IO_BITMAPS | 1131 CPU_BASED_USE_IO_BITMAPS |
1122 CPU_BASED_MOV_DR_EXITING | 1132 CPU_BASED_MOV_DR_EXITING |
1123 CPU_BASED_USE_TSC_OFFSETING; 1133 CPU_BASED_USE_TSC_OFFSETING |
1134 CPU_BASED_INVLPG_EXITING;
1124 opt = CPU_BASED_TPR_SHADOW | 1135 opt = CPU_BASED_TPR_SHADOW |
1125 CPU_BASED_USE_MSR_BITMAPS | 1136 CPU_BASED_USE_MSR_BITMAPS |
1126 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1137 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1149 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 1160 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
1150#endif 1161#endif
1151 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 1162 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
1152 /* CR3 accesses don't need to cause VM Exits when EPT enabled */ 1163 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1164 enabled */
1153 min &= ~(CPU_BASED_CR3_LOAD_EXITING | 1165 min &= ~(CPU_BASED_CR3_LOAD_EXITING |
1154 CPU_BASED_CR3_STORE_EXITING); 1166 CPU_BASED_CR3_STORE_EXITING |
1167 CPU_BASED_INVLPG_EXITING);
1155 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 1168 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1156 &_cpu_based_exec_control) < 0) 1169 &_cpu_based_exec_control) < 0)
1157 return -EIO; 1170 return -EIO;
@@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1288static void enter_pmode(struct kvm_vcpu *vcpu) 1301static void enter_pmode(struct kvm_vcpu *vcpu)
1289{ 1302{
1290 unsigned long flags; 1303 unsigned long flags;
1304 struct vcpu_vmx *vmx = to_vmx(vcpu);
1291 1305
1306 vmx->emulation_required = 1;
1292 vcpu->arch.rmode.active = 0; 1307 vcpu->arch.rmode.active = 0;
1293 1308
1294 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); 1309 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
@@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1305 1320
1306 update_exception_bitmap(vcpu); 1321 update_exception_bitmap(vcpu);
1307 1322
1323 if (emulate_invalid_guest_state)
1324 return;
1325
1308 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); 1326 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1309 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); 1327 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1310 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1328 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
@@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1345static void enter_rmode(struct kvm_vcpu *vcpu) 1363static void enter_rmode(struct kvm_vcpu *vcpu)
1346{ 1364{
1347 unsigned long flags; 1365 unsigned long flags;
1366 struct vcpu_vmx *vmx = to_vmx(vcpu);
1348 1367
1368 vmx->emulation_required = 1;
1349 vcpu->arch.rmode.active = 1; 1369 vcpu->arch.rmode.active = 1;
1350 1370
1351 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1371 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
@@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1367 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 1387 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1368 update_exception_bitmap(vcpu); 1388 update_exception_bitmap(vcpu);
1369 1389
1390 if (emulate_invalid_guest_state)
1391 goto continue_rmode;
1392
1370 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); 1393 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1371 vmcs_write32(GUEST_SS_LIMIT, 0xffff); 1394 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1372 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); 1395 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
@@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1382 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1405 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1383 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); 1406 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1384 1407
1408continue_rmode:
1385 kvm_mmu_reset_context(vcpu); 1409 kvm_mmu_reset_context(vcpu);
1386 init_rmode(vcpu->kvm); 1410 init_rmode(vcpu->kvm);
1387} 1411}
@@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1715 vmcs_writel(GUEST_GDTR_BASE, dt->base); 1739 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1716} 1740}
1717 1741
1742static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
1743{
1744 struct kvm_segment var;
1745 u32 ar;
1746
1747 vmx_get_segment(vcpu, &var, seg);
1748 ar = vmx_segment_access_rights(&var);
1749
1750 if (var.base != (var.selector << 4))
1751 return false;
1752 if (var.limit != 0xffff)
1753 return false;
1754 if (ar != 0xf3)
1755 return false;
1756
1757 return true;
1758}
1759
1760static bool code_segment_valid(struct kvm_vcpu *vcpu)
1761{
1762 struct kvm_segment cs;
1763 unsigned int cs_rpl;
1764
1765 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1766 cs_rpl = cs.selector & SELECTOR_RPL_MASK;
1767
1768 if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
1769 return false;
1770 if (!cs.s)
1771 return false;
1772 if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
1773 if (cs.dpl > cs_rpl)
1774 return false;
1775 } else if (cs.type & AR_TYPE_CODE_MASK) {
1776 if (cs.dpl != cs_rpl)
1777 return false;
1778 }
1779 if (!cs.present)
1780 return false;
1781
1782 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
1783 return true;
1784}
1785
1786static bool stack_segment_valid(struct kvm_vcpu *vcpu)
1787{
1788 struct kvm_segment ss;
1789 unsigned int ss_rpl;
1790
1791 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1792 ss_rpl = ss.selector & SELECTOR_RPL_MASK;
1793
1794 if ((ss.type != 3) || (ss.type != 7))
1795 return false;
1796 if (!ss.s)
1797 return false;
1798 if (ss.dpl != ss_rpl) /* DPL != RPL */
1799 return false;
1800 if (!ss.present)
1801 return false;
1802
1803 return true;
1804}
1805
1806static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
1807{
1808 struct kvm_segment var;
1809 unsigned int rpl;
1810
1811 vmx_get_segment(vcpu, &var, seg);
1812 rpl = var.selector & SELECTOR_RPL_MASK;
1813
1814 if (!var.s)
1815 return false;
1816 if (!var.present)
1817 return false;
1818 if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
1819 if (var.dpl < rpl) /* DPL < RPL */
1820 return false;
1821 }
1822
1823 /* TODO: Add other members to kvm_segment_field to allow checking for other access
1824 * rights flags
1825 */
1826 return true;
1827}
1828
1829static bool tr_valid(struct kvm_vcpu *vcpu)
1830{
1831 struct kvm_segment tr;
1832
1833 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
1834
1835 if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
1836 return false;
1837 if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
1838 return false;
1839 if (!tr.present)
1840 return false;
1841
1842 return true;
1843}
1844
1845static bool ldtr_valid(struct kvm_vcpu *vcpu)
1846{
1847 struct kvm_segment ldtr;
1848
1849 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
1850
1851 if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
1852 return false;
1853 if (ldtr.type != 2)
1854 return false;
1855 if (!ldtr.present)
1856 return false;
1857
1858 return true;
1859}
1860
1861static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
1862{
1863 struct kvm_segment cs, ss;
1864
1865 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1866 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1867
1868 return ((cs.selector & SELECTOR_RPL_MASK) ==
1869 (ss.selector & SELECTOR_RPL_MASK));
1870}
1871
1872/*
1873 * Check if guest state is valid. Returns true if valid, false if
1874 * not.
1875 * We assume that registers are always usable
1876 */
1877static bool guest_state_valid(struct kvm_vcpu *vcpu)
1878{
1879 /* real mode guest state checks */
1880 if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
1881 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
1882 return false;
1883 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
1884 return false;
1885 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
1886 return false;
1887 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
1888 return false;
1889 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
1890 return false;
1891 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
1892 return false;
1893 } else {
1894 /* protected mode guest state checks */
1895 if (!cs_ss_rpl_check(vcpu))
1896 return false;
1897 if (!code_segment_valid(vcpu))
1898 return false;
1899 if (!stack_segment_valid(vcpu))
1900 return false;
1901 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
1902 return false;
1903 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
1904 return false;
1905 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
1906 return false;
1907 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
1908 return false;
1909 if (!tr_valid(vcpu))
1910 return false;
1911 if (!ldtr_valid(vcpu))
1912 return false;
1913 }
1914 /* TODO:
1915 * - Add checks on RIP
1916 * - Add checks on RFLAGS
1917 */
1918
1919 return true;
1920}
1921
1718static int init_rmode_tss(struct kvm *kvm) 1922static int init_rmode_tss(struct kvm *kvm)
1719{ 1923{
1720 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 1924 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
@@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm)
1726 if (r < 0) 1930 if (r < 0)
1727 goto out; 1931 goto out;
1728 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 1932 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1729 r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); 1933 r = kvm_write_guest_page(kvm, fn++, &data,
1934 TSS_IOPB_BASE_OFFSET, sizeof(u16));
1730 if (r < 0) 1935 if (r < 0)
1731 goto out; 1936 goto out;
1732 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); 1937 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
@@ -1789,7 +1994,7 @@ static void seg_setup(int seg)
1789 vmcs_write16(sf->selector, 0); 1994 vmcs_write16(sf->selector, 0);
1790 vmcs_writel(sf->base, 0); 1995 vmcs_writel(sf->base, 0);
1791 vmcs_write32(sf->limit, 0xffff); 1996 vmcs_write32(sf->limit, 0xffff);
1792 vmcs_write32(sf->ar_bytes, 0x93); 1997 vmcs_write32(sf->ar_bytes, 0xf3);
1793} 1998}
1794 1999
1795static int alloc_apic_access_page(struct kvm *kvm) 2000static int alloc_apic_access_page(struct kvm *kvm)
@@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
1808 if (r) 2013 if (r)
1809 goto out; 2014 goto out;
1810 2015
1811 down_read(&current->mm->mmap_sem);
1812 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); 2016 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1813 up_read(&current->mm->mmap_sem);
1814out: 2017out:
1815 up_write(&kvm->slots_lock); 2018 up_write(&kvm->slots_lock);
1816 return r; 2019 return r;
@@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm)
1832 if (r) 2035 if (r)
1833 goto out; 2036 goto out;
1834 2037
1835 down_read(&current->mm->mmap_sem);
1836 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, 2038 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
1837 VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); 2039 VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
1838 up_read(&current->mm->mmap_sem);
1839out: 2040out:
1840 up_write(&kvm->slots_lock); 2041 up_write(&kvm->slots_lock);
1841 return r; 2042 return r;
@@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1917 } 2118 }
1918 if (!vm_need_ept()) 2119 if (!vm_need_ept())
1919 exec_control |= CPU_BASED_CR3_STORE_EXITING | 2120 exec_control |= CPU_BASED_CR3_STORE_EXITING |
1920 CPU_BASED_CR3_LOAD_EXITING; 2121 CPU_BASED_CR3_LOAD_EXITING |
2122 CPU_BASED_INVLPG_EXITING;
1921 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 2123 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1922 2124
1923 if (cpu_has_secondary_exec_ctrls()) { 2125 if (cpu_has_secondary_exec_ctrls()) {
@@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2019 u64 msr; 2221 u64 msr;
2020 int ret; 2222 int ret;
2021 2223
2224 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2022 down_read(&vcpu->kvm->slots_lock); 2225 down_read(&vcpu->kvm->slots_lock);
2023 if (!init_rmode(vmx->vcpu.kvm)) { 2226 if (!init_rmode(vmx->vcpu.kvm)) {
2024 ret = -ENOMEM; 2227 ret = -ENOMEM;
@@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2036 2239
2037 fx_init(&vmx->vcpu); 2240 fx_init(&vmx->vcpu);
2038 2241
2242 seg_setup(VCPU_SREG_CS);
2039 /* 2243 /*
2040 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 2244 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2041 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. 2245 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
@@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2047 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); 2251 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
2048 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); 2252 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
2049 } 2253 }
2050 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
2051 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
2052 2254
2053 seg_setup(VCPU_SREG_DS); 2255 seg_setup(VCPU_SREG_DS);
2054 seg_setup(VCPU_SREG_ES); 2256 seg_setup(VCPU_SREG_ES);
@@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2072 2274
2073 vmcs_writel(GUEST_RFLAGS, 0x02); 2275 vmcs_writel(GUEST_RFLAGS, 0x02);
2074 if (vmx->vcpu.vcpu_id == 0) 2276 if (vmx->vcpu.vcpu_id == 0)
2075 vmcs_writel(GUEST_RIP, 0xfff0); 2277 kvm_rip_write(vcpu, 0xfff0);
2076 else 2278 else
2077 vmcs_writel(GUEST_RIP, 0); 2279 kvm_rip_write(vcpu, 0);
2078 vmcs_writel(GUEST_RSP, 0); 2280 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
2079 2281
2080 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ 2282 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
2081 vmcs_writel(GUEST_DR7, 0x400); 2283 vmcs_writel(GUEST_DR7, 0x400);
@@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2125 2327
2126 ret = 0; 2328 ret = 0;
2127 2329
2330 /* HACK: Don't enable emulation on guest boot/reset */
2331 vmx->emulation_required = 0;
2332
2128out: 2333out:
2129 up_read(&vcpu->kvm->slots_lock); 2334 up_read(&vcpu->kvm->slots_lock);
2130 return ret; 2335 return ret;
@@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2136 2341
2137 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); 2342 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
2138 2343
2344 ++vcpu->stat.irq_injections;
2139 if (vcpu->arch.rmode.active) { 2345 if (vcpu->arch.rmode.active) {
2140 vmx->rmode.irq.pending = true; 2346 vmx->rmode.irq.pending = true;
2141 vmx->rmode.irq.vector = irq; 2347 vmx->rmode.irq.vector = irq;
2142 vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); 2348 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2143 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2349 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2144 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); 2350 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2145 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 2351 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2146 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); 2352 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2147 return; 2353 return;
2148 } 2354 }
2149 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2355 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2154{ 2360{
2155 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2361 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2156 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2362 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2157 vcpu->arch.nmi_pending = 0;
2158} 2363}
2159 2364
2160static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 2365static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
@@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2166 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); 2371 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
2167 if (!vcpu->arch.irq_pending[word_index]) 2372 if (!vcpu->arch.irq_pending[word_index])
2168 clear_bit(word_index, &vcpu->arch.irq_summary); 2373 clear_bit(word_index, &vcpu->arch.irq_summary);
2169 vmx_inject_irq(vcpu, irq); 2374 kvm_queue_interrupt(vcpu, irq);
2170} 2375}
2171 2376
2172 2377
@@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
2180 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); 2385 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2181 2386
2182 if (vcpu->arch.interrupt_window_open && 2387 if (vcpu->arch.interrupt_window_open &&
2183 vcpu->arch.irq_summary && 2388 vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
2184 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
2185 /*
2186 * If interrupts enabled, and not blocked by sti or mov ss. Good.
2187 */
2188 kvm_do_inject_irq(vcpu); 2389 kvm_do_inject_irq(vcpu);
2189 2390
2391 if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
2392 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2393
2190 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2394 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2191 if (!vcpu->arch.interrupt_window_open && 2395 if (!vcpu->arch.interrupt_window_open &&
2192 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) 2396 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
@@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
2237static int handle_rmode_exception(struct kvm_vcpu *vcpu, 2441static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2238 int vec, u32 err_code) 2442 int vec, u32 err_code)
2239{ 2443{
2240 if (!vcpu->arch.rmode.active)
2241 return 0;
2242
2243 /* 2444 /*
2244 * Instruction with address size override prefix opcode 0x67 2445 * Instruction with address size override prefix opcode 0x67
2245 * Cause the #SS fault with 0 error code in VM86 mode. 2446 * Cause the #SS fault with 0 error code in VM86 mode.
@@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2247 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2448 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2248 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2449 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
2249 return 1; 2450 return 1;
2451 /*
2452 * Forward all other exceptions that are valid in real mode.
2453 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
2454 * the required debugging infrastructure rework.
2455 */
2456 switch (vec) {
2457 case DE_VECTOR:
2458 case DB_VECTOR:
2459 case BP_VECTOR:
2460 case OF_VECTOR:
2461 case BR_VECTOR:
2462 case UD_VECTOR:
2463 case DF_VECTOR:
2464 case SS_VECTOR:
2465 case GP_VECTOR:
2466 case MF_VECTOR:
2467 kvm_queue_exception(vcpu, vec);
2468 return 1;
2469 }
2250 return 0; 2470 return 0;
2251} 2471}
2252 2472
@@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2288 } 2508 }
2289 2509
2290 error_code = 0; 2510 error_code = 0;
2291 rip = vmcs_readl(GUEST_RIP); 2511 rip = kvm_rip_read(vcpu);
2292 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 2512 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
2293 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 2513 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2294 if (is_page_fault(intr_info)) { 2514 if (is_page_fault(intr_info)) {
@@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2298 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2518 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2299 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2519 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2300 (u32)((u64)cr2 >> 32), handler); 2520 (u32)((u64)cr2 >> 32), handler);
2301 if (vect_info & VECTORING_INFO_VALID_MASK) 2521 if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
2302 kvm_mmu_unprotect_page_virt(vcpu, cr2); 2522 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2303 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2523 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2304 } 2524 }
@@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2386 reg = (exit_qualification >> 8) & 15; 2606 reg = (exit_qualification >> 8) & 15;
2387 switch ((exit_qualification >> 4) & 3) { 2607 switch ((exit_qualification >> 4) & 3) {
2388 case 0: /* mov to cr */ 2608 case 0: /* mov to cr */
2389 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], 2609 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
2390 (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); 2610 (u32)kvm_register_read(vcpu, reg),
2611 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2612 handler);
2391 switch (cr) { 2613 switch (cr) {
2392 case 0: 2614 case 0:
2393 vcpu_load_rsp_rip(vcpu); 2615 kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
2394 kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
2395 skip_emulated_instruction(vcpu); 2616 skip_emulated_instruction(vcpu);
2396 return 1; 2617 return 1;
2397 case 3: 2618 case 3:
2398 vcpu_load_rsp_rip(vcpu); 2619 kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
2399 kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
2400 skip_emulated_instruction(vcpu); 2620 skip_emulated_instruction(vcpu);
2401 return 1; 2621 return 1;
2402 case 4: 2622 case 4:
2403 vcpu_load_rsp_rip(vcpu); 2623 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
2404 kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
2405 skip_emulated_instruction(vcpu); 2624 skip_emulated_instruction(vcpu);
2406 return 1; 2625 return 1;
2407 case 8: 2626 case 8:
2408 vcpu_load_rsp_rip(vcpu); 2627 kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
2409 kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
2410 skip_emulated_instruction(vcpu); 2628 skip_emulated_instruction(vcpu);
2411 if (irqchip_in_kernel(vcpu->kvm)) 2629 if (irqchip_in_kernel(vcpu->kvm))
2412 return 1; 2630 return 1;
@@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2415 }; 2633 };
2416 break; 2634 break;
2417 case 2: /* clts */ 2635 case 2: /* clts */
2418 vcpu_load_rsp_rip(vcpu);
2419 vmx_fpu_deactivate(vcpu); 2636 vmx_fpu_deactivate(vcpu);
2420 vcpu->arch.cr0 &= ~X86_CR0_TS; 2637 vcpu->arch.cr0 &= ~X86_CR0_TS;
2421 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 2638 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
@@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2426 case 1: /*mov from cr*/ 2643 case 1: /*mov from cr*/
2427 switch (cr) { 2644 switch (cr) {
2428 case 3: 2645 case 3:
2429 vcpu_load_rsp_rip(vcpu); 2646 kvm_register_write(vcpu, reg, vcpu->arch.cr3);
2430 vcpu->arch.regs[reg] = vcpu->arch.cr3;
2431 vcpu_put_rsp_rip(vcpu);
2432 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, 2647 KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
2433 (u32)vcpu->arch.regs[reg], 2648 (u32)kvm_register_read(vcpu, reg),
2434 (u32)((u64)vcpu->arch.regs[reg] >> 32), 2649 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2435 handler); 2650 handler);
2436 skip_emulated_instruction(vcpu); 2651 skip_emulated_instruction(vcpu);
2437 return 1; 2652 return 1;
2438 case 8: 2653 case 8:
2439 vcpu_load_rsp_rip(vcpu); 2654 kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
2440 vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
2441 vcpu_put_rsp_rip(vcpu);
2442 KVMTRACE_2D(CR_READ, vcpu, (u32)cr, 2655 KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
2443 (u32)vcpu->arch.regs[reg], handler); 2656 (u32)kvm_register_read(vcpu, reg), handler);
2444 skip_emulated_instruction(vcpu); 2657 skip_emulated_instruction(vcpu);
2445 return 1; 2658 return 1;
2446 } 2659 }
@@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2472 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 2685 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2473 dr = exit_qualification & 7; 2686 dr = exit_qualification & 7;
2474 reg = (exit_qualification >> 8) & 15; 2687 reg = (exit_qualification >> 8) & 15;
2475 vcpu_load_rsp_rip(vcpu);
2476 if (exit_qualification & 16) { 2688 if (exit_qualification & 16) {
2477 /* mov from dr */ 2689 /* mov from dr */
2478 switch (dr) { 2690 switch (dr) {
@@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2485 default: 2697 default:
2486 val = 0; 2698 val = 0;
2487 } 2699 }
2488 vcpu->arch.regs[reg] = val; 2700 kvm_register_write(vcpu, reg, val);
2489 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); 2701 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
2490 } else { 2702 } else {
2491 /* mov to dr */ 2703 /* mov to dr */
2492 } 2704 }
2493 vcpu_put_rsp_rip(vcpu);
2494 skip_emulated_instruction(vcpu); 2705 skip_emulated_instruction(vcpu);
2495 return 1; 2706 return 1;
2496} 2707}
@@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2583 return 1; 2794 return 1;
2584} 2795}
2585 2796
2797static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2798{
2799 u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2800
2801 kvm_mmu_invlpg(vcpu, exit_qualification);
2802 skip_emulated_instruction(vcpu);
2803 return 1;
2804}
2805
2586static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2806static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2587{ 2807{
2588 skip_emulated_instruction(vcpu); 2808 skip_emulated_instruction(vcpu);
@@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2695 return 1; 2915 return 1;
2696} 2916}
2697 2917
2918static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
2919 struct kvm_run *kvm_run)
2920{
2921 struct vcpu_vmx *vmx = to_vmx(vcpu);
2922 int err;
2923
2924 preempt_enable();
2925 local_irq_enable();
2926
2927 while (!guest_state_valid(vcpu)) {
2928 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2929
2930 switch (err) {
2931 case EMULATE_DONE:
2932 break;
2933 case EMULATE_DO_MMIO:
2934 kvm_report_emulation_failure(vcpu, "mmio");
2935 /* TODO: Handle MMIO */
2936 return;
2937 default:
2938 kvm_report_emulation_failure(vcpu, "emulation failure");
2939 return;
2940 }
2941
2942 if (signal_pending(current))
2943 break;
2944 if (need_resched())
2945 schedule();
2946 }
2947
2948 local_irq_disable();
2949 preempt_disable();
2950
2951 /* Guest state should be valid now, no more emulation should be needed */
2952 vmx->emulation_required = 0;
2953}
2954
2698/* 2955/*
2699 * The exit handlers return 1 if the exit was handled fully and guest execution 2956 * The exit handlers return 1 if the exit was handled fully and guest execution
2700 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 2957 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2714 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 2971 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2715 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 2972 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2716 [EXIT_REASON_HLT] = handle_halt, 2973 [EXIT_REASON_HLT] = handle_halt,
2974 [EXIT_REASON_INVLPG] = handle_invlpg,
2717 [EXIT_REASON_VMCALL] = handle_vmcall, 2975 [EXIT_REASON_VMCALL] = handle_vmcall,
2718 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 2976 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2719 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 2977 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
@@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2735 struct vcpu_vmx *vmx = to_vmx(vcpu); 2993 struct vcpu_vmx *vmx = to_vmx(vcpu);
2736 u32 vectoring_info = vmx->idt_vectoring_info; 2994 u32 vectoring_info = vmx->idt_vectoring_info;
2737 2995
2738 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), 2996 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
2739 (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); 2997 (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
2740 2998
2741 /* Access CR3 don't cause VMExit in paging mode, so we need 2999 /* Access CR3 don't cause VMExit in paging mode, so we need
2742 * to sync with guest real CR3. */ 3000 * to sync with guest real CR3. */
@@ -2829,88 +3087,92 @@ static void enable_intr_window(struct kvm_vcpu *vcpu)
2829 enable_irq_window(vcpu); 3087 enable_irq_window(vcpu);
2830} 3088}
2831 3089
2832static void vmx_intr_assist(struct kvm_vcpu *vcpu) 3090static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
2833{ 3091{
2834 struct vcpu_vmx *vmx = to_vmx(vcpu); 3092 u32 exit_intr_info;
2835 u32 idtv_info_field, intr_info_field, exit_intr_info_field; 3093 u32 idt_vectoring_info;
2836 int vector; 3094 bool unblock_nmi;
3095 u8 vector;
3096 int type;
3097 bool idtv_info_valid;
3098 u32 error;
2837 3099
2838 update_tpr_threshold(vcpu); 3100 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2839 3101 if (cpu_has_virtual_nmis()) {
2840 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 3102 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
2841 exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO); 3103 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
2842 idtv_info_field = vmx->idt_vectoring_info; 3104 /*
2843 if (intr_info_field & INTR_INFO_VALID_MASK) { 3105 * SDM 3: 25.7.1.2
2844 if (idtv_info_field & INTR_INFO_VALID_MASK) { 3106 * Re-set bit "block by NMI" before VM entry if vmexit caused by
2845 /* TODO: fault when IDT_Vectoring */ 3107 * a guest IRET fault.
2846 if (printk_ratelimit()) 3108 */
2847 printk(KERN_ERR "Fault when IDT_Vectoring\n"); 3109 if (unblock_nmi && vector != DF_VECTOR)
2848 } 3110 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2849 enable_intr_window(vcpu); 3111 GUEST_INTR_STATE_NMI);
2850 return;
2851 } 3112 }
2852 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2853 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2854 == INTR_TYPE_EXT_INTR
2855 && vcpu->arch.rmode.active) {
2856 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2857
2858 vmx_inject_irq(vcpu, vect);
2859 enable_intr_window(vcpu);
2860 return;
2861 }
2862
2863 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
2864 3113
3114 idt_vectoring_info = vmx->idt_vectoring_info;
3115 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3116 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3117 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3118 if (vmx->vcpu.arch.nmi_injected) {
2865 /* 3119 /*
2866 * SDM 3: 25.7.1.2 3120 * SDM 3: 25.7.1.2
2867 * Clear bit "block by NMI" before VM entry if a NMI delivery 3121 * Clear bit "block by NMI" before VM entry if a NMI delivery
2868 * faulted. 3122 * faulted.
2869 */ 3123 */
2870 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) 3124 if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
2871 == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis()) 3125 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2872 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 3126 GUEST_INTR_STATE_NMI);
2873 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3127 else
2874 ~GUEST_INTR_STATE_NMI); 3128 vmx->vcpu.arch.nmi_injected = false;
2875 3129 }
2876 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field 3130 kvm_clear_exception_queue(&vmx->vcpu);
2877 & ~INTR_INFO_RESVD_BITS_MASK); 3131 if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
2878 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 3132 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
2879 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 3133 error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
2880 3134 kvm_queue_exception_e(&vmx->vcpu, vector, error);
2881 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) 3135 } else
2882 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 3136 kvm_queue_exception(&vmx->vcpu, vector);
2883 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 3137 vmx->idt_vectoring_info = 0;
2884 enable_intr_window(vcpu);
2885 return;
2886 } 3138 }
3139 kvm_clear_interrupt_queue(&vmx->vcpu);
3140 if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
3141 kvm_queue_interrupt(&vmx->vcpu, vector);
3142 vmx->idt_vectoring_info = 0;
3143 }
3144}
3145
3146static void vmx_intr_assist(struct kvm_vcpu *vcpu)
3147{
3148 update_tpr_threshold(vcpu);
3149
2887 if (cpu_has_virtual_nmis()) { 3150 if (cpu_has_virtual_nmis()) {
2888 /* 3151 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
2889 * SDM 3: 25.7.1.2 3152 if (vmx_nmi_enabled(vcpu)) {
2890 * Re-set bit "block by NMI" before VM entry if vmexit caused by 3153 vcpu->arch.nmi_pending = false;
2891 * a guest IRET fault. 3154 vcpu->arch.nmi_injected = true;
2892 */ 3155 } else {
2893 if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) && 3156 enable_intr_window(vcpu);
2894 (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8) 3157 return;
2895 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 3158 }
2896 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | 3159 }
2897 GUEST_INTR_STATE_NMI); 3160 if (vcpu->arch.nmi_injected) {
2898 else if (vcpu->arch.nmi_pending) { 3161 vmx_inject_nmi(vcpu);
2899 if (vmx_nmi_enabled(vcpu))
2900 vmx_inject_nmi(vcpu);
2901 enable_intr_window(vcpu); 3162 enable_intr_window(vcpu);
2902 return; 3163 return;
2903 } 3164 }
2904
2905 } 3165 }
2906 if (!kvm_cpu_has_interrupt(vcpu)) 3166 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
2907 return; 3167 if (vmx_irq_enabled(vcpu))
2908 if (vmx_irq_enabled(vcpu)) { 3168 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
2909 vector = kvm_cpu_get_interrupt(vcpu); 3169 else
2910 vmx_inject_irq(vcpu, vector); 3170 enable_irq_window(vcpu);
2911 kvm_timer_intr_post(vcpu, vector); 3171 }
2912 } else 3172 if (vcpu->arch.interrupt.pending) {
2913 enable_irq_window(vcpu); 3173 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
3174 kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
3175 }
2914} 3176}
2915 3177
2916/* 3178/*
@@ -2922,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2922static void fixup_rmode_irq(struct vcpu_vmx *vmx) 3184static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2923{ 3185{
2924 vmx->rmode.irq.pending = 0; 3186 vmx->rmode.irq.pending = 0;
2925 if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) 3187 if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
2926 return; 3188 return;
2927 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); 3189 kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
2928 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { 3190 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
2929 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; 3191 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
2930 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; 3192 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
@@ -2936,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2936 | vmx->rmode.irq.vector; 3198 | vmx->rmode.irq.vector;
2937} 3199}
2938 3200
3201#ifdef CONFIG_X86_64
3202#define R "r"
3203#define Q "q"
3204#else
3205#define R "e"
3206#define Q "l"
3207#endif
3208
2939static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3209static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2940{ 3210{
2941 struct vcpu_vmx *vmx = to_vmx(vcpu); 3211 struct vcpu_vmx *vmx = to_vmx(vcpu);
2942 u32 intr_info; 3212 u32 intr_info;
2943 3213
3214 /* Handle invalid guest state instead of entering VMX */
3215 if (vmx->emulation_required && emulate_invalid_guest_state) {
3216 handle_invalid_guest_state(vcpu, kvm_run);
3217 return;
3218 }
3219
3220 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3221 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
3222 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
3223 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
3224
2944 /* 3225 /*
2945 * Loading guest fpu may have cleared host cr0.ts 3226 * Loading guest fpu may have cleared host cr0.ts
2946 */ 3227 */
@@ -2948,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2948 3229
2949 asm( 3230 asm(
2950 /* Store host registers */ 3231 /* Store host registers */
2951#ifdef CONFIG_X86_64 3232 "push %%"R"dx; push %%"R"bp;"
2952 "push %%rdx; push %%rbp;" 3233 "push %%"R"cx \n\t"
2953 "push %%rcx \n\t" 3234 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
2954#else 3235 "je 1f \n\t"
2955 "push %%edx; push %%ebp;" 3236 "mov %%"R"sp, %c[host_rsp](%0) \n\t"
2956 "push %%ecx \n\t"
2957#endif
2958 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 3237 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
3238 "1: \n\t"
2959 /* Check if vmlaunch of vmresume is needed */ 3239 /* Check if vmlaunch of vmresume is needed */
2960 "cmpl $0, %c[launched](%0) \n\t" 3240 "cmpl $0, %c[launched](%0) \n\t"
2961 /* Load guest registers. Don't clobber flags. */ 3241 /* Load guest registers. Don't clobber flags. */
3242 "mov %c[cr2](%0), %%"R"ax \n\t"
3243 "mov %%"R"ax, %%cr2 \n\t"
3244 "mov %c[rax](%0), %%"R"ax \n\t"
3245 "mov %c[rbx](%0), %%"R"bx \n\t"
3246 "mov %c[rdx](%0), %%"R"dx \n\t"
3247 "mov %c[rsi](%0), %%"R"si \n\t"
3248 "mov %c[rdi](%0), %%"R"di \n\t"
3249 "mov %c[rbp](%0), %%"R"bp \n\t"
2962#ifdef CONFIG_X86_64 3250#ifdef CONFIG_X86_64
2963 "mov %c[cr2](%0), %%rax \n\t"
2964 "mov %%rax, %%cr2 \n\t"
2965 "mov %c[rax](%0), %%rax \n\t"
2966 "mov %c[rbx](%0), %%rbx \n\t"
2967 "mov %c[rdx](%0), %%rdx \n\t"
2968 "mov %c[rsi](%0), %%rsi \n\t"
2969 "mov %c[rdi](%0), %%rdi \n\t"
2970 "mov %c[rbp](%0), %%rbp \n\t"
2971 "mov %c[r8](%0), %%r8 \n\t" 3251 "mov %c[r8](%0), %%r8 \n\t"
2972 "mov %c[r9](%0), %%r9 \n\t" 3252 "mov %c[r9](%0), %%r9 \n\t"
2973 "mov %c[r10](%0), %%r10 \n\t" 3253 "mov %c[r10](%0), %%r10 \n\t"
@@ -2976,18 +3256,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2976 "mov %c[r13](%0), %%r13 \n\t" 3256 "mov %c[r13](%0), %%r13 \n\t"
2977 "mov %c[r14](%0), %%r14 \n\t" 3257 "mov %c[r14](%0), %%r14 \n\t"
2978 "mov %c[r15](%0), %%r15 \n\t" 3258 "mov %c[r15](%0), %%r15 \n\t"
2979 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
2980#else
2981 "mov %c[cr2](%0), %%eax \n\t"
2982 "mov %%eax, %%cr2 \n\t"
2983 "mov %c[rax](%0), %%eax \n\t"
2984 "mov %c[rbx](%0), %%ebx \n\t"
2985 "mov %c[rdx](%0), %%edx \n\t"
2986 "mov %c[rsi](%0), %%esi \n\t"
2987 "mov %c[rdi](%0), %%edi \n\t"
2988 "mov %c[rbp](%0), %%ebp \n\t"
2989 "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
2990#endif 3259#endif
3260 "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
3261
2991 /* Enter guest mode */ 3262 /* Enter guest mode */
2992 "jne .Llaunched \n\t" 3263 "jne .Llaunched \n\t"
2993 __ex(ASM_VMX_VMLAUNCH) "\n\t" 3264 __ex(ASM_VMX_VMLAUNCH) "\n\t"
@@ -2995,15 +3266,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2995 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 3266 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
2996 ".Lkvm_vmx_return: " 3267 ".Lkvm_vmx_return: "
2997 /* Save guest registers, load host registers, keep flags */ 3268 /* Save guest registers, load host registers, keep flags */
3269 "xchg %0, (%%"R"sp) \n\t"
3270 "mov %%"R"ax, %c[rax](%0) \n\t"
3271 "mov %%"R"bx, %c[rbx](%0) \n\t"
3272 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
3273 "mov %%"R"dx, %c[rdx](%0) \n\t"
3274 "mov %%"R"si, %c[rsi](%0) \n\t"
3275 "mov %%"R"di, %c[rdi](%0) \n\t"
3276 "mov %%"R"bp, %c[rbp](%0) \n\t"
2998#ifdef CONFIG_X86_64 3277#ifdef CONFIG_X86_64
2999 "xchg %0, (%%rsp) \n\t"
3000 "mov %%rax, %c[rax](%0) \n\t"
3001 "mov %%rbx, %c[rbx](%0) \n\t"
3002 "pushq (%%rsp); popq %c[rcx](%0) \n\t"
3003 "mov %%rdx, %c[rdx](%0) \n\t"
3004 "mov %%rsi, %c[rsi](%0) \n\t"
3005 "mov %%rdi, %c[rdi](%0) \n\t"
3006 "mov %%rbp, %c[rbp](%0) \n\t"
3007 "mov %%r8, %c[r8](%0) \n\t" 3278 "mov %%r8, %c[r8](%0) \n\t"
3008 "mov %%r9, %c[r9](%0) \n\t" 3279 "mov %%r9, %c[r9](%0) \n\t"
3009 "mov %%r10, %c[r10](%0) \n\t" 3280 "mov %%r10, %c[r10](%0) \n\t"
@@ -3012,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3012 "mov %%r13, %c[r13](%0) \n\t" 3283 "mov %%r13, %c[r13](%0) \n\t"
3013 "mov %%r14, %c[r14](%0) \n\t" 3284 "mov %%r14, %c[r14](%0) \n\t"
3014 "mov %%r15, %c[r15](%0) \n\t" 3285 "mov %%r15, %c[r15](%0) \n\t"
3015 "mov %%cr2, %%rax \n\t"
3016 "mov %%rax, %c[cr2](%0) \n\t"
3017
3018 "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
3019#else
3020 "xchg %0, (%%esp) \n\t"
3021 "mov %%eax, %c[rax](%0) \n\t"
3022 "mov %%ebx, %c[rbx](%0) \n\t"
3023 "pushl (%%esp); popl %c[rcx](%0) \n\t"
3024 "mov %%edx, %c[rdx](%0) \n\t"
3025 "mov %%esi, %c[rsi](%0) \n\t"
3026 "mov %%edi, %c[rdi](%0) \n\t"
3027 "mov %%ebp, %c[rbp](%0) \n\t"
3028 "mov %%cr2, %%eax \n\t"
3029 "mov %%eax, %c[cr2](%0) \n\t"
3030
3031 "pop %%ebp; pop %%ebp; pop %%edx \n\t"
3032#endif 3286#endif
3287 "mov %%cr2, %%"R"ax \n\t"
3288 "mov %%"R"ax, %c[cr2](%0) \n\t"
3289
3290 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
3033 "setbe %c[fail](%0) \n\t" 3291 "setbe %c[fail](%0) \n\t"
3034 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 3292 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
3035 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 3293 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
3036 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 3294 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
3295 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
3037 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 3296 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
3038 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), 3297 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
3039 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), 3298 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
@@ -3053,14 +3312,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3053#endif 3312#endif
3054 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 3313 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
3055 : "cc", "memory" 3314 : "cc", "memory"
3315 , R"bx", R"di", R"si"
3056#ifdef CONFIG_X86_64 3316#ifdef CONFIG_X86_64
3057 , "rbx", "rdi", "rsi"
3058 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 3317 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
3059#else
3060 , "ebx", "edi", "rsi"
3061#endif 3318#endif
3062 ); 3319 );
3063 3320
3321 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
3322 vcpu->arch.regs_dirty = 0;
3323
3064 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3324 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3065 if (vmx->rmode.irq.pending) 3325 if (vmx->rmode.irq.pending)
3066 fixup_rmode_irq(vmx); 3326 fixup_rmode_irq(vmx);
@@ -3080,8 +3340,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3080 KVMTRACE_0D(NMI, vcpu, handler); 3340 KVMTRACE_0D(NMI, vcpu, handler);
3081 asm("int $2"); 3341 asm("int $2");
3082 } 3342 }
3343
3344 vmx_complete_interrupts(vmx);
3083} 3345}
3084 3346
3347#undef R
3348#undef Q
3349
3085static void vmx_free_vmcs(struct kvm_vcpu *vcpu) 3350static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
3086{ 3351{
3087 struct vcpu_vmx *vmx = to_vmx(vcpu); 3352 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3224,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3224 .set_idt = vmx_set_idt, 3489 .set_idt = vmx_set_idt,
3225 .get_gdt = vmx_get_gdt, 3490 .get_gdt = vmx_get_gdt,
3226 .set_gdt = vmx_set_gdt, 3491 .set_gdt = vmx_set_gdt,
3227 .cache_regs = vcpu_load_rsp_rip, 3492 .cache_reg = vmx_cache_reg,
3228 .decache_regs = vcpu_put_rsp_rip,
3229 .get_rflags = vmx_get_rflags, 3493 .get_rflags = vmx_get_rflags,
3230 .set_rflags = vmx_set_rflags, 3494 .set_rflags = vmx_set_rflags,
3231 3495
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 17e25995b65b..3e010d21fdd7 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,9 +331,6 @@ enum vmcs_field {
331 331
332#define AR_RESERVD_MASK 0xfffe0f00 332#define AR_RESERVD_MASK 0xfffe0f00
333 333
334#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
335#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
336
337#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 334#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
338#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 335#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10
339 336
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 19afbb644c7f..4f0677d1eae8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4,10 +4,14 @@
4 * derived from drivers/kvm/kvm_main.c 4 * derived from drivers/kvm/kvm_main.c
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008
7 * 9 *
8 * Authors: 10 * Authors:
9 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
10 * Yaniv Kamay <yaniv@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Amit Shah <amit.shah@qumranet.com>
14 * Ben-Ami Yassour <benami@il.ibm.com>
11 * 15 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See 16 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory. 17 * the COPYING file in the top-level directory.
@@ -19,14 +23,18 @@
19#include "mmu.h" 23#include "mmu.h"
20#include "i8254.h" 24#include "i8254.h"
21#include "tss.h" 25#include "tss.h"
26#include "kvm_cache_regs.h"
27#include "x86.h"
22 28
23#include <linux/clocksource.h> 29#include <linux/clocksource.h>
30#include <linux/interrupt.h>
24#include <linux/kvm.h> 31#include <linux/kvm.h>
25#include <linux/fs.h> 32#include <linux/fs.h>
26#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
27#include <linux/module.h> 34#include <linux/module.h>
28#include <linux/mman.h> 35#include <linux/mman.h>
29#include <linux/highmem.h> 36#include <linux/highmem.h>
37#include <linux/intel-iommu.h>
30 38
31#include <asm/uaccess.h> 39#include <asm/uaccess.h>
32#include <asm/msr.h> 40#include <asm/msr.h>
@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
61 struct kvm_cpuid_entry2 __user *entries); 69 struct kvm_cpuid_entry2 __user *entries);
62 70
63struct kvm_x86_ops *kvm_x86_ops; 71struct kvm_x86_ops *kvm_x86_ops;
72EXPORT_SYMBOL_GPL(kvm_x86_ops);
64 73
65struct kvm_stats_debugfs_item debugfs_entries[] = { 74struct kvm_stats_debugfs_item debugfs_entries[] = {
66 { "pf_fixed", VCPU_STAT(pf_fixed) }, 75 { "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
83 { "fpu_reload", VCPU_STAT(fpu_reload) }, 92 { "fpu_reload", VCPU_STAT(fpu_reload) },
84 { "insn_emulation", VCPU_STAT(insn_emulation) }, 93 { "insn_emulation", VCPU_STAT(insn_emulation) },
85 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 94 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
95 { "irq_injections", VCPU_STAT(irq_injections) },
86 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 96 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
87 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 97 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
88 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 98 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
90 { "mmu_flooded", VM_STAT(mmu_flooded) }, 100 { "mmu_flooded", VM_STAT(mmu_flooded) },
91 { "mmu_recycled", VM_STAT(mmu_recycled) }, 101 { "mmu_recycled", VM_STAT(mmu_recycled) },
92 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 102 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
103 { "mmu_unsync", VM_STAT(mmu_unsync) },
93 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 104 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
94 { "largepages", VM_STAT(lpages) }, 105 { "largepages", VM_STAT(lpages) },
95 { NULL } 106 { NULL }
96}; 107};
97 108
98
99unsigned long segment_base(u16 selector) 109unsigned long segment_base(u16 selector)
100{ 110{
101 struct descriptor_table gdt; 111 struct descriptor_table gdt;
@@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
352void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 362void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
353{ 363{
354 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 364 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
365 kvm_mmu_sync_roots(vcpu);
355 kvm_mmu_flush_tlb(vcpu); 366 kvm_mmu_flush_tlb(vcpu);
356 return; 367 return;
357 } 368 }
@@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
662 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 673 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
663 __func__, data); 674 __func__, data);
664 break; 675 break;
676 case MSR_IA32_DEBUGCTLMSR:
677 if (!data) {
678 /* We support the non-activated case already */
679 break;
680 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
681 /* Values other than LBR and BTF are vendor-specific,
682 thus reserved and should throw a #GP */
683 return 1;
684 }
685 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
686 __func__, data);
687 break;
665 case MSR_IA32_UCODE_REV: 688 case MSR_IA32_UCODE_REV:
666 case MSR_IA32_UCODE_WRITE: 689 case MSR_IA32_UCODE_WRITE:
667 break; 690 break;
@@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
692 /* ...but clean it before doing the actual write */ 715 /* ...but clean it before doing the actual write */
693 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 716 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
694 717
695 down_read(&current->mm->mmap_sem);
696 vcpu->arch.time_page = 718 vcpu->arch.time_page =
697 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 719 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
698 up_read(&current->mm->mmap_sem);
699 720
700 if (is_error_page(vcpu->arch.time_page)) { 721 if (is_error_page(vcpu->arch.time_page)) {
701 kvm_release_page_clean(vcpu->arch.time_page); 722 kvm_release_page_clean(vcpu->arch.time_page);
@@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
752 case MSR_IA32_MC0_MISC+8: 773 case MSR_IA32_MC0_MISC+8:
753 case MSR_IA32_MC0_MISC+12: 774 case MSR_IA32_MC0_MISC+12:
754 case MSR_IA32_MC0_MISC+16: 775 case MSR_IA32_MC0_MISC+16:
776 case MSR_IA32_MC0_MISC+20:
755 case MSR_IA32_UCODE_REV: 777 case MSR_IA32_UCODE_REV:
756 case MSR_IA32_EBL_CR_POWERON: 778 case MSR_IA32_EBL_CR_POWERON:
779 case MSR_IA32_DEBUGCTLMSR:
780 case MSR_IA32_LASTBRANCHFROMIP:
781 case MSR_IA32_LASTBRANCHTOIP:
782 case MSR_IA32_LASTINTFROMIP:
783 case MSR_IA32_LASTINTTOIP:
757 data = 0; 784 data = 0;
758 break; 785 break;
759 case MSR_MTRRcap: 786 case MSR_MTRRcap:
@@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext)
901 case KVM_CAP_PV_MMU: 928 case KVM_CAP_PV_MMU:
902 r = !tdp_enabled; 929 r = !tdp_enabled;
903 break; 930 break;
931 case KVM_CAP_IOMMU:
932 r = intel_iommu_found();
933 break;
904 default: 934 default:
905 r = 0; 935 r = 0;
906 break; 936 break;
@@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1303 struct kvm_vcpu *vcpu = filp->private_data; 1333 struct kvm_vcpu *vcpu = filp->private_data;
1304 void __user *argp = (void __user *)arg; 1334 void __user *argp = (void __user *)arg;
1305 int r; 1335 int r;
1336 struct kvm_lapic_state *lapic = NULL;
1306 1337
1307 switch (ioctl) { 1338 switch (ioctl) {
1308 case KVM_GET_LAPIC: { 1339 case KVM_GET_LAPIC: {
1309 struct kvm_lapic_state lapic; 1340 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1310 1341
1311 memset(&lapic, 0, sizeof lapic); 1342 r = -ENOMEM;
1312 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); 1343 if (!lapic)
1344 goto out;
1345 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1313 if (r) 1346 if (r)
1314 goto out; 1347 goto out;
1315 r = -EFAULT; 1348 r = -EFAULT;
1316 if (copy_to_user(argp, &lapic, sizeof lapic)) 1349 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1317 goto out; 1350 goto out;
1318 r = 0; 1351 r = 0;
1319 break; 1352 break;
1320 } 1353 }
1321 case KVM_SET_LAPIC: { 1354 case KVM_SET_LAPIC: {
1322 struct kvm_lapic_state lapic; 1355 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1323 1356 r = -ENOMEM;
1357 if (!lapic)
1358 goto out;
1324 r = -EFAULT; 1359 r = -EFAULT;
1325 if (copy_from_user(&lapic, argp, sizeof lapic)) 1360 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1326 goto out; 1361 goto out;
1327 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; 1362 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1328 if (r) 1363 if (r)
1329 goto out; 1364 goto out;
1330 r = 0; 1365 r = 0;
@@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1422 r = -EINVAL; 1457 r = -EINVAL;
1423 } 1458 }
1424out: 1459out:
1460 if (lapic)
1461 kfree(lapic);
1425 return r; 1462 return r;
1426} 1463}
1427 1464
@@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
1630 struct kvm *kvm = filp->private_data; 1667 struct kvm *kvm = filp->private_data;
1631 void __user *argp = (void __user *)arg; 1668 void __user *argp = (void __user *)arg;
1632 int r = -EINVAL; 1669 int r = -EINVAL;
1670 /*
1671 * This union makes it completely explicit to gcc-3.x
1672 * that these two variables' stack usage should be
1673 * combined, not added together.
1674 */
1675 union {
1676 struct kvm_pit_state ps;
1677 struct kvm_memory_alias alias;
1678 } u;
1633 1679
1634 switch (ioctl) { 1680 switch (ioctl) {
1635 case KVM_SET_TSS_ADDR: 1681 case KVM_SET_TSS_ADDR:
@@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
1661 case KVM_GET_NR_MMU_PAGES: 1707 case KVM_GET_NR_MMU_PAGES:
1662 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 1708 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1663 break; 1709 break;
1664 case KVM_SET_MEMORY_ALIAS: { 1710 case KVM_SET_MEMORY_ALIAS:
1665 struct kvm_memory_alias alias;
1666
1667 r = -EFAULT; 1711 r = -EFAULT;
1668 if (copy_from_user(&alias, argp, sizeof alias)) 1712 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1669 goto out; 1713 goto out;
1670 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); 1714 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1671 if (r) 1715 if (r)
1672 goto out; 1716 goto out;
1673 break; 1717 break;
1674 }
1675 case KVM_CREATE_IRQCHIP: 1718 case KVM_CREATE_IRQCHIP:
1676 r = -ENOMEM; 1719 r = -ENOMEM;
1677 kvm->arch.vpic = kvm_create_pic(kvm); 1720 kvm->arch.vpic = kvm_create_pic(kvm);
@@ -1699,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
1699 goto out; 1742 goto out;
1700 if (irqchip_in_kernel(kvm)) { 1743 if (irqchip_in_kernel(kvm)) {
1701 mutex_lock(&kvm->lock); 1744 mutex_lock(&kvm->lock);
1702 if (irq_event.irq < 16) 1745 kvm_set_irq(kvm, irq_event.irq, irq_event.level);
1703 kvm_pic_set_irq(pic_irqchip(kvm),
1704 irq_event.irq,
1705 irq_event.level);
1706 kvm_ioapic_set_irq(kvm->arch.vioapic,
1707 irq_event.irq,
1708 irq_event.level);
1709 mutex_unlock(&kvm->lock); 1746 mutex_unlock(&kvm->lock);
1710 r = 0; 1747 r = 0;
1711 } 1748 }
@@ -1713,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp,
1713 } 1750 }
1714 case KVM_GET_IRQCHIP: { 1751 case KVM_GET_IRQCHIP: {
1715 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1752 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1716 struct kvm_irqchip chip; 1753 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1717 1754
1718 r = -EFAULT; 1755 r = -ENOMEM;
1719 if (copy_from_user(&chip, argp, sizeof chip)) 1756 if (!chip)
1720 goto out; 1757 goto out;
1758 r = -EFAULT;
1759 if (copy_from_user(chip, argp, sizeof *chip))
1760 goto get_irqchip_out;
1721 r = -ENXIO; 1761 r = -ENXIO;
1722 if (!irqchip_in_kernel(kvm)) 1762 if (!irqchip_in_kernel(kvm))
1723 goto out; 1763 goto get_irqchip_out;
1724 r = kvm_vm_ioctl_get_irqchip(kvm, &chip); 1764 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1725 if (r) 1765 if (r)
1726 goto out; 1766 goto get_irqchip_out;
1727 r = -EFAULT; 1767 r = -EFAULT;
1728 if (copy_to_user(argp, &chip, sizeof chip)) 1768 if (copy_to_user(argp, chip, sizeof *chip))
1729 goto out; 1769 goto get_irqchip_out;
1730 r = 0; 1770 r = 0;
1771 get_irqchip_out:
1772 kfree(chip);
1773 if (r)
1774 goto out;
1731 break; 1775 break;
1732 } 1776 }
1733 case KVM_SET_IRQCHIP: { 1777 case KVM_SET_IRQCHIP: {
1734 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1778 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1735 struct kvm_irqchip chip; 1779 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1736 1780
1737 r = -EFAULT; 1781 r = -ENOMEM;
1738 if (copy_from_user(&chip, argp, sizeof chip)) 1782 if (!chip)
1739 goto out; 1783 goto out;
1784 r = -EFAULT;
1785 if (copy_from_user(chip, argp, sizeof *chip))
1786 goto set_irqchip_out;
1740 r = -ENXIO; 1787 r = -ENXIO;
1741 if (!irqchip_in_kernel(kvm)) 1788 if (!irqchip_in_kernel(kvm))
1742 goto out; 1789 goto set_irqchip_out;
1743 r = kvm_vm_ioctl_set_irqchip(kvm, &chip); 1790 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
1744 if (r) 1791 if (r)
1745 goto out; 1792 goto set_irqchip_out;
1746 r = 0; 1793 r = 0;
1794 set_irqchip_out:
1795 kfree(chip);
1796 if (r)
1797 goto out;
1747 break; 1798 break;
1748 } 1799 }
1749 case KVM_GET_PIT: { 1800 case KVM_GET_PIT: {
1750 struct kvm_pit_state ps;
1751 r = -EFAULT; 1801 r = -EFAULT;
1752 if (copy_from_user(&ps, argp, sizeof ps)) 1802 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
1753 goto out; 1803 goto out;
1754 r = -ENXIO; 1804 r = -ENXIO;
1755 if (!kvm->arch.vpit) 1805 if (!kvm->arch.vpit)
1756 goto out; 1806 goto out;
1757 r = kvm_vm_ioctl_get_pit(kvm, &ps); 1807 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
1758 if (r) 1808 if (r)
1759 goto out; 1809 goto out;
1760 r = -EFAULT; 1810 r = -EFAULT;
1761 if (copy_to_user(argp, &ps, sizeof ps)) 1811 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
1762 goto out; 1812 goto out;
1763 r = 0; 1813 r = 0;
1764 break; 1814 break;
1765 } 1815 }
1766 case KVM_SET_PIT: { 1816 case KVM_SET_PIT: {
1767 struct kvm_pit_state ps;
1768 r = -EFAULT; 1817 r = -EFAULT;
1769 if (copy_from_user(&ps, argp, sizeof ps)) 1818 if (copy_from_user(&u.ps, argp, sizeof u.ps))
1770 goto out; 1819 goto out;
1771 r = -ENXIO; 1820 r = -ENXIO;
1772 if (!kvm->arch.vpit) 1821 if (!kvm->arch.vpit)
1773 goto out; 1822 goto out;
1774 r = kvm_vm_ioctl_set_pit(kvm, &ps); 1823 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
1775 if (r) 1824 if (r)
1776 goto out; 1825 goto out;
1777 r = 0; 1826 r = 0;
@@ -2018,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
2018 2067
2019 val = *(u64 *)new; 2068 val = *(u64 *)new;
2020 2069
2021 down_read(&current->mm->mmap_sem);
2022 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2070 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2023 up_read(&current->mm->mmap_sem);
2024 2071
2025 kaddr = kmap_atomic(page, KM_USER0); 2072 kaddr = kmap_atomic(page, KM_USER0);
2026 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2073 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
@@ -2040,6 +2087,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2040 2087
2041int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2088int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2042{ 2089{
2090 kvm_mmu_invlpg(vcpu, address);
2043 return X86EMUL_CONTINUE; 2091 return X86EMUL_CONTINUE;
2044} 2092}
2045 2093
@@ -2080,7 +2128,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2080void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2128void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2081{ 2129{
2082 u8 opcodes[4]; 2130 u8 opcodes[4];
2083 unsigned long rip = vcpu->arch.rip; 2131 unsigned long rip = kvm_rip_read(vcpu);
2084 unsigned long rip_linear; 2132 unsigned long rip_linear;
2085 2133
2086 if (!printk_ratelimit()) 2134 if (!printk_ratelimit())
@@ -2102,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = {
2102 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2150 .cmpxchg_emulated = emulator_cmpxchg_emulated,
2103}; 2151};
2104 2152
2153static void cache_all_regs(struct kvm_vcpu *vcpu)
2154{
2155 kvm_register_read(vcpu, VCPU_REGS_RAX);
2156 kvm_register_read(vcpu, VCPU_REGS_RSP);
2157 kvm_register_read(vcpu, VCPU_REGS_RIP);
2158 vcpu->arch.regs_dirty = ~0;
2159}
2160
2105int emulate_instruction(struct kvm_vcpu *vcpu, 2161int emulate_instruction(struct kvm_vcpu *vcpu,
2106 struct kvm_run *run, 2162 struct kvm_run *run,
2107 unsigned long cr2, 2163 unsigned long cr2,
@@ -2111,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2111 int r; 2167 int r;
2112 struct decode_cache *c; 2168 struct decode_cache *c;
2113 2169
2170 kvm_clear_exception_queue(vcpu);
2114 vcpu->arch.mmio_fault_cr2 = cr2; 2171 vcpu->arch.mmio_fault_cr2 = cr2;
2115 kvm_x86_ops->cache_regs(vcpu); 2172 /*
2173 * TODO: fix x86_emulate.c to use guest_read/write_register
2174 * instead of direct ->regs accesses, can save hundred cycles
2175 * on Intel for instructions that don't read/change RSP, for
2176 * for example.
2177 */
2178 cache_all_regs(vcpu);
2116 2179
2117 vcpu->mmio_is_write = 0; 2180 vcpu->mmio_is_write = 0;
2118 vcpu->arch.pio.string = 0; 2181 vcpu->arch.pio.string = 0;
@@ -2172,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2172 return EMULATE_DO_MMIO; 2235 return EMULATE_DO_MMIO;
2173 } 2236 }
2174 2237
2175 kvm_x86_ops->decache_regs(vcpu);
2176 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2238 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2177 2239
2178 if (vcpu->mmio_is_write) { 2240 if (vcpu->mmio_is_write) {
@@ -2225,20 +2287,19 @@ int complete_pio(struct kvm_vcpu *vcpu)
2225 struct kvm_pio_request *io = &vcpu->arch.pio; 2287 struct kvm_pio_request *io = &vcpu->arch.pio;
2226 long delta; 2288 long delta;
2227 int r; 2289 int r;
2228 2290 unsigned long val;
2229 kvm_x86_ops->cache_regs(vcpu);
2230 2291
2231 if (!io->string) { 2292 if (!io->string) {
2232 if (io->in) 2293 if (io->in) {
2233 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, 2294 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2234 io->size); 2295 memcpy(&val, vcpu->arch.pio_data, io->size);
2296 kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2297 }
2235 } else { 2298 } else {
2236 if (io->in) { 2299 if (io->in) {
2237 r = pio_copy_data(vcpu); 2300 r = pio_copy_data(vcpu);
2238 if (r) { 2301 if (r)
2239 kvm_x86_ops->cache_regs(vcpu);
2240 return r; 2302 return r;
2241 }
2242 } 2303 }
2243 2304
2244 delta = 1; 2305 delta = 1;
@@ -2248,19 +2309,24 @@ int complete_pio(struct kvm_vcpu *vcpu)
2248 * The size of the register should really depend on 2309 * The size of the register should really depend on
2249 * current address size. 2310 * current address size.
2250 */ 2311 */
2251 vcpu->arch.regs[VCPU_REGS_RCX] -= delta; 2312 val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2313 val -= delta;
2314 kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2252 } 2315 }
2253 if (io->down) 2316 if (io->down)
2254 delta = -delta; 2317 delta = -delta;
2255 delta *= io->size; 2318 delta *= io->size;
2256 if (io->in) 2319 if (io->in) {
2257 vcpu->arch.regs[VCPU_REGS_RDI] += delta; 2320 val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2258 else 2321 val += delta;
2259 vcpu->arch.regs[VCPU_REGS_RSI] += delta; 2322 kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2323 } else {
2324 val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2325 val += delta;
2326 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2327 }
2260 } 2328 }
2261 2329
2262 kvm_x86_ops->decache_regs(vcpu);
2263
2264 io->count -= io->cur_count; 2330 io->count -= io->cur_count;
2265 io->cur_count = 0; 2331 io->cur_count = 0;
2266 2332
@@ -2313,6 +2379,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2313 int size, unsigned port) 2379 int size, unsigned port)
2314{ 2380{
2315 struct kvm_io_device *pio_dev; 2381 struct kvm_io_device *pio_dev;
2382 unsigned long val;
2316 2383
2317 vcpu->run->exit_reason = KVM_EXIT_IO; 2384 vcpu->run->exit_reason = KVM_EXIT_IO;
2318 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2385 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2333,8 +2400,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2333 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2400 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2334 handler); 2401 handler);
2335 2402
2336 kvm_x86_ops->cache_regs(vcpu); 2403 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2337 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); 2404 memcpy(vcpu->arch.pio_data, &val, 4);
2338 2405
2339 kvm_x86_ops->skip_emulated_instruction(vcpu); 2406 kvm_x86_ops->skip_emulated_instruction(vcpu);
2340 2407
@@ -2492,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2492 KVMTRACE_0D(HLT, vcpu, handler); 2559 KVMTRACE_0D(HLT, vcpu, handler);
2493 if (irqchip_in_kernel(vcpu->kvm)) { 2560 if (irqchip_in_kernel(vcpu->kvm)) {
2494 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 2561 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2495 up_read(&vcpu->kvm->slots_lock);
2496 kvm_vcpu_block(vcpu);
2497 down_read(&vcpu->kvm->slots_lock);
2498 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2499 return -EINTR;
2500 return 1; 2562 return 1;
2501 } else { 2563 } else {
2502 vcpu->run->exit_reason = KVM_EXIT_HLT; 2564 vcpu->run->exit_reason = KVM_EXIT_HLT;
@@ -2519,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2519 unsigned long nr, a0, a1, a2, a3, ret; 2581 unsigned long nr, a0, a1, a2, a3, ret;
2520 int r = 1; 2582 int r = 1;
2521 2583
2522 kvm_x86_ops->cache_regs(vcpu); 2584 nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
2523 2585 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
2524 nr = vcpu->arch.regs[VCPU_REGS_RAX]; 2586 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
2525 a0 = vcpu->arch.regs[VCPU_REGS_RBX]; 2587 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2526 a1 = vcpu->arch.regs[VCPU_REGS_RCX]; 2588 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2527 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2528 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2529 2589
2530 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 2590 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2531 2591
@@ -2548,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2548 ret = -KVM_ENOSYS; 2608 ret = -KVM_ENOSYS;
2549 break; 2609 break;
2550 } 2610 }
2551 vcpu->arch.regs[VCPU_REGS_RAX] = ret; 2611 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2552 kvm_x86_ops->decache_regs(vcpu);
2553 ++vcpu->stat.hypercalls; 2612 ++vcpu->stat.hypercalls;
2554 return r; 2613 return r;
2555} 2614}
@@ -2559,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2559{ 2618{
2560 char instruction[3]; 2619 char instruction[3];
2561 int ret = 0; 2620 int ret = 0;
2621 unsigned long rip = kvm_rip_read(vcpu);
2562 2622
2563 2623
2564 /* 2624 /*
@@ -2568,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2568 */ 2628 */
2569 kvm_mmu_zap_all(vcpu->kvm); 2629 kvm_mmu_zap_all(vcpu->kvm);
2570 2630
2571 kvm_x86_ops->cache_regs(vcpu);
2572 kvm_x86_ops->patch_hypercall(vcpu, instruction); 2631 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2573 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) 2632 if (emulator_write_emulated(rip, instruction, 3, vcpu)
2574 != X86EMUL_CONTINUE) 2633 != X86EMUL_CONTINUE)
2575 ret = -EFAULT; 2634 ret = -EFAULT;
2576 2635
@@ -2700,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2700 u32 function, index; 2759 u32 function, index;
2701 struct kvm_cpuid_entry2 *e, *best; 2760 struct kvm_cpuid_entry2 *e, *best;
2702 2761
2703 kvm_x86_ops->cache_regs(vcpu); 2762 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
2704 function = vcpu->arch.regs[VCPU_REGS_RAX]; 2763 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
2705 index = vcpu->arch.regs[VCPU_REGS_RCX]; 2764 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
2706 vcpu->arch.regs[VCPU_REGS_RAX] = 0; 2765 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
2707 vcpu->arch.regs[VCPU_REGS_RBX] = 0; 2766 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
2708 vcpu->arch.regs[VCPU_REGS_RCX] = 0; 2767 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
2709 vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2710 best = NULL; 2768 best = NULL;
2711 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 2769 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2712 e = &vcpu->arch.cpuid_entries[i]; 2770 e = &vcpu->arch.cpuid_entries[i];
@@ -2724,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2724 best = e; 2782 best = e;
2725 } 2783 }
2726 if (best) { 2784 if (best) {
2727 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; 2785 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
2728 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; 2786 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
2729 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; 2787 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
2730 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; 2788 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
2731 } 2789 }
2732 kvm_x86_ops->decache_regs(vcpu);
2733 kvm_x86_ops->skip_emulated_instruction(vcpu); 2790 kvm_x86_ops->skip_emulated_instruction(vcpu);
2734 KVMTRACE_5D(CPUID, vcpu, function, 2791 KVMTRACE_5D(CPUID, vcpu, function,
2735 (u32)vcpu->arch.regs[VCPU_REGS_RAX], 2792 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
2736 (u32)vcpu->arch.regs[VCPU_REGS_RBX], 2793 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
2737 (u32)vcpu->arch.regs[VCPU_REGS_RCX], 2794 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
2738 (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); 2795 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
2739} 2796}
2740EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 2797EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2741 2798
@@ -2776,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
2776 if (!apic || !apic->vapic_addr) 2833 if (!apic || !apic->vapic_addr)
2777 return; 2834 return;
2778 2835
2779 down_read(&current->mm->mmap_sem);
2780 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2836 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2781 up_read(&current->mm->mmap_sem);
2782 2837
2783 vcpu->arch.apic->vapic_page = page; 2838 vcpu->arch.apic->vapic_page = page;
2784} 2839}
@@ -2796,28 +2851,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
2796 up_read(&vcpu->kvm->slots_lock); 2851 up_read(&vcpu->kvm->slots_lock);
2797} 2852}
2798 2853
2799static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2854static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2800{ 2855{
2801 int r; 2856 int r;
2802 2857
2803 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2804 pr_debug("vcpu %d received sipi with vector # %x\n",
2805 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2806 kvm_lapic_reset(vcpu);
2807 r = kvm_x86_ops->vcpu_reset(vcpu);
2808 if (r)
2809 return r;
2810 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2811 }
2812
2813 down_read(&vcpu->kvm->slots_lock);
2814 vapic_enter(vcpu);
2815
2816preempted:
2817 if (vcpu->guest_debug.enabled)
2818 kvm_x86_ops->guest_debug_pre(vcpu);
2819
2820again:
2821 if (vcpu->requests) 2858 if (vcpu->requests)
2822 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 2859 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
2823 kvm_mmu_unload(vcpu); 2860 kvm_mmu_unload(vcpu);
@@ -2829,6 +2866,8 @@ again:
2829 if (vcpu->requests) { 2866 if (vcpu->requests) {
2830 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2867 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2831 __kvm_migrate_timers(vcpu); 2868 __kvm_migrate_timers(vcpu);
2869 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
2870 kvm_mmu_sync_roots(vcpu);
2832 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 2871 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2833 kvm_x86_ops->tlb_flush(vcpu); 2872 kvm_x86_ops->tlb_flush(vcpu);
2834 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2873 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
@@ -2854,21 +2893,15 @@ again:
2854 2893
2855 local_irq_disable(); 2894 local_irq_disable();
2856 2895
2857 if (vcpu->requests || need_resched()) { 2896 if (vcpu->requests || need_resched() || signal_pending(current)) {
2858 local_irq_enable(); 2897 local_irq_enable();
2859 preempt_enable(); 2898 preempt_enable();
2860 r = 1; 2899 r = 1;
2861 goto out; 2900 goto out;
2862 } 2901 }
2863 2902
2864 if (signal_pending(current)) { 2903 if (vcpu->guest_debug.enabled)
2865 local_irq_enable(); 2904 kvm_x86_ops->guest_debug_pre(vcpu);
2866 preempt_enable();
2867 r = -EINTR;
2868 kvm_run->exit_reason = KVM_EXIT_INTR;
2869 ++vcpu->stat.signal_exits;
2870 goto out;
2871 }
2872 2905
2873 vcpu->guest_mode = 1; 2906 vcpu->guest_mode = 1;
2874 /* 2907 /*
@@ -2917,8 +2950,8 @@ again:
2917 * Profile KVM exit RIPs: 2950 * Profile KVM exit RIPs:
2918 */ 2951 */
2919 if (unlikely(prof_on == KVM_PROFILING)) { 2952 if (unlikely(prof_on == KVM_PROFILING)) {
2920 kvm_x86_ops->cache_regs(vcpu); 2953 unsigned long rip = kvm_rip_read(vcpu);
2921 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); 2954 profile_hit(KVM_PROFILING, (void *)rip);
2922 } 2955 }
2923 2956
2924 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) 2957 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
@@ -2927,26 +2960,63 @@ again:
2927 kvm_lapic_sync_from_vapic(vcpu); 2960 kvm_lapic_sync_from_vapic(vcpu);
2928 2961
2929 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 2962 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2963out:
2964 return r;
2965}
2930 2966
2931 if (r > 0) { 2967static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2932 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 2968{
2933 r = -EINTR; 2969 int r;
2934 kvm_run->exit_reason = KVM_EXIT_INTR; 2970
2935 ++vcpu->stat.request_irq_exits; 2971 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2936 goto out; 2972 pr_debug("vcpu %d received sipi with vector # %x\n",
2937 } 2973 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2938 if (!need_resched()) 2974 kvm_lapic_reset(vcpu);
2939 goto again; 2975 r = kvm_x86_ops->vcpu_reset(vcpu);
2976 if (r)
2977 return r;
2978 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2940 } 2979 }
2941 2980
2942out: 2981 down_read(&vcpu->kvm->slots_lock);
2943 up_read(&vcpu->kvm->slots_lock); 2982 vapic_enter(vcpu);
2944 if (r > 0) { 2983
2945 kvm_resched(vcpu); 2984 r = 1;
2946 down_read(&vcpu->kvm->slots_lock); 2985 while (r > 0) {
2947 goto preempted; 2986 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
2987 r = vcpu_enter_guest(vcpu, kvm_run);
2988 else {
2989 up_read(&vcpu->kvm->slots_lock);
2990 kvm_vcpu_block(vcpu);
2991 down_read(&vcpu->kvm->slots_lock);
2992 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
2993 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
2994 vcpu->arch.mp_state =
2995 KVM_MP_STATE_RUNNABLE;
2996 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2997 r = -EINTR;
2998 }
2999
3000 if (r > 0) {
3001 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3002 r = -EINTR;
3003 kvm_run->exit_reason = KVM_EXIT_INTR;
3004 ++vcpu->stat.request_irq_exits;
3005 }
3006 if (signal_pending(current)) {
3007 r = -EINTR;
3008 kvm_run->exit_reason = KVM_EXIT_INTR;
3009 ++vcpu->stat.signal_exits;
3010 }
3011 if (need_resched()) {
3012 up_read(&vcpu->kvm->slots_lock);
3013 kvm_resched(vcpu);
3014 down_read(&vcpu->kvm->slots_lock);
3015 }
3016 }
2948 } 3017 }
2949 3018
3019 up_read(&vcpu->kvm->slots_lock);
2950 post_kvm_run_save(vcpu, kvm_run); 3020 post_kvm_run_save(vcpu, kvm_run);
2951 3021
2952 vapic_exit(vcpu); 3022 vapic_exit(vcpu);
@@ -2966,6 +3036,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2966 3036
2967 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3037 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2968 kvm_vcpu_block(vcpu); 3038 kvm_vcpu_block(vcpu);
3039 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
2969 r = -EAGAIN; 3040 r = -EAGAIN;
2970 goto out; 3041 goto out;
2971 } 3042 }
@@ -2999,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2999 } 3070 }
3000 } 3071 }
3001#endif 3072#endif
3002 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { 3073 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3003 kvm_x86_ops->cache_regs(vcpu); 3074 kvm_register_write(vcpu, VCPU_REGS_RAX,
3004 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; 3075 kvm_run->hypercall.ret);
3005 kvm_x86_ops->decache_regs(vcpu);
3006 }
3007 3076
3008 r = __vcpu_run(vcpu, kvm_run); 3077 r = __vcpu_run(vcpu, kvm_run);
3009 3078
@@ -3019,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3019{ 3088{
3020 vcpu_load(vcpu); 3089 vcpu_load(vcpu);
3021 3090
3022 kvm_x86_ops->cache_regs(vcpu); 3091 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3023 3092 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3024 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3093 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3025 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; 3094 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3026 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; 3095 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3027 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; 3096 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3028 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; 3097 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3029 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; 3098 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3030 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3031 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
3032#ifdef CONFIG_X86_64 3099#ifdef CONFIG_X86_64
3033 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; 3100 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3034 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; 3101 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3035 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; 3102 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3036 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; 3103 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3037 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; 3104 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3038 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; 3105 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3039 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; 3106 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3040 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; 3107 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3041#endif 3108#endif
3042 3109
3043 regs->rip = vcpu->arch.rip; 3110 regs->rip = kvm_rip_read(vcpu);
3044 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3111 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3045 3112
3046 /* 3113 /*
@@ -3058,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3058{ 3125{
3059 vcpu_load(vcpu); 3126 vcpu_load(vcpu);
3060 3127
3061 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; 3128 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3062 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; 3129 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3063 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; 3130 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3064 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; 3131 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3065 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; 3132 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3066 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; 3133 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3067 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; 3134 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3068 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; 3135 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3069#ifdef CONFIG_X86_64 3136#ifdef CONFIG_X86_64
3070 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; 3137 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3071 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; 3138 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3072 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; 3139 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3073 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; 3140 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3074 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; 3141 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3075 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; 3142 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3076 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; 3143 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3077 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; 3144 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3145
3078#endif 3146#endif
3079 3147
3080 vcpu->arch.rip = regs->rip; 3148 kvm_rip_write(vcpu, regs->rip);
3081 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3149 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3082 3150
3083 kvm_x86_ops->decache_regs(vcpu);
3084 3151
3085 vcpu->arch.exception.pending = false; 3152 vcpu->arch.exception.pending = false;
3086 3153
@@ -3294,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3294 return 0; 3361 return 0;
3295} 3362}
3296 3363
3364static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
3365{
3366 struct kvm_segment segvar = {
3367 .base = selector << 4,
3368 .limit = 0xffff,
3369 .selector = selector,
3370 .type = 3,
3371 .present = 1,
3372 .dpl = 3,
3373 .db = 0,
3374 .s = 1,
3375 .l = 0,
3376 .g = 0,
3377 .avl = 0,
3378 .unusable = 0,
3379 };
3380 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3381 return 0;
3382}
3383
3297int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3384int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3298 int type_bits, int seg) 3385 int type_bits, int seg)
3299{ 3386{
3300 struct kvm_segment kvm_seg; 3387 struct kvm_segment kvm_seg;
3301 3388
3389 if (!(vcpu->arch.cr0 & X86_CR0_PE))
3390 return kvm_load_realmode_segment(vcpu, selector, seg);
3302 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 3391 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3303 return 1; 3392 return 1;
3304 kvm_seg.type |= type_bits; 3393 kvm_seg.type |= type_bits;
@@ -3316,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3316 struct tss_segment_32 *tss) 3405 struct tss_segment_32 *tss)
3317{ 3406{
3318 tss->cr3 = vcpu->arch.cr3; 3407 tss->cr3 = vcpu->arch.cr3;
3319 tss->eip = vcpu->arch.rip; 3408 tss->eip = kvm_rip_read(vcpu);
3320 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 3409 tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3321 tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; 3410 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3322 tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3411 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3323 tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; 3412 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3324 tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; 3413 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3325 tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; 3414 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3326 tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; 3415 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3327 tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; 3416 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3328 tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; 3417 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3329
3330 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3418 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3331 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3419 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3332 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3420 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
@@ -3342,17 +3430,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3342{ 3430{
3343 kvm_set_cr3(vcpu, tss->cr3); 3431 kvm_set_cr3(vcpu, tss->cr3);
3344 3432
3345 vcpu->arch.rip = tss->eip; 3433 kvm_rip_write(vcpu, tss->eip);
3346 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 3434 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3347 3435
3348 vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; 3436 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
3349 vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; 3437 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
3350 vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; 3438 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
3351 vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; 3439 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
3352 vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; 3440 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
3353 vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; 3441 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
3354 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; 3442 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
3355 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; 3443 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
3356 3444
3357 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3445 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3358 return 1; 3446 return 1;
@@ -3380,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3380static void save_state_to_tss16(struct kvm_vcpu *vcpu, 3468static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3381 struct tss_segment_16 *tss) 3469 struct tss_segment_16 *tss)
3382{ 3470{
3383 tss->ip = vcpu->arch.rip; 3471 tss->ip = kvm_rip_read(vcpu);
3384 tss->flag = kvm_x86_ops->get_rflags(vcpu); 3472 tss->flag = kvm_x86_ops->get_rflags(vcpu);
3385 tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; 3473 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3386 tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; 3474 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3387 tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; 3475 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3388 tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; 3476 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3389 tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; 3477 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3390 tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; 3478 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3391 tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; 3479 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
3392 tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; 3480 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
3393 3481
3394 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3482 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3395 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3483 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
@@ -3402,16 +3490,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3402static int load_state_from_tss16(struct kvm_vcpu *vcpu, 3490static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3403 struct tss_segment_16 *tss) 3491 struct tss_segment_16 *tss)
3404{ 3492{
3405 vcpu->arch.rip = tss->ip; 3493 kvm_rip_write(vcpu, tss->ip);
3406 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 3494 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3407 vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; 3495 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
3408 vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; 3496 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
3409 vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; 3497 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
3410 vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; 3498 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
3411 vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; 3499 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
3412 vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; 3500 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
3413 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; 3501 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
3414 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; 3502 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
3415 3503
3416 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3504 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3417 return 1; 3505 return 1;
@@ -3534,7 +3622,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3534 } 3622 }
3535 3623
3536 kvm_x86_ops->skip_emulated_instruction(vcpu); 3624 kvm_x86_ops->skip_emulated_instruction(vcpu);
3537 kvm_x86_ops->cache_regs(vcpu);
3538 3625
3539 if (nseg_desc.type & 8) 3626 if (nseg_desc.type & 8)
3540 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, 3627 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
@@ -3559,7 +3646,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3559 tr_seg.type = 11; 3646 tr_seg.type = 11;
3560 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3647 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3561out: 3648out:
3562 kvm_x86_ops->decache_regs(vcpu);
3563 return ret; 3649 return ret;
3564} 3650}
3565EXPORT_SYMBOL_GPL(kvm_task_switch); 3651EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -3622,6 +3708,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3622 pr_debug("Set back pending irq %d\n", 3708 pr_debug("Set back pending irq %d\n",
3623 pending_vec); 3709 pending_vec);
3624 } 3710 }
3711 kvm_pic_clear_isr_ack(vcpu->kvm);
3625 } 3712 }
3626 3713
3627 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3714 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -3634,6 +3721,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3634 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3721 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3635 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3722 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3636 3723
3724 /* Older userspace won't unhalt the vcpu on reset. */
3725 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
3726 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
3727 !(vcpu->arch.cr0 & X86_CR0_PE))
3728 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3729
3637 vcpu_put(vcpu); 3730 vcpu_put(vcpu);
3638 3731
3639 return 0; 3732 return 0;
@@ -3918,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void)
3918 return ERR_PTR(-ENOMEM); 4011 return ERR_PTR(-ENOMEM);
3919 4012
3920 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4013 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4014 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
3921 4015
3922 return kvm; 4016 return kvm;
3923} 4017}
@@ -3950,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
3950 4044
3951void kvm_arch_destroy_vm(struct kvm *kvm) 4045void kvm_arch_destroy_vm(struct kvm *kvm)
3952{ 4046{
4047 kvm_iommu_unmap_guest(kvm);
4048 kvm_free_all_assigned_devices(kvm);
3953 kvm_free_pit(kvm); 4049 kvm_free_pit(kvm);
3954 kfree(kvm->arch.vpic); 4050 kfree(kvm->arch.vpic);
3955 kfree(kvm->arch.vioapic); 4051 kfree(kvm->arch.vioapic);
@@ -3981,7 +4077,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3981 userspace_addr = do_mmap(NULL, 0, 4077 userspace_addr = do_mmap(NULL, 0,
3982 npages * PAGE_SIZE, 4078 npages * PAGE_SIZE,
3983 PROT_READ | PROT_WRITE, 4079 PROT_READ | PROT_WRITE,
3984 MAP_SHARED | MAP_ANONYMOUS, 4080 MAP_PRIVATE | MAP_ANONYMOUS,
3985 0); 4081 0);
3986 up_write(&current->mm->mmap_sem); 4082 up_write(&current->mm->mmap_sem);
3987 4083
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644
index 000000000000..6a4be78a7384
--- /dev/null
+++ b/arch/x86/kvm/x86.h
@@ -0,0 +1,22 @@
1#ifndef ARCH_X86_KVM_X86_H
2#define ARCH_X86_KVM_X86_H
3
4#include <linux/kvm_host.h>
5
6static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
7{
8 vcpu->arch.exception.pending = false;
9}
10
11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
12{
13 vcpu->arch.interrupt.pending = true;
14 vcpu->arch.interrupt.nr = vector;
15}
16
17static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
18{
19 vcpu->arch.interrupt.pending = false;
20}
21
22#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f2f90468f8b1..ea051173b0da 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -26,6 +26,7 @@
26#define DPRINTF(_f, _a ...) printf(_f , ## _a) 26#define DPRINTF(_f, _a ...) printf(_f , ## _a)
27#else 27#else
28#include <linux/kvm_host.h> 28#include <linux/kvm_host.h>
29#include "kvm_cache_regs.h"
29#define DPRINTF(x...) do {} while (0) 30#define DPRINTF(x...) do {} while (0)
30#endif 31#endif
31#include <linux/module.h> 32#include <linux/module.h>
@@ -46,25 +47,26 @@
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ 47#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */ 48#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */ 49#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1) 50#define DstAcc (4<<1) /* Destination Accumulator */
51#define DstMask (7<<1)
50/* Source operand type. */ 52/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */ 53#define SrcNone (0<<4) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ 54#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */ 55#define SrcReg (1<<4) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */ 56#define SrcMem (2<<4) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ 57#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ 58#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */ 59#define SrcImm (5<<4) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ 60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3) 61#define SrcMask (7<<4)
60/* Generic ModRM decode. */ 62/* Generic ModRM decode. */
61#define ModRM (1<<6) 63#define ModRM (1<<7)
62/* Destination is only written; never read. */ 64/* Destination is only written; never read. */
63#define Mov (1<<7) 65#define Mov (1<<8)
64#define BitOp (1<<8) 66#define BitOp (1<<9)
65#define MemAbs (1<<9) /* Memory operand is absolute displacement */ 67#define MemAbs (1<<10) /* Memory operand is absolute displacement */
66#define String (1<<10) /* String instruction (rep capable) */ 68#define String (1<<12) /* String instruction (rep capable) */
67#define Stack (1<<11) /* Stack instruction (push/pop) */ 69#define Stack (1<<13) /* Stack instruction (push/pop) */
68#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 70#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
69#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 71#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
70#define GroupMask 0xff /* Group number stored in bits 0:7 */ 72#define GroupMask 0xff /* Group number stored in bits 0:7 */
@@ -94,7 +96,7 @@ static u16 opcode_table[256] = {
94 /* 0x20 - 0x27 */ 96 /* 0x20 - 0x27 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 SrcImmByte, SrcImm, 0, 0, 99 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
98 /* 0x28 - 0x2F */ 100 /* 0x28 - 0x2F */
99 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
100 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -106,7 +108,8 @@ static u16 opcode_table[256] = {
106 /* 0x38 - 0x3F */ 108 /* 0x38 - 0x3F */
107 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
108 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
109 0, 0, 0, 0, 111 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
112 0, 0,
110 /* 0x40 - 0x47 */ 113 /* 0x40 - 0x47 */
111 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 114 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
112 /* 0x48 - 0x4F */ 115 /* 0x48 - 0x4F */
@@ -153,9 +156,16 @@ static u16 opcode_table[256] = {
153 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 156 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
154 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 157 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
155 ByteOp | ImplicitOps | String, ImplicitOps | String, 158 ByteOp | ImplicitOps | String, ImplicitOps | String,
156 /* 0xB0 - 0xBF */ 159 /* 0xB0 - 0xB7 */
157 0, 0, 0, 0, 0, 0, 0, 0, 160 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
158 DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, 161 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
162 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
163 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
164 /* 0xB8 - 0xBF */
165 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
166 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
167 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
168 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
159 /* 0xC0 - 0xC7 */ 169 /* 0xC0 - 0xC7 */
160 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 170 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
161 0, ImplicitOps | Stack, 0, 0, 171 0, ImplicitOps | Stack, 0, 0,
@@ -169,17 +179,20 @@ static u16 opcode_table[256] = {
169 /* 0xD8 - 0xDF */ 179 /* 0xD8 - 0xDF */
170 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0,
171 /* 0xE0 - 0xE7 */ 181 /* 0xE0 - 0xE7 */
172 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0,
183 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
184 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
173 /* 0xE8 - 0xEF */ 185 /* 0xE8 - 0xEF */
174 ImplicitOps | Stack, SrcImm | ImplicitOps, 186 ImplicitOps | Stack, SrcImm | ImplicitOps,
175 ImplicitOps, SrcImmByte | ImplicitOps, 187 ImplicitOps, SrcImmByte | ImplicitOps,
176 0, 0, 0, 0, 188 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
189 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
177 /* 0xF0 - 0xF7 */ 190 /* 0xF0 - 0xF7 */
178 0, 0, 0, 0, 191 0, 0, 0, 0,
179 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, 192 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
180 /* 0xF8 - 0xFF */ 193 /* 0xF8 - 0xFF */
181 ImplicitOps, 0, ImplicitOps, ImplicitOps, 194 ImplicitOps, 0, ImplicitOps, ImplicitOps,
182 0, 0, Group | Group4, Group | Group5, 195 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
183}; 196};
184 197
185static u16 twobyte_table[256] = { 198static u16 twobyte_table[256] = {
@@ -268,15 +281,16 @@ static u16 group_table[] = {
268 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 281 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
269 0, 0, 0, 0, 282 0, 0, 0, 0,
270 [Group3*8] = 283 [Group3*8] =
271 DstMem | SrcImm | ModRM | SrcImm, 0, 284 DstMem | SrcImm | ModRM, 0,
272 DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 285 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
273 0, 0, 0, 0, 286 0, 0, 0, 0,
274 [Group4*8] = 287 [Group4*8] =
275 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 288 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
276 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0,
277 [Group5*8] = 290 [Group5*8] =
278 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 291 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
279 SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, 292 SrcMem | ModRM | Stack, 0,
293 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
280 [Group7*8] = 294 [Group7*8] =
281 0, 0, ModRM | SrcMem, ModRM | SrcMem, 295 0, 0, ModRM | SrcMem, ModRM | SrcMem,
282 SrcNone | ModRM | DstMem | Mov, 0, 296 SrcNone | ModRM | DstMem | Mov, 0,
@@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
839 /* Shadow copy of register state. Committed on successful emulation. */ 853 /* Shadow copy of register state. Committed on successful emulation. */
840 854
841 memset(c, 0, sizeof(struct decode_cache)); 855 memset(c, 0, sizeof(struct decode_cache));
842 c->eip = ctxt->vcpu->arch.rip; 856 c->eip = kvm_rip_read(ctxt->vcpu);
843 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 857 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
844 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 858 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
845 859
@@ -1048,6 +1062,23 @@ done_prefixes:
1048 } 1062 }
1049 c->dst.type = OP_MEM; 1063 c->dst.type = OP_MEM;
1050 break; 1064 break;
1065 case DstAcc:
1066 c->dst.type = OP_REG;
1067 c->dst.bytes = c->op_bytes;
1068 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1069 switch (c->op_bytes) {
1070 case 1:
1071 c->dst.val = *(u8 *)c->dst.ptr;
1072 break;
1073 case 2:
1074 c->dst.val = *(u16 *)c->dst.ptr;
1075 break;
1076 case 4:
1077 c->dst.val = *(u32 *)c->dst.ptr;
1078 break;
1079 }
1080 c->dst.orig_val = c->dst.val;
1081 break;
1051 } 1082 }
1052 1083
1053 if (c->rip_relative) 1084 if (c->rip_relative)
@@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1151 case 1: /* dec */ 1182 case 1: /* dec */
1152 emulate_1op("dec", c->dst, ctxt->eflags); 1183 emulate_1op("dec", c->dst, ctxt->eflags);
1153 break; 1184 break;
1185 case 2: /* call near abs */ {
1186 long int old_eip;
1187 old_eip = c->eip;
1188 c->eip = c->src.val;
1189 c->src.val = old_eip;
1190 emulate_push(ctxt);
1191 break;
1192 }
1154 case 4: /* jmp abs */ 1193 case 4: /* jmp abs */
1155 c->eip = c->src.val; 1194 c->eip = c->src.val;
1156 break; 1195 break;
@@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1251 u64 msr_data; 1290 u64 msr_data;
1252 unsigned long saved_eip = 0; 1291 unsigned long saved_eip = 0;
1253 struct decode_cache *c = &ctxt->decode; 1292 struct decode_cache *c = &ctxt->decode;
1293 unsigned int port;
1294 int io_dir_in;
1254 int rc = 0; 1295 int rc = 0;
1255 1296
1256 /* Shadow copy of register state. Committed on successful emulation. 1297 /* Shadow copy of register state. Committed on successful emulation.
@@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1267 if (c->rep_prefix && (c->d & String)) { 1308 if (c->rep_prefix && (c->d & String)) {
1268 /* All REP prefixes have the same first termination condition */ 1309 /* All REP prefixes have the same first termination condition */
1269 if (c->regs[VCPU_REGS_RCX] == 0) { 1310 if (c->regs[VCPU_REGS_RCX] == 0) {
1270 ctxt->vcpu->arch.rip = c->eip; 1311 kvm_rip_write(ctxt->vcpu, c->eip);
1271 goto done; 1312 goto done;
1272 } 1313 }
1273 /* The second termination condition only applies for REPE 1314 /* The second termination condition only applies for REPE
@@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1281 (c->b == 0xae) || (c->b == 0xaf)) { 1322 (c->b == 0xae) || (c->b == 0xaf)) {
1282 if ((c->rep_prefix == REPE_PREFIX) && 1323 if ((c->rep_prefix == REPE_PREFIX) &&
1283 ((ctxt->eflags & EFLG_ZF) == 0)) { 1324 ((ctxt->eflags & EFLG_ZF) == 0)) {
1284 ctxt->vcpu->arch.rip = c->eip; 1325 kvm_rip_write(ctxt->vcpu, c->eip);
1285 goto done; 1326 goto done;
1286 } 1327 }
1287 if ((c->rep_prefix == REPNE_PREFIX) && 1328 if ((c->rep_prefix == REPNE_PREFIX) &&
1288 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { 1329 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
1289 ctxt->vcpu->arch.rip = c->eip; 1330 kvm_rip_write(ctxt->vcpu, c->eip);
1290 goto done; 1331 goto done;
1291 } 1332 }
1292 } 1333 }
1293 c->regs[VCPU_REGS_RCX]--; 1334 c->regs[VCPU_REGS_RCX]--;
1294 c->eip = ctxt->vcpu->arch.rip; 1335 c->eip = kvm_rip_read(ctxt->vcpu);
1295 } 1336 }
1296 1337
1297 if (c->src.type == OP_MEM) { 1338 if (c->src.type == OP_MEM) {
@@ -1351,27 +1392,10 @@ special_insn:
1351 sbb: /* sbb */ 1392 sbb: /* sbb */
1352 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1393 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1353 break; 1394 break;
1354 case 0x20 ... 0x23: 1395 case 0x20 ... 0x25:
1355 and: /* and */ 1396 and: /* and */
1356 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1397 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
1357 break; 1398 break;
1358 case 0x24: /* and al imm8 */
1359 c->dst.type = OP_REG;
1360 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1361 c->dst.val = *(u8 *)c->dst.ptr;
1362 c->dst.bytes = 1;
1363 c->dst.orig_val = c->dst.val;
1364 goto and;
1365 case 0x25: /* and ax imm16, or eax imm32 */
1366 c->dst.type = OP_REG;
1367 c->dst.bytes = c->op_bytes;
1368 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1369 if (c->op_bytes == 2)
1370 c->dst.val = *(u16 *)c->dst.ptr;
1371 else
1372 c->dst.val = *(u32 *)c->dst.ptr;
1373 c->dst.orig_val = c->dst.val;
1374 goto and;
1375 case 0x28 ... 0x2d: 1399 case 0x28 ... 0x2d:
1376 sub: /* sub */ 1400 sub: /* sub */
1377 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); 1401 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
@@ -1659,7 +1683,7 @@ special_insn:
1659 case 0xae ... 0xaf: /* scas */ 1683 case 0xae ... 0xaf: /* scas */
1660 DPRINTF("Urk! I don't handle SCAS.\n"); 1684 DPRINTF("Urk! I don't handle SCAS.\n");
1661 goto cannot_emulate; 1685 goto cannot_emulate;
1662 case 0xb8: /* mov r, imm */ 1686 case 0xb0 ... 0xbf: /* mov r, imm */
1663 goto mov; 1687 goto mov;
1664 case 0xc0 ... 0xc1: 1688 case 0xc0 ... 0xc1:
1665 emulate_grp2(ctxt); 1689 emulate_grp2(ctxt);
@@ -1679,6 +1703,16 @@ special_insn:
1679 c->src.val = c->regs[VCPU_REGS_RCX]; 1703 c->src.val = c->regs[VCPU_REGS_RCX];
1680 emulate_grp2(ctxt); 1704 emulate_grp2(ctxt);
1681 break; 1705 break;
1706 case 0xe4: /* inb */
1707 case 0xe5: /* in */
1708 port = insn_fetch(u8, 1, c->eip);
1709 io_dir_in = 1;
1710 goto do_io;
1711 case 0xe6: /* outb */
1712 case 0xe7: /* out */
1713 port = insn_fetch(u8, 1, c->eip);
1714 io_dir_in = 0;
1715 goto do_io;
1682 case 0xe8: /* call (near) */ { 1716 case 0xe8: /* call (near) */ {
1683 long int rel; 1717 long int rel;
1684 switch (c->op_bytes) { 1718 switch (c->op_bytes) {
@@ -1729,6 +1763,22 @@ special_insn:
1729 jmp_rel(c, c->src.val); 1763 jmp_rel(c, c->src.val);
1730 c->dst.type = OP_NONE; /* Disable writeback. */ 1764 c->dst.type = OP_NONE; /* Disable writeback. */
1731 break; 1765 break;
1766 case 0xec: /* in al,dx */
1767 case 0xed: /* in (e/r)ax,dx */
1768 port = c->regs[VCPU_REGS_RDX];
1769 io_dir_in = 1;
1770 goto do_io;
1771 case 0xee: /* out al,dx */
1772 case 0xef: /* out (e/r)ax,dx */
1773 port = c->regs[VCPU_REGS_RDX];
1774 io_dir_in = 0;
1775 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
1776 (c->d & ByteOp) ? 1 : c->op_bytes,
1777 port) != 0) {
1778 c->eip = saved_eip;
1779 goto cannot_emulate;
1780 }
1781 return 0;
1732 case 0xf4: /* hlt */ 1782 case 0xf4: /* hlt */
1733 ctxt->vcpu->arch.halt_request = 1; 1783 ctxt->vcpu->arch.halt_request = 1;
1734 break; 1784 break;
@@ -1754,6 +1804,14 @@ special_insn:
1754 ctxt->eflags |= X86_EFLAGS_IF; 1804 ctxt->eflags |= X86_EFLAGS_IF;
1755 c->dst.type = OP_NONE; /* Disable writeback. */ 1805 c->dst.type = OP_NONE; /* Disable writeback. */
1756 break; 1806 break;
1807 case 0xfc: /* cld */
1808 ctxt->eflags &= ~EFLG_DF;
1809 c->dst.type = OP_NONE; /* Disable writeback. */
1810 break;
1811 case 0xfd: /* std */
1812 ctxt->eflags |= EFLG_DF;
1813 c->dst.type = OP_NONE; /* Disable writeback. */
1814 break;
1757 case 0xfe ... 0xff: /* Grp4/Grp5 */ 1815 case 0xfe ... 0xff: /* Grp4/Grp5 */
1758 rc = emulate_grp45(ctxt, ops); 1816 rc = emulate_grp45(ctxt, ops);
1759 if (rc != 0) 1817 if (rc != 0)
@@ -1768,7 +1826,7 @@ writeback:
1768 1826
1769 /* Commit shadow register state. */ 1827 /* Commit shadow register state. */
1770 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 1828 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
1771 ctxt->vcpu->arch.rip = c->eip; 1829 kvm_rip_write(ctxt->vcpu, c->eip);
1772 1830
1773done: 1831done:
1774 if (rc == X86EMUL_UNHANDLEABLE) { 1832 if (rc == X86EMUL_UNHANDLEABLE) {
@@ -1793,7 +1851,7 @@ twobyte_insn:
1793 goto done; 1851 goto done;
1794 1852
1795 /* Let the processor re-execute the fixed hypercall */ 1853 /* Let the processor re-execute the fixed hypercall */
1796 c->eip = ctxt->vcpu->arch.rip; 1854 c->eip = kvm_rip_read(ctxt->vcpu);
1797 /* Disable writeback. */ 1855 /* Disable writeback. */
1798 c->dst.type = OP_NONE; 1856 c->dst.type = OP_NONE;
1799 break; 1857 break;
@@ -1889,7 +1947,7 @@ twobyte_insn:
1889 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); 1947 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
1890 if (rc) { 1948 if (rc) {
1891 kvm_inject_gp(ctxt->vcpu, 0); 1949 kvm_inject_gp(ctxt->vcpu, 0);
1892 c->eip = ctxt->vcpu->arch.rip; 1950 c->eip = kvm_rip_read(ctxt->vcpu);
1893 } 1951 }
1894 rc = X86EMUL_CONTINUE; 1952 rc = X86EMUL_CONTINUE;
1895 c->dst.type = OP_NONE; 1953 c->dst.type = OP_NONE;
@@ -1899,7 +1957,7 @@ twobyte_insn:
1899 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); 1957 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
1900 if (rc) { 1958 if (rc) {
1901 kvm_inject_gp(ctxt->vcpu, 0); 1959 kvm_inject_gp(ctxt->vcpu, 0);
1902 c->eip = ctxt->vcpu->arch.rip; 1960 c->eip = kvm_rip_read(ctxt->vcpu);
1903 } else { 1961 } else {
1904 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 1962 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
1905 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 1963 c->regs[VCPU_REGS_RDX] = msr_data >> 32;