aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/kvm')
-rw-r--r--drivers/kvm/kvm.h106
-rw-r--r--drivers/kvm/kvm_main.c155
-rw-r--r--drivers/kvm/mmu.c1114
-rw-r--r--drivers/kvm/paging_tmpl.h260
-rw-r--r--drivers/kvm/svm.c113
-rw-r--r--drivers/kvm/vmx.c175
-rw-r--r--drivers/kvm/x86_emulate.c2
7 files changed, 1551 insertions, 374 deletions
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 100df6f38d92..91e0c75aca8f 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -52,6 +52,8 @@
52#define KVM_MAX_VCPUS 1 52#define KVM_MAX_VCPUS 1
53#define KVM_MEMORY_SLOTS 4 53#define KVM_MEMORY_SLOTS 4
54#define KVM_NUM_MMU_PAGES 256 54#define KVM_NUM_MMU_PAGES 256
55#define KVM_MIN_FREE_MMU_PAGES 5
56#define KVM_REFILL_PAGES 25
55 57
56#define FX_IMAGE_SIZE 512 58#define FX_IMAGE_SIZE 512
57#define FX_IMAGE_ALIGN 16 59#define FX_IMAGE_ALIGN 16
@@ -89,14 +91,54 @@ typedef unsigned long hva_t;
89typedef u64 hpa_t; 91typedef u64 hpa_t;
90typedef unsigned long hfn_t; 92typedef unsigned long hfn_t;
91 93
94#define NR_PTE_CHAIN_ENTRIES 5
95
96struct kvm_pte_chain {
97 u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
98 struct hlist_node link;
99};
100
101/*
102 * kvm_mmu_page_role, below, is defined as:
103 *
104 * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
105 * bits 4:7 - page table level for this shadow (1-4)
106 * bits 8:9 - page table quadrant for 2-level guests
107 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
108 */
109union kvm_mmu_page_role {
110 unsigned word;
111 struct {
112 unsigned glevels : 4;
113 unsigned level : 4;
114 unsigned quadrant : 2;
115 unsigned pad_for_nice_hex_output : 6;
116 unsigned metaphysical : 1;
117 };
118};
119
92struct kvm_mmu_page { 120struct kvm_mmu_page {
93 struct list_head link; 121 struct list_head link;
122 struct hlist_node hash_link;
123
124 /*
125 * The following two entries are used to key the shadow page in the
126 * hash table.
127 */
128 gfn_t gfn;
129 union kvm_mmu_page_role role;
130
94 hpa_t page_hpa; 131 hpa_t page_hpa;
95 unsigned long slot_bitmap; /* One bit set per slot which has memory 132 unsigned long slot_bitmap; /* One bit set per slot which has memory
96 * in this shadow page. 133 * in this shadow page.
97 */ 134 */
98 int global; /* Set if all ptes in this page are global */ 135 int global; /* Set if all ptes in this page are global */
99 u64 *parent_pte; 136 int multimapped; /* More than one parent_pte? */
137 int root_count; /* Currently serving as active root */
138 union {
139 u64 *parent_pte; /* !multimapped */
140 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
141 };
100}; 142};
101 143
102struct vmcs { 144struct vmcs {
@@ -117,14 +159,26 @@ struct kvm_vcpu;
117struct kvm_mmu { 159struct kvm_mmu {
118 void (*new_cr3)(struct kvm_vcpu *vcpu); 160 void (*new_cr3)(struct kvm_vcpu *vcpu);
119 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 161 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
120 void (*inval_page)(struct kvm_vcpu *vcpu, gva_t gva);
121 void (*free)(struct kvm_vcpu *vcpu); 162 void (*free)(struct kvm_vcpu *vcpu);
122 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); 163 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
123 hpa_t root_hpa; 164 hpa_t root_hpa;
124 int root_level; 165 int root_level;
125 int shadow_root_level; 166 int shadow_root_level;
167
168 u64 *pae_root;
169};
170
171#define KVM_NR_MEM_OBJS 20
172
173struct kvm_mmu_memory_cache {
174 int nobjs;
175 void *objects[KVM_NR_MEM_OBJS];
126}; 176};
127 177
178/*
179 * We don't want allocation failures within the mmu code, so we preallocate
180 * enough memory for a single page fault in a cache.
181 */
128struct kvm_guest_debug { 182struct kvm_guest_debug {
129 int enabled; 183 int enabled;
130 unsigned long bp[4]; 184 unsigned long bp[4];
@@ -173,6 +227,7 @@ struct kvm_vcpu {
173 struct mutex mutex; 227 struct mutex mutex;
174 int cpu; 228 int cpu;
175 int launched; 229 int launched;
230 int interrupt_window_open;
176 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ 231 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
177#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) 232#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
178 unsigned long irq_pending[NR_IRQ_WORDS]; 233 unsigned long irq_pending[NR_IRQ_WORDS];
@@ -184,6 +239,7 @@ struct kvm_vcpu {
184 unsigned long cr3; 239 unsigned long cr3;
185 unsigned long cr4; 240 unsigned long cr4;
186 unsigned long cr8; 241 unsigned long cr8;
242 u64 pdptrs[4]; /* pae */
187 u64 shadow_efer; 243 u64 shadow_efer;
188 u64 apic_base; 244 u64 apic_base;
189 int nmsrs; 245 int nmsrs;
@@ -194,6 +250,12 @@ struct kvm_vcpu {
194 struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES]; 250 struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES];
195 struct kvm_mmu mmu; 251 struct kvm_mmu mmu;
196 252
253 struct kvm_mmu_memory_cache mmu_pte_chain_cache;
254 struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
255
256 gfn_t last_pt_write_gfn;
257 int last_pt_write_count;
258
197 struct kvm_guest_debug guest_debug; 259 struct kvm_guest_debug guest_debug;
198 260
199 char fx_buf[FX_BUF_SIZE]; 261 char fx_buf[FX_BUF_SIZE];
@@ -231,10 +293,16 @@ struct kvm {
231 spinlock_t lock; /* protects everything except vcpus */ 293 spinlock_t lock; /* protects everything except vcpus */
232 int nmemslots; 294 int nmemslots;
233 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; 295 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
296 /*
297 * Hash table of struct kvm_mmu_page.
298 */
234 struct list_head active_mmu_pages; 299 struct list_head active_mmu_pages;
300 int n_free_mmu_pages;
301 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
235 struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; 302 struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
236 int memory_config_version; 303 int memory_config_version;
237 int busy; 304 int busy;
305 unsigned long rmap_overflow;
238}; 306};
239 307
240struct kvm_stat { 308struct kvm_stat {
@@ -247,6 +315,9 @@ struct kvm_stat {
247 u32 io_exits; 315 u32 io_exits;
248 u32 mmio_exits; 316 u32 mmio_exits;
249 u32 signal_exits; 317 u32 signal_exits;
318 u32 irq_window_exits;
319 u32 halt_exits;
320 u32 request_irq_exits;
250 u32 irq_exits; 321 u32 irq_exits;
251}; 322};
252 323
@@ -279,6 +350,7 @@ struct kvm_arch_ops {
279 void (*set_segment)(struct kvm_vcpu *vcpu, 350 void (*set_segment)(struct kvm_vcpu *vcpu,
280 struct kvm_segment *var, int seg); 351 struct kvm_segment *var, int seg);
281 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); 352 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
353 void (*decache_cr0_cr4_guest_bits)(struct kvm_vcpu *vcpu);
282 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 354 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
283 void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu, 355 void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu,
284 unsigned long cr0); 356 unsigned long cr0);
@@ -323,7 +395,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
323int kvm_mmu_setup(struct kvm_vcpu *vcpu); 395int kvm_mmu_setup(struct kvm_vcpu *vcpu);
324 396
325int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 397int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
326void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 398void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot);
327 399
328hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); 400hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
329#define HPA_MSB ((sizeof(hpa_t) * 8) - 1) 401#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
@@ -396,6 +468,19 @@ int kvm_write_guest(struct kvm_vcpu *vcpu,
396 468
397unsigned long segment_base(u16 selector); 469unsigned long segment_base(u16 selector);
398 470
471void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
472void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
473int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
474void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
475
476static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
477 u32 error_code)
478{
479 if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
480 kvm_mmu_free_some_pages(vcpu);
481 return vcpu->mmu.page_fault(vcpu, gva, error_code);
482}
483
399static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn) 484static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn)
400{ 485{
401 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 486 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
@@ -541,19 +626,4 @@ static inline u32 get_rdx_init_val(void)
541#define TSS_REDIRECTION_SIZE (256 / 8) 626#define TSS_REDIRECTION_SIZE (256 / 8)
542#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) 627#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
543 628
544#ifdef CONFIG_X86_64
545
546/*
547 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. Therefore
548 * we need to allocate shadow page tables in the first 4GB of memory, which
549 * happens to fit the DMA32 zone.
550 */
551#define GFP_KVM_MMU (GFP_KERNEL | __GFP_DMA32)
552
553#else
554
555#define GFP_KVM_MMU GFP_KERNEL
556
557#endif
558
559#endif 629#endif
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index ce7fe640f18d..67c1154960f0 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -58,6 +58,9 @@ static struct kvm_stats_debugfs_item {
58 { "io_exits", &kvm_stat.io_exits }, 58 { "io_exits", &kvm_stat.io_exits },
59 { "mmio_exits", &kvm_stat.mmio_exits }, 59 { "mmio_exits", &kvm_stat.mmio_exits },
60 { "signal_exits", &kvm_stat.signal_exits }, 60 { "signal_exits", &kvm_stat.signal_exits },
61 { "irq_window", &kvm_stat.irq_window_exits },
62 { "halt_exits", &kvm_stat.halt_exits },
63 { "request_irq", &kvm_stat.request_irq_exits },
61 { "irq_exits", &kvm_stat.irq_exits }, 64 { "irq_exits", &kvm_stat.irq_exits },
62 { 0, 0 } 65 { 0, 0 }
63}; 66};
@@ -227,6 +230,7 @@ static int kvm_dev_open(struct inode *inode, struct file *filp)
227 struct kvm_vcpu *vcpu = &kvm->vcpus[i]; 230 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
228 231
229 mutex_init(&vcpu->mutex); 232 mutex_init(&vcpu->mutex);
233 vcpu->kvm = kvm;
230 vcpu->mmu.root_hpa = INVALID_PAGE; 234 vcpu->mmu.root_hpa = INVALID_PAGE;
231 INIT_LIST_HEAD(&vcpu->free_pages); 235 INIT_LIST_HEAD(&vcpu->free_pages);
232 } 236 }
@@ -268,8 +272,8 @@ static void kvm_free_physmem(struct kvm *kvm)
268 272
269static void kvm_free_vcpu(struct kvm_vcpu *vcpu) 273static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
270{ 274{
271 kvm_arch_ops->vcpu_free(vcpu);
272 kvm_mmu_destroy(vcpu); 275 kvm_mmu_destroy(vcpu);
276 kvm_arch_ops->vcpu_free(vcpu);
273} 277}
274 278
275static void kvm_free_vcpus(struct kvm *kvm) 279static void kvm_free_vcpus(struct kvm *kvm)
@@ -295,14 +299,17 @@ static void inject_gp(struct kvm_vcpu *vcpu)
295 kvm_arch_ops->inject_gp(vcpu, 0); 299 kvm_arch_ops->inject_gp(vcpu, 0);
296} 300}
297 301
298static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu, 302/*
299 unsigned long cr3) 303 * Load the pae pdptrs. Return true is they are all valid.
304 */
305static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
300{ 306{
301 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 307 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
302 unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5; 308 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
303 int i; 309 int i;
304 u64 pdpte; 310 u64 pdpte;
305 u64 *pdpt; 311 u64 *pdpt;
312 int ret;
306 struct kvm_memory_slot *memslot; 313 struct kvm_memory_slot *memslot;
307 314
308 spin_lock(&vcpu->kvm->lock); 315 spin_lock(&vcpu->kvm->lock);
@@ -310,16 +317,23 @@ static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu,
310 /* FIXME: !memslot - emulate? 0xff? */ 317 /* FIXME: !memslot - emulate? 0xff? */
311 pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0); 318 pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0);
312 319
320 ret = 1;
313 for (i = 0; i < 4; ++i) { 321 for (i = 0; i < 4; ++i) {
314 pdpte = pdpt[offset + i]; 322 pdpte = pdpt[offset + i];
315 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) 323 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
316 break; 324 ret = 0;
325 goto out;
326 }
317 } 327 }
318 328
329 for (i = 0; i < 4; ++i)
330 vcpu->pdptrs[i] = pdpt[offset + i];
331
332out:
319 kunmap_atomic(pdpt, KM_USER0); 333 kunmap_atomic(pdpt, KM_USER0);
320 spin_unlock(&vcpu->kvm->lock); 334 spin_unlock(&vcpu->kvm->lock);
321 335
322 return i != 4; 336 return ret;
323} 337}
324 338
325void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 339void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
@@ -365,8 +379,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
365 } 379 }
366 } else 380 } else
367#endif 381#endif
368 if (is_pae(vcpu) && 382 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
369 pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
370 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 383 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
371 "reserved bits\n"); 384 "reserved bits\n");
372 inject_gp(vcpu); 385 inject_gp(vcpu);
@@ -387,6 +400,7 @@ EXPORT_SYMBOL_GPL(set_cr0);
387 400
388void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 401void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
389{ 402{
403 kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
390 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); 404 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
391} 405}
392EXPORT_SYMBOL_GPL(lmsw); 406EXPORT_SYMBOL_GPL(lmsw);
@@ -407,7 +421,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
407 return; 421 return;
408 } 422 }
409 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK) 423 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
410 && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { 424 && !load_pdptrs(vcpu, vcpu->cr3)) {
411 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 425 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
412 inject_gp(vcpu); 426 inject_gp(vcpu);
413 } 427 }
@@ -439,7 +453,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
439 return; 453 return;
440 } 454 }
441 if (is_paging(vcpu) && is_pae(vcpu) && 455 if (is_paging(vcpu) && is_pae(vcpu) &&
442 pdptrs_have_reserved_bits_set(vcpu, cr3)) { 456 !load_pdptrs(vcpu, cr3)) {
443 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 457 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
444 "reserved bits\n"); 458 "reserved bits\n");
445 inject_gp(vcpu); 459 inject_gp(vcpu);
@@ -449,7 +463,19 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
449 463
450 vcpu->cr3 = cr3; 464 vcpu->cr3 = cr3;
451 spin_lock(&vcpu->kvm->lock); 465 spin_lock(&vcpu->kvm->lock);
452 vcpu->mmu.new_cr3(vcpu); 466 /*
467 * Does the new cr3 value map to physical memory? (Note, we
468 * catch an invalid cr3 even in real-mode, because it would
469 * cause trouble later on when we turn on paging anyway.)
470 *
471 * A real CPU would silently accept an invalid cr3 and would
472 * attempt to use it - with largely undefined (and often hard
473 * to debug) behavior on the guest side.
474 */
475 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
476 inject_gp(vcpu);
477 else
478 vcpu->mmu.new_cr3(vcpu);
453 spin_unlock(&vcpu->kvm->lock); 479 spin_unlock(&vcpu->kvm->lock);
454} 480}
455EXPORT_SYMBOL_GPL(set_cr3); 481EXPORT_SYMBOL_GPL(set_cr3);
@@ -517,7 +543,6 @@ static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n)
517 vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; 543 vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
518 544
519 vcpu->cpu = -1; /* First load will set up TR */ 545 vcpu->cpu = -1; /* First load will set up TR */
520 vcpu->kvm = kvm;
521 r = kvm_arch_ops->vcpu_create(vcpu); 546 r = kvm_arch_ops->vcpu_create(vcpu);
522 if (r < 0) 547 if (r < 0)
523 goto out_free_vcpus; 548 goto out_free_vcpus;
@@ -634,6 +659,7 @@ raced:
634 | __GFP_ZERO); 659 | __GFP_ZERO);
635 if (!new.phys_mem[i]) 660 if (!new.phys_mem[i])
636 goto out_free; 661 goto out_free;
662 new.phys_mem[i]->private = 0;
637 } 663 }
638 } 664 }
639 665
@@ -688,6 +714,13 @@ out:
688 return r; 714 return r;
689} 715}
690 716
717static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot)
718{
719 spin_lock(&vcpu->kvm->lock);
720 kvm_mmu_slot_remove_write_access(vcpu, slot);
721 spin_unlock(&vcpu->kvm->lock);
722}
723
691/* 724/*
692 * Get (and clear) the dirty memory log for a memory slot. 725 * Get (and clear) the dirty memory log for a memory slot.
693 */ 726 */
@@ -697,6 +730,7 @@ static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm,
697 struct kvm_memory_slot *memslot; 730 struct kvm_memory_slot *memslot;
698 int r, i; 731 int r, i;
699 int n; 732 int n;
733 int cleared;
700 unsigned long any = 0; 734 unsigned long any = 0;
701 735
702 spin_lock(&kvm->lock); 736 spin_lock(&kvm->lock);
@@ -727,15 +761,17 @@ static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm,
727 761
728 762
729 if (any) { 763 if (any) {
730 spin_lock(&kvm->lock); 764 cleared = 0;
731 kvm_mmu_slot_remove_write_access(kvm, log->slot);
732 spin_unlock(&kvm->lock);
733 memset(memslot->dirty_bitmap, 0, n);
734 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 765 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
735 struct kvm_vcpu *vcpu = vcpu_load(kvm, i); 766 struct kvm_vcpu *vcpu = vcpu_load(kvm, i);
736 767
737 if (!vcpu) 768 if (!vcpu)
738 continue; 769 continue;
770 if (!cleared) {
771 do_remove_write_access(vcpu, log->slot);
772 memset(memslot->dirty_bitmap, 0, n);
773 cleared = 1;
774 }
739 kvm_arch_ops->tlb_flush(vcpu); 775 kvm_arch_ops->tlb_flush(vcpu);
740 vcpu_put(vcpu); 776 vcpu_put(vcpu);
741 } 777 }
@@ -863,6 +899,27 @@ static int emulator_read_emulated(unsigned long addr,
863 } 899 }
864} 900}
865 901
902static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
903 unsigned long val, int bytes)
904{
905 struct kvm_memory_slot *m;
906 struct page *page;
907 void *virt;
908
909 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
910 return 0;
911 m = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
912 if (!m)
913 return 0;
914 page = gfn_to_page(m, gpa >> PAGE_SHIFT);
915 kvm_mmu_pre_write(vcpu, gpa, bytes);
916 virt = kmap_atomic(page, KM_USER0);
917 memcpy(virt + offset_in_page(gpa), &val, bytes);
918 kunmap_atomic(virt, KM_USER0);
919 kvm_mmu_post_write(vcpu, gpa, bytes);
920 return 1;
921}
922
866static int emulator_write_emulated(unsigned long addr, 923static int emulator_write_emulated(unsigned long addr,
867 unsigned long val, 924 unsigned long val,
868 unsigned int bytes, 925 unsigned int bytes,
@@ -874,6 +931,9 @@ static int emulator_write_emulated(unsigned long addr,
874 if (gpa == UNMAPPED_GVA) 931 if (gpa == UNMAPPED_GVA)
875 return X86EMUL_PROPAGATE_FAULT; 932 return X86EMUL_PROPAGATE_FAULT;
876 933
934 if (emulator_write_phys(vcpu, gpa, val, bytes))
935 return X86EMUL_CONTINUE;
936
877 vcpu->mmio_needed = 1; 937 vcpu->mmio_needed = 1;
878 vcpu->mmio_phys_addr = gpa; 938 vcpu->mmio_phys_addr = gpa;
879 vcpu->mmio_size = bytes; 939 vcpu->mmio_size = bytes;
@@ -898,6 +958,30 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
898 return emulator_write_emulated(addr, new, bytes, ctxt); 958 return emulator_write_emulated(addr, new, bytes, ctxt);
899} 959}
900 960
961#ifdef CONFIG_X86_32
962
963static int emulator_cmpxchg8b_emulated(unsigned long addr,
964 unsigned long old_lo,
965 unsigned long old_hi,
966 unsigned long new_lo,
967 unsigned long new_hi,
968 struct x86_emulate_ctxt *ctxt)
969{
970 static int reported;
971 int r;
972
973 if (!reported) {
974 reported = 1;
975 printk(KERN_WARNING "kvm: emulating exchange8b as write\n");
976 }
977 r = emulator_write_emulated(addr, new_lo, 4, ctxt);
978 if (r != X86EMUL_CONTINUE)
979 return r;
980 return emulator_write_emulated(addr+4, new_hi, 4, ctxt);
981}
982
983#endif
984
901static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 985static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
902{ 986{
903 return kvm_arch_ops->get_segment_base(vcpu, seg); 987 return kvm_arch_ops->get_segment_base(vcpu, seg);
@@ -905,18 +989,15 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
905 989
906int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 990int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
907{ 991{
908 spin_lock(&vcpu->kvm->lock);
909 vcpu->mmu.inval_page(vcpu, address);
910 spin_unlock(&vcpu->kvm->lock);
911 kvm_arch_ops->invlpg(vcpu, address);
912 return X86EMUL_CONTINUE; 992 return X86EMUL_CONTINUE;
913} 993}
914 994
915int emulate_clts(struct kvm_vcpu *vcpu) 995int emulate_clts(struct kvm_vcpu *vcpu)
916{ 996{
917 unsigned long cr0 = vcpu->cr0; 997 unsigned long cr0;
918 998
919 cr0 &= ~CR0_TS_MASK; 999 kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1000 cr0 = vcpu->cr0 & ~CR0_TS_MASK;
920 kvm_arch_ops->set_cr0(vcpu, cr0); 1001 kvm_arch_ops->set_cr0(vcpu, cr0);
921 return X86EMUL_CONTINUE; 1002 return X86EMUL_CONTINUE;
922} 1003}
@@ -975,6 +1056,9 @@ struct x86_emulate_ops emulate_ops = {
975 .read_emulated = emulator_read_emulated, 1056 .read_emulated = emulator_read_emulated,
976 .write_emulated = emulator_write_emulated, 1057 .write_emulated = emulator_write_emulated,
977 .cmpxchg_emulated = emulator_cmpxchg_emulated, 1058 .cmpxchg_emulated = emulator_cmpxchg_emulated,
1059#ifdef CONFIG_X86_32
1060 .cmpxchg8b_emulated = emulator_cmpxchg8b_emulated,
1061#endif
978}; 1062};
979 1063
980int emulate_instruction(struct kvm_vcpu *vcpu, 1064int emulate_instruction(struct kvm_vcpu *vcpu,
@@ -1024,6 +1108,8 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1024 } 1108 }
1025 1109
1026 if (r) { 1110 if (r) {
1111 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1112 return EMULATE_DONE;
1027 if (!vcpu->mmio_needed) { 1113 if (!vcpu->mmio_needed) {
1028 report_emulation_failure(&emulate_ctxt); 1114 report_emulation_failure(&emulate_ctxt);
1029 return EMULATE_FAIL; 1115 return EMULATE_FAIL;
@@ -1069,6 +1155,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1069 1155
1070unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 1156unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1071{ 1157{
1158 kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1072 switch (cr) { 1159 switch (cr) {
1073 case 0: 1160 case 0:
1074 return vcpu->cr0; 1161 return vcpu->cr0;
@@ -1403,6 +1490,7 @@ static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1403 sregs->gdt.limit = dt.limit; 1490 sregs->gdt.limit = dt.limit;
1404 sregs->gdt.base = dt.base; 1491 sregs->gdt.base = dt.base;
1405 1492
1493 kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1406 sregs->cr0 = vcpu->cr0; 1494 sregs->cr0 = vcpu->cr0;
1407 sregs->cr2 = vcpu->cr2; 1495 sregs->cr2 = vcpu->cr2;
1408 sregs->cr3 = vcpu->cr3; 1496 sregs->cr3 = vcpu->cr3;
@@ -1467,11 +1555,15 @@ static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1467#endif 1555#endif
1468 vcpu->apic_base = sregs->apic_base; 1556 vcpu->apic_base = sregs->apic_base;
1469 1557
1558 kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1559
1470 mmu_reset_needed |= vcpu->cr0 != sregs->cr0; 1560 mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
1471 kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0); 1561 kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0);
1472 1562
1473 mmu_reset_needed |= vcpu->cr4 != sregs->cr4; 1563 mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
1474 kvm_arch_ops->set_cr4(vcpu, sregs->cr4); 1564 kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
1565 if (!is_long_mode(vcpu) && is_pae(vcpu))
1566 load_pdptrs(vcpu, vcpu->cr3);
1475 1567
1476 if (mmu_reset_needed) 1568 if (mmu_reset_needed)
1477 kvm_mmu_reset_context(vcpu); 1569 kvm_mmu_reset_context(vcpu);
@@ -1693,12 +1785,12 @@ static long kvm_dev_ioctl(struct file *filp,
1693 if (copy_from_user(&kvm_run, (void *)arg, sizeof kvm_run)) 1785 if (copy_from_user(&kvm_run, (void *)arg, sizeof kvm_run))
1694 goto out; 1786 goto out;
1695 r = kvm_dev_ioctl_run(kvm, &kvm_run); 1787 r = kvm_dev_ioctl_run(kvm, &kvm_run);
1696 if (r < 0) 1788 if (r < 0 && r != -EINTR)
1697 goto out; 1789 goto out;
1698 r = -EFAULT; 1790 if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run)) {
1699 if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run)) 1791 r = -EFAULT;
1700 goto out; 1792 goto out;
1701 r = 0; 1793 }
1702 break; 1794 break;
1703 } 1795 }
1704 case KVM_GET_REGS: { 1796 case KVM_GET_REGS: {
@@ -1842,6 +1934,7 @@ static long kvm_dev_ioctl(struct file *filp,
1842 num_msrs_to_save * sizeof(u32))) 1934 num_msrs_to_save * sizeof(u32)))
1843 goto out; 1935 goto out;
1844 r = 0; 1936 r = 0;
1937 break;
1845 } 1938 }
1846 default: 1939 default:
1847 ; 1940 ;
@@ -1944,17 +2037,17 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
1944 return -EEXIST; 2037 return -EEXIST;
1945 } 2038 }
1946 2039
1947 kvm_arch_ops = ops; 2040 if (!ops->cpu_has_kvm_support()) {
1948
1949 if (!kvm_arch_ops->cpu_has_kvm_support()) {
1950 printk(KERN_ERR "kvm: no hardware support\n"); 2041 printk(KERN_ERR "kvm: no hardware support\n");
1951 return -EOPNOTSUPP; 2042 return -EOPNOTSUPP;
1952 } 2043 }
1953 if (kvm_arch_ops->disabled_by_bios()) { 2044 if (ops->disabled_by_bios()) {
1954 printk(KERN_ERR "kvm: disabled by bios\n"); 2045 printk(KERN_ERR "kvm: disabled by bios\n");
1955 return -EOPNOTSUPP; 2046 return -EOPNOTSUPP;
1956 } 2047 }
1957 2048
2049 kvm_arch_ops = ops;
2050
1958 r = kvm_arch_ops->hardware_setup(); 2051 r = kvm_arch_ops->hardware_setup();
1959 if (r < 0) 2052 if (r < 0)
1960 return r; 2053 return r;
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index 790423c5f23d..c6f972914f08 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -26,7 +26,31 @@
26#include "vmx.h" 26#include "vmx.h"
27#include "kvm.h" 27#include "kvm.h"
28 28
29#undef MMU_DEBUG
30
31#undef AUDIT
32
33#ifdef AUDIT
34static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
35#else
36static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
37#endif
38
39#ifdef MMU_DEBUG
40
41#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
42#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
43
44#else
45
29#define pgprintk(x...) do { } while (0) 46#define pgprintk(x...) do { } while (0)
47#define rmap_printk(x...) do { } while (0)
48
49#endif
50
51#if defined(MMU_DEBUG) || defined(AUDIT)
52static int dbg = 1;
53#endif
30 54
31#define ASSERT(x) \ 55#define ASSERT(x) \
32 if (!(x)) { \ 56 if (!(x)) { \
@@ -34,8 +58,10 @@
34 __FILE__, __LINE__, #x); \ 58 __FILE__, __LINE__, #x); \
35 } 59 }
36 60
37#define PT64_ENT_PER_PAGE 512 61#define PT64_PT_BITS 9
38#define PT32_ENT_PER_PAGE 1024 62#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
63#define PT32_PT_BITS 10
64#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
39 65
40#define PT_WRITABLE_SHIFT 1 66#define PT_WRITABLE_SHIFT 1
41 67
@@ -125,6 +151,13 @@
125#define PT_DIRECTORY_LEVEL 2 151#define PT_DIRECTORY_LEVEL 2
126#define PT_PAGE_TABLE_LEVEL 1 152#define PT_PAGE_TABLE_LEVEL 1
127 153
154#define RMAP_EXT 4
155
156struct kvm_rmap_desc {
157 u64 *shadow_ptes[RMAP_EXT];
158 struct kvm_rmap_desc *more;
159};
160
128static int is_write_protection(struct kvm_vcpu *vcpu) 161static int is_write_protection(struct kvm_vcpu *vcpu)
129{ 162{
130 return vcpu->cr0 & CR0_WP_MASK; 163 return vcpu->cr0 & CR0_WP_MASK;
@@ -150,32 +183,272 @@ static int is_io_pte(unsigned long pte)
150 return pte & PT_SHADOW_IO_MARK; 183 return pte & PT_SHADOW_IO_MARK;
151} 184}
152 185
186static int is_rmap_pte(u64 pte)
187{
188 return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
189 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
190}
191
192static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
193 size_t objsize, int min)
194{
195 void *obj;
196
197 if (cache->nobjs >= min)
198 return 0;
199 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
200 obj = kzalloc(objsize, GFP_NOWAIT);
201 if (!obj)
202 return -ENOMEM;
203 cache->objects[cache->nobjs++] = obj;
204 }
205 return 0;
206}
207
208static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
209{
210 while (mc->nobjs)
211 kfree(mc->objects[--mc->nobjs]);
212}
213
214static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
215{
216 int r;
217
218 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
219 sizeof(struct kvm_pte_chain), 4);
220 if (r)
221 goto out;
222 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
223 sizeof(struct kvm_rmap_desc), 1);
224out:
225 return r;
226}
227
228static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
229{
230 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
231 mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
232}
233
234static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
235 size_t size)
236{
237 void *p;
238
239 BUG_ON(!mc->nobjs);
240 p = mc->objects[--mc->nobjs];
241 memset(p, 0, size);
242 return p;
243}
244
245static void mmu_memory_cache_free(struct kvm_mmu_memory_cache *mc, void *obj)
246{
247 if (mc->nobjs < KVM_NR_MEM_OBJS)
248 mc->objects[mc->nobjs++] = obj;
249 else
250 kfree(obj);
251}
252
253static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
254{
255 return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
256 sizeof(struct kvm_pte_chain));
257}
258
259static void mmu_free_pte_chain(struct kvm_vcpu *vcpu,
260 struct kvm_pte_chain *pc)
261{
262 mmu_memory_cache_free(&vcpu->mmu_pte_chain_cache, pc);
263}
264
265static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
266{
267 return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
268 sizeof(struct kvm_rmap_desc));
269}
270
271static void mmu_free_rmap_desc(struct kvm_vcpu *vcpu,
272 struct kvm_rmap_desc *rd)
273{
274 mmu_memory_cache_free(&vcpu->mmu_rmap_desc_cache, rd);
275}
276
277/*
278 * Reverse mapping data structures:
279 *
280 * If page->private bit zero is zero, then page->private points to the
281 * shadow page table entry that points to page_address(page).
282 *
283 * If page->private bit zero is one, (then page->private & ~1) points
284 * to a struct kvm_rmap_desc containing more mappings.
285 */
286static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
287{
288 struct page *page;
289 struct kvm_rmap_desc *desc;
290 int i;
291
292 if (!is_rmap_pte(*spte))
293 return;
294 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
295 if (!page->private) {
296 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
297 page->private = (unsigned long)spte;
298 } else if (!(page->private & 1)) {
299 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
300 desc = mmu_alloc_rmap_desc(vcpu);
301 desc->shadow_ptes[0] = (u64 *)page->private;
302 desc->shadow_ptes[1] = spte;
303 page->private = (unsigned long)desc | 1;
304 } else {
305 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
306 desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
307 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
308 desc = desc->more;
309 if (desc->shadow_ptes[RMAP_EXT-1]) {
310 desc->more = mmu_alloc_rmap_desc(vcpu);
311 desc = desc->more;
312 }
313 for (i = 0; desc->shadow_ptes[i]; ++i)
314 ;
315 desc->shadow_ptes[i] = spte;
316 }
317}
318
319static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu,
320 struct page *page,
321 struct kvm_rmap_desc *desc,
322 int i,
323 struct kvm_rmap_desc *prev_desc)
324{
325 int j;
326
327 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
328 ;
329 desc->shadow_ptes[i] = desc->shadow_ptes[j];
330 desc->shadow_ptes[j] = 0;
331 if (j != 0)
332 return;
333 if (!prev_desc && !desc->more)
334 page->private = (unsigned long)desc->shadow_ptes[0];
335 else
336 if (prev_desc)
337 prev_desc->more = desc->more;
338 else
339 page->private = (unsigned long)desc->more | 1;
340 mmu_free_rmap_desc(vcpu, desc);
341}
342
343static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte)
344{
345 struct page *page;
346 struct kvm_rmap_desc *desc;
347 struct kvm_rmap_desc *prev_desc;
348 int i;
349
350 if (!is_rmap_pte(*spte))
351 return;
352 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
353 if (!page->private) {
354 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
355 BUG();
356 } else if (!(page->private & 1)) {
357 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
358 if ((u64 *)page->private != spte) {
359 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
360 spte, *spte);
361 BUG();
362 }
363 page->private = 0;
364 } else {
365 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
366 desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
367 prev_desc = NULL;
368 while (desc) {
369 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
370 if (desc->shadow_ptes[i] == spte) {
371 rmap_desc_remove_entry(vcpu, page,
372 desc, i,
373 prev_desc);
374 return;
375 }
376 prev_desc = desc;
377 desc = desc->more;
378 }
379 BUG();
380 }
381}
382
383static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
384{
385 struct kvm *kvm = vcpu->kvm;
386 struct page *page;
387 struct kvm_memory_slot *slot;
388 struct kvm_rmap_desc *desc;
389 u64 *spte;
390
391 slot = gfn_to_memslot(kvm, gfn);
392 BUG_ON(!slot);
393 page = gfn_to_page(slot, gfn);
394
395 while (page->private) {
396 if (!(page->private & 1))
397 spte = (u64 *)page->private;
398 else {
399 desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
400 spte = desc->shadow_ptes[0];
401 }
402 BUG_ON(!spte);
403 BUG_ON((*spte & PT64_BASE_ADDR_MASK) !=
404 page_to_pfn(page) << PAGE_SHIFT);
405 BUG_ON(!(*spte & PT_PRESENT_MASK));
406 BUG_ON(!(*spte & PT_WRITABLE_MASK));
407 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
408 rmap_remove(vcpu, spte);
409 kvm_arch_ops->tlb_flush(vcpu);
410 *spte &= ~(u64)PT_WRITABLE_MASK;
411 }
412}
413
414static int is_empty_shadow_page(hpa_t page_hpa)
415{
416 u64 *pos;
417 u64 *end;
418
419 for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64);
420 pos != end; pos++)
421 if (*pos != 0) {
422 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
423 pos, *pos);
424 return 0;
425 }
426 return 1;
427}
428
153static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) 429static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
154{ 430{
155 struct kvm_mmu_page *page_head = page_header(page_hpa); 431 struct kvm_mmu_page *page_head = page_header(page_hpa);
156 432
433 ASSERT(is_empty_shadow_page(page_hpa));
157 list_del(&page_head->link); 434 list_del(&page_head->link);
158 page_head->page_hpa = page_hpa; 435 page_head->page_hpa = page_hpa;
159 list_add(&page_head->link, &vcpu->free_pages); 436 list_add(&page_head->link, &vcpu->free_pages);
437 ++vcpu->kvm->n_free_mmu_pages;
160} 438}
161 439
162static int is_empty_shadow_page(hpa_t page_hpa) 440static unsigned kvm_page_table_hashfn(gfn_t gfn)
163{ 441{
164 u32 *pos; 442 return gfn;
165 u32 *end;
166 for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32);
167 pos != end; pos++)
168 if (*pos != 0)
169 return 0;
170 return 1;
171} 443}
172 444
173static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte) 445static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
446 u64 *parent_pte)
174{ 447{
175 struct kvm_mmu_page *page; 448 struct kvm_mmu_page *page;
176 449
177 if (list_empty(&vcpu->free_pages)) 450 if (list_empty(&vcpu->free_pages))
178 return INVALID_PAGE; 451 return NULL;
179 452
180 page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); 453 page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
181 list_del(&page->link); 454 list_del(&page->link);
@@ -183,8 +456,239 @@ static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte)
183 ASSERT(is_empty_shadow_page(page->page_hpa)); 456 ASSERT(is_empty_shadow_page(page->page_hpa));
184 page->slot_bitmap = 0; 457 page->slot_bitmap = 0;
185 page->global = 1; 458 page->global = 1;
459 page->multimapped = 0;
186 page->parent_pte = parent_pte; 460 page->parent_pte = parent_pte;
187 return page->page_hpa; 461 --vcpu->kvm->n_free_mmu_pages;
462 return page;
463}
464
465static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
466 struct kvm_mmu_page *page, u64 *parent_pte)
467{
468 struct kvm_pte_chain *pte_chain;
469 struct hlist_node *node;
470 int i;
471
472 if (!parent_pte)
473 return;
474 if (!page->multimapped) {
475 u64 *old = page->parent_pte;
476
477 if (!old) {
478 page->parent_pte = parent_pte;
479 return;
480 }
481 page->multimapped = 1;
482 pte_chain = mmu_alloc_pte_chain(vcpu);
483 INIT_HLIST_HEAD(&page->parent_ptes);
484 hlist_add_head(&pte_chain->link, &page->parent_ptes);
485 pte_chain->parent_ptes[0] = old;
486 }
487 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
488 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
489 continue;
490 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
491 if (!pte_chain->parent_ptes[i]) {
492 pte_chain->parent_ptes[i] = parent_pte;
493 return;
494 }
495 }
496 pte_chain = mmu_alloc_pte_chain(vcpu);
497 BUG_ON(!pte_chain);
498 hlist_add_head(&pte_chain->link, &page->parent_ptes);
499 pte_chain->parent_ptes[0] = parent_pte;
500}
501
502static void mmu_page_remove_parent_pte(struct kvm_vcpu *vcpu,
503 struct kvm_mmu_page *page,
504 u64 *parent_pte)
505{
506 struct kvm_pte_chain *pte_chain;
507 struct hlist_node *node;
508 int i;
509
510 if (!page->multimapped) {
511 BUG_ON(page->parent_pte != parent_pte);
512 page->parent_pte = NULL;
513 return;
514 }
515 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
516 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
517 if (!pte_chain->parent_ptes[i])
518 break;
519 if (pte_chain->parent_ptes[i] != parent_pte)
520 continue;
521 while (i + 1 < NR_PTE_CHAIN_ENTRIES
522 && pte_chain->parent_ptes[i + 1]) {
523 pte_chain->parent_ptes[i]
524 = pte_chain->parent_ptes[i + 1];
525 ++i;
526 }
527 pte_chain->parent_ptes[i] = NULL;
528 if (i == 0) {
529 hlist_del(&pte_chain->link);
530 mmu_free_pte_chain(vcpu, pte_chain);
531 if (hlist_empty(&page->parent_ptes)) {
532 page->multimapped = 0;
533 page->parent_pte = NULL;
534 }
535 }
536 return;
537 }
538 BUG();
539}
540
541static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
542 gfn_t gfn)
543{
544 unsigned index;
545 struct hlist_head *bucket;
546 struct kvm_mmu_page *page;
547 struct hlist_node *node;
548
549 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
550 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
551 bucket = &vcpu->kvm->mmu_page_hash[index];
552 hlist_for_each_entry(page, node, bucket, hash_link)
553 if (page->gfn == gfn && !page->role.metaphysical) {
554 pgprintk("%s: found role %x\n",
555 __FUNCTION__, page->role.word);
556 return page;
557 }
558 return NULL;
559}
560
561static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
562 gfn_t gfn,
563 gva_t gaddr,
564 unsigned level,
565 int metaphysical,
566 u64 *parent_pte)
567{
568 union kvm_mmu_page_role role;
569 unsigned index;
570 unsigned quadrant;
571 struct hlist_head *bucket;
572 struct kvm_mmu_page *page;
573 struct hlist_node *node;
574
575 role.word = 0;
576 role.glevels = vcpu->mmu.root_level;
577 role.level = level;
578 role.metaphysical = metaphysical;
579 if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
580 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
581 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
582 role.quadrant = quadrant;
583 }
584 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
585 gfn, role.word);
586 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
587 bucket = &vcpu->kvm->mmu_page_hash[index];
588 hlist_for_each_entry(page, node, bucket, hash_link)
589 if (page->gfn == gfn && page->role.word == role.word) {
590 mmu_page_add_parent_pte(vcpu, page, parent_pte);
591 pgprintk("%s: found\n", __FUNCTION__);
592 return page;
593 }
594 page = kvm_mmu_alloc_page(vcpu, parent_pte);
595 if (!page)
596 return page;
597 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
598 page->gfn = gfn;
599 page->role = role;
600 hlist_add_head(&page->hash_link, bucket);
601 if (!metaphysical)
602 rmap_write_protect(vcpu, gfn);
603 return page;
604}
605
606static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
607 struct kvm_mmu_page *page)
608{
609 unsigned i;
610 u64 *pt;
611 u64 ent;
612
613 pt = __va(page->page_hpa);
614
615 if (page->role.level == PT_PAGE_TABLE_LEVEL) {
616 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
617 if (pt[i] & PT_PRESENT_MASK)
618 rmap_remove(vcpu, &pt[i]);
619 pt[i] = 0;
620 }
621 kvm_arch_ops->tlb_flush(vcpu);
622 return;
623 }
624
625 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
626 ent = pt[i];
627
628 pt[i] = 0;
629 if (!(ent & PT_PRESENT_MASK))
630 continue;
631 ent &= PT64_BASE_ADDR_MASK;
632 mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]);
633 }
634}
635
636static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
637 struct kvm_mmu_page *page,
638 u64 *parent_pte)
639{
640 mmu_page_remove_parent_pte(vcpu, page, parent_pte);
641}
642
643static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
644 struct kvm_mmu_page *page)
645{
646 u64 *parent_pte;
647
648 while (page->multimapped || page->parent_pte) {
649 if (!page->multimapped)
650 parent_pte = page->parent_pte;
651 else {
652 struct kvm_pte_chain *chain;
653
654 chain = container_of(page->parent_ptes.first,
655 struct kvm_pte_chain, link);
656 parent_pte = chain->parent_ptes[0];
657 }
658 BUG_ON(!parent_pte);
659 kvm_mmu_put_page(vcpu, page, parent_pte);
660 *parent_pte = 0;
661 }
662 kvm_mmu_page_unlink_children(vcpu, page);
663 if (!page->root_count) {
664 hlist_del(&page->hash_link);
665 kvm_mmu_free_page(vcpu, page->page_hpa);
666 } else {
667 list_del(&page->link);
668 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
669 }
670}
671
672static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
673{
674 unsigned index;
675 struct hlist_head *bucket;
676 struct kvm_mmu_page *page;
677 struct hlist_node *node, *n;
678 int r;
679
680 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
681 r = 0;
682 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
683 bucket = &vcpu->kvm->mmu_page_hash[index];
684 hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
685 if (page->gfn == gfn && !page->role.metaphysical) {
686 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
687 page->role.word);
688 kvm_mmu_zap_page(vcpu, page);
689 r = 1;
690 }
691 return r;
188} 692}
189 693
190static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) 694static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
@@ -225,35 +729,6 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
225 return gpa_to_hpa(vcpu, gpa); 729 return gpa_to_hpa(vcpu, gpa);
226} 730}
227 731
228
229static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
230 int level)
231{
232 ASSERT(vcpu);
233 ASSERT(VALID_PAGE(page_hpa));
234 ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
235
236 if (level == 1)
237 memset(__va(page_hpa), 0, PAGE_SIZE);
238 else {
239 u64 *pos;
240 u64 *end;
241
242 for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
243 pos != end; pos++) {
244 u64 current_ent = *pos;
245
246 *pos = 0;
247 if (is_present_pte(current_ent))
248 release_pt_page_64(vcpu,
249 current_ent &
250 PT64_BASE_ADDR_MASK,
251 level - 1);
252 }
253 }
254 kvm_mmu_free_page(vcpu, page_hpa);
255}
256
257static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 732static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
258{ 733{
259} 734}
@@ -266,52 +741,109 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
266 for (; ; level--) { 741 for (; ; level--) {
267 u32 index = PT64_INDEX(v, level); 742 u32 index = PT64_INDEX(v, level);
268 u64 *table; 743 u64 *table;
744 u64 pte;
269 745
270 ASSERT(VALID_PAGE(table_addr)); 746 ASSERT(VALID_PAGE(table_addr));
271 table = __va(table_addr); 747 table = __va(table_addr);
272 748
273 if (level == 1) { 749 if (level == 1) {
750 pte = table[index];
751 if (is_present_pte(pte) && is_writeble_pte(pte))
752 return 0;
274 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); 753 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
275 page_header_update_slot(vcpu->kvm, table, v); 754 page_header_update_slot(vcpu->kvm, table, v);
276 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | 755 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
277 PT_USER_MASK; 756 PT_USER_MASK;
757 rmap_add(vcpu, &table[index]);
278 return 0; 758 return 0;
279 } 759 }
280 760
281 if (table[index] == 0) { 761 if (table[index] == 0) {
282 hpa_t new_table = kvm_mmu_alloc_page(vcpu, 762 struct kvm_mmu_page *new_table;
283 &table[index]); 763 gfn_t pseudo_gfn;
284 764
285 if (!VALID_PAGE(new_table)) { 765 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
766 >> PAGE_SHIFT;
767 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
768 v, level - 1,
769 1, &table[index]);
770 if (!new_table) {
286 pgprintk("nonpaging_map: ENOMEM\n"); 771 pgprintk("nonpaging_map: ENOMEM\n");
287 return -ENOMEM; 772 return -ENOMEM;
288 } 773 }
289 774
290 if (level == PT32E_ROOT_LEVEL) 775 table[index] = new_table->page_hpa | PT_PRESENT_MASK
291 table[index] = new_table | PT_PRESENT_MASK; 776 | PT_WRITABLE_MASK | PT_USER_MASK;
292 else
293 table[index] = new_table | PT_PRESENT_MASK |
294 PT_WRITABLE_MASK | PT_USER_MASK;
295 } 777 }
296 table_addr = table[index] & PT64_BASE_ADDR_MASK; 778 table_addr = table[index] & PT64_BASE_ADDR_MASK;
297 } 779 }
298} 780}
299 781
300static void nonpaging_flush(struct kvm_vcpu *vcpu) 782static void mmu_free_roots(struct kvm_vcpu *vcpu)
301{ 783{
302 hpa_t root = vcpu->mmu.root_hpa; 784 int i;
785 struct kvm_mmu_page *page;
303 786
304 ++kvm_stat.tlb_flush; 787#ifdef CONFIG_X86_64
305 pgprintk("nonpaging_flush\n"); 788 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
306 ASSERT(VALID_PAGE(root)); 789 hpa_t root = vcpu->mmu.root_hpa;
307 release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level); 790
308 root = kvm_mmu_alloc_page(vcpu, NULL); 791 ASSERT(VALID_PAGE(root));
309 ASSERT(VALID_PAGE(root)); 792 page = page_header(root);
310 vcpu->mmu.root_hpa = root; 793 --page->root_count;
311 if (is_paging(vcpu)) 794 vcpu->mmu.root_hpa = INVALID_PAGE;
312 root |= (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)); 795 return;
313 kvm_arch_ops->set_cr3(vcpu, root); 796 }
314 kvm_arch_ops->tlb_flush(vcpu); 797#endif
798 for (i = 0; i < 4; ++i) {
799 hpa_t root = vcpu->mmu.pae_root[i];
800
801 ASSERT(VALID_PAGE(root));
802 root &= PT64_BASE_ADDR_MASK;
803 page = page_header(root);
804 --page->root_count;
805 vcpu->mmu.pae_root[i] = INVALID_PAGE;
806 }
807 vcpu->mmu.root_hpa = INVALID_PAGE;
808}
809
810static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
811{
812 int i;
813 gfn_t root_gfn;
814 struct kvm_mmu_page *page;
815
816 root_gfn = vcpu->cr3 >> PAGE_SHIFT;
817
818#ifdef CONFIG_X86_64
819 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
820 hpa_t root = vcpu->mmu.root_hpa;
821
822 ASSERT(!VALID_PAGE(root));
823 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
824 PT64_ROOT_LEVEL, 0, NULL);
825 root = page->page_hpa;
826 ++page->root_count;
827 vcpu->mmu.root_hpa = root;
828 return;
829 }
830#endif
831 for (i = 0; i < 4; ++i) {
832 hpa_t root = vcpu->mmu.pae_root[i];
833
834 ASSERT(!VALID_PAGE(root));
835 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL)
836 root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
837 else if (vcpu->mmu.root_level == 0)
838 root_gfn = 0;
839 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
840 PT32_ROOT_LEVEL, !is_paging(vcpu),
841 NULL);
842 root = page->page_hpa;
843 ++page->root_count;
844 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
845 }
846 vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
315} 847}
316 848
317static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 849static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -322,43 +854,29 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
322static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 854static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
323 u32 error_code) 855 u32 error_code)
324{ 856{
325 int ret;
326 gpa_t addr = gva; 857 gpa_t addr = gva;
858 hpa_t paddr;
859 int r;
860
861 r = mmu_topup_memory_caches(vcpu);
862 if (r)
863 return r;
327 864
328 ASSERT(vcpu); 865 ASSERT(vcpu);
329 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); 866 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
330 867
331 for (;;) {
332 hpa_t paddr;
333
334 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
335 868
336 if (is_error_hpa(paddr)) 869 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
337 return 1;
338 870
339 ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr); 871 if (is_error_hpa(paddr))
340 if (ret) { 872 return 1;
341 nonpaging_flush(vcpu);
342 continue;
343 }
344 break;
345 }
346 return ret;
347}
348 873
349static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) 874 return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
350{
351} 875}
352 876
353static void nonpaging_free(struct kvm_vcpu *vcpu) 877static void nonpaging_free(struct kvm_vcpu *vcpu)
354{ 878{
355 hpa_t root; 879 mmu_free_roots(vcpu);
356
357 ASSERT(vcpu);
358 root = vcpu->mmu.root_hpa;
359 if (VALID_PAGE(root))
360 release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level);
361 vcpu->mmu.root_hpa = INVALID_PAGE;
362} 880}
363 881
364static int nonpaging_init_context(struct kvm_vcpu *vcpu) 882static int nonpaging_init_context(struct kvm_vcpu *vcpu)
@@ -367,40 +885,31 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
367 885
368 context->new_cr3 = nonpaging_new_cr3; 886 context->new_cr3 = nonpaging_new_cr3;
369 context->page_fault = nonpaging_page_fault; 887 context->page_fault = nonpaging_page_fault;
370 context->inval_page = nonpaging_inval_page;
371 context->gva_to_gpa = nonpaging_gva_to_gpa; 888 context->gva_to_gpa = nonpaging_gva_to_gpa;
372 context->free = nonpaging_free; 889 context->free = nonpaging_free;
373 context->root_level = PT32E_ROOT_LEVEL; 890 context->root_level = 0;
374 context->shadow_root_level = PT32E_ROOT_LEVEL; 891 context->shadow_root_level = PT32E_ROOT_LEVEL;
375 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); 892 mmu_alloc_roots(vcpu);
376 ASSERT(VALID_PAGE(context->root_hpa)); 893 ASSERT(VALID_PAGE(context->root_hpa));
377 kvm_arch_ops->set_cr3(vcpu, context->root_hpa); 894 kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
378 return 0; 895 return 0;
379} 896}
380 897
381
382static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 898static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
383{ 899{
384 struct kvm_mmu_page *page, *npage;
385
386 list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages,
387 link) {
388 if (page->global)
389 continue;
390
391 if (!page->parent_pte)
392 continue;
393
394 *page->parent_pte = 0;
395 release_pt_page_64(vcpu, page->page_hpa, 1);
396 }
397 ++kvm_stat.tlb_flush; 900 ++kvm_stat.tlb_flush;
398 kvm_arch_ops->tlb_flush(vcpu); 901 kvm_arch_ops->tlb_flush(vcpu);
399} 902}
400 903
401static void paging_new_cr3(struct kvm_vcpu *vcpu) 904static void paging_new_cr3(struct kvm_vcpu *vcpu)
402{ 905{
906 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
907 mmu_free_roots(vcpu);
908 if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
909 kvm_mmu_free_some_pages(vcpu);
910 mmu_alloc_roots(vcpu);
403 kvm_mmu_flush_tlb(vcpu); 911 kvm_mmu_flush_tlb(vcpu);
912 kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
404} 913}
405 914
406static void mark_pagetable_nonglobal(void *shadow_pte) 915static void mark_pagetable_nonglobal(void *shadow_pte)
@@ -412,7 +921,8 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu,
412 u64 *shadow_pte, 921 u64 *shadow_pte,
413 gpa_t gaddr, 922 gpa_t gaddr,
414 int dirty, 923 int dirty,
415 u64 access_bits) 924 u64 access_bits,
925 gfn_t gfn)
416{ 926{
417 hpa_t paddr; 927 hpa_t paddr;
418 928
@@ -420,13 +930,10 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu,
420 if (!dirty) 930 if (!dirty)
421 access_bits &= ~PT_WRITABLE_MASK; 931 access_bits &= ~PT_WRITABLE_MASK;
422 932
423 if (access_bits & PT_WRITABLE_MASK) 933 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
424 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
425 934
426 *shadow_pte |= access_bits; 935 *shadow_pte |= access_bits;
427 936
428 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
429
430 if (!(*shadow_pte & PT_GLOBAL_MASK)) 937 if (!(*shadow_pte & PT_GLOBAL_MASK))
431 mark_pagetable_nonglobal(shadow_pte); 938 mark_pagetable_nonglobal(shadow_pte);
432 939
@@ -434,10 +941,31 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu,
434 *shadow_pte |= gaddr; 941 *shadow_pte |= gaddr;
435 *shadow_pte |= PT_SHADOW_IO_MARK; 942 *shadow_pte |= PT_SHADOW_IO_MARK;
436 *shadow_pte &= ~PT_PRESENT_MASK; 943 *shadow_pte &= ~PT_PRESENT_MASK;
437 } else { 944 return;
438 *shadow_pte |= paddr; 945 }
439 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); 946
947 *shadow_pte |= paddr;
948
949 if (access_bits & PT_WRITABLE_MASK) {
950 struct kvm_mmu_page *shadow;
951
952 shadow = kvm_mmu_lookup_page(vcpu, gfn);
953 if (shadow) {
954 pgprintk("%s: found shadow page for %lx, marking ro\n",
955 __FUNCTION__, gfn);
956 access_bits &= ~PT_WRITABLE_MASK;
957 if (is_writeble_pte(*shadow_pte)) {
958 *shadow_pte &= ~PT_WRITABLE_MASK;
959 kvm_arch_ops->tlb_flush(vcpu);
960 }
961 }
440 } 962 }
963
964 if (access_bits & PT_WRITABLE_MASK)
965 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
966
967 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
968 rmap_add(vcpu, shadow_pte);
441} 969}
442 970
443static void inject_page_fault(struct kvm_vcpu *vcpu, 971static void inject_page_fault(struct kvm_vcpu *vcpu,
@@ -474,41 +1002,6 @@ static int may_access(u64 pte, int write, int user)
474 return 1; 1002 return 1;
475} 1003}
476 1004
477/*
478 * Remove a shadow pte.
479 */
480static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
481{
482 hpa_t page_addr = vcpu->mmu.root_hpa;
483 int level = vcpu->mmu.shadow_root_level;
484
485 ++kvm_stat.invlpg;
486
487 for (; ; level--) {
488 u32 index = PT64_INDEX(addr, level);
489 u64 *table = __va(page_addr);
490
491 if (level == PT_PAGE_TABLE_LEVEL ) {
492 table[index] = 0;
493 return;
494 }
495
496 if (!is_present_pte(table[index]))
497 return;
498
499 page_addr = table[index] & PT64_BASE_ADDR_MASK;
500
501 if (level == PT_DIRECTORY_LEVEL &&
502 (table[index] & PT_SHADOW_PS_MARK)) {
503 table[index] = 0;
504 release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
505
506 kvm_arch_ops->tlb_flush(vcpu);
507 return;
508 }
509 }
510}
511
512static void paging_free(struct kvm_vcpu *vcpu) 1005static void paging_free(struct kvm_vcpu *vcpu)
513{ 1006{
514 nonpaging_free(vcpu); 1007 nonpaging_free(vcpu);
@@ -522,37 +1015,40 @@ static void paging_free(struct kvm_vcpu *vcpu)
522#include "paging_tmpl.h" 1015#include "paging_tmpl.h"
523#undef PTTYPE 1016#undef PTTYPE
524 1017
525static int paging64_init_context(struct kvm_vcpu *vcpu) 1018static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
526{ 1019{
527 struct kvm_mmu *context = &vcpu->mmu; 1020 struct kvm_mmu *context = &vcpu->mmu;
528 1021
529 ASSERT(is_pae(vcpu)); 1022 ASSERT(is_pae(vcpu));
530 context->new_cr3 = paging_new_cr3; 1023 context->new_cr3 = paging_new_cr3;
531 context->page_fault = paging64_page_fault; 1024 context->page_fault = paging64_page_fault;
532 context->inval_page = paging_inval_page;
533 context->gva_to_gpa = paging64_gva_to_gpa; 1025 context->gva_to_gpa = paging64_gva_to_gpa;
534 context->free = paging_free; 1026 context->free = paging_free;
535 context->root_level = PT64_ROOT_LEVEL; 1027 context->root_level = level;
536 context->shadow_root_level = PT64_ROOT_LEVEL; 1028 context->shadow_root_level = level;
537 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); 1029 mmu_alloc_roots(vcpu);
538 ASSERT(VALID_PAGE(context->root_hpa)); 1030 ASSERT(VALID_PAGE(context->root_hpa));
539 kvm_arch_ops->set_cr3(vcpu, context->root_hpa | 1031 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
540 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); 1032 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
541 return 0; 1033 return 0;
542} 1034}
543 1035
1036static int paging64_init_context(struct kvm_vcpu *vcpu)
1037{
1038 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1039}
1040
544static int paging32_init_context(struct kvm_vcpu *vcpu) 1041static int paging32_init_context(struct kvm_vcpu *vcpu)
545{ 1042{
546 struct kvm_mmu *context = &vcpu->mmu; 1043 struct kvm_mmu *context = &vcpu->mmu;
547 1044
548 context->new_cr3 = paging_new_cr3; 1045 context->new_cr3 = paging_new_cr3;
549 context->page_fault = paging32_page_fault; 1046 context->page_fault = paging32_page_fault;
550 context->inval_page = paging_inval_page;
551 context->gva_to_gpa = paging32_gva_to_gpa; 1047 context->gva_to_gpa = paging32_gva_to_gpa;
552 context->free = paging_free; 1048 context->free = paging_free;
553 context->root_level = PT32_ROOT_LEVEL; 1049 context->root_level = PT32_ROOT_LEVEL;
554 context->shadow_root_level = PT32E_ROOT_LEVEL; 1050 context->shadow_root_level = PT32E_ROOT_LEVEL;
555 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); 1051 mmu_alloc_roots(vcpu);
556 ASSERT(VALID_PAGE(context->root_hpa)); 1052 ASSERT(VALID_PAGE(context->root_hpa));
557 kvm_arch_ops->set_cr3(vcpu, context->root_hpa | 1053 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
558 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); 1054 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
@@ -561,14 +1057,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
561 1057
562static int paging32E_init_context(struct kvm_vcpu *vcpu) 1058static int paging32E_init_context(struct kvm_vcpu *vcpu)
563{ 1059{
564 int ret; 1060 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
565
566 if ((ret = paging64_init_context(vcpu)))
567 return ret;
568
569 vcpu->mmu.root_level = PT32E_ROOT_LEVEL;
570 vcpu->mmu.shadow_root_level = PT32E_ROOT_LEVEL;
571 return 0;
572} 1061}
573 1062
574static int init_kvm_mmu(struct kvm_vcpu *vcpu) 1063static int init_kvm_mmu(struct kvm_vcpu *vcpu)
@@ -597,41 +1086,161 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
597 1086
598int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 1087int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
599{ 1088{
1089 int r;
1090
600 destroy_kvm_mmu(vcpu); 1091 destroy_kvm_mmu(vcpu);
601 return init_kvm_mmu(vcpu); 1092 r = init_kvm_mmu(vcpu);
1093 if (r < 0)
1094 goto out;
1095 r = mmu_topup_memory_caches(vcpu);
1096out:
1097 return r;
602} 1098}
603 1099
604static void free_mmu_pages(struct kvm_vcpu *vcpu) 1100void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
605{ 1101{
606 while (!list_empty(&vcpu->free_pages)) { 1102 gfn_t gfn = gpa >> PAGE_SHIFT;
1103 struct kvm_mmu_page *page;
1104 struct kvm_mmu_page *child;
1105 struct hlist_node *node, *n;
1106 struct hlist_head *bucket;
1107 unsigned index;
1108 u64 *spte;
1109 u64 pte;
1110 unsigned offset = offset_in_page(gpa);
1111 unsigned pte_size;
1112 unsigned page_offset;
1113 unsigned misaligned;
1114 int level;
1115 int flooded = 0;
1116
1117 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1118 if (gfn == vcpu->last_pt_write_gfn) {
1119 ++vcpu->last_pt_write_count;
1120 if (vcpu->last_pt_write_count >= 3)
1121 flooded = 1;
1122 } else {
1123 vcpu->last_pt_write_gfn = gfn;
1124 vcpu->last_pt_write_count = 1;
1125 }
1126 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1127 bucket = &vcpu->kvm->mmu_page_hash[index];
1128 hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
1129 if (page->gfn != gfn || page->role.metaphysical)
1130 continue;
1131 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1132 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1133 if (misaligned || flooded) {
1134 /*
1135 * Misaligned accesses are too much trouble to fix
1136 * up; also, they usually indicate a page is not used
1137 * as a page table.
1138 *
1139 * If we're seeing too many writes to a page,
1140 * it may no longer be a page table, or we may be
1141 * forking, in which case it is better to unmap the
1142 * page.
1143 */
1144 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1145 gpa, bytes, page->role.word);
1146 kvm_mmu_zap_page(vcpu, page);
1147 continue;
1148 }
1149 page_offset = offset;
1150 level = page->role.level;
1151 if (page->role.glevels == PT32_ROOT_LEVEL) {
1152 page_offset <<= 1; /* 32->64 */
1153 page_offset &= ~PAGE_MASK;
1154 }
1155 spte = __va(page->page_hpa);
1156 spte += page_offset / sizeof(*spte);
1157 pte = *spte;
1158 if (is_present_pte(pte)) {
1159 if (level == PT_PAGE_TABLE_LEVEL)
1160 rmap_remove(vcpu, spte);
1161 else {
1162 child = page_header(pte & PT64_BASE_ADDR_MASK);
1163 mmu_page_remove_parent_pte(vcpu, child, spte);
1164 }
1165 }
1166 *spte = 0;
1167 }
1168}
1169
1170void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
1171{
1172}
1173
1174int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1175{
1176 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1177
1178 return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
1179}
1180
1181void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1182{
1183 while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
607 struct kvm_mmu_page *page; 1184 struct kvm_mmu_page *page;
608 1185
1186 page = container_of(vcpu->kvm->active_mmu_pages.prev,
1187 struct kvm_mmu_page, link);
1188 kvm_mmu_zap_page(vcpu, page);
1189 }
1190}
1191EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages);
1192
1193static void free_mmu_pages(struct kvm_vcpu *vcpu)
1194{
1195 struct kvm_mmu_page *page;
1196
1197 while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1198 page = container_of(vcpu->kvm->active_mmu_pages.next,
1199 struct kvm_mmu_page, link);
1200 kvm_mmu_zap_page(vcpu, page);
1201 }
1202 while (!list_empty(&vcpu->free_pages)) {
609 page = list_entry(vcpu->free_pages.next, 1203 page = list_entry(vcpu->free_pages.next,
610 struct kvm_mmu_page, link); 1204 struct kvm_mmu_page, link);
611 list_del(&page->link); 1205 list_del(&page->link);
612 __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); 1206 __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
613 page->page_hpa = INVALID_PAGE; 1207 page->page_hpa = INVALID_PAGE;
614 } 1208 }
1209 free_page((unsigned long)vcpu->mmu.pae_root);
615} 1210}
616 1211
617static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 1212static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
618{ 1213{
1214 struct page *page;
619 int i; 1215 int i;
620 1216
621 ASSERT(vcpu); 1217 ASSERT(vcpu);
622 1218
623 for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { 1219 for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
624 struct page *page;
625 struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i]; 1220 struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
626 1221
627 INIT_LIST_HEAD(&page_header->link); 1222 INIT_LIST_HEAD(&page_header->link);
628 if ((page = alloc_page(GFP_KVM_MMU)) == NULL) 1223 if ((page = alloc_page(GFP_KERNEL)) == NULL)
629 goto error_1; 1224 goto error_1;
630 page->private = (unsigned long)page_header; 1225 page->private = (unsigned long)page_header;
631 page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; 1226 page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
632 memset(__va(page_header->page_hpa), 0, PAGE_SIZE); 1227 memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
633 list_add(&page_header->link, &vcpu->free_pages); 1228 list_add(&page_header->link, &vcpu->free_pages);
1229 ++vcpu->kvm->n_free_mmu_pages;
634 } 1230 }
1231
1232 /*
1233 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1234 * Therefore we need to allocate shadow page tables in the first
1235 * 4GB of memory, which happens to fit the DMA32 zone.
1236 */
1237 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1238 if (!page)
1239 goto error_1;
1240 vcpu->mmu.pae_root = page_address(page);
1241 for (i = 0; i < 4; ++i)
1242 vcpu->mmu.pae_root[i] = INVALID_PAGE;
1243
635 return 0; 1244 return 0;
636 1245
637error_1: 1246error_1:
@@ -663,10 +1272,12 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
663 1272
664 destroy_kvm_mmu(vcpu); 1273 destroy_kvm_mmu(vcpu);
665 free_mmu_pages(vcpu); 1274 free_mmu_pages(vcpu);
1275 mmu_free_memory_caches(vcpu);
666} 1276}
667 1277
668void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 1278void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
669{ 1279{
1280 struct kvm *kvm = vcpu->kvm;
670 struct kvm_mmu_page *page; 1281 struct kvm_mmu_page *page;
671 1282
672 list_for_each_entry(page, &kvm->active_mmu_pages, link) { 1283 list_for_each_entry(page, &kvm->active_mmu_pages, link) {
@@ -679,8 +1290,169 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
679 pt = __va(page->page_hpa); 1290 pt = __va(page->page_hpa);
680 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 1291 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
681 /* avoid RMW */ 1292 /* avoid RMW */
682 if (pt[i] & PT_WRITABLE_MASK) 1293 if (pt[i] & PT_WRITABLE_MASK) {
1294 rmap_remove(vcpu, &pt[i]);
683 pt[i] &= ~PT_WRITABLE_MASK; 1295 pt[i] &= ~PT_WRITABLE_MASK;
1296 }
1297 }
1298}
1299
1300#ifdef AUDIT
1301
1302static const char *audit_msg;
1303
1304static gva_t canonicalize(gva_t gva)
1305{
1306#ifdef CONFIG_X86_64
1307 gva = (long long)(gva << 16) >> 16;
1308#endif
1309 return gva;
1310}
684 1311
1312static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1313 gva_t va, int level)
1314{
1315 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1316 int i;
1317 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1318
1319 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1320 u64 ent = pt[i];
1321
1322 if (!ent & PT_PRESENT_MASK)
1323 continue;
1324
1325 va = canonicalize(va);
1326 if (level > 1)
1327 audit_mappings_page(vcpu, ent, va, level - 1);
1328 else {
1329 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1330 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1331
1332 if ((ent & PT_PRESENT_MASK)
1333 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1334 printk(KERN_ERR "audit error: (%s) levels %d"
1335 " gva %lx gpa %llx hpa %llx ent %llx\n",
1336 audit_msg, vcpu->mmu.root_level,
1337 va, gpa, hpa, ent);
1338 }
685 } 1339 }
686} 1340}
1341
1342static void audit_mappings(struct kvm_vcpu *vcpu)
1343{
1344 int i;
1345
1346 if (vcpu->mmu.root_level == 4)
1347 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
1348 else
1349 for (i = 0; i < 4; ++i)
1350 if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
1351 audit_mappings_page(vcpu,
1352 vcpu->mmu.pae_root[i],
1353 i << 30,
1354 2);
1355}
1356
1357static int count_rmaps(struct kvm_vcpu *vcpu)
1358{
1359 int nmaps = 0;
1360 int i, j, k;
1361
1362 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1363 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1364 struct kvm_rmap_desc *d;
1365
1366 for (j = 0; j < m->npages; ++j) {
1367 struct page *page = m->phys_mem[j];
1368
1369 if (!page->private)
1370 continue;
1371 if (!(page->private & 1)) {
1372 ++nmaps;
1373 continue;
1374 }
1375 d = (struct kvm_rmap_desc *)(page->private & ~1ul);
1376 while (d) {
1377 for (k = 0; k < RMAP_EXT; ++k)
1378 if (d->shadow_ptes[k])
1379 ++nmaps;
1380 else
1381 break;
1382 d = d->more;
1383 }
1384 }
1385 }
1386 return nmaps;
1387}
1388
1389static int count_writable_mappings(struct kvm_vcpu *vcpu)
1390{
1391 int nmaps = 0;
1392 struct kvm_mmu_page *page;
1393 int i;
1394
1395 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1396 u64 *pt = __va(page->page_hpa);
1397
1398 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1399 continue;
1400
1401 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1402 u64 ent = pt[i];
1403
1404 if (!(ent & PT_PRESENT_MASK))
1405 continue;
1406 if (!(ent & PT_WRITABLE_MASK))
1407 continue;
1408 ++nmaps;
1409 }
1410 }
1411 return nmaps;
1412}
1413
1414static void audit_rmap(struct kvm_vcpu *vcpu)
1415{
1416 int n_rmap = count_rmaps(vcpu);
1417 int n_actual = count_writable_mappings(vcpu);
1418
1419 if (n_rmap != n_actual)
1420 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1421 __FUNCTION__, audit_msg, n_rmap, n_actual);
1422}
1423
1424static void audit_write_protection(struct kvm_vcpu *vcpu)
1425{
1426 struct kvm_mmu_page *page;
1427
1428 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1429 hfn_t hfn;
1430 struct page *pg;
1431
1432 if (page->role.metaphysical)
1433 continue;
1434
1435 hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
1436 >> PAGE_SHIFT;
1437 pg = pfn_to_page(hfn);
1438 if (pg->private)
1439 printk(KERN_ERR "%s: (%s) shadow page has writable"
1440 " mappings: gfn %lx role %x\n",
1441 __FUNCTION__, audit_msg, page->gfn,
1442 page->role.word);
1443 }
1444}
1445
1446static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1447{
1448 int olddbg = dbg;
1449
1450 dbg = 0;
1451 audit_msg = msg;
1452 audit_rmap(vcpu);
1453 audit_write_protection(vcpu);
1454 audit_mappings(vcpu);
1455 dbg = olddbg;
1456}
1457
1458#endif
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index 09bb9b4ed12d..2dbf4307ed9e 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -32,6 +32,11 @@
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK 34 #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
35 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4
37 #else
38 #define PT_MAX_FULL_LEVELS 2
39 #endif
35#elif PTTYPE == 32 40#elif PTTYPE == 32
36 #define pt_element_t u32 41 #define pt_element_t u32
37 #define guest_walker guest_walker32 42 #define guest_walker guest_walker32
@@ -42,6 +47,7 @@
42 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 47 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
43 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 48 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
44 #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK 49 #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
50 #define PT_MAX_FULL_LEVELS 2
45#else 51#else
46 #error Invalid PTTYPE value 52 #error Invalid PTTYPE value
47#endif 53#endif
@@ -52,93 +58,126 @@
52 */ 58 */
53struct guest_walker { 59struct guest_walker {
54 int level; 60 int level;
61 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
55 pt_element_t *table; 62 pt_element_t *table;
63 pt_element_t *ptep;
56 pt_element_t inherited_ar; 64 pt_element_t inherited_ar;
65 gfn_t gfn;
57}; 66};
58 67
59static void FNAME(init_walker)(struct guest_walker *walker, 68/*
60 struct kvm_vcpu *vcpu) 69 * Fetch a guest pte for a guest virtual address
70 */
71static void FNAME(walk_addr)(struct guest_walker *walker,
72 struct kvm_vcpu *vcpu, gva_t addr)
61{ 73{
62 hpa_t hpa; 74 hpa_t hpa;
63 struct kvm_memory_slot *slot; 75 struct kvm_memory_slot *slot;
76 pt_element_t *ptep;
77 pt_element_t root;
78 gfn_t table_gfn;
64 79
80 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
65 walker->level = vcpu->mmu.root_level; 81 walker->level = vcpu->mmu.root_level;
66 slot = gfn_to_memslot(vcpu->kvm, 82 walker->table = NULL;
67 (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); 83 root = vcpu->cr3;
68 hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK); 84#if PTTYPE == 64
85 if (!is_long_mode(vcpu)) {
86 walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
87 root = *walker->ptep;
88 if (!(root & PT_PRESENT_MASK))
89 return;
90 --walker->level;
91 }
92#endif
93 table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
94 walker->table_gfn[walker->level - 1] = table_gfn;
95 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
96 walker->level - 1, table_gfn);
97 slot = gfn_to_memslot(vcpu->kvm, table_gfn);
98 hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
69 walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); 99 walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0);
70 100
71 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 101 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
72 (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0); 102 (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0);
73 103
74 walker->table = (pt_element_t *)( (unsigned long)walker->table |
75 (unsigned long)(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) );
76 walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; 104 walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
105
106 for (;;) {
107 int index = PT_INDEX(addr, walker->level);
108 hpa_t paddr;
109
110 ptep = &walker->table[index];
111 ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
112 ((unsigned long)ptep & PAGE_MASK));
113
114 if (is_present_pte(*ptep) && !(*ptep & PT_ACCESSED_MASK))
115 *ptep |= PT_ACCESSED_MASK;
116
117 if (!is_present_pte(*ptep))
118 break;
119
120 if (walker->level == PT_PAGE_TABLE_LEVEL) {
121 walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
122 >> PAGE_SHIFT;
123 break;
124 }
125
126 if (walker->level == PT_DIRECTORY_LEVEL
127 && (*ptep & PT_PAGE_SIZE_MASK)
128 && (PTTYPE == 64 || is_pse(vcpu))) {
129 walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
130 >> PAGE_SHIFT;
131 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
132 break;
133 }
134
135 if (walker->level != 3 || is_long_mode(vcpu))
136 walker->inherited_ar &= walker->table[index];
137 table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
138 paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK);
139 kunmap_atomic(walker->table, KM_USER0);
140 walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
141 KM_USER0);
142 --walker->level;
143 walker->table_gfn[walker->level - 1 ] = table_gfn;
144 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
145 walker->level - 1, table_gfn);
146 }
147 walker->ptep = ptep;
148 pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
77} 149}
78 150
79static void FNAME(release_walker)(struct guest_walker *walker) 151static void FNAME(release_walker)(struct guest_walker *walker)
80{ 152{
81 kunmap_atomic(walker->table, KM_USER0); 153 if (walker->table)
154 kunmap_atomic(walker->table, KM_USER0);
82} 155}
83 156
84static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, 157static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte,
85 u64 *shadow_pte, u64 access_bits) 158 u64 *shadow_pte, u64 access_bits, gfn_t gfn)
86{ 159{
87 ASSERT(*shadow_pte == 0); 160 ASSERT(*shadow_pte == 0);
88 access_bits &= guest_pte; 161 access_bits &= guest_pte;
89 *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); 162 *shadow_pte = (guest_pte & PT_PTE_COPY_MASK);
90 set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, 163 set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK,
91 guest_pte & PT_DIRTY_MASK, access_bits); 164 guest_pte & PT_DIRTY_MASK, access_bits, gfn);
92} 165}
93 166
94static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, 167static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde,
95 u64 *shadow_pte, u64 access_bits, 168 u64 *shadow_pte, u64 access_bits, gfn_t gfn)
96 int index)
97{ 169{
98 gpa_t gaddr; 170 gpa_t gaddr;
99 171
100 ASSERT(*shadow_pte == 0); 172 ASSERT(*shadow_pte == 0);
101 access_bits &= guest_pde; 173 access_bits &= guest_pde;
102 gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index; 174 gaddr = (gpa_t)gfn << PAGE_SHIFT;
103 if (PTTYPE == 32 && is_cpuid_PSE36()) 175 if (PTTYPE == 32 && is_cpuid_PSE36())
104 gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << 176 gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) <<
105 (32 - PT32_DIR_PSE36_SHIFT); 177 (32 - PT32_DIR_PSE36_SHIFT);
106 *shadow_pte = guest_pde & PT_PTE_COPY_MASK; 178 *shadow_pte = guest_pde & PT_PTE_COPY_MASK;
107 set_pte_common(vcpu, shadow_pte, gaddr, 179 set_pte_common(vcpu, shadow_pte, gaddr,
108 guest_pde & PT_DIRTY_MASK, access_bits); 180 guest_pde & PT_DIRTY_MASK, access_bits, gfn);
109}
110
111/*
112 * Fetch a guest pte from a specific level in the paging hierarchy.
113 */
114static pt_element_t *FNAME(fetch_guest)(struct kvm_vcpu *vcpu,
115 struct guest_walker *walker,
116 int level,
117 gva_t addr)
118{
119
120 ASSERT(level > 0 && level <= walker->level);
121
122 for (;;) {
123 int index = PT_INDEX(addr, walker->level);
124 hpa_t paddr;
125
126 ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
127 ((unsigned long)&walker->table[index] & PAGE_MASK));
128 if (level == walker->level ||
129 !is_present_pte(walker->table[index]) ||
130 (walker->level == PT_DIRECTORY_LEVEL &&
131 (walker->table[index] & PT_PAGE_SIZE_MASK) &&
132 (PTTYPE == 64 || is_pse(vcpu))))
133 return &walker->table[index];
134 if (walker->level != 3 || is_long_mode(vcpu))
135 walker->inherited_ar &= walker->table[index];
136 paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK);
137 kunmap_atomic(walker->table, KM_USER0);
138 walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
139 KM_USER0);
140 --walker->level;
141 }
142} 181}
143 182
144/* 183/*
@@ -150,15 +189,26 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
150 hpa_t shadow_addr; 189 hpa_t shadow_addr;
151 int level; 190 int level;
152 u64 *prev_shadow_ent = NULL; 191 u64 *prev_shadow_ent = NULL;
192 pt_element_t *guest_ent = walker->ptep;
193
194 if (!is_present_pte(*guest_ent))
195 return NULL;
153 196
154 shadow_addr = vcpu->mmu.root_hpa; 197 shadow_addr = vcpu->mmu.root_hpa;
155 level = vcpu->mmu.shadow_root_level; 198 level = vcpu->mmu.shadow_root_level;
199 if (level == PT32E_ROOT_LEVEL) {
200 shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
201 shadow_addr &= PT64_BASE_ADDR_MASK;
202 --level;
203 }
156 204
157 for (; ; level--) { 205 for (; ; level--) {
158 u32 index = SHADOW_PT_INDEX(addr, level); 206 u32 index = SHADOW_PT_INDEX(addr, level);
159 u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; 207 u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
160 pt_element_t *guest_ent; 208 struct kvm_mmu_page *shadow_page;
161 u64 shadow_pte; 209 u64 shadow_pte;
210 int metaphysical;
211 gfn_t table_gfn;
162 212
163 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { 213 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
164 if (level == PT_PAGE_TABLE_LEVEL) 214 if (level == PT_PAGE_TABLE_LEVEL)
@@ -168,21 +218,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
168 continue; 218 continue;
169 } 219 }
170 220
171 if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) {
172 ASSERT(level == PT32E_ROOT_LEVEL);
173 guest_ent = FNAME(fetch_guest)(vcpu, walker,
174 PT32_ROOT_LEVEL, addr);
175 } else
176 guest_ent = FNAME(fetch_guest)(vcpu, walker,
177 level, addr);
178
179 if (!is_present_pte(*guest_ent))
180 return NULL;
181
182 /* Don't set accessed bit on PAE PDPTRs */
183 if (vcpu->mmu.root_level != 3 || walker->level != 3)
184 *guest_ent |= PT_ACCESSED_MASK;
185
186 if (level == PT_PAGE_TABLE_LEVEL) { 221 if (level == PT_PAGE_TABLE_LEVEL) {
187 222
188 if (walker->level == PT_DIRECTORY_LEVEL) { 223 if (walker->level == PT_DIRECTORY_LEVEL) {
@@ -190,21 +225,30 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
190 *prev_shadow_ent |= PT_SHADOW_PS_MARK; 225 *prev_shadow_ent |= PT_SHADOW_PS_MARK;
191 FNAME(set_pde)(vcpu, *guest_ent, shadow_ent, 226 FNAME(set_pde)(vcpu, *guest_ent, shadow_ent,
192 walker->inherited_ar, 227 walker->inherited_ar,
193 PT_INDEX(addr, PT_PAGE_TABLE_LEVEL)); 228 walker->gfn);
194 } else { 229 } else {
195 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); 230 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
196 FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar); 231 FNAME(set_pte)(vcpu, *guest_ent, shadow_ent,
232 walker->inherited_ar,
233 walker->gfn);
197 } 234 }
198 return shadow_ent; 235 return shadow_ent;
199 } 236 }
200 237
201 shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent); 238 if (level - 1 == PT_PAGE_TABLE_LEVEL
202 if (!VALID_PAGE(shadow_addr)) 239 && walker->level == PT_DIRECTORY_LEVEL) {
203 return ERR_PTR(-ENOMEM); 240 metaphysical = 1;
204 shadow_pte = shadow_addr | PT_PRESENT_MASK; 241 table_gfn = (*guest_ent & PT_BASE_ADDR_MASK)
205 if (vcpu->mmu.root_level > 3 || level != 3) 242 >> PAGE_SHIFT;
206 shadow_pte |= PT_ACCESSED_MASK 243 } else {
207 | PT_WRITABLE_MASK | PT_USER_MASK; 244 metaphysical = 0;
245 table_gfn = walker->table_gfn[level - 2];
246 }
247 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
248 metaphysical, shadow_ent);
249 shadow_addr = shadow_page->page_hpa;
250 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
251 | PT_WRITABLE_MASK | PT_USER_MASK;
208 *shadow_ent = shadow_pte; 252 *shadow_ent = shadow_pte;
209 prev_shadow_ent = shadow_ent; 253 prev_shadow_ent = shadow_ent;
210 } 254 }
@@ -221,11 +265,13 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
221 u64 *shadow_ent, 265 u64 *shadow_ent,
222 struct guest_walker *walker, 266 struct guest_walker *walker,
223 gva_t addr, 267 gva_t addr,
224 int user) 268 int user,
269 int *write_pt)
225{ 270{
226 pt_element_t *guest_ent; 271 pt_element_t *guest_ent;
227 int writable_shadow; 272 int writable_shadow;
228 gfn_t gfn; 273 gfn_t gfn;
274 struct kvm_mmu_page *page;
229 275
230 if (is_writeble_pte(*shadow_ent)) 276 if (is_writeble_pte(*shadow_ent))
231 return 0; 277 return 0;
@@ -250,17 +296,35 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
250 *shadow_ent &= ~PT_USER_MASK; 296 *shadow_ent &= ~PT_USER_MASK;
251 } 297 }
252 298
253 guest_ent = FNAME(fetch_guest)(vcpu, walker, PT_PAGE_TABLE_LEVEL, addr); 299 guest_ent = walker->ptep;
254 300
255 if (!is_present_pte(*guest_ent)) { 301 if (!is_present_pte(*guest_ent)) {
256 *shadow_ent = 0; 302 *shadow_ent = 0;
257 return 0; 303 return 0;
258 } 304 }
259 305
260 gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 306 gfn = walker->gfn;
307
308 if (user) {
309 /*
310 * Usermode page faults won't be for page table updates.
311 */
312 while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
313 pgprintk("%s: zap %lx %x\n",
314 __FUNCTION__, gfn, page->role.word);
315 kvm_mmu_zap_page(vcpu, page);
316 }
317 } else if (kvm_mmu_lookup_page(vcpu, gfn)) {
318 pgprintk("%s: found shadow page for %lx, marking ro\n",
319 __FUNCTION__, gfn);
320 *guest_ent |= PT_DIRTY_MASK;
321 *write_pt = 1;
322 return 0;
323 }
261 mark_page_dirty(vcpu->kvm, gfn); 324 mark_page_dirty(vcpu->kvm, gfn);
262 *shadow_ent |= PT_WRITABLE_MASK; 325 *shadow_ent |= PT_WRITABLE_MASK;
263 *guest_ent |= PT_DIRTY_MASK; 326 *guest_ent |= PT_DIRTY_MASK;
327 rmap_add(vcpu, shadow_ent);
264 328
265 return 1; 329 return 1;
266} 330}
@@ -276,7 +340,8 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
276 * - normal guest page fault due to the guest pte marked not present, not 340 * - normal guest page fault due to the guest pte marked not present, not
277 * writable, or not executable 341 * writable, or not executable
278 * 342 *
279 * Returns: 1 if we need to emulate the instruction, 0 otherwise 343 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
344 * a negative value on error.
280 */ 345 */
281static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, 346static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
282 u32 error_code) 347 u32 error_code)
@@ -287,39 +352,47 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
287 struct guest_walker walker; 352 struct guest_walker walker;
288 u64 *shadow_pte; 353 u64 *shadow_pte;
289 int fixed; 354 int fixed;
355 int write_pt = 0;
356 int r;
357
358 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
359 kvm_mmu_audit(vcpu, "pre page fault");
360
361 r = mmu_topup_memory_caches(vcpu);
362 if (r)
363 return r;
290 364
291 /* 365 /*
292 * Look up the shadow pte for the faulting address. 366 * Look up the shadow pte for the faulting address.
293 */ 367 */
294 for (;;) { 368 FNAME(walk_addr)(&walker, vcpu, addr);
295 FNAME(init_walker)(&walker, vcpu); 369 shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
296 shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
297 if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */
298 nonpaging_flush(vcpu);
299 FNAME(release_walker)(&walker);
300 continue;
301 }
302 break;
303 }
304 370
305 /* 371 /*
306 * The page is not mapped by the guest. Let the guest handle it. 372 * The page is not mapped by the guest. Let the guest handle it.
307 */ 373 */
308 if (!shadow_pte) { 374 if (!shadow_pte) {
375 pgprintk("%s: not mapped\n", __FUNCTION__);
309 inject_page_fault(vcpu, addr, error_code); 376 inject_page_fault(vcpu, addr, error_code);
310 FNAME(release_walker)(&walker); 377 FNAME(release_walker)(&walker);
311 return 0; 378 return 0;
312 } 379 }
313 380
381 pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__,
382 shadow_pte, *shadow_pte);
383
314 /* 384 /*
315 * Update the shadow pte. 385 * Update the shadow pte.
316 */ 386 */
317 if (write_fault) 387 if (write_fault)
318 fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, 388 fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr,
319 user_fault); 389 user_fault, &write_pt);
320 else 390 else
321 fixed = fix_read_pf(shadow_pte); 391 fixed = fix_read_pf(shadow_pte);
322 392
393 pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__,
394 shadow_pte, *shadow_pte);
395
323 FNAME(release_walker)(&walker); 396 FNAME(release_walker)(&walker);
324 397
325 /* 398 /*
@@ -331,20 +404,23 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
331 pgprintk("%s: io work, no access\n", __FUNCTION__); 404 pgprintk("%s: io work, no access\n", __FUNCTION__);
332 inject_page_fault(vcpu, addr, 405 inject_page_fault(vcpu, addr,
333 error_code | PFERR_PRESENT_MASK); 406 error_code | PFERR_PRESENT_MASK);
407 kvm_mmu_audit(vcpu, "post page fault (io)");
334 return 0; 408 return 0;
335 } 409 }
336 410
337 /* 411 /*
338 * pte not present, guest page fault. 412 * pte not present, guest page fault.
339 */ 413 */
340 if (pte_present && !fixed) { 414 if (pte_present && !fixed && !write_pt) {
341 inject_page_fault(vcpu, addr, error_code); 415 inject_page_fault(vcpu, addr, error_code);
416 kvm_mmu_audit(vcpu, "post page fault (guest)");
342 return 0; 417 return 0;
343 } 418 }
344 419
345 ++kvm_stat.pf_fixed; 420 ++kvm_stat.pf_fixed;
421 kvm_mmu_audit(vcpu, "post page fault (fixed)");
346 422
347 return 0; 423 return write_pt;
348} 424}
349 425
350static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 426static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -353,9 +429,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
353 pt_element_t guest_pte; 429 pt_element_t guest_pte;
354 gpa_t gpa; 430 gpa_t gpa;
355 431
356 FNAME(init_walker)(&walker, vcpu); 432 FNAME(walk_addr)(&walker, vcpu, vaddr);
357 guest_pte = *FNAME(fetch_guest)(vcpu, &walker, PT_PAGE_TABLE_LEVEL, 433 guest_pte = *walker.ptep;
358 vaddr);
359 FNAME(release_walker)(&walker); 434 FNAME(release_walker)(&walker);
360 435
361 if (!is_present_pte(guest_pte)) 436 if (!is_present_pte(guest_pte))
@@ -389,3 +464,4 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
389#undef PT_PTE_COPY_MASK 464#undef PT_PTE_COPY_MASK
390#undef PT_NON_PTE_COPY_MASK 465#undef PT_NON_PTE_COPY_MASK
391#undef PT_DIR_BASE_ADDR_MASK 466#undef PT_DIR_BASE_ADDR_MASK
467#undef PT_MAX_FULL_LEVELS
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index fa0428735717..ccc06b1b91b5 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -235,6 +235,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
235 235
236 vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip; 236 vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip;
237 vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 237 vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
238
239 vcpu->interrupt_window_open = 1;
238} 240}
239 241
240static int has_svm(void) 242static int has_svm(void)
@@ -495,7 +497,6 @@ static void init_vmcb(struct vmcb *vmcb)
495 /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ 497 /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */
496 (1ULL << INTERCEPT_CPUID) | 498 (1ULL << INTERCEPT_CPUID) |
497 (1ULL << INTERCEPT_HLT) | 499 (1ULL << INTERCEPT_HLT) |
498 (1ULL << INTERCEPT_INVLPG) |
499 (1ULL << INTERCEPT_INVLPGA) | 500 (1ULL << INTERCEPT_INVLPGA) |
500 (1ULL << INTERCEPT_IOIO_PROT) | 501 (1ULL << INTERCEPT_IOIO_PROT) |
501 (1ULL << INTERCEPT_MSR_PROT) | 502 (1ULL << INTERCEPT_MSR_PROT) |
@@ -700,6 +701,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
700 vcpu->svm->vmcb->save.gdtr.base = dt->base ; 701 vcpu->svm->vmcb->save.gdtr.base = dt->base ;
701} 702}
702 703
704static void svm_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu)
705{
706}
707
703static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 708static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
704{ 709{
705#ifdef CONFIG_X86_64 710#ifdef CONFIG_X86_64
@@ -847,6 +852,7 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
847 u64 fault_address; 852 u64 fault_address;
848 u32 error_code; 853 u32 error_code;
849 enum emulation_result er; 854 enum emulation_result er;
855 int r;
850 856
851 if (is_external_interrupt(exit_int_info)) 857 if (is_external_interrupt(exit_int_info))
852 push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); 858 push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
@@ -855,7 +861,12 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
855 861
856 fault_address = vcpu->svm->vmcb->control.exit_info_2; 862 fault_address = vcpu->svm->vmcb->control.exit_info_2;
857 error_code = vcpu->svm->vmcb->control.exit_info_1; 863 error_code = vcpu->svm->vmcb->control.exit_info_1;
858 if (!vcpu->mmu.page_fault(vcpu, fault_address, error_code)) { 864 r = kvm_mmu_page_fault(vcpu, fault_address, error_code);
865 if (r < 0) {
866 spin_unlock(&vcpu->kvm->lock);
867 return r;
868 }
869 if (!r) {
859 spin_unlock(&vcpu->kvm->lock); 870 spin_unlock(&vcpu->kvm->lock);
860 return 1; 871 return 1;
861 } 872 }
@@ -1031,10 +1042,11 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1031{ 1042{
1032 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; 1043 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
1033 skip_emulated_instruction(vcpu); 1044 skip_emulated_instruction(vcpu);
1034 if (vcpu->irq_summary && (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)) 1045 if (vcpu->irq_summary)
1035 return 1; 1046 return 1;
1036 1047
1037 kvm_run->exit_reason = KVM_EXIT_HLT; 1048 kvm_run->exit_reason = KVM_EXIT_HLT;
1049 ++kvm_stat.halt_exits;
1038 return 0; 1050 return 0;
1039} 1051}
1040 1052
@@ -1186,6 +1198,23 @@ static int msr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1186 return rdmsr_interception(vcpu, kvm_run); 1198 return rdmsr_interception(vcpu, kvm_run);
1187} 1199}
1188 1200
1201static int interrupt_window_interception(struct kvm_vcpu *vcpu,
1202 struct kvm_run *kvm_run)
1203{
1204 /*
1205 * If the user space waits to inject interrupts, exit as soon as
1206 * possible
1207 */
1208 if (kvm_run->request_interrupt_window &&
1209 !vcpu->irq_summary) {
1210 ++kvm_stat.irq_window_exits;
1211 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1212 return 0;
1213 }
1214
1215 return 1;
1216}
1217
1189static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, 1218static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu,
1190 struct kvm_run *kvm_run) = { 1219 struct kvm_run *kvm_run) = {
1191 [SVM_EXIT_READ_CR0] = emulate_on_interception, 1220 [SVM_EXIT_READ_CR0] = emulate_on_interception,
@@ -1210,6 +1239,7 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu,
1210 [SVM_EXIT_NMI] = nop_on_interception, 1239 [SVM_EXIT_NMI] = nop_on_interception,
1211 [SVM_EXIT_SMI] = nop_on_interception, 1240 [SVM_EXIT_SMI] = nop_on_interception,
1212 [SVM_EXIT_INIT] = nop_on_interception, 1241 [SVM_EXIT_INIT] = nop_on_interception,
1242 [SVM_EXIT_VINTR] = interrupt_window_interception,
1213 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ 1243 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
1214 [SVM_EXIT_CPUID] = cpuid_interception, 1244 [SVM_EXIT_CPUID] = cpuid_interception,
1215 [SVM_EXIT_HLT] = halt_interception, 1245 [SVM_EXIT_HLT] = halt_interception,
@@ -1278,15 +1308,11 @@ static void pre_svm_run(struct kvm_vcpu *vcpu)
1278} 1308}
1279 1309
1280 1310
1281static inline void kvm_try_inject_irq(struct kvm_vcpu *vcpu) 1311static inline void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1282{ 1312{
1283 struct vmcb_control_area *control; 1313 struct vmcb_control_area *control;
1284 1314
1285 if (!vcpu->irq_summary)
1286 return;
1287
1288 control = &vcpu->svm->vmcb->control; 1315 control = &vcpu->svm->vmcb->control;
1289
1290 control->int_vector = pop_irq(vcpu); 1316 control->int_vector = pop_irq(vcpu);
1291 control->int_ctl &= ~V_INTR_PRIO_MASK; 1317 control->int_ctl &= ~V_INTR_PRIO_MASK;
1292 control->int_ctl |= V_IRQ_MASK | 1318 control->int_ctl |= V_IRQ_MASK |
@@ -1301,6 +1327,59 @@ static void kvm_reput_irq(struct kvm_vcpu *vcpu)
1301 control->int_ctl &= ~V_IRQ_MASK; 1327 control->int_ctl &= ~V_IRQ_MASK;
1302 push_irq(vcpu, control->int_vector); 1328 push_irq(vcpu, control->int_vector);
1303 } 1329 }
1330
1331 vcpu->interrupt_window_open =
1332 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
1333}
1334
1335static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1336 struct kvm_run *kvm_run)
1337{
1338 struct vmcb_control_area *control = &vcpu->svm->vmcb->control;
1339
1340 vcpu->interrupt_window_open =
1341 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
1342 (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
1343
1344 if (vcpu->interrupt_window_open && vcpu->irq_summary)
1345 /*
1346 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1347 */
1348 kvm_do_inject_irq(vcpu);
1349
1350 /*
1351 * Interrupts blocked. Wait for unblock.
1352 */
1353 if (!vcpu->interrupt_window_open &&
1354 (vcpu->irq_summary || kvm_run->request_interrupt_window)) {
1355 control->intercept |= 1ULL << INTERCEPT_VINTR;
1356 } else
1357 control->intercept &= ~(1ULL << INTERCEPT_VINTR);
1358}
1359
1360static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1361 struct kvm_run *kvm_run)
1362{
1363 kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
1364 vcpu->irq_summary == 0);
1365 kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0;
1366 kvm_run->cr8 = vcpu->cr8;
1367 kvm_run->apic_base = vcpu->apic_base;
1368}
1369
1370/*
1371 * Check if userspace requested an interrupt window, and that the
1372 * interrupt window is open.
1373 *
1374 * No need to exit to userspace if we already have an interrupt queued.
1375 */
1376static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1377 struct kvm_run *kvm_run)
1378{
1379 return (!vcpu->irq_summary &&
1380 kvm_run->request_interrupt_window &&
1381 vcpu->interrupt_window_open &&
1382 (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
1304} 1383}
1305 1384
1306static void save_db_regs(unsigned long *db_regs) 1385static void save_db_regs(unsigned long *db_regs)
@@ -1324,9 +1403,10 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1324 u16 fs_selector; 1403 u16 fs_selector;
1325 u16 gs_selector; 1404 u16 gs_selector;
1326 u16 ldt_selector; 1405 u16 ldt_selector;
1406 int r;
1327 1407
1328again: 1408again:
1329 kvm_try_inject_irq(vcpu); 1409 do_interrupt_requests(vcpu, kvm_run);
1330 1410
1331 clgi(); 1411 clgi();
1332 1412
@@ -1487,18 +1567,28 @@ again:
1487 if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 1567 if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
1488 kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; 1568 kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
1489 kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code; 1569 kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code;
1570 post_kvm_run_save(vcpu, kvm_run);
1490 return 0; 1571 return 0;
1491 } 1572 }
1492 1573
1493 if (handle_exit(vcpu, kvm_run)) { 1574 r = handle_exit(vcpu, kvm_run);
1575 if (r > 0) {
1494 if (signal_pending(current)) { 1576 if (signal_pending(current)) {
1495 ++kvm_stat.signal_exits; 1577 ++kvm_stat.signal_exits;
1578 post_kvm_run_save(vcpu, kvm_run);
1579 return -EINTR;
1580 }
1581
1582 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
1583 ++kvm_stat.request_irq_exits;
1584 post_kvm_run_save(vcpu, kvm_run);
1496 return -EINTR; 1585 return -EINTR;
1497 } 1586 }
1498 kvm_resched(vcpu); 1587 kvm_resched(vcpu);
1499 goto again; 1588 goto again;
1500 } 1589 }
1501 return 0; 1590 post_kvm_run_save(vcpu, kvm_run);
1591 return r;
1502} 1592}
1503 1593
1504static void svm_flush_tlb(struct kvm_vcpu *vcpu) 1594static void svm_flush_tlb(struct kvm_vcpu *vcpu)
@@ -1565,6 +1655,7 @@ static struct kvm_arch_ops svm_arch_ops = {
1565 .get_segment = svm_get_segment, 1655 .get_segment = svm_get_segment,
1566 .set_segment = svm_set_segment, 1656 .set_segment = svm_set_segment,
1567 .get_cs_db_l_bits = svm_get_cs_db_l_bits, 1657 .get_cs_db_l_bits = svm_get_cs_db_l_bits,
1658 .decache_cr0_cr4_guest_bits = svm_decache_cr0_cr4_guest_bits,
1568 .set_cr0 = svm_set_cr0, 1659 .set_cr0 = svm_set_cr0,
1569 .set_cr0_no_modeswitch = svm_set_cr0, 1660 .set_cr0_no_modeswitch = svm_set_cr0,
1570 .set_cr3 = svm_set_cr3, 1661 .set_cr3 = svm_set_cr3,
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index d0a2c2d5342a..d4701cb4c654 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -116,7 +116,7 @@ static void vmcs_clear(struct vmcs *vmcs)
116static void __vcpu_clear(void *arg) 116static void __vcpu_clear(void *arg)
117{ 117{
118 struct kvm_vcpu *vcpu = arg; 118 struct kvm_vcpu *vcpu = arg;
119 int cpu = smp_processor_id(); 119 int cpu = raw_smp_processor_id();
120 120
121 if (vcpu->cpu == cpu) 121 if (vcpu->cpu == cpu)
122 vmcs_clear(vcpu->vmcs); 122 vmcs_clear(vcpu->vmcs);
@@ -152,15 +152,21 @@ static u64 vmcs_read64(unsigned long field)
152#endif 152#endif
153} 153}
154 154
155static noinline void vmwrite_error(unsigned long field, unsigned long value)
156{
157 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
158 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
159 dump_stack();
160}
161
155static void vmcs_writel(unsigned long field, unsigned long value) 162static void vmcs_writel(unsigned long field, unsigned long value)
156{ 163{
157 u8 error; 164 u8 error;
158 165
159 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" 166 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
160 : "=q"(error) : "a"(value), "d"(field) : "cc" ); 167 : "=q"(error) : "a"(value), "d"(field) : "cc" );
161 if (error) 168 if (unlikely(error))
162 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", 169 vmwrite_error(field, value);
163 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
164} 170}
165 171
166static void vmcs_write16(unsigned long field, u16 value) 172static void vmcs_write16(unsigned long field, u16 value)
@@ -263,6 +269,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
263 if (interruptibility & 3) 269 if (interruptibility & 3)
264 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 270 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
265 interruptibility & ~3); 271 interruptibility & ~3);
272 vcpu->interrupt_window_open = 1;
266} 273}
267 274
268static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) 275static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
@@ -541,7 +548,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
541 548
542static struct vmcs *alloc_vmcs(void) 549static struct vmcs *alloc_vmcs(void)
543{ 550{
544 return alloc_vmcs_cpu(smp_processor_id()); 551 return alloc_vmcs_cpu(raw_smp_processor_id());
545} 552}
546 553
547static void free_vmcs(struct vmcs *vmcs) 554static void free_vmcs(struct vmcs *vmcs)
@@ -736,6 +743,15 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
736 743
737#endif 744#endif
738 745
746static void vmx_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu)
747{
748 vcpu->cr0 &= KVM_GUEST_CR0_MASK;
749 vcpu->cr0 |= vmcs_readl(GUEST_CR0) & ~KVM_GUEST_CR0_MASK;
750
751 vcpu->cr4 &= KVM_GUEST_CR4_MASK;
752 vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
753}
754
739static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 755static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
740{ 756{
741 if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) 757 if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
@@ -1011,8 +1027,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1011 vmcs_writel(GUEST_RIP, 0xfff0); 1027 vmcs_writel(GUEST_RIP, 0xfff0);
1012 vmcs_writel(GUEST_RSP, 0); 1028 vmcs_writel(GUEST_RSP, 0);
1013 1029
1014 vmcs_writel(GUEST_CR3, 0);
1015
1016 //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 1030 //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1017 vmcs_writel(GUEST_DR7, 0x400); 1031 vmcs_writel(GUEST_DR7, 0x400);
1018 1032
@@ -1049,7 +1063,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1049 | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ 1063 | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
1050 | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ 1064 | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */
1051 | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ 1065 | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */
1052 | CPU_BASED_INVDPG_EXITING
1053 | CPU_BASED_MOV_DR_EXITING 1066 | CPU_BASED_MOV_DR_EXITING
1054 | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ 1067 | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */
1055 ); 1068 );
@@ -1094,14 +1107,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1094 rdmsrl(MSR_IA32_SYSENTER_EIP, a); 1107 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
1095 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ 1108 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1096 1109
1097 ret = -ENOMEM;
1098 vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
1099 if (!vcpu->guest_msrs)
1100 goto out;
1101 vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
1102 if (!vcpu->host_msrs)
1103 goto out_free_guest_msrs;
1104
1105 for (i = 0; i < NR_VMX_MSR; ++i) { 1110 for (i = 0; i < NR_VMX_MSR; ++i) {
1106 u32 index = vmx_msr_index[i]; 1111 u32 index = vmx_msr_index[i];
1107 u32 data_low, data_high; 1112 u32 data_low, data_high;
@@ -1155,8 +1160,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1155 1160
1156 return 0; 1161 return 0;
1157 1162
1158out_free_guest_msrs:
1159 kfree(vcpu->guest_msrs);
1160out: 1163out:
1161 return ret; 1164 return ret;
1162} 1165}
@@ -1224,21 +1227,34 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1224 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 1227 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1225} 1228}
1226 1229
1227static void kvm_try_inject_irq(struct kvm_vcpu *vcpu) 1230
1231static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1232 struct kvm_run *kvm_run)
1228{ 1233{
1229 if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) 1234 u32 cpu_based_vm_exec_control;
1230 && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0) 1235
1236 vcpu->interrupt_window_open =
1237 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
1238 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
1239
1240 if (vcpu->interrupt_window_open &&
1241 vcpu->irq_summary &&
1242 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1231 /* 1243 /*
1232 * Interrupts enabled, and not blocked by sti or mov ss. Good. 1244 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1233 */ 1245 */
1234 kvm_do_inject_irq(vcpu); 1246 kvm_do_inject_irq(vcpu);
1235 else 1247
1248 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1249 if (!vcpu->interrupt_window_open &&
1250 (vcpu->irq_summary || kvm_run->request_interrupt_window))
1236 /* 1251 /*
1237 * Interrupts blocked. Wait for unblock. 1252 * Interrupts blocked. Wait for unblock.
1238 */ 1253 */
1239 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1254 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
1240 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) 1255 else
1241 | CPU_BASED_VIRTUAL_INTR_PENDING); 1256 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
1257 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1242} 1258}
1243 1259
1244static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) 1260static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
@@ -1277,6 +1293,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1277 unsigned long cr2, rip; 1293 unsigned long cr2, rip;
1278 u32 vect_info; 1294 u32 vect_info;
1279 enum emulation_result er; 1295 enum emulation_result er;
1296 int r;
1280 1297
1281 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 1298 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1282 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 1299 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -1305,7 +1322,12 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1305 cr2 = vmcs_readl(EXIT_QUALIFICATION); 1322 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1306 1323
1307 spin_lock(&vcpu->kvm->lock); 1324 spin_lock(&vcpu->kvm->lock);
1308 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) { 1325 r = kvm_mmu_page_fault(vcpu, cr2, error_code);
1326 if (r < 0) {
1327 spin_unlock(&vcpu->kvm->lock);
1328 return r;
1329 }
1330 if (!r) {
1309 spin_unlock(&vcpu->kvm->lock); 1331 spin_unlock(&vcpu->kvm->lock);
1310 return 1; 1332 return 1;
1311 } 1333 }
@@ -1425,17 +1447,6 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1425 return 0; 1447 return 0;
1426} 1448}
1427 1449
1428static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1429{
1430 u64 address = vmcs_read64(EXIT_QUALIFICATION);
1431 int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1432 spin_lock(&vcpu->kvm->lock);
1433 vcpu->mmu.inval_page(vcpu, address);
1434 spin_unlock(&vcpu->kvm->lock);
1435 vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
1436 return 1;
1437}
1438
1439static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1450static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1440{ 1451{
1441 u64 exit_qualification; 1452 u64 exit_qualification;
@@ -1575,23 +1586,40 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1575 return 1; 1586 return 1;
1576} 1587}
1577 1588
1589static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1590 struct kvm_run *kvm_run)
1591{
1592 kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0;
1593 kvm_run->cr8 = vcpu->cr8;
1594 kvm_run->apic_base = vcpu->apic_base;
1595 kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
1596 vcpu->irq_summary == 0);
1597}
1598
1578static int handle_interrupt_window(struct kvm_vcpu *vcpu, 1599static int handle_interrupt_window(struct kvm_vcpu *vcpu,
1579 struct kvm_run *kvm_run) 1600 struct kvm_run *kvm_run)
1580{ 1601{
1581 /* Turn off interrupt window reporting. */ 1602 /*
1582 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1603 * If the user space waits to inject interrupts, exit as soon as
1583 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) 1604 * possible
1584 & ~CPU_BASED_VIRTUAL_INTR_PENDING); 1605 */
1606 if (kvm_run->request_interrupt_window &&
1607 !vcpu->irq_summary) {
1608 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1609 ++kvm_stat.irq_window_exits;
1610 return 0;
1611 }
1585 return 1; 1612 return 1;
1586} 1613}
1587 1614
1588static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1615static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1589{ 1616{
1590 skip_emulated_instruction(vcpu); 1617 skip_emulated_instruction(vcpu);
1591 if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) 1618 if (vcpu->irq_summary)
1592 return 1; 1619 return 1;
1593 1620
1594 kvm_run->exit_reason = KVM_EXIT_HLT; 1621 kvm_run->exit_reason = KVM_EXIT_HLT;
1622 ++kvm_stat.halt_exits;
1595 return 0; 1623 return 0;
1596} 1624}
1597 1625
@@ -1605,7 +1633,6 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
1605 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 1633 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
1606 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 1634 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
1607 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 1635 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
1608 [EXIT_REASON_INVLPG] = handle_invlpg,
1609 [EXIT_REASON_CR_ACCESS] = handle_cr, 1636 [EXIT_REASON_CR_ACCESS] = handle_cr,
1610 [EXIT_REASON_DR_ACCESS] = handle_dr, 1637 [EXIT_REASON_DR_ACCESS] = handle_dr,
1611 [EXIT_REASON_CPUID] = handle_cpuid, 1638 [EXIT_REASON_CPUID] = handle_cpuid,
@@ -1642,11 +1669,27 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1642 return 0; 1669 return 0;
1643} 1670}
1644 1671
1672/*
1673 * Check if userspace requested an interrupt window, and that the
1674 * interrupt window is open.
1675 *
1676 * No need to exit to userspace if we already have an interrupt queued.
1677 */
1678static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1679 struct kvm_run *kvm_run)
1680{
1681 return (!vcpu->irq_summary &&
1682 kvm_run->request_interrupt_window &&
1683 vcpu->interrupt_window_open &&
1684 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
1685}
1686
1645static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1687static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1646{ 1688{
1647 u8 fail; 1689 u8 fail;
1648 u16 fs_sel, gs_sel, ldt_sel; 1690 u16 fs_sel, gs_sel, ldt_sel;
1649 int fs_gs_ldt_reload_needed; 1691 int fs_gs_ldt_reload_needed;
1692 int r;
1650 1693
1651again: 1694again:
1652 /* 1695 /*
@@ -1673,9 +1716,7 @@ again:
1673 vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); 1716 vmcs_writel(HOST_GS_BASE, segment_base(gs_sel));
1674#endif 1717#endif
1675 1718
1676 if (vcpu->irq_summary && 1719 do_interrupt_requests(vcpu, kvm_run);
1677 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1678 kvm_try_inject_irq(vcpu);
1679 1720
1680 if (vcpu->guest_debug.enabled) 1721 if (vcpu->guest_debug.enabled)
1681 kvm_guest_debug_pre(vcpu); 1722 kvm_guest_debug_pre(vcpu);
@@ -1812,6 +1853,7 @@ again:
1812 1853
1813 fx_save(vcpu->guest_fx_image); 1854 fx_save(vcpu->guest_fx_image);
1814 fx_restore(vcpu->host_fx_image); 1855 fx_restore(vcpu->host_fx_image);
1856 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
1815 1857
1816#ifndef CONFIG_X86_64 1858#ifndef CONFIG_X86_64
1817 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 1859 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
@@ -1821,6 +1863,7 @@ again:
1821 if (fail) { 1863 if (fail) {
1822 kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; 1864 kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
1823 kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); 1865 kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
1866 r = 0;
1824 } else { 1867 } else {
1825 if (fs_gs_ldt_reload_needed) { 1868 if (fs_gs_ldt_reload_needed) {
1826 load_ldt(ldt_sel); 1869 load_ldt(ldt_sel);
@@ -1840,17 +1883,28 @@ again:
1840 } 1883 }
1841 vcpu->launched = 1; 1884 vcpu->launched = 1;
1842 kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; 1885 kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
1843 if (kvm_handle_exit(kvm_run, vcpu)) { 1886 r = kvm_handle_exit(kvm_run, vcpu);
1887 if (r > 0) {
1844 /* Give scheduler a change to reschedule. */ 1888 /* Give scheduler a change to reschedule. */
1845 if (signal_pending(current)) { 1889 if (signal_pending(current)) {
1846 ++kvm_stat.signal_exits; 1890 ++kvm_stat.signal_exits;
1891 post_kvm_run_save(vcpu, kvm_run);
1892 return -EINTR;
1893 }
1894
1895 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
1896 ++kvm_stat.request_irq_exits;
1897 post_kvm_run_save(vcpu, kvm_run);
1847 return -EINTR; 1898 return -EINTR;
1848 } 1899 }
1900
1849 kvm_resched(vcpu); 1901 kvm_resched(vcpu);
1850 goto again; 1902 goto again;
1851 } 1903 }
1852 } 1904 }
1853 return 0; 1905
1906 post_kvm_run_save(vcpu, kvm_run);
1907 return r;
1854} 1908}
1855 1909
1856static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1910static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
@@ -1906,13 +1960,33 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
1906{ 1960{
1907 struct vmcs *vmcs; 1961 struct vmcs *vmcs;
1908 1962
1963 vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
1964 if (!vcpu->guest_msrs)
1965 return -ENOMEM;
1966
1967 vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
1968 if (!vcpu->host_msrs)
1969 goto out_free_guest_msrs;
1970
1909 vmcs = alloc_vmcs(); 1971 vmcs = alloc_vmcs();
1910 if (!vmcs) 1972 if (!vmcs)
1911 return -ENOMEM; 1973 goto out_free_msrs;
1974
1912 vmcs_clear(vmcs); 1975 vmcs_clear(vmcs);
1913 vcpu->vmcs = vmcs; 1976 vcpu->vmcs = vmcs;
1914 vcpu->launched = 0; 1977 vcpu->launched = 0;
1978
1915 return 0; 1979 return 0;
1980
1981out_free_msrs:
1982 kfree(vcpu->host_msrs);
1983 vcpu->host_msrs = NULL;
1984
1985out_free_guest_msrs:
1986 kfree(vcpu->guest_msrs);
1987 vcpu->guest_msrs = NULL;
1988
1989 return -ENOMEM;
1916} 1990}
1917 1991
1918static struct kvm_arch_ops vmx_arch_ops = { 1992static struct kvm_arch_ops vmx_arch_ops = {
@@ -1936,6 +2010,7 @@ static struct kvm_arch_ops vmx_arch_ops = {
1936 .get_segment = vmx_get_segment, 2010 .get_segment = vmx_get_segment,
1937 .set_segment = vmx_set_segment, 2011 .set_segment = vmx_set_segment,
1938 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 2012 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
2013 .decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits,
1939 .set_cr0 = vmx_set_cr0, 2014 .set_cr0 = vmx_set_cr0,
1940 .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch, 2015 .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch,
1941 .set_cr3 = vmx_set_cr3, 2016 .set_cr3 = vmx_set_cr3,
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
index 1bff3e925fda..be70795b4822 100644
--- a/drivers/kvm/x86_emulate.c
+++ b/drivers/kvm/x86_emulate.c
@@ -1323,7 +1323,7 @@ twobyte_special_insn:
1323 ctxt)) != 0)) 1323 ctxt)) != 0))
1324 goto done; 1324 goto done;
1325 if ((old_lo != _regs[VCPU_REGS_RAX]) 1325 if ((old_lo != _regs[VCPU_REGS_RAX])
1326 || (old_hi != _regs[VCPU_REGS_RDI])) { 1326 || (old_hi != _regs[VCPU_REGS_RDX])) {
1327 _regs[VCPU_REGS_RAX] = old_lo; 1327 _regs[VCPU_REGS_RAX] = old_lo;
1328 _regs[VCPU_REGS_RDX] = old_hi; 1328 _regs[VCPU_REGS_RDX] = old_hi;
1329 _eflags &= ~EFLG_ZF; 1329 _eflags &= ~EFLG_ZF;