aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/kvm/mmu.c')
-rw-r--r--drivers/kvm/mmu.c1114
1 files changed, 943 insertions, 171 deletions
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index 790423c5f23d..c6f972914f08 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -26,7 +26,31 @@
26#include "vmx.h" 26#include "vmx.h"
27#include "kvm.h" 27#include "kvm.h"
28 28
29#undef MMU_DEBUG
30
31#undef AUDIT
32
33#ifdef AUDIT
34static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
35#else
36static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
37#endif
38
39#ifdef MMU_DEBUG
40
41#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
42#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
43
44#else
45
29#define pgprintk(x...) do { } while (0) 46#define pgprintk(x...) do { } while (0)
47#define rmap_printk(x...) do { } while (0)
48
49#endif
50
51#if defined(MMU_DEBUG) || defined(AUDIT)
52static int dbg = 1;
53#endif
30 54
31#define ASSERT(x) \ 55#define ASSERT(x) \
32 if (!(x)) { \ 56 if (!(x)) { \
@@ -34,8 +58,10 @@
34 __FILE__, __LINE__, #x); \ 58 __FILE__, __LINE__, #x); \
35 } 59 }
36 60
37#define PT64_ENT_PER_PAGE 512 61#define PT64_PT_BITS 9
38#define PT32_ENT_PER_PAGE 1024 62#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
63#define PT32_PT_BITS 10
64#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
39 65
40#define PT_WRITABLE_SHIFT 1 66#define PT_WRITABLE_SHIFT 1
41 67
@@ -125,6 +151,13 @@
125#define PT_DIRECTORY_LEVEL 2 151#define PT_DIRECTORY_LEVEL 2
126#define PT_PAGE_TABLE_LEVEL 1 152#define PT_PAGE_TABLE_LEVEL 1
127 153
154#define RMAP_EXT 4
155
156struct kvm_rmap_desc {
157 u64 *shadow_ptes[RMAP_EXT];
158 struct kvm_rmap_desc *more;
159};
160
128static int is_write_protection(struct kvm_vcpu *vcpu) 161static int is_write_protection(struct kvm_vcpu *vcpu)
129{ 162{
130 return vcpu->cr0 & CR0_WP_MASK; 163 return vcpu->cr0 & CR0_WP_MASK;
@@ -150,32 +183,272 @@ static int is_io_pte(unsigned long pte)
150 return pte & PT_SHADOW_IO_MARK; 183 return pte & PT_SHADOW_IO_MARK;
151} 184}
152 185
186static int is_rmap_pte(u64 pte)
187{
188 return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
189 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
190}
191
192static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
193 size_t objsize, int min)
194{
195 void *obj;
196
197 if (cache->nobjs >= min)
198 return 0;
199 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
200 obj = kzalloc(objsize, GFP_NOWAIT);
201 if (!obj)
202 return -ENOMEM;
203 cache->objects[cache->nobjs++] = obj;
204 }
205 return 0;
206}
207
208static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
209{
210 while (mc->nobjs)
211 kfree(mc->objects[--mc->nobjs]);
212}
213
214static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
215{
216 int r;
217
218 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
219 sizeof(struct kvm_pte_chain), 4);
220 if (r)
221 goto out;
222 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
223 sizeof(struct kvm_rmap_desc), 1);
224out:
225 return r;
226}
227
228static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
229{
230 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
231 mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
232}
233
234static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
235 size_t size)
236{
237 void *p;
238
239 BUG_ON(!mc->nobjs);
240 p = mc->objects[--mc->nobjs];
241 memset(p, 0, size);
242 return p;
243}
244
245static void mmu_memory_cache_free(struct kvm_mmu_memory_cache *mc, void *obj)
246{
247 if (mc->nobjs < KVM_NR_MEM_OBJS)
248 mc->objects[mc->nobjs++] = obj;
249 else
250 kfree(obj);
251}
252
253static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
254{
255 return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
256 sizeof(struct kvm_pte_chain));
257}
258
259static void mmu_free_pte_chain(struct kvm_vcpu *vcpu,
260 struct kvm_pte_chain *pc)
261{
262 mmu_memory_cache_free(&vcpu->mmu_pte_chain_cache, pc);
263}
264
265static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
266{
267 return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
268 sizeof(struct kvm_rmap_desc));
269}
270
271static void mmu_free_rmap_desc(struct kvm_vcpu *vcpu,
272 struct kvm_rmap_desc *rd)
273{
274 mmu_memory_cache_free(&vcpu->mmu_rmap_desc_cache, rd);
275}
276
277/*
278 * Reverse mapping data structures:
279 *
280 * If page->private bit zero is zero, then page->private points to the
281 * shadow page table entry that points to page_address(page).
282 *
283 * If page->private bit zero is one, (then page->private & ~1) points
284 * to a struct kvm_rmap_desc containing more mappings.
285 */
286static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
287{
288 struct page *page;
289 struct kvm_rmap_desc *desc;
290 int i;
291
292 if (!is_rmap_pte(*spte))
293 return;
294 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
295 if (!page->private) {
296 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
297 page->private = (unsigned long)spte;
298 } else if (!(page->private & 1)) {
299 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
300 desc = mmu_alloc_rmap_desc(vcpu);
301 desc->shadow_ptes[0] = (u64 *)page->private;
302 desc->shadow_ptes[1] = spte;
303 page->private = (unsigned long)desc | 1;
304 } else {
305 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
306 desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
307 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
308 desc = desc->more;
309 if (desc->shadow_ptes[RMAP_EXT-1]) {
310 desc->more = mmu_alloc_rmap_desc(vcpu);
311 desc = desc->more;
312 }
313 for (i = 0; desc->shadow_ptes[i]; ++i)
314 ;
315 desc->shadow_ptes[i] = spte;
316 }
317}
318
319static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu,
320 struct page *page,
321 struct kvm_rmap_desc *desc,
322 int i,
323 struct kvm_rmap_desc *prev_desc)
324{
325 int j;
326
327 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
328 ;
329 desc->shadow_ptes[i] = desc->shadow_ptes[j];
330 desc->shadow_ptes[j] = 0;
331 if (j != 0)
332 return;
333 if (!prev_desc && !desc->more)
334 page->private = (unsigned long)desc->shadow_ptes[0];
335 else
336 if (prev_desc)
337 prev_desc->more = desc->more;
338 else
339 page->private = (unsigned long)desc->more | 1;
340 mmu_free_rmap_desc(vcpu, desc);
341}
342
343static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte)
344{
345 struct page *page;
346 struct kvm_rmap_desc *desc;
347 struct kvm_rmap_desc *prev_desc;
348 int i;
349
350 if (!is_rmap_pte(*spte))
351 return;
352 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
353 if (!page->private) {
354 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
355 BUG();
356 } else if (!(page->private & 1)) {
357 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
358 if ((u64 *)page->private != spte) {
359 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
360 spte, *spte);
361 BUG();
362 }
363 page->private = 0;
364 } else {
365 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
366 desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
367 prev_desc = NULL;
368 while (desc) {
369 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
370 if (desc->shadow_ptes[i] == spte) {
371 rmap_desc_remove_entry(vcpu, page,
372 desc, i,
373 prev_desc);
374 return;
375 }
376 prev_desc = desc;
377 desc = desc->more;
378 }
379 BUG();
380 }
381}
382
383static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
384{
385 struct kvm *kvm = vcpu->kvm;
386 struct page *page;
387 struct kvm_memory_slot *slot;
388 struct kvm_rmap_desc *desc;
389 u64 *spte;
390
391 slot = gfn_to_memslot(kvm, gfn);
392 BUG_ON(!slot);
393 page = gfn_to_page(slot, gfn);
394
395 while (page->private) {
396 if (!(page->private & 1))
397 spte = (u64 *)page->private;
398 else {
399 desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
400 spte = desc->shadow_ptes[0];
401 }
402 BUG_ON(!spte);
403 BUG_ON((*spte & PT64_BASE_ADDR_MASK) !=
404 page_to_pfn(page) << PAGE_SHIFT);
405 BUG_ON(!(*spte & PT_PRESENT_MASK));
406 BUG_ON(!(*spte & PT_WRITABLE_MASK));
407 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
408 rmap_remove(vcpu, spte);
409 kvm_arch_ops->tlb_flush(vcpu);
410 *spte &= ~(u64)PT_WRITABLE_MASK;
411 }
412}
413
414static int is_empty_shadow_page(hpa_t page_hpa)
415{
416 u64 *pos;
417 u64 *end;
418
419 for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64);
420 pos != end; pos++)
421 if (*pos != 0) {
422 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
423 pos, *pos);
424 return 0;
425 }
426 return 1;
427}
428
153static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) 429static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
154{ 430{
155 struct kvm_mmu_page *page_head = page_header(page_hpa); 431 struct kvm_mmu_page *page_head = page_header(page_hpa);
156 432
433 ASSERT(is_empty_shadow_page(page_hpa));
157 list_del(&page_head->link); 434 list_del(&page_head->link);
158 page_head->page_hpa = page_hpa; 435 page_head->page_hpa = page_hpa;
159 list_add(&page_head->link, &vcpu->free_pages); 436 list_add(&page_head->link, &vcpu->free_pages);
437 ++vcpu->kvm->n_free_mmu_pages;
160} 438}
161 439
162static int is_empty_shadow_page(hpa_t page_hpa) 440static unsigned kvm_page_table_hashfn(gfn_t gfn)
163{ 441{
164 u32 *pos; 442 return gfn;
165 u32 *end;
166 for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32);
167 pos != end; pos++)
168 if (*pos != 0)
169 return 0;
170 return 1;
171} 443}
172 444
173static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte) 445static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
446 u64 *parent_pte)
174{ 447{
175 struct kvm_mmu_page *page; 448 struct kvm_mmu_page *page;
176 449
177 if (list_empty(&vcpu->free_pages)) 450 if (list_empty(&vcpu->free_pages))
178 return INVALID_PAGE; 451 return NULL;
179 452
180 page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); 453 page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
181 list_del(&page->link); 454 list_del(&page->link);
@@ -183,8 +456,239 @@ static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte)
183 ASSERT(is_empty_shadow_page(page->page_hpa)); 456 ASSERT(is_empty_shadow_page(page->page_hpa));
184 page->slot_bitmap = 0; 457 page->slot_bitmap = 0;
185 page->global = 1; 458 page->global = 1;
459 page->multimapped = 0;
186 page->parent_pte = parent_pte; 460 page->parent_pte = parent_pte;
187 return page->page_hpa; 461 --vcpu->kvm->n_free_mmu_pages;
462 return page;
463}
464
465static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
466 struct kvm_mmu_page *page, u64 *parent_pte)
467{
468 struct kvm_pte_chain *pte_chain;
469 struct hlist_node *node;
470 int i;
471
472 if (!parent_pte)
473 return;
474 if (!page->multimapped) {
475 u64 *old = page->parent_pte;
476
477 if (!old) {
478 page->parent_pte = parent_pte;
479 return;
480 }
481 page->multimapped = 1;
482 pte_chain = mmu_alloc_pte_chain(vcpu);
483 INIT_HLIST_HEAD(&page->parent_ptes);
484 hlist_add_head(&pte_chain->link, &page->parent_ptes);
485 pte_chain->parent_ptes[0] = old;
486 }
487 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
488 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
489 continue;
490 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
491 if (!pte_chain->parent_ptes[i]) {
492 pte_chain->parent_ptes[i] = parent_pte;
493 return;
494 }
495 }
496 pte_chain = mmu_alloc_pte_chain(vcpu);
497 BUG_ON(!pte_chain);
498 hlist_add_head(&pte_chain->link, &page->parent_ptes);
499 pte_chain->parent_ptes[0] = parent_pte;
500}
501
502static void mmu_page_remove_parent_pte(struct kvm_vcpu *vcpu,
503 struct kvm_mmu_page *page,
504 u64 *parent_pte)
505{
506 struct kvm_pte_chain *pte_chain;
507 struct hlist_node *node;
508 int i;
509
510 if (!page->multimapped) {
511 BUG_ON(page->parent_pte != parent_pte);
512 page->parent_pte = NULL;
513 return;
514 }
515 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
516 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
517 if (!pte_chain->parent_ptes[i])
518 break;
519 if (pte_chain->parent_ptes[i] != parent_pte)
520 continue;
521 while (i + 1 < NR_PTE_CHAIN_ENTRIES
522 && pte_chain->parent_ptes[i + 1]) {
523 pte_chain->parent_ptes[i]
524 = pte_chain->parent_ptes[i + 1];
525 ++i;
526 }
527 pte_chain->parent_ptes[i] = NULL;
528 if (i == 0) {
529 hlist_del(&pte_chain->link);
530 mmu_free_pte_chain(vcpu, pte_chain);
531 if (hlist_empty(&page->parent_ptes)) {
532 page->multimapped = 0;
533 page->parent_pte = NULL;
534 }
535 }
536 return;
537 }
538 BUG();
539}
540
541static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
542 gfn_t gfn)
543{
544 unsigned index;
545 struct hlist_head *bucket;
546 struct kvm_mmu_page *page;
547 struct hlist_node *node;
548
549 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
550 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
551 bucket = &vcpu->kvm->mmu_page_hash[index];
552 hlist_for_each_entry(page, node, bucket, hash_link)
553 if (page->gfn == gfn && !page->role.metaphysical) {
554 pgprintk("%s: found role %x\n",
555 __FUNCTION__, page->role.word);
556 return page;
557 }
558 return NULL;
559}
560
561static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
562 gfn_t gfn,
563 gva_t gaddr,
564 unsigned level,
565 int metaphysical,
566 u64 *parent_pte)
567{
568 union kvm_mmu_page_role role;
569 unsigned index;
570 unsigned quadrant;
571 struct hlist_head *bucket;
572 struct kvm_mmu_page *page;
573 struct hlist_node *node;
574
575 role.word = 0;
576 role.glevels = vcpu->mmu.root_level;
577 role.level = level;
578 role.metaphysical = metaphysical;
579 if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
580 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
581 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
582 role.quadrant = quadrant;
583 }
584 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
585 gfn, role.word);
586 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
587 bucket = &vcpu->kvm->mmu_page_hash[index];
588 hlist_for_each_entry(page, node, bucket, hash_link)
589 if (page->gfn == gfn && page->role.word == role.word) {
590 mmu_page_add_parent_pte(vcpu, page, parent_pte);
591 pgprintk("%s: found\n", __FUNCTION__);
592 return page;
593 }
594 page = kvm_mmu_alloc_page(vcpu, parent_pte);
595 if (!page)
596 return page;
597 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
598 page->gfn = gfn;
599 page->role = role;
600 hlist_add_head(&page->hash_link, bucket);
601 if (!metaphysical)
602 rmap_write_protect(vcpu, gfn);
603 return page;
604}
605
606static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
607 struct kvm_mmu_page *page)
608{
609 unsigned i;
610 u64 *pt;
611 u64 ent;
612
613 pt = __va(page->page_hpa);
614
615 if (page->role.level == PT_PAGE_TABLE_LEVEL) {
616 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
617 if (pt[i] & PT_PRESENT_MASK)
618 rmap_remove(vcpu, &pt[i]);
619 pt[i] = 0;
620 }
621 kvm_arch_ops->tlb_flush(vcpu);
622 return;
623 }
624
625 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
626 ent = pt[i];
627
628 pt[i] = 0;
629 if (!(ent & PT_PRESENT_MASK))
630 continue;
631 ent &= PT64_BASE_ADDR_MASK;
632 mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]);
633 }
634}
635
636static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
637 struct kvm_mmu_page *page,
638 u64 *parent_pte)
639{
640 mmu_page_remove_parent_pte(vcpu, page, parent_pte);
641}
642
643static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
644 struct kvm_mmu_page *page)
645{
646 u64 *parent_pte;
647
648 while (page->multimapped || page->parent_pte) {
649 if (!page->multimapped)
650 parent_pte = page->parent_pte;
651 else {
652 struct kvm_pte_chain *chain;
653
654 chain = container_of(page->parent_ptes.first,
655 struct kvm_pte_chain, link);
656 parent_pte = chain->parent_ptes[0];
657 }
658 BUG_ON(!parent_pte);
659 kvm_mmu_put_page(vcpu, page, parent_pte);
660 *parent_pte = 0;
661 }
662 kvm_mmu_page_unlink_children(vcpu, page);
663 if (!page->root_count) {
664 hlist_del(&page->hash_link);
665 kvm_mmu_free_page(vcpu, page->page_hpa);
666 } else {
667 list_del(&page->link);
668 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
669 }
670}
671
672static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
673{
674 unsigned index;
675 struct hlist_head *bucket;
676 struct kvm_mmu_page *page;
677 struct hlist_node *node, *n;
678 int r;
679
680 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
681 r = 0;
682 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
683 bucket = &vcpu->kvm->mmu_page_hash[index];
684 hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
685 if (page->gfn == gfn && !page->role.metaphysical) {
686 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
687 page->role.word);
688 kvm_mmu_zap_page(vcpu, page);
689 r = 1;
690 }
691 return r;
188} 692}
189 693
190static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) 694static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
@@ -225,35 +729,6 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
225 return gpa_to_hpa(vcpu, gpa); 729 return gpa_to_hpa(vcpu, gpa);
226} 730}
227 731
228
229static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
230 int level)
231{
232 ASSERT(vcpu);
233 ASSERT(VALID_PAGE(page_hpa));
234 ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
235
236 if (level == 1)
237 memset(__va(page_hpa), 0, PAGE_SIZE);
238 else {
239 u64 *pos;
240 u64 *end;
241
242 for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
243 pos != end; pos++) {
244 u64 current_ent = *pos;
245
246 *pos = 0;
247 if (is_present_pte(current_ent))
248 release_pt_page_64(vcpu,
249 current_ent &
250 PT64_BASE_ADDR_MASK,
251 level - 1);
252 }
253 }
254 kvm_mmu_free_page(vcpu, page_hpa);
255}
256
257static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 732static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
258{ 733{
259} 734}
@@ -266,52 +741,109 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
266 for (; ; level--) { 741 for (; ; level--) {
267 u32 index = PT64_INDEX(v, level); 742 u32 index = PT64_INDEX(v, level);
268 u64 *table; 743 u64 *table;
744 u64 pte;
269 745
270 ASSERT(VALID_PAGE(table_addr)); 746 ASSERT(VALID_PAGE(table_addr));
271 table = __va(table_addr); 747 table = __va(table_addr);
272 748
273 if (level == 1) { 749 if (level == 1) {
750 pte = table[index];
751 if (is_present_pte(pte) && is_writeble_pte(pte))
752 return 0;
274 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); 753 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
275 page_header_update_slot(vcpu->kvm, table, v); 754 page_header_update_slot(vcpu->kvm, table, v);
276 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | 755 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
277 PT_USER_MASK; 756 PT_USER_MASK;
757 rmap_add(vcpu, &table[index]);
278 return 0; 758 return 0;
279 } 759 }
280 760
281 if (table[index] == 0) { 761 if (table[index] == 0) {
282 hpa_t new_table = kvm_mmu_alloc_page(vcpu, 762 struct kvm_mmu_page *new_table;
283 &table[index]); 763 gfn_t pseudo_gfn;
284 764
285 if (!VALID_PAGE(new_table)) { 765 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
766 >> PAGE_SHIFT;
767 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
768 v, level - 1,
769 1, &table[index]);
770 if (!new_table) {
286 pgprintk("nonpaging_map: ENOMEM\n"); 771 pgprintk("nonpaging_map: ENOMEM\n");
287 return -ENOMEM; 772 return -ENOMEM;
288 } 773 }
289 774
290 if (level == PT32E_ROOT_LEVEL) 775 table[index] = new_table->page_hpa | PT_PRESENT_MASK
291 table[index] = new_table | PT_PRESENT_MASK; 776 | PT_WRITABLE_MASK | PT_USER_MASK;
292 else
293 table[index] = new_table | PT_PRESENT_MASK |
294 PT_WRITABLE_MASK | PT_USER_MASK;
295 } 777 }
296 table_addr = table[index] & PT64_BASE_ADDR_MASK; 778 table_addr = table[index] & PT64_BASE_ADDR_MASK;
297 } 779 }
298} 780}
299 781
300static void nonpaging_flush(struct kvm_vcpu *vcpu) 782static void mmu_free_roots(struct kvm_vcpu *vcpu)
301{ 783{
302 hpa_t root = vcpu->mmu.root_hpa; 784 int i;
785 struct kvm_mmu_page *page;
303 786
304 ++kvm_stat.tlb_flush; 787#ifdef CONFIG_X86_64
305 pgprintk("nonpaging_flush\n"); 788 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
306 ASSERT(VALID_PAGE(root)); 789 hpa_t root = vcpu->mmu.root_hpa;
307 release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level); 790
308 root = kvm_mmu_alloc_page(vcpu, NULL); 791 ASSERT(VALID_PAGE(root));
309 ASSERT(VALID_PAGE(root)); 792 page = page_header(root);
310 vcpu->mmu.root_hpa = root; 793 --page->root_count;
311 if (is_paging(vcpu)) 794 vcpu->mmu.root_hpa = INVALID_PAGE;
312 root |= (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)); 795 return;
313 kvm_arch_ops->set_cr3(vcpu, root); 796 }
314 kvm_arch_ops->tlb_flush(vcpu); 797#endif
798 for (i = 0; i < 4; ++i) {
799 hpa_t root = vcpu->mmu.pae_root[i];
800
801 ASSERT(VALID_PAGE(root));
802 root &= PT64_BASE_ADDR_MASK;
803 page = page_header(root);
804 --page->root_count;
805 vcpu->mmu.pae_root[i] = INVALID_PAGE;
806 }
807 vcpu->mmu.root_hpa = INVALID_PAGE;
808}
809
810static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
811{
812 int i;
813 gfn_t root_gfn;
814 struct kvm_mmu_page *page;
815
816 root_gfn = vcpu->cr3 >> PAGE_SHIFT;
817
818#ifdef CONFIG_X86_64
819 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
820 hpa_t root = vcpu->mmu.root_hpa;
821
822 ASSERT(!VALID_PAGE(root));
823 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
824 PT64_ROOT_LEVEL, 0, NULL);
825 root = page->page_hpa;
826 ++page->root_count;
827 vcpu->mmu.root_hpa = root;
828 return;
829 }
830#endif
831 for (i = 0; i < 4; ++i) {
832 hpa_t root = vcpu->mmu.pae_root[i];
833
834 ASSERT(!VALID_PAGE(root));
835 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL)
836 root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
837 else if (vcpu->mmu.root_level == 0)
838 root_gfn = 0;
839 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
840 PT32_ROOT_LEVEL, !is_paging(vcpu),
841 NULL);
842 root = page->page_hpa;
843 ++page->root_count;
844 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
845 }
846 vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
315} 847}
316 848
317static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 849static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -322,43 +854,29 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
322static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 854static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
323 u32 error_code) 855 u32 error_code)
324{ 856{
325 int ret;
326 gpa_t addr = gva; 857 gpa_t addr = gva;
858 hpa_t paddr;
859 int r;
860
861 r = mmu_topup_memory_caches(vcpu);
862 if (r)
863 return r;
327 864
328 ASSERT(vcpu); 865 ASSERT(vcpu);
329 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); 866 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
330 867
331 for (;;) {
332 hpa_t paddr;
333
334 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
335 868
336 if (is_error_hpa(paddr)) 869 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
337 return 1;
338 870
339 ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr); 871 if (is_error_hpa(paddr))
340 if (ret) { 872 return 1;
341 nonpaging_flush(vcpu);
342 continue;
343 }
344 break;
345 }
346 return ret;
347}
348 873
349static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) 874 return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
350{
351} 875}
352 876
353static void nonpaging_free(struct kvm_vcpu *vcpu) 877static void nonpaging_free(struct kvm_vcpu *vcpu)
354{ 878{
355 hpa_t root; 879 mmu_free_roots(vcpu);
356
357 ASSERT(vcpu);
358 root = vcpu->mmu.root_hpa;
359 if (VALID_PAGE(root))
360 release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level);
361 vcpu->mmu.root_hpa = INVALID_PAGE;
362} 880}
363 881
364static int nonpaging_init_context(struct kvm_vcpu *vcpu) 882static int nonpaging_init_context(struct kvm_vcpu *vcpu)
@@ -367,40 +885,31 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
367 885
368 context->new_cr3 = nonpaging_new_cr3; 886 context->new_cr3 = nonpaging_new_cr3;
369 context->page_fault = nonpaging_page_fault; 887 context->page_fault = nonpaging_page_fault;
370 context->inval_page = nonpaging_inval_page;
371 context->gva_to_gpa = nonpaging_gva_to_gpa; 888 context->gva_to_gpa = nonpaging_gva_to_gpa;
372 context->free = nonpaging_free; 889 context->free = nonpaging_free;
373 context->root_level = PT32E_ROOT_LEVEL; 890 context->root_level = 0;
374 context->shadow_root_level = PT32E_ROOT_LEVEL; 891 context->shadow_root_level = PT32E_ROOT_LEVEL;
375 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); 892 mmu_alloc_roots(vcpu);
376 ASSERT(VALID_PAGE(context->root_hpa)); 893 ASSERT(VALID_PAGE(context->root_hpa));
377 kvm_arch_ops->set_cr3(vcpu, context->root_hpa); 894 kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
378 return 0; 895 return 0;
379} 896}
380 897
381
382static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 898static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
383{ 899{
384 struct kvm_mmu_page *page, *npage;
385
386 list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages,
387 link) {
388 if (page->global)
389 continue;
390
391 if (!page->parent_pte)
392 continue;
393
394 *page->parent_pte = 0;
395 release_pt_page_64(vcpu, page->page_hpa, 1);
396 }
397 ++kvm_stat.tlb_flush; 900 ++kvm_stat.tlb_flush;
398 kvm_arch_ops->tlb_flush(vcpu); 901 kvm_arch_ops->tlb_flush(vcpu);
399} 902}
400 903
401static void paging_new_cr3(struct kvm_vcpu *vcpu) 904static void paging_new_cr3(struct kvm_vcpu *vcpu)
402{ 905{
906 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
907 mmu_free_roots(vcpu);
908 if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
909 kvm_mmu_free_some_pages(vcpu);
910 mmu_alloc_roots(vcpu);
403 kvm_mmu_flush_tlb(vcpu); 911 kvm_mmu_flush_tlb(vcpu);
912 kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
404} 913}
405 914
406static void mark_pagetable_nonglobal(void *shadow_pte) 915static void mark_pagetable_nonglobal(void *shadow_pte)
@@ -412,7 +921,8 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu,
412 u64 *shadow_pte, 921 u64 *shadow_pte,
413 gpa_t gaddr, 922 gpa_t gaddr,
414 int dirty, 923 int dirty,
415 u64 access_bits) 924 u64 access_bits,
925 gfn_t gfn)
416{ 926{
417 hpa_t paddr; 927 hpa_t paddr;
418 928
@@ -420,13 +930,10 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu,
420 if (!dirty) 930 if (!dirty)
421 access_bits &= ~PT_WRITABLE_MASK; 931 access_bits &= ~PT_WRITABLE_MASK;
422 932
423 if (access_bits & PT_WRITABLE_MASK) 933 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
424 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
425 934
426 *shadow_pte |= access_bits; 935 *shadow_pte |= access_bits;
427 936
428 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
429
430 if (!(*shadow_pte & PT_GLOBAL_MASK)) 937 if (!(*shadow_pte & PT_GLOBAL_MASK))
431 mark_pagetable_nonglobal(shadow_pte); 938 mark_pagetable_nonglobal(shadow_pte);
432 939
@@ -434,10 +941,31 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu,
434 *shadow_pte |= gaddr; 941 *shadow_pte |= gaddr;
435 *shadow_pte |= PT_SHADOW_IO_MARK; 942 *shadow_pte |= PT_SHADOW_IO_MARK;
436 *shadow_pte &= ~PT_PRESENT_MASK; 943 *shadow_pte &= ~PT_PRESENT_MASK;
437 } else { 944 return;
438 *shadow_pte |= paddr; 945 }
439 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); 946
947 *shadow_pte |= paddr;
948
949 if (access_bits & PT_WRITABLE_MASK) {
950 struct kvm_mmu_page *shadow;
951
952 shadow = kvm_mmu_lookup_page(vcpu, gfn);
953 if (shadow) {
954 pgprintk("%s: found shadow page for %lx, marking ro\n",
955 __FUNCTION__, gfn);
956 access_bits &= ~PT_WRITABLE_MASK;
957 if (is_writeble_pte(*shadow_pte)) {
958 *shadow_pte &= ~PT_WRITABLE_MASK;
959 kvm_arch_ops->tlb_flush(vcpu);
960 }
961 }
440 } 962 }
963
964 if (access_bits & PT_WRITABLE_MASK)
965 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
966
967 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
968 rmap_add(vcpu, shadow_pte);
441} 969}
442 970
443static void inject_page_fault(struct kvm_vcpu *vcpu, 971static void inject_page_fault(struct kvm_vcpu *vcpu,
@@ -474,41 +1002,6 @@ static int may_access(u64 pte, int write, int user)
474 return 1; 1002 return 1;
475} 1003}
476 1004
477/*
478 * Remove a shadow pte.
479 */
480static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
481{
482 hpa_t page_addr = vcpu->mmu.root_hpa;
483 int level = vcpu->mmu.shadow_root_level;
484
485 ++kvm_stat.invlpg;
486
487 for (; ; level--) {
488 u32 index = PT64_INDEX(addr, level);
489 u64 *table = __va(page_addr);
490
491 if (level == PT_PAGE_TABLE_LEVEL ) {
492 table[index] = 0;
493 return;
494 }
495
496 if (!is_present_pte(table[index]))
497 return;
498
499 page_addr = table[index] & PT64_BASE_ADDR_MASK;
500
501 if (level == PT_DIRECTORY_LEVEL &&
502 (table[index] & PT_SHADOW_PS_MARK)) {
503 table[index] = 0;
504 release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
505
506 kvm_arch_ops->tlb_flush(vcpu);
507 return;
508 }
509 }
510}
511
512static void paging_free(struct kvm_vcpu *vcpu) 1005static void paging_free(struct kvm_vcpu *vcpu)
513{ 1006{
514 nonpaging_free(vcpu); 1007 nonpaging_free(vcpu);
@@ -522,37 +1015,40 @@ static void paging_free(struct kvm_vcpu *vcpu)
522#include "paging_tmpl.h" 1015#include "paging_tmpl.h"
523#undef PTTYPE 1016#undef PTTYPE
524 1017
525static int paging64_init_context(struct kvm_vcpu *vcpu) 1018static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
526{ 1019{
527 struct kvm_mmu *context = &vcpu->mmu; 1020 struct kvm_mmu *context = &vcpu->mmu;
528 1021
529 ASSERT(is_pae(vcpu)); 1022 ASSERT(is_pae(vcpu));
530 context->new_cr3 = paging_new_cr3; 1023 context->new_cr3 = paging_new_cr3;
531 context->page_fault = paging64_page_fault; 1024 context->page_fault = paging64_page_fault;
532 context->inval_page = paging_inval_page;
533 context->gva_to_gpa = paging64_gva_to_gpa; 1025 context->gva_to_gpa = paging64_gva_to_gpa;
534 context->free = paging_free; 1026 context->free = paging_free;
535 context->root_level = PT64_ROOT_LEVEL; 1027 context->root_level = level;
536 context->shadow_root_level = PT64_ROOT_LEVEL; 1028 context->shadow_root_level = level;
537 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); 1029 mmu_alloc_roots(vcpu);
538 ASSERT(VALID_PAGE(context->root_hpa)); 1030 ASSERT(VALID_PAGE(context->root_hpa));
539 kvm_arch_ops->set_cr3(vcpu, context->root_hpa | 1031 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
540 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); 1032 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
541 return 0; 1033 return 0;
542} 1034}
543 1035
1036static int paging64_init_context(struct kvm_vcpu *vcpu)
1037{
1038 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1039}
1040
544static int paging32_init_context(struct kvm_vcpu *vcpu) 1041static int paging32_init_context(struct kvm_vcpu *vcpu)
545{ 1042{
546 struct kvm_mmu *context = &vcpu->mmu; 1043 struct kvm_mmu *context = &vcpu->mmu;
547 1044
548 context->new_cr3 = paging_new_cr3; 1045 context->new_cr3 = paging_new_cr3;
549 context->page_fault = paging32_page_fault; 1046 context->page_fault = paging32_page_fault;
550 context->inval_page = paging_inval_page;
551 context->gva_to_gpa = paging32_gva_to_gpa; 1047 context->gva_to_gpa = paging32_gva_to_gpa;
552 context->free = paging_free; 1048 context->free = paging_free;
553 context->root_level = PT32_ROOT_LEVEL; 1049 context->root_level = PT32_ROOT_LEVEL;
554 context->shadow_root_level = PT32E_ROOT_LEVEL; 1050 context->shadow_root_level = PT32E_ROOT_LEVEL;
555 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); 1051 mmu_alloc_roots(vcpu);
556 ASSERT(VALID_PAGE(context->root_hpa)); 1052 ASSERT(VALID_PAGE(context->root_hpa));
557 kvm_arch_ops->set_cr3(vcpu, context->root_hpa | 1053 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
558 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); 1054 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
@@ -561,14 +1057,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
561 1057
562static int paging32E_init_context(struct kvm_vcpu *vcpu) 1058static int paging32E_init_context(struct kvm_vcpu *vcpu)
563{ 1059{
564 int ret; 1060 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
565
566 if ((ret = paging64_init_context(vcpu)))
567 return ret;
568
569 vcpu->mmu.root_level = PT32E_ROOT_LEVEL;
570 vcpu->mmu.shadow_root_level = PT32E_ROOT_LEVEL;
571 return 0;
572} 1061}
573 1062
574static int init_kvm_mmu(struct kvm_vcpu *vcpu) 1063static int init_kvm_mmu(struct kvm_vcpu *vcpu)
@@ -597,41 +1086,161 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
597 1086
598int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 1087int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
599{ 1088{
1089 int r;
1090
600 destroy_kvm_mmu(vcpu); 1091 destroy_kvm_mmu(vcpu);
601 return init_kvm_mmu(vcpu); 1092 r = init_kvm_mmu(vcpu);
1093 if (r < 0)
1094 goto out;
1095 r = mmu_topup_memory_caches(vcpu);
1096out:
1097 return r;
602} 1098}
603 1099
604static void free_mmu_pages(struct kvm_vcpu *vcpu) 1100void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
605{ 1101{
606 while (!list_empty(&vcpu->free_pages)) { 1102 gfn_t gfn = gpa >> PAGE_SHIFT;
1103 struct kvm_mmu_page *page;
1104 struct kvm_mmu_page *child;
1105 struct hlist_node *node, *n;
1106 struct hlist_head *bucket;
1107 unsigned index;
1108 u64 *spte;
1109 u64 pte;
1110 unsigned offset = offset_in_page(gpa);
1111 unsigned pte_size;
1112 unsigned page_offset;
1113 unsigned misaligned;
1114 int level;
1115 int flooded = 0;
1116
1117 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1118 if (gfn == vcpu->last_pt_write_gfn) {
1119 ++vcpu->last_pt_write_count;
1120 if (vcpu->last_pt_write_count >= 3)
1121 flooded = 1;
1122 } else {
1123 vcpu->last_pt_write_gfn = gfn;
1124 vcpu->last_pt_write_count = 1;
1125 }
1126 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1127 bucket = &vcpu->kvm->mmu_page_hash[index];
1128 hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
1129 if (page->gfn != gfn || page->role.metaphysical)
1130 continue;
1131 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1132 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1133 if (misaligned || flooded) {
1134 /*
1135 * Misaligned accesses are too much trouble to fix
1136 * up; also, they usually indicate a page is not used
1137 * as a page table.
1138 *
1139 * If we're seeing too many writes to a page,
1140 * it may no longer be a page table, or we may be
1141 * forking, in which case it is better to unmap the
1142 * page.
1143 */
1144 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1145 gpa, bytes, page->role.word);
1146 kvm_mmu_zap_page(vcpu, page);
1147 continue;
1148 }
1149 page_offset = offset;
1150 level = page->role.level;
1151 if (page->role.glevels == PT32_ROOT_LEVEL) {
1152 page_offset <<= 1; /* 32->64 */
1153 page_offset &= ~PAGE_MASK;
1154 }
1155 spte = __va(page->page_hpa);
1156 spte += page_offset / sizeof(*spte);
1157 pte = *spte;
1158 if (is_present_pte(pte)) {
1159 if (level == PT_PAGE_TABLE_LEVEL)
1160 rmap_remove(vcpu, spte);
1161 else {
1162 child = page_header(pte & PT64_BASE_ADDR_MASK);
1163 mmu_page_remove_parent_pte(vcpu, child, spte);
1164 }
1165 }
1166 *spte = 0;
1167 }
1168}
1169
1170void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
1171{
1172}
1173
1174int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1175{
1176 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1177
1178 return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
1179}
1180
1181void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1182{
1183 while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
607 struct kvm_mmu_page *page; 1184 struct kvm_mmu_page *page;
608 1185
1186 page = container_of(vcpu->kvm->active_mmu_pages.prev,
1187 struct kvm_mmu_page, link);
1188 kvm_mmu_zap_page(vcpu, page);
1189 }
1190}
1191EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages);
1192
1193static void free_mmu_pages(struct kvm_vcpu *vcpu)
1194{
1195 struct kvm_mmu_page *page;
1196
1197 while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1198 page = container_of(vcpu->kvm->active_mmu_pages.next,
1199 struct kvm_mmu_page, link);
1200 kvm_mmu_zap_page(vcpu, page);
1201 }
1202 while (!list_empty(&vcpu->free_pages)) {
609 page = list_entry(vcpu->free_pages.next, 1203 page = list_entry(vcpu->free_pages.next,
610 struct kvm_mmu_page, link); 1204 struct kvm_mmu_page, link);
611 list_del(&page->link); 1205 list_del(&page->link);
612 __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); 1206 __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
613 page->page_hpa = INVALID_PAGE; 1207 page->page_hpa = INVALID_PAGE;
614 } 1208 }
1209 free_page((unsigned long)vcpu->mmu.pae_root);
615} 1210}
616 1211
617static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 1212static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
618{ 1213{
1214 struct page *page;
619 int i; 1215 int i;
620 1216
621 ASSERT(vcpu); 1217 ASSERT(vcpu);
622 1218
623 for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { 1219 for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
624 struct page *page;
625 struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i]; 1220 struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
626 1221
627 INIT_LIST_HEAD(&page_header->link); 1222 INIT_LIST_HEAD(&page_header->link);
628 if ((page = alloc_page(GFP_KVM_MMU)) == NULL) 1223 if ((page = alloc_page(GFP_KERNEL)) == NULL)
629 goto error_1; 1224 goto error_1;
630 page->private = (unsigned long)page_header; 1225 page->private = (unsigned long)page_header;
631 page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; 1226 page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
632 memset(__va(page_header->page_hpa), 0, PAGE_SIZE); 1227 memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
633 list_add(&page_header->link, &vcpu->free_pages); 1228 list_add(&page_header->link, &vcpu->free_pages);
1229 ++vcpu->kvm->n_free_mmu_pages;
634 } 1230 }
1231
1232 /*
1233 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1234 * Therefore we need to allocate shadow page tables in the first
1235 * 4GB of memory, which happens to fit the DMA32 zone.
1236 */
1237 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1238 if (!page)
1239 goto error_1;
1240 vcpu->mmu.pae_root = page_address(page);
1241 for (i = 0; i < 4; ++i)
1242 vcpu->mmu.pae_root[i] = INVALID_PAGE;
1243
635 return 0; 1244 return 0;
636 1245
637error_1: 1246error_1:
@@ -663,10 +1272,12 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
663 1272
664 destroy_kvm_mmu(vcpu); 1273 destroy_kvm_mmu(vcpu);
665 free_mmu_pages(vcpu); 1274 free_mmu_pages(vcpu);
1275 mmu_free_memory_caches(vcpu);
666} 1276}
667 1277
668void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 1278void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
669{ 1279{
1280 struct kvm *kvm = vcpu->kvm;
670 struct kvm_mmu_page *page; 1281 struct kvm_mmu_page *page;
671 1282
672 list_for_each_entry(page, &kvm->active_mmu_pages, link) { 1283 list_for_each_entry(page, &kvm->active_mmu_pages, link) {
@@ -679,8 +1290,169 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
679 pt = __va(page->page_hpa); 1290 pt = __va(page->page_hpa);
680 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 1291 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
681 /* avoid RMW */ 1292 /* avoid RMW */
682 if (pt[i] & PT_WRITABLE_MASK) 1293 if (pt[i] & PT_WRITABLE_MASK) {
1294 rmap_remove(vcpu, &pt[i]);
683 pt[i] &= ~PT_WRITABLE_MASK; 1295 pt[i] &= ~PT_WRITABLE_MASK;
1296 }
1297 }
1298}
1299
1300#ifdef AUDIT
1301
1302static const char *audit_msg;
1303
1304static gva_t canonicalize(gva_t gva)
1305{
1306#ifdef CONFIG_X86_64
1307 gva = (long long)(gva << 16) >> 16;
1308#endif
1309 return gva;
1310}
684 1311
1312static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1313 gva_t va, int level)
1314{
1315 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1316 int i;
1317 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1318
1319 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1320 u64 ent = pt[i];
1321
1322 if (!ent & PT_PRESENT_MASK)
1323 continue;
1324
1325 va = canonicalize(va);
1326 if (level > 1)
1327 audit_mappings_page(vcpu, ent, va, level - 1);
1328 else {
1329 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1330 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1331
1332 if ((ent & PT_PRESENT_MASK)
1333 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1334 printk(KERN_ERR "audit error: (%s) levels %d"
1335 " gva %lx gpa %llx hpa %llx ent %llx\n",
1336 audit_msg, vcpu->mmu.root_level,
1337 va, gpa, hpa, ent);
1338 }
685 } 1339 }
686} 1340}
1341
1342static void audit_mappings(struct kvm_vcpu *vcpu)
1343{
1344 int i;
1345
1346 if (vcpu->mmu.root_level == 4)
1347 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
1348 else
1349 for (i = 0; i < 4; ++i)
1350 if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
1351 audit_mappings_page(vcpu,
1352 vcpu->mmu.pae_root[i],
1353 i << 30,
1354 2);
1355}
1356
1357static int count_rmaps(struct kvm_vcpu *vcpu)
1358{
1359 int nmaps = 0;
1360 int i, j, k;
1361
1362 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1363 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1364 struct kvm_rmap_desc *d;
1365
1366 for (j = 0; j < m->npages; ++j) {
1367 struct page *page = m->phys_mem[j];
1368
1369 if (!page->private)
1370 continue;
1371 if (!(page->private & 1)) {
1372 ++nmaps;
1373 continue;
1374 }
1375 d = (struct kvm_rmap_desc *)(page->private & ~1ul);
1376 while (d) {
1377 for (k = 0; k < RMAP_EXT; ++k)
1378 if (d->shadow_ptes[k])
1379 ++nmaps;
1380 else
1381 break;
1382 d = d->more;
1383 }
1384 }
1385 }
1386 return nmaps;
1387}
1388
1389static int count_writable_mappings(struct kvm_vcpu *vcpu)
1390{
1391 int nmaps = 0;
1392 struct kvm_mmu_page *page;
1393 int i;
1394
1395 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1396 u64 *pt = __va(page->page_hpa);
1397
1398 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1399 continue;
1400
1401 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1402 u64 ent = pt[i];
1403
1404 if (!(ent & PT_PRESENT_MASK))
1405 continue;
1406 if (!(ent & PT_WRITABLE_MASK))
1407 continue;
1408 ++nmaps;
1409 }
1410 }
1411 return nmaps;
1412}
1413
1414static void audit_rmap(struct kvm_vcpu *vcpu)
1415{
1416 int n_rmap = count_rmaps(vcpu);
1417 int n_actual = count_writable_mappings(vcpu);
1418
1419 if (n_rmap != n_actual)
1420 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1421 __FUNCTION__, audit_msg, n_rmap, n_actual);
1422}
1423
1424static void audit_write_protection(struct kvm_vcpu *vcpu)
1425{
1426 struct kvm_mmu_page *page;
1427
1428 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1429 hfn_t hfn;
1430 struct page *pg;
1431
1432 if (page->role.metaphysical)
1433 continue;
1434
1435 hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
1436 >> PAGE_SHIFT;
1437 pg = pfn_to_page(hfn);
1438 if (pg->private)
1439 printk(KERN_ERR "%s: (%s) shadow page has writable"
1440 " mappings: gfn %lx role %x\n",
1441 __FUNCTION__, audit_msg, page->gfn,
1442 page->role.word);
1443 }
1444}
1445
1446static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1447{
1448 int olddbg = dbg;
1449
1450 dbg = 0;
1451 audit_msg = msg;
1452 audit_rmap(vcpu);
1453 audit_write_protection(vcpu);
1454 audit_mappings(vcpu);
1455 dbg = olddbg;
1456}
1457
1458#endif