aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c587
1 files changed, 384 insertions, 203 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0ef5bb2b4043..eca41ae9f453 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h"
21 22
22#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
23#include <linux/types.h> 24#include <linux/types.h>
@@ -107,6 +108,9 @@ module_param(oos_shadow, bool, 0644);
107 108
108#define PT32_LEVEL_MASK(level) \ 109#define PT32_LEVEL_MASK(level) \
109 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) 110 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
111#define PT32_LVL_OFFSET_MASK(level) \
112 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
113 * PT32_LEVEL_BITS))) - 1))
110 114
111#define PT32_INDEX(address, level)\ 115#define PT32_INDEX(address, level)\
112 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 116 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
@@ -115,10 +119,19 @@ module_param(oos_shadow, bool, 0644);
115#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) 119#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
116#define PT64_DIR_BASE_ADDR_MASK \ 120#define PT64_DIR_BASE_ADDR_MASK \
117 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) 121 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
122#define PT64_LVL_ADDR_MASK(level) \
123 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
124 * PT64_LEVEL_BITS))) - 1))
125#define PT64_LVL_OFFSET_MASK(level) \
126 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
127 * PT64_LEVEL_BITS))) - 1))
118 128
119#define PT32_BASE_ADDR_MASK PAGE_MASK 129#define PT32_BASE_ADDR_MASK PAGE_MASK
120#define PT32_DIR_BASE_ADDR_MASK \ 130#define PT32_DIR_BASE_ADDR_MASK \
121 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) 131 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
132#define PT32_LVL_ADDR_MASK(level) \
133 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
134 * PT32_LEVEL_BITS))) - 1))
122 135
123#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 136#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
124 | PT64_NX_MASK) 137 | PT64_NX_MASK)
@@ -129,6 +142,7 @@ module_param(oos_shadow, bool, 0644);
129#define PFERR_RSVD_MASK (1U << 3) 142#define PFERR_RSVD_MASK (1U << 3)
130#define PFERR_FETCH_MASK (1U << 4) 143#define PFERR_FETCH_MASK (1U << 4)
131 144
145#define PT_PDPE_LEVEL 3
132#define PT_DIRECTORY_LEVEL 2 146#define PT_DIRECTORY_LEVEL 2
133#define PT_PAGE_TABLE_LEVEL 1 147#define PT_PAGE_TABLE_LEVEL 1
134 148
@@ -139,10 +153,13 @@ module_param(oos_shadow, bool, 0644);
139#define ACC_USER_MASK PT_USER_MASK 153#define ACC_USER_MASK PT_USER_MASK
140#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 154#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
141 155
156#define CREATE_TRACE_POINTS
157#include "mmutrace.h"
158
142#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 159#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
143 160
144struct kvm_rmap_desc { 161struct kvm_rmap_desc {
145 u64 *shadow_ptes[RMAP_EXT]; 162 u64 *sptes[RMAP_EXT];
146 struct kvm_rmap_desc *more; 163 struct kvm_rmap_desc *more;
147}; 164};
148 165
@@ -239,16 +256,25 @@ static int is_writeble_pte(unsigned long pte)
239 return pte & PT_WRITABLE_MASK; 256 return pte & PT_WRITABLE_MASK;
240} 257}
241 258
242static int is_dirty_pte(unsigned long pte) 259static int is_dirty_gpte(unsigned long pte)
243{ 260{
244 return pte & shadow_dirty_mask; 261 return pte & PT_DIRTY_MASK;
245} 262}
246 263
247static int is_rmap_pte(u64 pte) 264static int is_rmap_spte(u64 pte)
248{ 265{
249 return is_shadow_present_pte(pte); 266 return is_shadow_present_pte(pte);
250} 267}
251 268
269static int is_last_spte(u64 pte, int level)
270{
271 if (level == PT_PAGE_TABLE_LEVEL)
272 return 1;
273 if (is_large_pte(pte))
274 return 1;
275 return 0;
276}
277
252static pfn_t spte_to_pfn(u64 pte) 278static pfn_t spte_to_pfn(u64 pte)
253{ 279{
254 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 280 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -261,7 +287,7 @@ static gfn_t pse36_gfn_delta(u32 gpte)
261 return (gpte & PT32_DIR_PSE36_MASK) << shift; 287 return (gpte & PT32_DIR_PSE36_MASK) << shift;
262} 288}
263 289
264static void set_shadow_pte(u64 *sptep, u64 spte) 290static void __set_spte(u64 *sptep, u64 spte)
265{ 291{
266#ifdef CONFIG_X86_64 292#ifdef CONFIG_X86_64
267 set_64bit((unsigned long *)sptep, spte); 293 set_64bit((unsigned long *)sptep, spte);
@@ -380,37 +406,52 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
380 * Return the pointer to the largepage write count for a given 406 * Return the pointer to the largepage write count for a given
381 * gfn, handling slots that are not large page aligned. 407 * gfn, handling slots that are not large page aligned.
382 */ 408 */
383static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) 409static int *slot_largepage_idx(gfn_t gfn,
410 struct kvm_memory_slot *slot,
411 int level)
384{ 412{
385 unsigned long idx; 413 unsigned long idx;
386 414
387 idx = (gfn / KVM_PAGES_PER_HPAGE) - 415 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
388 (slot->base_gfn / KVM_PAGES_PER_HPAGE); 416 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
389 return &slot->lpage_info[idx].write_count; 417 return &slot->lpage_info[level - 2][idx].write_count;
390} 418}
391 419
392static void account_shadowed(struct kvm *kvm, gfn_t gfn) 420static void account_shadowed(struct kvm *kvm, gfn_t gfn)
393{ 421{
422 struct kvm_memory_slot *slot;
394 int *write_count; 423 int *write_count;
424 int i;
395 425
396 gfn = unalias_gfn(kvm, gfn); 426 gfn = unalias_gfn(kvm, gfn);
397 write_count = slot_largepage_idx(gfn, 427
398 gfn_to_memslot_unaliased(kvm, gfn)); 428 slot = gfn_to_memslot_unaliased(kvm, gfn);
399 *write_count += 1; 429 for (i = PT_DIRECTORY_LEVEL;
430 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
431 write_count = slot_largepage_idx(gfn, slot, i);
432 *write_count += 1;
433 }
400} 434}
401 435
402static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 436static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
403{ 437{
438 struct kvm_memory_slot *slot;
404 int *write_count; 439 int *write_count;
440 int i;
405 441
406 gfn = unalias_gfn(kvm, gfn); 442 gfn = unalias_gfn(kvm, gfn);
407 write_count = slot_largepage_idx(gfn, 443 for (i = PT_DIRECTORY_LEVEL;
408 gfn_to_memslot_unaliased(kvm, gfn)); 444 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
409 *write_count -= 1; 445 slot = gfn_to_memslot_unaliased(kvm, gfn);
410 WARN_ON(*write_count < 0); 446 write_count = slot_largepage_idx(gfn, slot, i);
447 *write_count -= 1;
448 WARN_ON(*write_count < 0);
449 }
411} 450}
412 451
413static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) 452static int has_wrprotected_page(struct kvm *kvm,
453 gfn_t gfn,
454 int level)
414{ 455{
415 struct kvm_memory_slot *slot; 456 struct kvm_memory_slot *slot;
416 int *largepage_idx; 457 int *largepage_idx;
@@ -418,47 +459,67 @@ static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
418 gfn = unalias_gfn(kvm, gfn); 459 gfn = unalias_gfn(kvm, gfn);
419 slot = gfn_to_memslot_unaliased(kvm, gfn); 460 slot = gfn_to_memslot_unaliased(kvm, gfn);
420 if (slot) { 461 if (slot) {
421 largepage_idx = slot_largepage_idx(gfn, slot); 462 largepage_idx = slot_largepage_idx(gfn, slot, level);
422 return *largepage_idx; 463 return *largepage_idx;
423 } 464 }
424 465
425 return 1; 466 return 1;
426} 467}
427 468
428static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) 469static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
429{ 470{
471 unsigned long page_size = PAGE_SIZE;
430 struct vm_area_struct *vma; 472 struct vm_area_struct *vma;
431 unsigned long addr; 473 unsigned long addr;
432 int ret = 0; 474 int i, ret = 0;
433 475
434 addr = gfn_to_hva(kvm, gfn); 476 addr = gfn_to_hva(kvm, gfn);
435 if (kvm_is_error_hva(addr)) 477 if (kvm_is_error_hva(addr))
436 return ret; 478 return page_size;
437 479
438 down_read(&current->mm->mmap_sem); 480 down_read(&current->mm->mmap_sem);
439 vma = find_vma(current->mm, addr); 481 vma = find_vma(current->mm, addr);
440 if (vma && is_vm_hugetlb_page(vma)) 482 if (!vma)
441 ret = 1; 483 goto out;
484
485 page_size = vma_kernel_pagesize(vma);
486
487out:
442 up_read(&current->mm->mmap_sem); 488 up_read(&current->mm->mmap_sem);
443 489
490 for (i = PT_PAGE_TABLE_LEVEL;
491 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
492 if (page_size >= KVM_HPAGE_SIZE(i))
493 ret = i;
494 else
495 break;
496 }
497
444 return ret; 498 return ret;
445} 499}
446 500
447static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) 501static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
448{ 502{
449 struct kvm_memory_slot *slot; 503 struct kvm_memory_slot *slot;
450 504 int host_level;
451 if (has_wrprotected_page(vcpu->kvm, large_gfn)) 505 int level = PT_PAGE_TABLE_LEVEL;
452 return 0;
453
454 if (!host_largepage_backed(vcpu->kvm, large_gfn))
455 return 0;
456 506
457 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 507 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
458 if (slot && slot->dirty_bitmap) 508 if (slot && slot->dirty_bitmap)
459 return 0; 509 return PT_PAGE_TABLE_LEVEL;
460 510
461 return 1; 511 host_level = host_mapping_level(vcpu->kvm, large_gfn);
512
513 if (host_level == PT_PAGE_TABLE_LEVEL)
514 return host_level;
515
516 for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) {
517
518 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
519 break;
520 }
521
522 return level - 1;
462} 523}
463 524
464/* 525/*
@@ -466,19 +527,19 @@ static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
466 * Note: gfn must be unaliased before this function get called 527 * Note: gfn must be unaliased before this function get called
467 */ 528 */
468 529
469static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) 530static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
470{ 531{
471 struct kvm_memory_slot *slot; 532 struct kvm_memory_slot *slot;
472 unsigned long idx; 533 unsigned long idx;
473 534
474 slot = gfn_to_memslot(kvm, gfn); 535 slot = gfn_to_memslot(kvm, gfn);
475 if (!lpage) 536 if (likely(level == PT_PAGE_TABLE_LEVEL))
476 return &slot->rmap[gfn - slot->base_gfn]; 537 return &slot->rmap[gfn - slot->base_gfn];
477 538
478 idx = (gfn / KVM_PAGES_PER_HPAGE) - 539 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
479 (slot->base_gfn / KVM_PAGES_PER_HPAGE); 540 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
480 541
481 return &slot->lpage_info[idx].rmap_pde; 542 return &slot->lpage_info[level - 2][idx].rmap_pde;
482} 543}
483 544
484/* 545/*
@@ -494,42 +555,42 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
494 * the spte was not added. 555 * the spte was not added.
495 * 556 *
496 */ 557 */
497static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) 558static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
498{ 559{
499 struct kvm_mmu_page *sp; 560 struct kvm_mmu_page *sp;
500 struct kvm_rmap_desc *desc; 561 struct kvm_rmap_desc *desc;
501 unsigned long *rmapp; 562 unsigned long *rmapp;
502 int i, count = 0; 563 int i, count = 0;
503 564
504 if (!is_rmap_pte(*spte)) 565 if (!is_rmap_spte(*spte))
505 return count; 566 return count;
506 gfn = unalias_gfn(vcpu->kvm, gfn); 567 gfn = unalias_gfn(vcpu->kvm, gfn);
507 sp = page_header(__pa(spte)); 568 sp = page_header(__pa(spte));
508 sp->gfns[spte - sp->spt] = gfn; 569 sp->gfns[spte - sp->spt] = gfn;
509 rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); 570 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
510 if (!*rmapp) { 571 if (!*rmapp) {
511 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 572 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
512 *rmapp = (unsigned long)spte; 573 *rmapp = (unsigned long)spte;
513 } else if (!(*rmapp & 1)) { 574 } else if (!(*rmapp & 1)) {
514 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); 575 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
515 desc = mmu_alloc_rmap_desc(vcpu); 576 desc = mmu_alloc_rmap_desc(vcpu);
516 desc->shadow_ptes[0] = (u64 *)*rmapp; 577 desc->sptes[0] = (u64 *)*rmapp;
517 desc->shadow_ptes[1] = spte; 578 desc->sptes[1] = spte;
518 *rmapp = (unsigned long)desc | 1; 579 *rmapp = (unsigned long)desc | 1;
519 } else { 580 } else {
520 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 581 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
521 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 582 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
522 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { 583 while (desc->sptes[RMAP_EXT-1] && desc->more) {
523 desc = desc->more; 584 desc = desc->more;
524 count += RMAP_EXT; 585 count += RMAP_EXT;
525 } 586 }
526 if (desc->shadow_ptes[RMAP_EXT-1]) { 587 if (desc->sptes[RMAP_EXT-1]) {
527 desc->more = mmu_alloc_rmap_desc(vcpu); 588 desc->more = mmu_alloc_rmap_desc(vcpu);
528 desc = desc->more; 589 desc = desc->more;
529 } 590 }
530 for (i = 0; desc->shadow_ptes[i]; ++i) 591 for (i = 0; desc->sptes[i]; ++i)
531 ; 592 ;
532 desc->shadow_ptes[i] = spte; 593 desc->sptes[i] = spte;
533 } 594 }
534 return count; 595 return count;
535} 596}
@@ -541,14 +602,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp,
541{ 602{
542 int j; 603 int j;
543 604
544 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) 605 for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
545 ; 606 ;
546 desc->shadow_ptes[i] = desc->shadow_ptes[j]; 607 desc->sptes[i] = desc->sptes[j];
547 desc->shadow_ptes[j] = NULL; 608 desc->sptes[j] = NULL;
548 if (j != 0) 609 if (j != 0)
549 return; 610 return;
550 if (!prev_desc && !desc->more) 611 if (!prev_desc && !desc->more)
551 *rmapp = (unsigned long)desc->shadow_ptes[0]; 612 *rmapp = (unsigned long)desc->sptes[0];
552 else 613 else
553 if (prev_desc) 614 if (prev_desc)
554 prev_desc->more = desc->more; 615 prev_desc->more = desc->more;
@@ -566,7 +627,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
566 unsigned long *rmapp; 627 unsigned long *rmapp;
567 int i; 628 int i;
568 629
569 if (!is_rmap_pte(*spte)) 630 if (!is_rmap_spte(*spte))
570 return; 631 return;
571 sp = page_header(__pa(spte)); 632 sp = page_header(__pa(spte));
572 pfn = spte_to_pfn(*spte); 633 pfn = spte_to_pfn(*spte);
@@ -576,7 +637,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
576 kvm_release_pfn_dirty(pfn); 637 kvm_release_pfn_dirty(pfn);
577 else 638 else
578 kvm_release_pfn_clean(pfn); 639 kvm_release_pfn_clean(pfn);
579 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); 640 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
580 if (!*rmapp) { 641 if (!*rmapp) {
581 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 642 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
582 BUG(); 643 BUG();
@@ -593,8 +654,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
593 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 654 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
594 prev_desc = NULL; 655 prev_desc = NULL;
595 while (desc) { 656 while (desc) {
596 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) 657 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
597 if (desc->shadow_ptes[i] == spte) { 658 if (desc->sptes[i] == spte) {
598 rmap_desc_remove_entry(rmapp, 659 rmap_desc_remove_entry(rmapp,
599 desc, i, 660 desc, i,
600 prev_desc); 661 prev_desc);
@@ -625,10 +686,10 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
625 prev_desc = NULL; 686 prev_desc = NULL;
626 prev_spte = NULL; 687 prev_spte = NULL;
627 while (desc) { 688 while (desc) {
628 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { 689 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
629 if (prev_spte == spte) 690 if (prev_spte == spte)
630 return desc->shadow_ptes[i]; 691 return desc->sptes[i];
631 prev_spte = desc->shadow_ptes[i]; 692 prev_spte = desc->sptes[i];
632 } 693 }
633 desc = desc->more; 694 desc = desc->more;
634 } 695 }
@@ -639,10 +700,10 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
639{ 700{
640 unsigned long *rmapp; 701 unsigned long *rmapp;
641 u64 *spte; 702 u64 *spte;
642 int write_protected = 0; 703 int i, write_protected = 0;
643 704
644 gfn = unalias_gfn(kvm, gfn); 705 gfn = unalias_gfn(kvm, gfn);
645 rmapp = gfn_to_rmap(kvm, gfn, 0); 706 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
646 707
647 spte = rmap_next(kvm, rmapp, NULL); 708 spte = rmap_next(kvm, rmapp, NULL);
648 while (spte) { 709 while (spte) {
@@ -650,7 +711,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
650 BUG_ON(!(*spte & PT_PRESENT_MASK)); 711 BUG_ON(!(*spte & PT_PRESENT_MASK));
651 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 712 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
652 if (is_writeble_pte(*spte)) { 713 if (is_writeble_pte(*spte)) {
653 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); 714 __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
654 write_protected = 1; 715 write_protected = 1;
655 } 716 }
656 spte = rmap_next(kvm, rmapp, spte); 717 spte = rmap_next(kvm, rmapp, spte);
@@ -664,21 +725,24 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
664 } 725 }
665 726
666 /* check for huge page mappings */ 727 /* check for huge page mappings */
667 rmapp = gfn_to_rmap(kvm, gfn, 1); 728 for (i = PT_DIRECTORY_LEVEL;
668 spte = rmap_next(kvm, rmapp, NULL); 729 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
669 while (spte) { 730 rmapp = gfn_to_rmap(kvm, gfn, i);
670 BUG_ON(!spte); 731 spte = rmap_next(kvm, rmapp, NULL);
671 BUG_ON(!(*spte & PT_PRESENT_MASK)); 732 while (spte) {
672 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 733 BUG_ON(!spte);
673 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 734 BUG_ON(!(*spte & PT_PRESENT_MASK));
674 if (is_writeble_pte(*spte)) { 735 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
675 rmap_remove(kvm, spte); 736 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
676 --kvm->stat.lpages; 737 if (is_writeble_pte(*spte)) {
677 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 738 rmap_remove(kvm, spte);
678 spte = NULL; 739 --kvm->stat.lpages;
679 write_protected = 1; 740 __set_spte(spte, shadow_trap_nonpresent_pte);
741 spte = NULL;
742 write_protected = 1;
743 }
744 spte = rmap_next(kvm, rmapp, spte);
680 } 745 }
681 spte = rmap_next(kvm, rmapp, spte);
682 } 746 }
683 747
684 return write_protected; 748 return write_protected;
@@ -693,7 +757,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
693 BUG_ON(!(*spte & PT_PRESENT_MASK)); 757 BUG_ON(!(*spte & PT_PRESENT_MASK));
694 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 758 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
695 rmap_remove(kvm, spte); 759 rmap_remove(kvm, spte);
696 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 760 __set_spte(spte, shadow_trap_nonpresent_pte);
697 need_tlb_flush = 1; 761 need_tlb_flush = 1;
698 } 762 }
699 return need_tlb_flush; 763 return need_tlb_flush;
@@ -702,7 +766,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
702static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 766static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
703 int (*handler)(struct kvm *kvm, unsigned long *rmapp)) 767 int (*handler)(struct kvm *kvm, unsigned long *rmapp))
704{ 768{
705 int i; 769 int i, j;
706 int retval = 0; 770 int retval = 0;
707 771
708 /* 772 /*
@@ -721,11 +785,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
721 end = start + (memslot->npages << PAGE_SHIFT); 785 end = start + (memslot->npages << PAGE_SHIFT);
722 if (hva >= start && hva < end) { 786 if (hva >= start && hva < end) {
723 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 787 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
788
724 retval |= handler(kvm, &memslot->rmap[gfn_offset]); 789 retval |= handler(kvm, &memslot->rmap[gfn_offset]);
725 retval |= handler(kvm, 790
726 &memslot->lpage_info[ 791 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
727 gfn_offset / 792 int idx = gfn_offset;
728 KVM_PAGES_PER_HPAGE].rmap_pde); 793 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
794 retval |= handler(kvm,
795 &memslot->lpage_info[j][idx].rmap_pde);
796 }
729 } 797 }
730 } 798 }
731 799
@@ -763,12 +831,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
763 831
764#define RMAP_RECYCLE_THRESHOLD 1000 832#define RMAP_RECYCLE_THRESHOLD 1000
765 833
766static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) 834static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
767{ 835{
768 unsigned long *rmapp; 836 unsigned long *rmapp;
837 struct kvm_mmu_page *sp;
838
839 sp = page_header(__pa(spte));
769 840
770 gfn = unalias_gfn(vcpu->kvm, gfn); 841 gfn = unalias_gfn(vcpu->kvm, gfn);
771 rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); 842 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
772 843
773 kvm_unmap_rmapp(vcpu->kvm, rmapp); 844 kvm_unmap_rmapp(vcpu->kvm, rmapp);
774 kvm_flush_remote_tlbs(vcpu->kvm); 845 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -1109,6 +1180,7 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1109 return 1; 1180 return 1;
1110 } 1181 }
1111 1182
1183 trace_kvm_mmu_sync_page(sp);
1112 if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1184 if (rmap_write_protect(vcpu->kvm, sp->gfn))
1113 kvm_flush_remote_tlbs(vcpu->kvm); 1185 kvm_flush_remote_tlbs(vcpu->kvm);
1114 kvm_unlink_unsync_page(vcpu->kvm, sp); 1186 kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1231,8 +1303,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1231 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1303 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1232 role.quadrant = quadrant; 1304 role.quadrant = quadrant;
1233 } 1305 }
1234 pgprintk("%s: looking gfn %lx role %x\n", __func__,
1235 gfn, role.word);
1236 index = kvm_page_table_hashfn(gfn); 1306 index = kvm_page_table_hashfn(gfn);
1237 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1307 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1238 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) 1308 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
@@ -1249,14 +1319,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1249 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1319 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1250 kvm_mmu_mark_parents_unsync(vcpu, sp); 1320 kvm_mmu_mark_parents_unsync(vcpu, sp);
1251 } 1321 }
1252 pgprintk("%s: found\n", __func__); 1322 trace_kvm_mmu_get_page(sp, false);
1253 return sp; 1323 return sp;
1254 } 1324 }
1255 ++vcpu->kvm->stat.mmu_cache_miss; 1325 ++vcpu->kvm->stat.mmu_cache_miss;
1256 sp = kvm_mmu_alloc_page(vcpu, parent_pte); 1326 sp = kvm_mmu_alloc_page(vcpu, parent_pte);
1257 if (!sp) 1327 if (!sp)
1258 return sp; 1328 return sp;
1259 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
1260 sp->gfn = gfn; 1329 sp->gfn = gfn;
1261 sp->role = role; 1330 sp->role = role;
1262 hlist_add_head(&sp->hash_link, bucket); 1331 hlist_add_head(&sp->hash_link, bucket);
@@ -1269,6 +1338,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1269 vcpu->arch.mmu.prefetch_page(vcpu, sp); 1338 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1270 else 1339 else
1271 nonpaging_prefetch_page(vcpu, sp); 1340 nonpaging_prefetch_page(vcpu, sp);
1341 trace_kvm_mmu_get_page(sp, true);
1272 return sp; 1342 return sp;
1273} 1343}
1274 1344
@@ -1292,6 +1362,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1292{ 1362{
1293 if (iterator->level < PT_PAGE_TABLE_LEVEL) 1363 if (iterator->level < PT_PAGE_TABLE_LEVEL)
1294 return false; 1364 return false;
1365
1366 if (iterator->level == PT_PAGE_TABLE_LEVEL)
1367 if (is_large_pte(*iterator->sptep))
1368 return false;
1369
1295 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 1370 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1296 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 1371 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1297 return true; 1372 return true;
@@ -1312,25 +1387,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1312 1387
1313 pt = sp->spt; 1388 pt = sp->spt;
1314 1389
1315 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1316 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1317 if (is_shadow_present_pte(pt[i]))
1318 rmap_remove(kvm, &pt[i]);
1319 pt[i] = shadow_trap_nonpresent_pte;
1320 }
1321 return;
1322 }
1323
1324 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 1390 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1325 ent = pt[i]; 1391 ent = pt[i];
1326 1392
1327 if (is_shadow_present_pte(ent)) { 1393 if (is_shadow_present_pte(ent)) {
1328 if (!is_large_pte(ent)) { 1394 if (!is_last_spte(ent, sp->role.level)) {
1329 ent &= PT64_BASE_ADDR_MASK; 1395 ent &= PT64_BASE_ADDR_MASK;
1330 mmu_page_remove_parent_pte(page_header(ent), 1396 mmu_page_remove_parent_pte(page_header(ent),
1331 &pt[i]); 1397 &pt[i]);
1332 } else { 1398 } else {
1333 --kvm->stat.lpages; 1399 if (is_large_pte(ent))
1400 --kvm->stat.lpages;
1334 rmap_remove(kvm, &pt[i]); 1401 rmap_remove(kvm, &pt[i]);
1335 } 1402 }
1336 } 1403 }
@@ -1346,10 +1413,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1346static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) 1413static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1347{ 1414{
1348 int i; 1415 int i;
1416 struct kvm_vcpu *vcpu;
1349 1417
1350 for (i = 0; i < KVM_MAX_VCPUS; ++i) 1418 kvm_for_each_vcpu(i, vcpu, kvm)
1351 if (kvm->vcpus[i]) 1419 vcpu->arch.last_pte_updated = NULL;
1352 kvm->vcpus[i]->arch.last_pte_updated = NULL;
1353} 1420}
1354 1421
1355static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 1422static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1368,7 +1435,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1368 } 1435 }
1369 BUG_ON(!parent_pte); 1436 BUG_ON(!parent_pte);
1370 kvm_mmu_put_page(sp, parent_pte); 1437 kvm_mmu_put_page(sp, parent_pte);
1371 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); 1438 __set_spte(parent_pte, shadow_trap_nonpresent_pte);
1372 } 1439 }
1373} 1440}
1374 1441
@@ -1400,6 +1467,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1400static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1467static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1401{ 1468{
1402 int ret; 1469 int ret;
1470
1471 trace_kvm_mmu_zap_page(sp);
1403 ++kvm->stat.mmu_shadow_zapped; 1472 ++kvm->stat.mmu_shadow_zapped;
1404 ret = mmu_zap_unsync_children(kvm, sp); 1473 ret = mmu_zap_unsync_children(kvm, sp);
1405 kvm_mmu_page_unlink_children(kvm, sp); 1474 kvm_mmu_page_unlink_children(kvm, sp);
@@ -1516,7 +1585,7 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1516 1585
1517 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 1586 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1518 if (pt[i] == shadow_notrap_nonpresent_pte) 1587 if (pt[i] == shadow_notrap_nonpresent_pte)
1519 set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); 1588 __set_spte(&pt[i], shadow_trap_nonpresent_pte);
1520 } 1589 }
1521} 1590}
1522 1591
@@ -1646,6 +1715,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1646 struct kvm_mmu_page *s; 1715 struct kvm_mmu_page *s;
1647 struct hlist_node *node, *n; 1716 struct hlist_node *node, *n;
1648 1717
1718 trace_kvm_mmu_unsync_page(sp);
1649 index = kvm_page_table_hashfn(sp->gfn); 1719 index = kvm_page_table_hashfn(sp->gfn);
1650 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1720 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1651 /* don't unsync if pagetable is shadowed with multiple roles */ 1721 /* don't unsync if pagetable is shadowed with multiple roles */
@@ -1682,9 +1752,9 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1682 return 0; 1752 return 0;
1683} 1753}
1684 1754
1685static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1755static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1686 unsigned pte_access, int user_fault, 1756 unsigned pte_access, int user_fault,
1687 int write_fault, int dirty, int largepage, 1757 int write_fault, int dirty, int level,
1688 gfn_t gfn, pfn_t pfn, bool speculative, 1758 gfn_t gfn, pfn_t pfn, bool speculative,
1689 bool can_unsync) 1759 bool can_unsync)
1690{ 1760{
@@ -1707,7 +1777,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1707 spte |= shadow_nx_mask; 1777 spte |= shadow_nx_mask;
1708 if (pte_access & ACC_USER_MASK) 1778 if (pte_access & ACC_USER_MASK)
1709 spte |= shadow_user_mask; 1779 spte |= shadow_user_mask;
1710 if (largepage) 1780 if (level > PT_PAGE_TABLE_LEVEL)
1711 spte |= PT_PAGE_SIZE_MASK; 1781 spte |= PT_PAGE_SIZE_MASK;
1712 if (tdp_enabled) 1782 if (tdp_enabled)
1713 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 1783 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
@@ -1718,7 +1788,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1718 if ((pte_access & ACC_WRITE_MASK) 1788 if ((pte_access & ACC_WRITE_MASK)
1719 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1789 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1720 1790
1721 if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { 1791 if (level > PT_PAGE_TABLE_LEVEL &&
1792 has_wrprotected_page(vcpu->kvm, gfn, level)) {
1722 ret = 1; 1793 ret = 1;
1723 spte = shadow_trap_nonpresent_pte; 1794 spte = shadow_trap_nonpresent_pte;
1724 goto set_pte; 1795 goto set_pte;
@@ -1732,7 +1803,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1732 * is responsibility of mmu_get_page / kvm_sync_page. 1803 * is responsibility of mmu_get_page / kvm_sync_page.
1733 * Same reasoning can be applied to dirty page accounting. 1804 * Same reasoning can be applied to dirty page accounting.
1734 */ 1805 */
1735 if (!can_unsync && is_writeble_pte(*shadow_pte)) 1806 if (!can_unsync && is_writeble_pte(*sptep))
1736 goto set_pte; 1807 goto set_pte;
1737 1808
1738 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 1809 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1749,65 +1820,67 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1749 mark_page_dirty(vcpu->kvm, gfn); 1820 mark_page_dirty(vcpu->kvm, gfn);
1750 1821
1751set_pte: 1822set_pte:
1752 set_shadow_pte(shadow_pte, spte); 1823 __set_spte(sptep, spte);
1753 return ret; 1824 return ret;
1754} 1825}
1755 1826
1756static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1827static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1757 unsigned pt_access, unsigned pte_access, 1828 unsigned pt_access, unsigned pte_access,
1758 int user_fault, int write_fault, int dirty, 1829 int user_fault, int write_fault, int dirty,
1759 int *ptwrite, int largepage, gfn_t gfn, 1830 int *ptwrite, int level, gfn_t gfn,
1760 pfn_t pfn, bool speculative) 1831 pfn_t pfn, bool speculative)
1761{ 1832{
1762 int was_rmapped = 0; 1833 int was_rmapped = 0;
1763 int was_writeble = is_writeble_pte(*shadow_pte); 1834 int was_writeble = is_writeble_pte(*sptep);
1764 int rmap_count; 1835 int rmap_count;
1765 1836
1766 pgprintk("%s: spte %llx access %x write_fault %d" 1837 pgprintk("%s: spte %llx access %x write_fault %d"
1767 " user_fault %d gfn %lx\n", 1838 " user_fault %d gfn %lx\n",
1768 __func__, *shadow_pte, pt_access, 1839 __func__, *sptep, pt_access,
1769 write_fault, user_fault, gfn); 1840 write_fault, user_fault, gfn);
1770 1841
1771 if (is_rmap_pte(*shadow_pte)) { 1842 if (is_rmap_spte(*sptep)) {
1772 /* 1843 /*
1773 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 1844 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1774 * the parent of the now unreachable PTE. 1845 * the parent of the now unreachable PTE.
1775 */ 1846 */
1776 if (largepage && !is_large_pte(*shadow_pte)) { 1847 if (level > PT_PAGE_TABLE_LEVEL &&
1848 !is_large_pte(*sptep)) {
1777 struct kvm_mmu_page *child; 1849 struct kvm_mmu_page *child;
1778 u64 pte = *shadow_pte; 1850 u64 pte = *sptep;
1779 1851
1780 child = page_header(pte & PT64_BASE_ADDR_MASK); 1852 child = page_header(pte & PT64_BASE_ADDR_MASK);
1781 mmu_page_remove_parent_pte(child, shadow_pte); 1853 mmu_page_remove_parent_pte(child, sptep);
1782 } else if (pfn != spte_to_pfn(*shadow_pte)) { 1854 } else if (pfn != spte_to_pfn(*sptep)) {
1783 pgprintk("hfn old %lx new %lx\n", 1855 pgprintk("hfn old %lx new %lx\n",
1784 spte_to_pfn(*shadow_pte), pfn); 1856 spte_to_pfn(*sptep), pfn);
1785 rmap_remove(vcpu->kvm, shadow_pte); 1857 rmap_remove(vcpu->kvm, sptep);
1786 } else 1858 } else
1787 was_rmapped = 1; 1859 was_rmapped = 1;
1788 } 1860 }
1789 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, 1861
1790 dirty, largepage, gfn, pfn, speculative, true)) { 1862 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
1863 dirty, level, gfn, pfn, speculative, true)) {
1791 if (write_fault) 1864 if (write_fault)
1792 *ptwrite = 1; 1865 *ptwrite = 1;
1793 kvm_x86_ops->tlb_flush(vcpu); 1866 kvm_x86_ops->tlb_flush(vcpu);
1794 } 1867 }
1795 1868
1796 pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); 1869 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
1797 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", 1870 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1798 is_large_pte(*shadow_pte)? "2MB" : "4kB", 1871 is_large_pte(*sptep)? "2MB" : "4kB",
1799 is_present_pte(*shadow_pte)?"RW":"R", gfn, 1872 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
1800 *shadow_pte, shadow_pte); 1873 *sptep, sptep);
1801 if (!was_rmapped && is_large_pte(*shadow_pte)) 1874 if (!was_rmapped && is_large_pte(*sptep))
1802 ++vcpu->kvm->stat.lpages; 1875 ++vcpu->kvm->stat.lpages;
1803 1876
1804 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1877 page_header_update_slot(vcpu->kvm, sptep, gfn);
1805 if (!was_rmapped) { 1878 if (!was_rmapped) {
1806 rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); 1879 rmap_count = rmap_add(vcpu, sptep, gfn);
1807 if (!is_rmap_pte(*shadow_pte)) 1880 if (!is_rmap_spte(*sptep))
1808 kvm_release_pfn_clean(pfn); 1881 kvm_release_pfn_clean(pfn);
1809 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 1882 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1810 rmap_recycle(vcpu, gfn, largepage); 1883 rmap_recycle(vcpu, sptep, gfn);
1811 } else { 1884 } else {
1812 if (was_writeble) 1885 if (was_writeble)
1813 kvm_release_pfn_dirty(pfn); 1886 kvm_release_pfn_dirty(pfn);
@@ -1815,7 +1888,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1815 kvm_release_pfn_clean(pfn); 1888 kvm_release_pfn_clean(pfn);
1816 } 1889 }
1817 if (speculative) { 1890 if (speculative) {
1818 vcpu->arch.last_pte_updated = shadow_pte; 1891 vcpu->arch.last_pte_updated = sptep;
1819 vcpu->arch.last_pte_gfn = gfn; 1892 vcpu->arch.last_pte_gfn = gfn;
1820 } 1893 }
1821} 1894}
@@ -1825,7 +1898,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1825} 1898}
1826 1899
1827static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 1900static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1828 int largepage, gfn_t gfn, pfn_t pfn) 1901 int level, gfn_t gfn, pfn_t pfn)
1829{ 1902{
1830 struct kvm_shadow_walk_iterator iterator; 1903 struct kvm_shadow_walk_iterator iterator;
1831 struct kvm_mmu_page *sp; 1904 struct kvm_mmu_page *sp;
@@ -1833,11 +1906,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1833 gfn_t pseudo_gfn; 1906 gfn_t pseudo_gfn;
1834 1907
1835 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 1908 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
1836 if (iterator.level == PT_PAGE_TABLE_LEVEL 1909 if (iterator.level == level) {
1837 || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
1838 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 1910 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
1839 0, write, 1, &pt_write, 1911 0, write, 1, &pt_write,
1840 largepage, gfn, pfn, false); 1912 level, gfn, pfn, false);
1841 ++vcpu->stat.pf_fixed; 1913 ++vcpu->stat.pf_fixed;
1842 break; 1914 break;
1843 } 1915 }
@@ -1853,10 +1925,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1853 return -ENOMEM; 1925 return -ENOMEM;
1854 } 1926 }
1855 1927
1856 set_shadow_pte(iterator.sptep, 1928 __set_spte(iterator.sptep,
1857 __pa(sp->spt) 1929 __pa(sp->spt)
1858 | PT_PRESENT_MASK | PT_WRITABLE_MASK 1930 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1859 | shadow_user_mask | shadow_x_mask); 1931 | shadow_user_mask | shadow_x_mask);
1860 } 1932 }
1861 } 1933 }
1862 return pt_write; 1934 return pt_write;
@@ -1865,14 +1937,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1865static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 1937static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1866{ 1938{
1867 int r; 1939 int r;
1868 int largepage = 0; 1940 int level;
1869 pfn_t pfn; 1941 pfn_t pfn;
1870 unsigned long mmu_seq; 1942 unsigned long mmu_seq;
1871 1943
1872 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1944 level = mapping_level(vcpu, gfn);
1873 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1945
1874 largepage = 1; 1946 /*
1875 } 1947 * This path builds a PAE pagetable - so we can map 2mb pages at
1948 * maximum. Therefore check if the level is larger than that.
1949 */
1950 if (level > PT_DIRECTORY_LEVEL)
1951 level = PT_DIRECTORY_LEVEL;
1952
1953 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
1876 1954
1877 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1955 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1878 smp_rmb(); 1956 smp_rmb();
@@ -1888,7 +1966,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1888 if (mmu_notifier_retry(vcpu, mmu_seq)) 1966 if (mmu_notifier_retry(vcpu, mmu_seq))
1889 goto out_unlock; 1967 goto out_unlock;
1890 kvm_mmu_free_some_pages(vcpu); 1968 kvm_mmu_free_some_pages(vcpu);
1891 r = __direct_map(vcpu, v, write, largepage, gfn, pfn); 1969 r = __direct_map(vcpu, v, write, level, gfn, pfn);
1892 spin_unlock(&vcpu->kvm->mmu_lock); 1970 spin_unlock(&vcpu->kvm->mmu_lock);
1893 1971
1894 1972
@@ -1954,6 +2032,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
1954 gfn_t root_gfn; 2032 gfn_t root_gfn;
1955 struct kvm_mmu_page *sp; 2033 struct kvm_mmu_page *sp;
1956 int direct = 0; 2034 int direct = 0;
2035 u64 pdptr;
1957 2036
1958 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; 2037 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1959 2038
@@ -1981,11 +2060,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
1981 2060
1982 ASSERT(!VALID_PAGE(root)); 2061 ASSERT(!VALID_PAGE(root));
1983 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2062 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1984 if (!is_present_pte(vcpu->arch.pdptrs[i])) { 2063 pdptr = kvm_pdptr_read(vcpu, i);
2064 if (!is_present_gpte(pdptr)) {
1985 vcpu->arch.mmu.pae_root[i] = 0; 2065 vcpu->arch.mmu.pae_root[i] = 0;
1986 continue; 2066 continue;
1987 } 2067 }
1988 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; 2068 root_gfn = pdptr >> PAGE_SHIFT;
1989 } else if (vcpu->arch.mmu.root_level == 0) 2069 } else if (vcpu->arch.mmu.root_level == 0)
1990 root_gfn = 0; 2070 root_gfn = 0;
1991 if (mmu_check_root(vcpu, root_gfn)) 2071 if (mmu_check_root(vcpu, root_gfn))
@@ -2062,7 +2142,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2062{ 2142{
2063 pfn_t pfn; 2143 pfn_t pfn;
2064 int r; 2144 int r;
2065 int largepage = 0; 2145 int level;
2066 gfn_t gfn = gpa >> PAGE_SHIFT; 2146 gfn_t gfn = gpa >> PAGE_SHIFT;
2067 unsigned long mmu_seq; 2147 unsigned long mmu_seq;
2068 2148
@@ -2073,10 +2153,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2073 if (r) 2153 if (r)
2074 return r; 2154 return r;
2075 2155
2076 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 2156 level = mapping_level(vcpu, gfn);
2077 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 2157
2078 largepage = 1; 2158 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2079 } 2159
2080 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2160 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2081 smp_rmb(); 2161 smp_rmb();
2082 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2162 pfn = gfn_to_pfn(vcpu->kvm, gfn);
@@ -2089,7 +2169,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2089 goto out_unlock; 2169 goto out_unlock;
2090 kvm_mmu_free_some_pages(vcpu); 2170 kvm_mmu_free_some_pages(vcpu);
2091 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2171 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
2092 largepage, gfn, pfn); 2172 level, gfn, pfn);
2093 spin_unlock(&vcpu->kvm->mmu_lock); 2173 spin_unlock(&vcpu->kvm->mmu_lock);
2094 2174
2095 return r; 2175 return r;
@@ -2206,7 +2286,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2206 context->rsvd_bits_mask[0][0] = exb_bit_rsvd | 2286 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2207 rsvd_bits(maxphyaddr, 51); 2287 rsvd_bits(maxphyaddr, 51);
2208 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; 2288 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2209 context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; 2289 context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2290 rsvd_bits(maxphyaddr, 51) |
2291 rsvd_bits(13, 29);
2210 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2292 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2211 rsvd_bits(maxphyaddr, 51) | 2293 rsvd_bits(maxphyaddr, 51) |
2212 rsvd_bits(13, 20); /* large page */ 2294 rsvd_bits(13, 20); /* large page */
@@ -2357,8 +2439,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2357 spin_unlock(&vcpu->kvm->mmu_lock); 2439 spin_unlock(&vcpu->kvm->mmu_lock);
2358 if (r) 2440 if (r)
2359 goto out; 2441 goto out;
2442 /* set_cr3() should ensure TLB has been flushed */
2360 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2443 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2361 kvm_mmu_flush_tlb(vcpu);
2362out: 2444out:
2363 return r; 2445 return r;
2364} 2446}
@@ -2378,15 +2460,14 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2378 2460
2379 pte = *spte; 2461 pte = *spte;
2380 if (is_shadow_present_pte(pte)) { 2462 if (is_shadow_present_pte(pte)) {
2381 if (sp->role.level == PT_PAGE_TABLE_LEVEL || 2463 if (is_last_spte(pte, sp->role.level))
2382 is_large_pte(pte))
2383 rmap_remove(vcpu->kvm, spte); 2464 rmap_remove(vcpu->kvm, spte);
2384 else { 2465 else {
2385 child = page_header(pte & PT64_BASE_ADDR_MASK); 2466 child = page_header(pte & PT64_BASE_ADDR_MASK);
2386 mmu_page_remove_parent_pte(child, spte); 2467 mmu_page_remove_parent_pte(child, spte);
2387 } 2468 }
2388 } 2469 }
2389 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 2470 __set_spte(spte, shadow_trap_nonpresent_pte);
2390 if (is_large_pte(pte)) 2471 if (is_large_pte(pte))
2391 --vcpu->kvm->stat.lpages; 2472 --vcpu->kvm->stat.lpages;
2392} 2473}
@@ -2397,11 +2478,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2397 const void *new) 2478 const void *new)
2398{ 2479{
2399 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 2480 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2400 if (!vcpu->arch.update_pte.largepage || 2481 ++vcpu->kvm->stat.mmu_pde_zapped;
2401 sp->role.glevels == PT32_ROOT_LEVEL) { 2482 return;
2402 ++vcpu->kvm->stat.mmu_pde_zapped;
2403 return;
2404 }
2405 } 2483 }
2406 2484
2407 ++vcpu->kvm->stat.mmu_pte_updated; 2485 ++vcpu->kvm->stat.mmu_pte_updated;
@@ -2447,8 +2525,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2447 u64 gpte = 0; 2525 u64 gpte = 0;
2448 pfn_t pfn; 2526 pfn_t pfn;
2449 2527
2450 vcpu->arch.update_pte.largepage = 0;
2451
2452 if (bytes != 4 && bytes != 8) 2528 if (bytes != 4 && bytes != 8)
2453 return; 2529 return;
2454 2530
@@ -2472,14 +2548,10 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2472 if ((bytes == 4) && (gpa % 4 == 0)) 2548 if ((bytes == 4) && (gpa % 4 == 0))
2473 memcpy((void *)&gpte, new, 4); 2549 memcpy((void *)&gpte, new, 4);
2474 } 2550 }
2475 if (!is_present_pte(gpte)) 2551 if (!is_present_gpte(gpte))
2476 return; 2552 return;
2477 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2553 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2478 2554
2479 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
2480 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
2481 vcpu->arch.update_pte.largepage = 1;
2482 }
2483 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; 2555 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2484 smp_rmb(); 2556 smp_rmb();
2485 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2557 pfn = gfn_to_pfn(vcpu->kvm, gfn);
@@ -2622,6 +2694,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2622 gpa_t gpa; 2694 gpa_t gpa;
2623 int r; 2695 int r;
2624 2696
2697 if (tdp_enabled)
2698 return 0;
2699
2625 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 2700 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
2626 2701
2627 spin_lock(&vcpu->kvm->mmu_lock); 2702 spin_lock(&vcpu->kvm->mmu_lock);
@@ -2633,7 +2708,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2633 2708
2634void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 2709void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2635{ 2710{
2636 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { 2711 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
2712 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2637 struct kvm_mmu_page *sp; 2713 struct kvm_mmu_page *sp;
2638 2714
2639 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 2715 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
@@ -2670,8 +2746,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2670 ++vcpu->stat.mmio_exits; 2746 ++vcpu->stat.mmio_exits;
2671 return 0; 2747 return 0;
2672 case EMULATE_FAIL: 2748 case EMULATE_FAIL:
2673 kvm_report_emulation_failure(vcpu, "pagetable"); 2749 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2674 return 1; 2750 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2751 return 0;
2675 default: 2752 default:
2676 BUG(); 2753 BUG();
2677 } 2754 }
@@ -2712,12 +2789,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2712 2789
2713 ASSERT(vcpu); 2790 ASSERT(vcpu);
2714 2791
2715 if (vcpu->kvm->arch.n_requested_mmu_pages)
2716 vcpu->kvm->arch.n_free_mmu_pages =
2717 vcpu->kvm->arch.n_requested_mmu_pages;
2718 else
2719 vcpu->kvm->arch.n_free_mmu_pages =
2720 vcpu->kvm->arch.n_alloc_mmu_pages;
2721 /* 2792 /*
2722 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. 2793 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
2723 * Therefore we need to allocate shadow page tables in the first 2794 * Therefore we need to allocate shadow page tables in the first
@@ -3029,6 +3100,24 @@ out:
3029 return r; 3100 return r;
3030} 3101}
3031 3102
3103int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3104{
3105 struct kvm_shadow_walk_iterator iterator;
3106 int nr_sptes = 0;
3107
3108 spin_lock(&vcpu->kvm->mmu_lock);
3109 for_each_shadow_entry(vcpu, addr, iterator) {
3110 sptes[iterator.level-1] = *iterator.sptep;
3111 nr_sptes++;
3112 if (!is_shadow_present_pte(*iterator.sptep))
3113 break;
3114 }
3115 spin_unlock(&vcpu->kvm->mmu_lock);
3116
3117 return nr_sptes;
3118}
3119EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3120
3032#ifdef AUDIT 3121#ifdef AUDIT
3033 3122
3034static const char *audit_msg; 3123static const char *audit_msg;
@@ -3041,6 +3130,54 @@ static gva_t canonicalize(gva_t gva)
3041 return gva; 3130 return gva;
3042} 3131}
3043 3132
3133
3134typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
3135 u64 *sptep);
3136
3137static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3138 inspect_spte_fn fn)
3139{
3140 int i;
3141
3142 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3143 u64 ent = sp->spt[i];
3144
3145 if (is_shadow_present_pte(ent)) {
3146 if (!is_last_spte(ent, sp->role.level)) {
3147 struct kvm_mmu_page *child;
3148 child = page_header(ent & PT64_BASE_ADDR_MASK);
3149 __mmu_spte_walk(kvm, child, fn);
3150 } else
3151 fn(kvm, sp, &sp->spt[i]);
3152 }
3153 }
3154}
3155
3156static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3157{
3158 int i;
3159 struct kvm_mmu_page *sp;
3160
3161 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3162 return;
3163 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3164 hpa_t root = vcpu->arch.mmu.root_hpa;
3165 sp = page_header(root);
3166 __mmu_spte_walk(vcpu->kvm, sp, fn);
3167 return;
3168 }
3169 for (i = 0; i < 4; ++i) {
3170 hpa_t root = vcpu->arch.mmu.pae_root[i];
3171
3172 if (root && VALID_PAGE(root)) {
3173 root &= PT64_BASE_ADDR_MASK;
3174 sp = page_header(root);
3175 __mmu_spte_walk(vcpu->kvm, sp, fn);
3176 }
3177 }
3178 return;
3179}
3180
3044static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, 3181static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3045 gva_t va, int level) 3182 gva_t va, int level)
3046{ 3183{
@@ -3055,20 +3192,19 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3055 continue; 3192 continue;
3056 3193
3057 va = canonicalize(va); 3194 va = canonicalize(va);
3058 if (level > 1) { 3195 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3059 if (ent == shadow_notrap_nonpresent_pte) 3196 audit_mappings_page(vcpu, ent, va, level - 1);
3060 printk(KERN_ERR "audit: (%s) nontrapping pte" 3197 else {
3061 " in nonleaf level: levels %d gva %lx"
3062 " level %d pte %llx\n", audit_msg,
3063 vcpu->arch.mmu.root_level, va, level, ent);
3064 else
3065 audit_mappings_page(vcpu, ent, va, level - 1);
3066 } else {
3067 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 3198 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
3068 gfn_t gfn = gpa >> PAGE_SHIFT; 3199 gfn_t gfn = gpa >> PAGE_SHIFT;
3069 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); 3200 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3070 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; 3201 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3071 3202
3203 if (is_error_pfn(pfn)) {
3204 kvm_release_pfn_clean(pfn);
3205 continue;
3206 }
3207
3072 if (is_shadow_present_pte(ent) 3208 if (is_shadow_present_pte(ent)
3073 && (ent & PT64_BASE_ADDR_MASK) != hpa) 3209 && (ent & PT64_BASE_ADDR_MASK) != hpa)
3074 printk(KERN_ERR "xx audit error: (%s) levels %d" 3210 printk(KERN_ERR "xx audit error: (%s) levels %d"
@@ -3122,7 +3258,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3122 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 3258 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3123 while (d) { 3259 while (d) {
3124 for (k = 0; k < RMAP_EXT; ++k) 3260 for (k = 0; k < RMAP_EXT; ++k)
3125 if (d->shadow_ptes[k]) 3261 if (d->sptes[k])
3126 ++nmaps; 3262 ++nmaps;
3127 else 3263 else
3128 break; 3264 break;
@@ -3133,9 +3269,48 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3133 return nmaps; 3269 return nmaps;
3134} 3270}
3135 3271
3136static int count_writable_mappings(struct kvm_vcpu *vcpu) 3272void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
3273{
3274 unsigned long *rmapp;
3275 struct kvm_mmu_page *rev_sp;
3276 gfn_t gfn;
3277
3278 if (*sptep & PT_WRITABLE_MASK) {
3279 rev_sp = page_header(__pa(sptep));
3280 gfn = rev_sp->gfns[sptep - rev_sp->spt];
3281
3282 if (!gfn_to_memslot(kvm, gfn)) {
3283 if (!printk_ratelimit())
3284 return;
3285 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3286 audit_msg, gfn);
3287 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3288 audit_msg, sptep - rev_sp->spt,
3289 rev_sp->gfn);
3290 dump_stack();
3291 return;
3292 }
3293
3294 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
3295 is_large_pte(*sptep));
3296 if (!*rmapp) {
3297 if (!printk_ratelimit())
3298 return;
3299 printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3300 audit_msg, *sptep);
3301 dump_stack();
3302 }
3303 }
3304
3305}
3306
3307void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3308{
3309 mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3310}
3311
3312static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3137{ 3313{
3138 int nmaps = 0;
3139 struct kvm_mmu_page *sp; 3314 struct kvm_mmu_page *sp;
3140 int i; 3315 int i;
3141 3316
@@ -3152,20 +3327,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu)
3152 continue; 3327 continue;
3153 if (!(ent & PT_WRITABLE_MASK)) 3328 if (!(ent & PT_WRITABLE_MASK))
3154 continue; 3329 continue;
3155 ++nmaps; 3330 inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
3156 } 3331 }
3157 } 3332 }
3158 return nmaps; 3333 return;
3159} 3334}
3160 3335
3161static void audit_rmap(struct kvm_vcpu *vcpu) 3336static void audit_rmap(struct kvm_vcpu *vcpu)
3162{ 3337{
3163 int n_rmap = count_rmaps(vcpu); 3338 check_writable_mappings_rmap(vcpu);
3164 int n_actual = count_writable_mappings(vcpu); 3339 count_rmaps(vcpu);
3165
3166 if (n_rmap != n_actual)
3167 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
3168 __func__, audit_msg, n_rmap, n_actual);
3169} 3340}
3170 3341
3171static void audit_write_protection(struct kvm_vcpu *vcpu) 3342static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -3173,20 +3344,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
3173 struct kvm_mmu_page *sp; 3344 struct kvm_mmu_page *sp;
3174 struct kvm_memory_slot *slot; 3345 struct kvm_memory_slot *slot;
3175 unsigned long *rmapp; 3346 unsigned long *rmapp;
3347 u64 *spte;
3176 gfn_t gfn; 3348 gfn_t gfn;
3177 3349
3178 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { 3350 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3179 if (sp->role.direct) 3351 if (sp->role.direct)
3180 continue; 3352 continue;
3353 if (sp->unsync)
3354 continue;
3181 3355
3182 gfn = unalias_gfn(vcpu->kvm, sp->gfn); 3356 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
3183 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); 3357 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
3184 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3358 rmapp = &slot->rmap[gfn - slot->base_gfn];
3185 if (*rmapp) 3359
3186 printk(KERN_ERR "%s: (%s) shadow page has writable" 3360 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3187 " mappings: gfn %lx role %x\n", 3361 while (spte) {
3362 if (*spte & PT_WRITABLE_MASK)
3363 printk(KERN_ERR "%s: (%s) shadow page has "
3364 "writable mappings: gfn %lx role %x\n",
3188 __func__, audit_msg, sp->gfn, 3365 __func__, audit_msg, sp->gfn,
3189 sp->role.word); 3366 sp->role.word);
3367 spte = rmap_next(vcpu->kvm, rmapp, spte);
3368 }
3190 } 3369 }
3191} 3370}
3192 3371
@@ -3198,7 +3377,9 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
3198 audit_msg = msg; 3377 audit_msg = msg;
3199 audit_rmap(vcpu); 3378 audit_rmap(vcpu);
3200 audit_write_protection(vcpu); 3379 audit_write_protection(vcpu);
3201 audit_mappings(vcpu); 3380 if (strcmp("pre pte write", audit_msg) != 0)
3381 audit_mappings(vcpu);
3382 audit_writable_sptes_have_rmaps(vcpu);
3202 dbg = olddbg; 3383 dbg = olddbg;
3203} 3384}
3204 3385