diff options
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu.c | 1371 |
1 files changed, 780 insertions, 591 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 311f6dad8951..aee38623b768 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * MMU support | 7 | * MMU support |
8 | * | 8 | * |
9 | * Copyright (C) 2006 Qumranet, Inc. | 9 | * Copyright (C) 2006 Qumranet, Inc. |
10 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 10 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
11 | * | 11 | * |
12 | * Authors: | 12 | * Authors: |
13 | * Yaniv Kamay <yaniv@qumranet.com> | 13 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -18,9 +18,11 @@ | |||
18 | * | 18 | * |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include "irq.h" | ||
21 | #include "mmu.h" | 22 | #include "mmu.h" |
22 | #include "x86.h" | 23 | #include "x86.h" |
23 | #include "kvm_cache_regs.h" | 24 | #include "kvm_cache_regs.h" |
25 | #include "x86.h" | ||
24 | 26 | ||
25 | #include <linux/kvm_host.h> | 27 | #include <linux/kvm_host.h> |
26 | #include <linux/types.h> | 28 | #include <linux/types.h> |
@@ -49,15 +51,25 @@ | |||
49 | */ | 51 | */ |
50 | bool tdp_enabled = false; | 52 | bool tdp_enabled = false; |
51 | 53 | ||
52 | #undef MMU_DEBUG | 54 | enum { |
55 | AUDIT_PRE_PAGE_FAULT, | ||
56 | AUDIT_POST_PAGE_FAULT, | ||
57 | AUDIT_PRE_PTE_WRITE, | ||
58 | AUDIT_POST_PTE_WRITE, | ||
59 | AUDIT_PRE_SYNC, | ||
60 | AUDIT_POST_SYNC | ||
61 | }; | ||
53 | 62 | ||
54 | #undef AUDIT | 63 | char *audit_point_name[] = { |
64 | "pre page fault", | ||
65 | "post page fault", | ||
66 | "pre pte write", | ||
67 | "post pte write", | ||
68 | "pre sync", | ||
69 | "post sync" | ||
70 | }; | ||
55 | 71 | ||
56 | #ifdef AUDIT | 72 | #undef MMU_DEBUG |
57 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); | ||
58 | #else | ||
59 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | ||
60 | #endif | ||
61 | 73 | ||
62 | #ifdef MMU_DEBUG | 74 | #ifdef MMU_DEBUG |
63 | 75 | ||
@@ -71,7 +83,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | |||
71 | 83 | ||
72 | #endif | 84 | #endif |
73 | 85 | ||
74 | #if defined(MMU_DEBUG) || defined(AUDIT) | 86 | #ifdef MMU_DEBUG |
75 | static int dbg = 0; | 87 | static int dbg = 0; |
76 | module_param(dbg, bool, 0644); | 88 | module_param(dbg, bool, 0644); |
77 | #endif | 89 | #endif |
@@ -89,6 +101,8 @@ module_param(oos_shadow, bool, 0644); | |||
89 | } | 101 | } |
90 | #endif | 102 | #endif |
91 | 103 | ||
104 | #define PTE_PREFETCH_NUM 8 | ||
105 | |||
92 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | 106 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 |
93 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | 107 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 |
94 | 108 | ||
@@ -97,9 +111,6 @@ module_param(oos_shadow, bool, 0644); | |||
97 | #define PT64_LEVEL_SHIFT(level) \ | 111 | #define PT64_LEVEL_SHIFT(level) \ |
98 | (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) | 112 | (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) |
99 | 113 | ||
100 | #define PT64_LEVEL_MASK(level) \ | ||
101 | (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) | ||
102 | |||
103 | #define PT64_INDEX(address, level)\ | 114 | #define PT64_INDEX(address, level)\ |
104 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) | 115 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) |
105 | 116 | ||
@@ -109,8 +120,6 @@ module_param(oos_shadow, bool, 0644); | |||
109 | #define PT32_LEVEL_SHIFT(level) \ | 120 | #define PT32_LEVEL_SHIFT(level) \ |
110 | (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) | 121 | (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) |
111 | 122 | ||
112 | #define PT32_LEVEL_MASK(level) \ | ||
113 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) | ||
114 | #define PT32_LVL_OFFSET_MASK(level) \ | 123 | #define PT32_LVL_OFFSET_MASK(level) \ |
115 | (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ | 124 | (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ |
116 | * PT32_LEVEL_BITS))) - 1)) | 125 | * PT32_LEVEL_BITS))) - 1)) |
@@ -178,10 +187,10 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); | |||
178 | static struct kmem_cache *pte_chain_cache; | 187 | static struct kmem_cache *pte_chain_cache; |
179 | static struct kmem_cache *rmap_desc_cache; | 188 | static struct kmem_cache *rmap_desc_cache; |
180 | static struct kmem_cache *mmu_page_header_cache; | 189 | static struct kmem_cache *mmu_page_header_cache; |
190 | static struct percpu_counter kvm_total_used_mmu_pages; | ||
181 | 191 | ||
182 | static u64 __read_mostly shadow_trap_nonpresent_pte; | 192 | static u64 __read_mostly shadow_trap_nonpresent_pte; |
183 | static u64 __read_mostly shadow_notrap_nonpresent_pte; | 193 | static u64 __read_mostly shadow_notrap_nonpresent_pte; |
184 | static u64 __read_mostly shadow_base_present_pte; | ||
185 | static u64 __read_mostly shadow_nx_mask; | 194 | static u64 __read_mostly shadow_nx_mask; |
186 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ | 195 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ |
187 | static u64 __read_mostly shadow_user_mask; | 196 | static u64 __read_mostly shadow_user_mask; |
@@ -200,12 +209,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | |||
200 | } | 209 | } |
201 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); | 210 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); |
202 | 211 | ||
203 | void kvm_mmu_set_base_ptes(u64 base_pte) | ||
204 | { | ||
205 | shadow_base_present_pte = base_pte; | ||
206 | } | ||
207 | EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); | ||
208 | |||
209 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 212 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
210 | u64 dirty_mask, u64 nx_mask, u64 x_mask) | 213 | u64 dirty_mask, u64 nx_mask, u64 x_mask) |
211 | { | 214 | { |
@@ -299,18 +302,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte) | |||
299 | #endif | 302 | #endif |
300 | } | 303 | } |
301 | 304 | ||
305 | static bool spte_has_volatile_bits(u64 spte) | ||
306 | { | ||
307 | if (!shadow_accessed_mask) | ||
308 | return false; | ||
309 | |||
310 | if (!is_shadow_present_pte(spte)) | ||
311 | return false; | ||
312 | |||
313 | if ((spte & shadow_accessed_mask) && | ||
314 | (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) | ||
315 | return false; | ||
316 | |||
317 | return true; | ||
318 | } | ||
319 | |||
320 | static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) | ||
321 | { | ||
322 | return (old_spte & bit_mask) && !(new_spte & bit_mask); | ||
323 | } | ||
324 | |||
302 | static void update_spte(u64 *sptep, u64 new_spte) | 325 | static void update_spte(u64 *sptep, u64 new_spte) |
303 | { | 326 | { |
304 | u64 old_spte; | 327 | u64 mask, old_spte = *sptep; |
328 | |||
329 | WARN_ON(!is_rmap_spte(new_spte)); | ||
330 | |||
331 | new_spte |= old_spte & shadow_dirty_mask; | ||
332 | |||
333 | mask = shadow_accessed_mask; | ||
334 | if (is_writable_pte(old_spte)) | ||
335 | mask |= shadow_dirty_mask; | ||
305 | 336 | ||
306 | if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || | 337 | if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) |
307 | !is_rmap_spte(*sptep)) | ||
308 | __set_spte(sptep, new_spte); | 338 | __set_spte(sptep, new_spte); |
309 | else { | 339 | else |
310 | old_spte = __xchg_spte(sptep, new_spte); | 340 | old_spte = __xchg_spte(sptep, new_spte); |
311 | if (old_spte & shadow_accessed_mask) | 341 | |
312 | mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); | 342 | if (!shadow_accessed_mask) |
313 | } | 343 | return; |
344 | |||
345 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) | ||
346 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); | ||
347 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) | ||
348 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); | ||
314 | } | 349 | } |
315 | 350 | ||
316 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | 351 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, |
@@ -339,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, | |||
339 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, | 374 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, |
340 | int min) | 375 | int min) |
341 | { | 376 | { |
342 | struct page *page; | 377 | void *page; |
343 | 378 | ||
344 | if (cache->nobjs >= min) | 379 | if (cache->nobjs >= min) |
345 | return 0; | 380 | return 0; |
346 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | 381 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { |
347 | page = alloc_page(GFP_KERNEL); | 382 | page = (void *)__get_free_page(GFP_KERNEL); |
348 | if (!page) | 383 | if (!page) |
349 | return -ENOMEM; | 384 | return -ENOMEM; |
350 | cache->objects[cache->nobjs++] = page_address(page); | 385 | cache->objects[cache->nobjs++] = page; |
351 | } | 386 | } |
352 | return 0; | 387 | return 0; |
353 | } | 388 | } |
@@ -367,7 +402,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | |||
367 | if (r) | 402 | if (r) |
368 | goto out; | 403 | goto out; |
369 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, | 404 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, |
370 | rmap_desc_cache, 4); | 405 | rmap_desc_cache, 4 + PTE_PREFETCH_NUM); |
371 | if (r) | 406 | if (r) |
372 | goto out; | 407 | goto out; |
373 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); | 408 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); |
@@ -437,46 +472,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) | |||
437 | } | 472 | } |
438 | 473 | ||
439 | /* | 474 | /* |
440 | * Return the pointer to the largepage write count for a given | 475 | * Return the pointer to the large page information for a given gfn, |
441 | * gfn, handling slots that are not large page aligned. | 476 | * handling slots that are not large page aligned. |
442 | */ | 477 | */ |
443 | static int *slot_largepage_idx(gfn_t gfn, | 478 | static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, |
444 | struct kvm_memory_slot *slot, | 479 | struct kvm_memory_slot *slot, |
445 | int level) | 480 | int level) |
446 | { | 481 | { |
447 | unsigned long idx; | 482 | unsigned long idx; |
448 | 483 | ||
449 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - | 484 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - |
450 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | 485 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); |
451 | return &slot->lpage_info[level - 2][idx].write_count; | 486 | return &slot->lpage_info[level - 2][idx]; |
452 | } | 487 | } |
453 | 488 | ||
454 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) | 489 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) |
455 | { | 490 | { |
456 | struct kvm_memory_slot *slot; | 491 | struct kvm_memory_slot *slot; |
457 | int *write_count; | 492 | struct kvm_lpage_info *linfo; |
458 | int i; | 493 | int i; |
459 | 494 | ||
460 | slot = gfn_to_memslot(kvm, gfn); | 495 | slot = gfn_to_memslot(kvm, gfn); |
461 | for (i = PT_DIRECTORY_LEVEL; | 496 | for (i = PT_DIRECTORY_LEVEL; |
462 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 497 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
463 | write_count = slot_largepage_idx(gfn, slot, i); | 498 | linfo = lpage_info_slot(gfn, slot, i); |
464 | *write_count += 1; | 499 | linfo->write_count += 1; |
465 | } | 500 | } |
466 | } | 501 | } |
467 | 502 | ||
468 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | 503 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) |
469 | { | 504 | { |
470 | struct kvm_memory_slot *slot; | 505 | struct kvm_memory_slot *slot; |
471 | int *write_count; | 506 | struct kvm_lpage_info *linfo; |
472 | int i; | 507 | int i; |
473 | 508 | ||
474 | slot = gfn_to_memslot(kvm, gfn); | 509 | slot = gfn_to_memslot(kvm, gfn); |
475 | for (i = PT_DIRECTORY_LEVEL; | 510 | for (i = PT_DIRECTORY_LEVEL; |
476 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 511 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
477 | write_count = slot_largepage_idx(gfn, slot, i); | 512 | linfo = lpage_info_slot(gfn, slot, i); |
478 | *write_count -= 1; | 513 | linfo->write_count -= 1; |
479 | WARN_ON(*write_count < 0); | 514 | WARN_ON(linfo->write_count < 0); |
480 | } | 515 | } |
481 | } | 516 | } |
482 | 517 | ||
@@ -485,12 +520,12 @@ static int has_wrprotected_page(struct kvm *kvm, | |||
485 | int level) | 520 | int level) |
486 | { | 521 | { |
487 | struct kvm_memory_slot *slot; | 522 | struct kvm_memory_slot *slot; |
488 | int *largepage_idx; | 523 | struct kvm_lpage_info *linfo; |
489 | 524 | ||
490 | slot = gfn_to_memslot(kvm, gfn); | 525 | slot = gfn_to_memslot(kvm, gfn); |
491 | if (slot) { | 526 | if (slot) { |
492 | largepage_idx = slot_largepage_idx(gfn, slot, level); | 527 | linfo = lpage_info_slot(gfn, slot, level); |
493 | return *largepage_idx; | 528 | return linfo->write_count; |
494 | } | 529 | } |
495 | 530 | ||
496 | return 1; | 531 | return 1; |
@@ -514,14 +549,28 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) | |||
514 | return ret; | 549 | return ret; |
515 | } | 550 | } |
516 | 551 | ||
517 | static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | 552 | static struct kvm_memory_slot * |
553 | gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, | ||
554 | bool no_dirty_log) | ||
518 | { | 555 | { |
519 | struct kvm_memory_slot *slot; | 556 | struct kvm_memory_slot *slot; |
520 | int host_level, level, max_level; | ||
521 | 557 | ||
522 | slot = gfn_to_memslot(vcpu->kvm, large_gfn); | 558 | slot = gfn_to_memslot(vcpu->kvm, gfn); |
523 | if (slot && slot->dirty_bitmap) | 559 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID || |
524 | return PT_PAGE_TABLE_LEVEL; | 560 | (no_dirty_log && slot->dirty_bitmap)) |
561 | slot = NULL; | ||
562 | |||
563 | return slot; | ||
564 | } | ||
565 | |||
566 | static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) | ||
567 | { | ||
568 | return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); | ||
569 | } | ||
570 | |||
571 | static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | ||
572 | { | ||
573 | int host_level, level, max_level; | ||
525 | 574 | ||
526 | host_level = host_mapping_level(vcpu->kvm, large_gfn); | 575 | host_level = host_mapping_level(vcpu->kvm, large_gfn); |
527 | 576 | ||
@@ -545,16 +594,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | |||
545 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | 594 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) |
546 | { | 595 | { |
547 | struct kvm_memory_slot *slot; | 596 | struct kvm_memory_slot *slot; |
548 | unsigned long idx; | 597 | struct kvm_lpage_info *linfo; |
549 | 598 | ||
550 | slot = gfn_to_memslot(kvm, gfn); | 599 | slot = gfn_to_memslot(kvm, gfn); |
551 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | 600 | if (likely(level == PT_PAGE_TABLE_LEVEL)) |
552 | return &slot->rmap[gfn - slot->base_gfn]; | 601 | return &slot->rmap[gfn - slot->base_gfn]; |
553 | 602 | ||
554 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - | 603 | linfo = lpage_info_slot(gfn, slot, level); |
555 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | ||
556 | 604 | ||
557 | return &slot->lpage_info[level - 2][idx].rmap_pde; | 605 | return &linfo->rmap_pde; |
558 | } | 606 | } |
559 | 607 | ||
560 | /* | 608 | /* |
@@ -591,6 +639,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
591 | desc->sptes[0] = (u64 *)*rmapp; | 639 | desc->sptes[0] = (u64 *)*rmapp; |
592 | desc->sptes[1] = spte; | 640 | desc->sptes[1] = spte; |
593 | *rmapp = (unsigned long)desc | 1; | 641 | *rmapp = (unsigned long)desc | 1; |
642 | ++count; | ||
594 | } else { | 643 | } else { |
595 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | 644 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); |
596 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 645 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
@@ -603,7 +652,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
603 | desc = desc->more; | 652 | desc = desc->more; |
604 | } | 653 | } |
605 | for (i = 0; desc->sptes[i]; ++i) | 654 | for (i = 0; desc->sptes[i]; ++i) |
606 | ; | 655 | ++count; |
607 | desc->sptes[i] = spte; | 656 | desc->sptes[i] = spte; |
608 | } | 657 | } |
609 | return count; | 658 | return count; |
@@ -645,18 +694,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
645 | gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); | 694 | gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); |
646 | rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); | 695 | rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); |
647 | if (!*rmapp) { | 696 | if (!*rmapp) { |
648 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | 697 | printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte); |
649 | BUG(); | 698 | BUG(); |
650 | } else if (!(*rmapp & 1)) { | 699 | } else if (!(*rmapp & 1)) { |
651 | rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); | 700 | rmap_printk("rmap_remove: %p 1->0\n", spte); |
652 | if ((u64 *)*rmapp != spte) { | 701 | if ((u64 *)*rmapp != spte) { |
653 | printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", | 702 | printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); |
654 | spte, *spte); | ||
655 | BUG(); | 703 | BUG(); |
656 | } | 704 | } |
657 | *rmapp = 0; | 705 | *rmapp = 0; |
658 | } else { | 706 | } else { |
659 | rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); | 707 | rmap_printk("rmap_remove: %p many->many\n", spte); |
660 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 708 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
661 | prev_desc = NULL; | 709 | prev_desc = NULL; |
662 | while (desc) { | 710 | while (desc) { |
@@ -670,35 +718,36 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
670 | prev_desc = desc; | 718 | prev_desc = desc; |
671 | desc = desc->more; | 719 | desc = desc->more; |
672 | } | 720 | } |
673 | pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); | 721 | pr_err("rmap_remove: %p many->many\n", spte); |
674 | BUG(); | 722 | BUG(); |
675 | } | 723 | } |
676 | } | 724 | } |
677 | 725 | ||
678 | static void set_spte_track_bits(u64 *sptep, u64 new_spte) | 726 | static int set_spte_track_bits(u64 *sptep, u64 new_spte) |
679 | { | 727 | { |
680 | pfn_t pfn; | 728 | pfn_t pfn; |
681 | u64 old_spte = *sptep; | 729 | u64 old_spte = *sptep; |
682 | 730 | ||
683 | if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || | 731 | if (!spte_has_volatile_bits(old_spte)) |
684 | old_spte & shadow_accessed_mask) { | ||
685 | __set_spte(sptep, new_spte); | 732 | __set_spte(sptep, new_spte); |
686 | } else | 733 | else |
687 | old_spte = __xchg_spte(sptep, new_spte); | 734 | old_spte = __xchg_spte(sptep, new_spte); |
688 | 735 | ||
689 | if (!is_rmap_spte(old_spte)) | 736 | if (!is_rmap_spte(old_spte)) |
690 | return; | 737 | return 0; |
738 | |||
691 | pfn = spte_to_pfn(old_spte); | 739 | pfn = spte_to_pfn(old_spte); |
692 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) | 740 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) |
693 | kvm_set_pfn_accessed(pfn); | 741 | kvm_set_pfn_accessed(pfn); |
694 | if (is_writable_pte(old_spte)) | 742 | if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) |
695 | kvm_set_pfn_dirty(pfn); | 743 | kvm_set_pfn_dirty(pfn); |
744 | return 1; | ||
696 | } | 745 | } |
697 | 746 | ||
698 | static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) | 747 | static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) |
699 | { | 748 | { |
700 | set_spte_track_bits(sptep, new_spte); | 749 | if (set_spte_track_bits(sptep, new_spte)) |
701 | rmap_remove(kvm, sptep); | 750 | rmap_remove(kvm, sptep); |
702 | } | 751 | } |
703 | 752 | ||
704 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | 753 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) |
@@ -746,13 +795,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
746 | } | 795 | } |
747 | spte = rmap_next(kvm, rmapp, spte); | 796 | spte = rmap_next(kvm, rmapp, spte); |
748 | } | 797 | } |
749 | if (write_protected) { | ||
750 | pfn_t pfn; | ||
751 | |||
752 | spte = rmap_next(kvm, rmapp, NULL); | ||
753 | pfn = spte_to_pfn(*spte); | ||
754 | kvm_set_pfn_dirty(pfn); | ||
755 | } | ||
756 | 798 | ||
757 | /* check for huge page mappings */ | 799 | /* check for huge page mappings */ |
758 | for (i = PT_DIRECTORY_LEVEL; | 800 | for (i = PT_DIRECTORY_LEVEL; |
@@ -848,19 +890,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |||
848 | end = start + (memslot->npages << PAGE_SHIFT); | 890 | end = start + (memslot->npages << PAGE_SHIFT); |
849 | if (hva >= start && hva < end) { | 891 | if (hva >= start && hva < end) { |
850 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; | 892 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; |
893 | gfn_t gfn = memslot->base_gfn + gfn_offset; | ||
851 | 894 | ||
852 | ret = handler(kvm, &memslot->rmap[gfn_offset], data); | 895 | ret = handler(kvm, &memslot->rmap[gfn_offset], data); |
853 | 896 | ||
854 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { | 897 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { |
855 | unsigned long idx; | 898 | struct kvm_lpage_info *linfo; |
856 | int sh; | 899 | |
857 | 900 | linfo = lpage_info_slot(gfn, memslot, | |
858 | sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); | 901 | PT_DIRECTORY_LEVEL + j); |
859 | idx = ((memslot->base_gfn+gfn_offset) >> sh) - | 902 | ret |= handler(kvm, &linfo->rmap_pde, data); |
860 | (memslot->base_gfn >> sh); | ||
861 | ret |= handler(kvm, | ||
862 | &memslot->lpage_info[j][idx].rmap_pde, | ||
863 | data); | ||
864 | } | 903 | } |
865 | trace_kvm_age_page(hva, memslot, ret); | 904 | trace_kvm_age_page(hva, memslot, ret); |
866 | retval |= ret; | 905 | retval |= ret; |
@@ -911,6 +950,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
911 | return young; | 950 | return young; |
912 | } | 951 | } |
913 | 952 | ||
953 | static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | ||
954 | unsigned long data) | ||
955 | { | ||
956 | u64 *spte; | ||
957 | int young = 0; | ||
958 | |||
959 | /* | ||
960 | * If there's no access bit in the secondary pte set by the | ||
961 | * hardware it's up to gup-fast/gup to set the access bit in | ||
962 | * the primary pte or in the page structure. | ||
963 | */ | ||
964 | if (!shadow_accessed_mask) | ||
965 | goto out; | ||
966 | |||
967 | spte = rmap_next(kvm, rmapp, NULL); | ||
968 | while (spte) { | ||
969 | u64 _spte = *spte; | ||
970 | BUG_ON(!(_spte & PT_PRESENT_MASK)); | ||
971 | young = _spte & PT_ACCESSED_MASK; | ||
972 | if (young) { | ||
973 | young = 1; | ||
974 | break; | ||
975 | } | ||
976 | spte = rmap_next(kvm, rmapp, spte); | ||
977 | } | ||
978 | out: | ||
979 | return young; | ||
980 | } | ||
981 | |||
914 | #define RMAP_RECYCLE_THRESHOLD 1000 | 982 | #define RMAP_RECYCLE_THRESHOLD 1000 |
915 | 983 | ||
916 | static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | 984 | static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) |
@@ -931,6 +999,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva) | |||
931 | return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); | 999 | return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); |
932 | } | 1000 | } |
933 | 1001 | ||
1002 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) | ||
1003 | { | ||
1004 | return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); | ||
1005 | } | ||
1006 | |||
934 | #ifdef MMU_DEBUG | 1007 | #ifdef MMU_DEBUG |
935 | static int is_empty_shadow_page(u64 *spt) | 1008 | static int is_empty_shadow_page(u64 *spt) |
936 | { | 1009 | { |
@@ -947,16 +1020,28 @@ static int is_empty_shadow_page(u64 *spt) | |||
947 | } | 1020 | } |
948 | #endif | 1021 | #endif |
949 | 1022 | ||
1023 | /* | ||
1024 | * This value is the sum of all of the kvm instances's | ||
1025 | * kvm->arch.n_used_mmu_pages values. We need a global, | ||
1026 | * aggregate version in order to make the slab shrinker | ||
1027 | * faster | ||
1028 | */ | ||
1029 | static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) | ||
1030 | { | ||
1031 | kvm->arch.n_used_mmu_pages += nr; | ||
1032 | percpu_counter_add(&kvm_total_used_mmu_pages, nr); | ||
1033 | } | ||
1034 | |||
950 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1035 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
951 | { | 1036 | { |
952 | ASSERT(is_empty_shadow_page(sp->spt)); | 1037 | ASSERT(is_empty_shadow_page(sp->spt)); |
953 | hlist_del(&sp->hash_link); | 1038 | hlist_del(&sp->hash_link); |
954 | list_del(&sp->link); | 1039 | list_del(&sp->link); |
955 | __free_page(virt_to_page(sp->spt)); | 1040 | free_page((unsigned long)sp->spt); |
956 | if (!sp->role.direct) | 1041 | if (!sp->role.direct) |
957 | __free_page(virt_to_page(sp->gfns)); | 1042 | free_page((unsigned long)sp->gfns); |
958 | kmem_cache_free(mmu_page_header_cache, sp); | 1043 | kmem_cache_free(mmu_page_header_cache, sp); |
959 | ++kvm->arch.n_free_mmu_pages; | 1044 | kvm_mod_used_mmu_pages(kvm, -1); |
960 | } | 1045 | } |
961 | 1046 | ||
962 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | 1047 | static unsigned kvm_page_table_hashfn(gfn_t gfn) |
@@ -979,7 +1064,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
979 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | 1064 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); |
980 | sp->multimapped = 0; | 1065 | sp->multimapped = 0; |
981 | sp->parent_pte = parent_pte; | 1066 | sp->parent_pte = parent_pte; |
982 | --vcpu->kvm->arch.n_free_mmu_pages; | 1067 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); |
983 | return sp; | 1068 | return sp; |
984 | } | 1069 | } |
985 | 1070 | ||
@@ -1110,7 +1195,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | |||
1110 | } | 1195 | } |
1111 | 1196 | ||
1112 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, | 1197 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, |
1113 | struct kvm_mmu_page *sp, bool clear_unsync) | 1198 | struct kvm_mmu_page *sp) |
1114 | { | 1199 | { |
1115 | return 1; | 1200 | return 1; |
1116 | } | 1201 | } |
@@ -1119,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) | |||
1119 | { | 1204 | { |
1120 | } | 1205 | } |
1121 | 1206 | ||
1207 | static void nonpaging_update_pte(struct kvm_vcpu *vcpu, | ||
1208 | struct kvm_mmu_page *sp, u64 *spte, | ||
1209 | const void *pte) | ||
1210 | { | ||
1211 | WARN_ON(1); | ||
1212 | } | ||
1213 | |||
1122 | #define KVM_PAGE_ARRAY_NR 16 | 1214 | #define KVM_PAGE_ARRAY_NR 16 |
1123 | 1215 | ||
1124 | struct kvm_mmu_pages { | 1216 | struct kvm_mmu_pages { |
@@ -1240,7 +1332,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
1240 | if (clear_unsync) | 1332 | if (clear_unsync) |
1241 | kvm_unlink_unsync_page(vcpu->kvm, sp); | 1333 | kvm_unlink_unsync_page(vcpu->kvm, sp); |
1242 | 1334 | ||
1243 | if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { | 1335 | if (vcpu->arch.mmu.sync_page(vcpu, sp)) { |
1244 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); | 1336 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); |
1245 | return 1; | 1337 | return 1; |
1246 | } | 1338 | } |
@@ -1281,12 +1373,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) | |||
1281 | continue; | 1373 | continue; |
1282 | 1374 | ||
1283 | WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); | 1375 | WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); |
1376 | kvm_unlink_unsync_page(vcpu->kvm, s); | ||
1284 | if ((s->role.cr4_pae != !!is_pae(vcpu)) || | 1377 | if ((s->role.cr4_pae != !!is_pae(vcpu)) || |
1285 | (vcpu->arch.mmu.sync_page(vcpu, s, true))) { | 1378 | (vcpu->arch.mmu.sync_page(vcpu, s))) { |
1286 | kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); | 1379 | kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); |
1287 | continue; | 1380 | continue; |
1288 | } | 1381 | } |
1289 | kvm_unlink_unsync_page(vcpu->kvm, s); | ||
1290 | flush = true; | 1382 | flush = true; |
1291 | } | 1383 | } |
1292 | 1384 | ||
@@ -1403,7 +1495,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1403 | if (role.direct) | 1495 | if (role.direct) |
1404 | role.cr4_pae = 0; | 1496 | role.cr4_pae = 0; |
1405 | role.access = access; | 1497 | role.access = access; |
1406 | if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { | 1498 | if (!vcpu->arch.mmu.direct_map |
1499 | && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { | ||
1407 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | 1500 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); |
1408 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | 1501 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; |
1409 | role.quadrant = quadrant; | 1502 | role.quadrant = quadrant; |
@@ -1458,6 +1551,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, | |||
1458 | iterator->addr = addr; | 1551 | iterator->addr = addr; |
1459 | iterator->shadow_addr = vcpu->arch.mmu.root_hpa; | 1552 | iterator->shadow_addr = vcpu->arch.mmu.root_hpa; |
1460 | iterator->level = vcpu->arch.mmu.shadow_root_level; | 1553 | iterator->level = vcpu->arch.mmu.shadow_root_level; |
1554 | |||
1555 | if (iterator->level == PT64_ROOT_LEVEL && | ||
1556 | vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && | ||
1557 | !vcpu->arch.mmu.direct_map) | ||
1558 | --iterator->level; | ||
1559 | |||
1461 | if (iterator->level == PT32E_ROOT_LEVEL) { | 1560 | if (iterator->level == PT32E_ROOT_LEVEL) { |
1462 | iterator->shadow_addr | 1561 | iterator->shadow_addr |
1463 | = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | 1562 | = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; |
@@ -1665,41 +1764,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, | |||
1665 | 1764 | ||
1666 | /* | 1765 | /* |
1667 | * Changing the number of mmu pages allocated to the vm | 1766 | * Changing the number of mmu pages allocated to the vm |
1668 | * Note: if kvm_nr_mmu_pages is too small, you will get dead lock | 1767 | * Note: if goal_nr_mmu_pages is too small, you will get dead lock |
1669 | */ | 1768 | */ |
1670 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) | 1769 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) |
1671 | { | 1770 | { |
1672 | int used_pages; | ||
1673 | LIST_HEAD(invalid_list); | 1771 | LIST_HEAD(invalid_list); |
1674 | |||
1675 | used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; | ||
1676 | used_pages = max(0, used_pages); | ||
1677 | |||
1678 | /* | 1772 | /* |
1679 | * If we set the number of mmu pages to be smaller be than the | 1773 | * If we set the number of mmu pages to be smaller be than the |
1680 | * number of actived pages , we must to free some mmu pages before we | 1774 | * number of actived pages , we must to free some mmu pages before we |
1681 | * change the value | 1775 | * change the value |
1682 | */ | 1776 | */ |
1683 | 1777 | ||
1684 | if (used_pages > kvm_nr_mmu_pages) { | 1778 | if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { |
1685 | while (used_pages > kvm_nr_mmu_pages && | 1779 | while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && |
1686 | !list_empty(&kvm->arch.active_mmu_pages)) { | 1780 | !list_empty(&kvm->arch.active_mmu_pages)) { |
1687 | struct kvm_mmu_page *page; | 1781 | struct kvm_mmu_page *page; |
1688 | 1782 | ||
1689 | page = container_of(kvm->arch.active_mmu_pages.prev, | 1783 | page = container_of(kvm->arch.active_mmu_pages.prev, |
1690 | struct kvm_mmu_page, link); | 1784 | struct kvm_mmu_page, link); |
1691 | used_pages -= kvm_mmu_prepare_zap_page(kvm, page, | 1785 | kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); |
1692 | &invalid_list); | 1786 | kvm_mmu_commit_zap_page(kvm, &invalid_list); |
1693 | } | 1787 | } |
1694 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 1788 | goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; |
1695 | kvm_nr_mmu_pages = used_pages; | ||
1696 | kvm->arch.n_free_mmu_pages = 0; | ||
1697 | } | 1789 | } |
1698 | else | ||
1699 | kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages | ||
1700 | - kvm->arch.n_alloc_mmu_pages; | ||
1701 | 1790 | ||
1702 | kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; | 1791 | kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; |
1703 | } | 1792 | } |
1704 | 1793 | ||
1705 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | 1794 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) |
@@ -1709,11 +1798,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |||
1709 | LIST_HEAD(invalid_list); | 1798 | LIST_HEAD(invalid_list); |
1710 | int r; | 1799 | int r; |
1711 | 1800 | ||
1712 | pgprintk("%s: looking for gfn %lx\n", __func__, gfn); | 1801 | pgprintk("%s: looking for gfn %llx\n", __func__, gfn); |
1713 | r = 0; | 1802 | r = 0; |
1714 | 1803 | ||
1715 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { | 1804 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { |
1716 | pgprintk("%s: gfn %lx role %x\n", __func__, gfn, | 1805 | pgprintk("%s: gfn %llx role %x\n", __func__, gfn, |
1717 | sp->role.word); | 1806 | sp->role.word); |
1718 | r = 1; | 1807 | r = 1; |
1719 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); | 1808 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); |
@@ -1729,7 +1818,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) | |||
1729 | LIST_HEAD(invalid_list); | 1818 | LIST_HEAD(invalid_list); |
1730 | 1819 | ||
1731 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { | 1820 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { |
1732 | pgprintk("%s: zap %lx %x\n", | 1821 | pgprintk("%s: zap %llx %x\n", |
1733 | __func__, gfn, sp->role.word); | 1822 | __func__, gfn, sp->role.word); |
1734 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); | 1823 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); |
1735 | } | 1824 | } |
@@ -1915,9 +2004,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1915 | unsigned pte_access, int user_fault, | 2004 | unsigned pte_access, int user_fault, |
1916 | int write_fault, int dirty, int level, | 2005 | int write_fault, int dirty, int level, |
1917 | gfn_t gfn, pfn_t pfn, bool speculative, | 2006 | gfn_t gfn, pfn_t pfn, bool speculative, |
1918 | bool can_unsync, bool reset_host_protection) | 2007 | bool can_unsync, bool host_writable) |
1919 | { | 2008 | { |
1920 | u64 spte; | 2009 | u64 spte, entry = *sptep; |
1921 | int ret = 0; | 2010 | int ret = 0; |
1922 | 2011 | ||
1923 | /* | 2012 | /* |
@@ -1925,7 +2014,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1925 | * whether the guest actually used the pte (in order to detect | 2014 | * whether the guest actually used the pte (in order to detect |
1926 | * demand paging). | 2015 | * demand paging). |
1927 | */ | 2016 | */ |
1928 | spte = shadow_base_present_pte | shadow_dirty_mask; | 2017 | spte = PT_PRESENT_MASK; |
1929 | if (!speculative) | 2018 | if (!speculative) |
1930 | spte |= shadow_accessed_mask; | 2019 | spte |= shadow_accessed_mask; |
1931 | if (!dirty) | 2020 | if (!dirty) |
@@ -1942,14 +2031,16 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1942 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, | 2031 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, |
1943 | kvm_is_mmio_pfn(pfn)); | 2032 | kvm_is_mmio_pfn(pfn)); |
1944 | 2033 | ||
1945 | if (reset_host_protection) | 2034 | if (host_writable) |
1946 | spte |= SPTE_HOST_WRITEABLE; | 2035 | spte |= SPTE_HOST_WRITEABLE; |
2036 | else | ||
2037 | pte_access &= ~ACC_WRITE_MASK; | ||
1947 | 2038 | ||
1948 | spte |= (u64)pfn << PAGE_SHIFT; | 2039 | spte |= (u64)pfn << PAGE_SHIFT; |
1949 | 2040 | ||
1950 | if ((pte_access & ACC_WRITE_MASK) | 2041 | if ((pte_access & ACC_WRITE_MASK) |
1951 | || (!tdp_enabled && write_fault && !is_write_protection(vcpu) | 2042 | || (!vcpu->arch.mmu.direct_map && write_fault |
1952 | && !user_fault)) { | 2043 | && !is_write_protection(vcpu) && !user_fault)) { |
1953 | 2044 | ||
1954 | if (level > PT_PAGE_TABLE_LEVEL && | 2045 | if (level > PT_PAGE_TABLE_LEVEL && |
1955 | has_wrprotected_page(vcpu->kvm, gfn, level)) { | 2046 | has_wrprotected_page(vcpu->kvm, gfn, level)) { |
@@ -1960,7 +2051,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1960 | 2051 | ||
1961 | spte |= PT_WRITABLE_MASK; | 2052 | spte |= PT_WRITABLE_MASK; |
1962 | 2053 | ||
1963 | if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) | 2054 | if (!vcpu->arch.mmu.direct_map |
2055 | && !(pte_access & ACC_WRITE_MASK)) | ||
1964 | spte &= ~PT_USER_MASK; | 2056 | spte &= ~PT_USER_MASK; |
1965 | 2057 | ||
1966 | /* | 2058 | /* |
@@ -1973,7 +2065,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1973 | goto set_pte; | 2065 | goto set_pte; |
1974 | 2066 | ||
1975 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { | 2067 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { |
1976 | pgprintk("%s: found shadow page for %lx, marking ro\n", | 2068 | pgprintk("%s: found shadow page for %llx, marking ro\n", |
1977 | __func__, gfn); | 2069 | __func__, gfn); |
1978 | ret = 1; | 2070 | ret = 1; |
1979 | pte_access &= ~ACC_WRITE_MASK; | 2071 | pte_access &= ~ACC_WRITE_MASK; |
@@ -1986,9 +2078,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1986 | mark_page_dirty(vcpu->kvm, gfn); | 2078 | mark_page_dirty(vcpu->kvm, gfn); |
1987 | 2079 | ||
1988 | set_pte: | 2080 | set_pte: |
1989 | if (is_writable_pte(*sptep) && !is_writable_pte(spte)) | ||
1990 | kvm_set_pfn_dirty(pfn); | ||
1991 | update_spte(sptep, spte); | 2081 | update_spte(sptep, spte); |
2082 | /* | ||
2083 | * If we overwrite a writable spte with a read-only one we | ||
2084 | * should flush remote TLBs. Otherwise rmap_write_protect | ||
2085 | * will find a read-only spte, even though the writable spte | ||
2086 | * might be cached on a CPU's TLB. | ||
2087 | */ | ||
2088 | if (is_writable_pte(entry) && !is_writable_pte(*sptep)) | ||
2089 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1992 | done: | 2090 | done: |
1993 | return ret; | 2091 | return ret; |
1994 | } | 2092 | } |
@@ -1998,13 +2096,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1998 | int user_fault, int write_fault, int dirty, | 2096 | int user_fault, int write_fault, int dirty, |
1999 | int *ptwrite, int level, gfn_t gfn, | 2097 | int *ptwrite, int level, gfn_t gfn, |
2000 | pfn_t pfn, bool speculative, | 2098 | pfn_t pfn, bool speculative, |
2001 | bool reset_host_protection) | 2099 | bool host_writable) |
2002 | { | 2100 | { |
2003 | int was_rmapped = 0; | 2101 | int was_rmapped = 0; |
2004 | int rmap_count; | 2102 | int rmap_count; |
2005 | 2103 | ||
2006 | pgprintk("%s: spte %llx access %x write_fault %d" | 2104 | pgprintk("%s: spte %llx access %x write_fault %d" |
2007 | " user_fault %d gfn %lx\n", | 2105 | " user_fault %d gfn %llx\n", |
2008 | __func__, *sptep, pt_access, | 2106 | __func__, *sptep, pt_access, |
2009 | write_fault, user_fault, gfn); | 2107 | write_fault, user_fault, gfn); |
2010 | 2108 | ||
@@ -2023,7 +2121,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2023 | __set_spte(sptep, shadow_trap_nonpresent_pte); | 2121 | __set_spte(sptep, shadow_trap_nonpresent_pte); |
2024 | kvm_flush_remote_tlbs(vcpu->kvm); | 2122 | kvm_flush_remote_tlbs(vcpu->kvm); |
2025 | } else if (pfn != spte_to_pfn(*sptep)) { | 2123 | } else if (pfn != spte_to_pfn(*sptep)) { |
2026 | pgprintk("hfn old %lx new %lx\n", | 2124 | pgprintk("hfn old %llx new %llx\n", |
2027 | spte_to_pfn(*sptep), pfn); | 2125 | spte_to_pfn(*sptep), pfn); |
2028 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); | 2126 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); |
2029 | kvm_flush_remote_tlbs(vcpu->kvm); | 2127 | kvm_flush_remote_tlbs(vcpu->kvm); |
@@ -2033,14 +2131,14 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2033 | 2131 | ||
2034 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, | 2132 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, |
2035 | dirty, level, gfn, pfn, speculative, true, | 2133 | dirty, level, gfn, pfn, speculative, true, |
2036 | reset_host_protection)) { | 2134 | host_writable)) { |
2037 | if (write_fault) | 2135 | if (write_fault) |
2038 | *ptwrite = 1; | 2136 | *ptwrite = 1; |
2039 | kvm_mmu_flush_tlb(vcpu); | 2137 | kvm_mmu_flush_tlb(vcpu); |
2040 | } | 2138 | } |
2041 | 2139 | ||
2042 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); | 2140 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); |
2043 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", | 2141 | pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", |
2044 | is_large_pte(*sptep)? "2MB" : "4kB", | 2142 | is_large_pte(*sptep)? "2MB" : "4kB", |
2045 | *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, | 2143 | *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, |
2046 | *sptep, sptep); | 2144 | *sptep, sptep); |
@@ -2064,8 +2162,95 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | |||
2064 | { | 2162 | { |
2065 | } | 2163 | } |
2066 | 2164 | ||
2165 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | ||
2166 | bool no_dirty_log) | ||
2167 | { | ||
2168 | struct kvm_memory_slot *slot; | ||
2169 | unsigned long hva; | ||
2170 | |||
2171 | slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); | ||
2172 | if (!slot) { | ||
2173 | get_page(bad_page); | ||
2174 | return page_to_pfn(bad_page); | ||
2175 | } | ||
2176 | |||
2177 | hva = gfn_to_hva_memslot(slot, gfn); | ||
2178 | |||
2179 | return hva_to_pfn_atomic(vcpu->kvm, hva); | ||
2180 | } | ||
2181 | |||
2182 | static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, | ||
2183 | struct kvm_mmu_page *sp, | ||
2184 | u64 *start, u64 *end) | ||
2185 | { | ||
2186 | struct page *pages[PTE_PREFETCH_NUM]; | ||
2187 | unsigned access = sp->role.access; | ||
2188 | int i, ret; | ||
2189 | gfn_t gfn; | ||
2190 | |||
2191 | gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); | ||
2192 | if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) | ||
2193 | return -1; | ||
2194 | |||
2195 | ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); | ||
2196 | if (ret <= 0) | ||
2197 | return -1; | ||
2198 | |||
2199 | for (i = 0; i < ret; i++, gfn++, start++) | ||
2200 | mmu_set_spte(vcpu, start, ACC_ALL, | ||
2201 | access, 0, 0, 1, NULL, | ||
2202 | sp->role.level, gfn, | ||
2203 | page_to_pfn(pages[i]), true, true); | ||
2204 | |||
2205 | return 0; | ||
2206 | } | ||
2207 | |||
2208 | static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, | ||
2209 | struct kvm_mmu_page *sp, u64 *sptep) | ||
2210 | { | ||
2211 | u64 *spte, *start = NULL; | ||
2212 | int i; | ||
2213 | |||
2214 | WARN_ON(!sp->role.direct); | ||
2215 | |||
2216 | i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); | ||
2217 | spte = sp->spt + i; | ||
2218 | |||
2219 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { | ||
2220 | if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { | ||
2221 | if (!start) | ||
2222 | continue; | ||
2223 | if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) | ||
2224 | break; | ||
2225 | start = NULL; | ||
2226 | } else if (!start) | ||
2227 | start = spte; | ||
2228 | } | ||
2229 | } | ||
2230 | |||
2231 | static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) | ||
2232 | { | ||
2233 | struct kvm_mmu_page *sp; | ||
2234 | |||
2235 | /* | ||
2236 | * Since it's no accessed bit on EPT, it's no way to | ||
2237 | * distinguish between actually accessed translations | ||
2238 | * and prefetched, so disable pte prefetch if EPT is | ||
2239 | * enabled. | ||
2240 | */ | ||
2241 | if (!shadow_accessed_mask) | ||
2242 | return; | ||
2243 | |||
2244 | sp = page_header(__pa(sptep)); | ||
2245 | if (sp->role.level > PT_PAGE_TABLE_LEVEL) | ||
2246 | return; | ||
2247 | |||
2248 | __direct_pte_prefetch(vcpu, sp, sptep); | ||
2249 | } | ||
2250 | |||
2067 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | 2251 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, |
2068 | int level, gfn_t gfn, pfn_t pfn) | 2252 | int map_writable, int level, gfn_t gfn, pfn_t pfn, |
2253 | bool prefault) | ||
2069 | { | 2254 | { |
2070 | struct kvm_shadow_walk_iterator iterator; | 2255 | struct kvm_shadow_walk_iterator iterator; |
2071 | struct kvm_mmu_page *sp; | 2256 | struct kvm_mmu_page *sp; |
@@ -2074,9 +2259,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2074 | 2259 | ||
2075 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { | 2260 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { |
2076 | if (iterator.level == level) { | 2261 | if (iterator.level == level) { |
2077 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, | 2262 | unsigned pte_access = ACC_ALL; |
2263 | |||
2264 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, | ||
2078 | 0, write, 1, &pt_write, | 2265 | 0, write, 1, &pt_write, |
2079 | level, gfn, pfn, false, true); | 2266 | level, gfn, pfn, prefault, map_writable); |
2267 | direct_pte_prefetch(vcpu, iterator.sptep); | ||
2080 | ++vcpu->stat.pf_fixed; | 2268 | ++vcpu->stat.pf_fixed; |
2081 | break; | 2269 | break; |
2082 | } | 2270 | } |
@@ -2098,28 +2286,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2098 | __set_spte(iterator.sptep, | 2286 | __set_spte(iterator.sptep, |
2099 | __pa(sp->spt) | 2287 | __pa(sp->spt) |
2100 | | PT_PRESENT_MASK | PT_WRITABLE_MASK | 2288 | | PT_PRESENT_MASK | PT_WRITABLE_MASK |
2101 | | shadow_user_mask | shadow_x_mask); | 2289 | | shadow_user_mask | shadow_x_mask |
2290 | | shadow_accessed_mask); | ||
2102 | } | 2291 | } |
2103 | } | 2292 | } |
2104 | return pt_write; | 2293 | return pt_write; |
2105 | } | 2294 | } |
2106 | 2295 | ||
2107 | static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) | 2296 | static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) |
2108 | { | 2297 | { |
2109 | char buf[1]; | 2298 | siginfo_t info; |
2110 | void __user *hva; | ||
2111 | int r; | ||
2112 | 2299 | ||
2113 | /* Touch the page, so send SIGBUS */ | 2300 | info.si_signo = SIGBUS; |
2114 | hva = (void __user *)gfn_to_hva(kvm, gfn); | 2301 | info.si_errno = 0; |
2115 | r = copy_from_user(buf, hva, 1); | 2302 | info.si_code = BUS_MCEERR_AR; |
2303 | info.si_addr = (void __user *)address; | ||
2304 | info.si_addr_lsb = PAGE_SHIFT; | ||
2305 | |||
2306 | send_sig_info(SIGBUS, &info, tsk); | ||
2116 | } | 2307 | } |
2117 | 2308 | ||
2118 | static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) | 2309 | static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) |
2119 | { | 2310 | { |
2120 | kvm_release_pfn_clean(pfn); | 2311 | kvm_release_pfn_clean(pfn); |
2121 | if (is_hwpoison_pfn(pfn)) { | 2312 | if (is_hwpoison_pfn(pfn)) { |
2122 | kvm_send_hwpoison_signal(kvm, gfn); | 2313 | kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); |
2123 | return 0; | 2314 | return 0; |
2124 | } else if (is_fault_pfn(pfn)) | 2315 | } else if (is_fault_pfn(pfn)) |
2125 | return -EFAULT; | 2316 | return -EFAULT; |
@@ -2127,27 +2318,81 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) | |||
2127 | return 1; | 2318 | return 1; |
2128 | } | 2319 | } |
2129 | 2320 | ||
2130 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | 2321 | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, |
2322 | gfn_t *gfnp, pfn_t *pfnp, int *levelp) | ||
2323 | { | ||
2324 | pfn_t pfn = *pfnp; | ||
2325 | gfn_t gfn = *gfnp; | ||
2326 | int level = *levelp; | ||
2327 | |||
2328 | /* | ||
2329 | * Check if it's a transparent hugepage. If this would be an | ||
2330 | * hugetlbfs page, level wouldn't be set to | ||
2331 | * PT_PAGE_TABLE_LEVEL and there would be no adjustment done | ||
2332 | * here. | ||
2333 | */ | ||
2334 | if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && | ||
2335 | level == PT_PAGE_TABLE_LEVEL && | ||
2336 | PageTransCompound(pfn_to_page(pfn)) && | ||
2337 | !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { | ||
2338 | unsigned long mask; | ||
2339 | /* | ||
2340 | * mmu_notifier_retry was successful and we hold the | ||
2341 | * mmu_lock here, so the pmd can't become splitting | ||
2342 | * from under us, and in turn | ||
2343 | * __split_huge_page_refcount() can't run from under | ||
2344 | * us and we can safely transfer the refcount from | ||
2345 | * PG_tail to PG_head as we switch the pfn to tail to | ||
2346 | * head. | ||
2347 | */ | ||
2348 | *levelp = level = PT_DIRECTORY_LEVEL; | ||
2349 | mask = KVM_PAGES_PER_HPAGE(level) - 1; | ||
2350 | VM_BUG_ON((gfn & mask) != (pfn & mask)); | ||
2351 | if (pfn & mask) { | ||
2352 | gfn &= ~mask; | ||
2353 | *gfnp = gfn; | ||
2354 | kvm_release_pfn_clean(pfn); | ||
2355 | pfn &= ~mask; | ||
2356 | if (!get_page_unless_zero(pfn_to_page(pfn))) | ||
2357 | BUG(); | ||
2358 | *pfnp = pfn; | ||
2359 | } | ||
2360 | } | ||
2361 | } | ||
2362 | |||
2363 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | ||
2364 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | ||
2365 | |||
2366 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | ||
2367 | bool prefault) | ||
2131 | { | 2368 | { |
2132 | int r; | 2369 | int r; |
2133 | int level; | 2370 | int level; |
2371 | int force_pt_level; | ||
2134 | pfn_t pfn; | 2372 | pfn_t pfn; |
2135 | unsigned long mmu_seq; | 2373 | unsigned long mmu_seq; |
2374 | bool map_writable; | ||
2136 | 2375 | ||
2137 | level = mapping_level(vcpu, gfn); | 2376 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); |
2138 | 2377 | if (likely(!force_pt_level)) { | |
2139 | /* | 2378 | level = mapping_level(vcpu, gfn); |
2140 | * This path builds a PAE pagetable - so we can map 2mb pages at | 2379 | /* |
2141 | * maximum. Therefore check if the level is larger than that. | 2380 | * This path builds a PAE pagetable - so we can map |
2142 | */ | 2381 | * 2mb pages at maximum. Therefore check if the level |
2143 | if (level > PT_DIRECTORY_LEVEL) | 2382 | * is larger than that. |
2144 | level = PT_DIRECTORY_LEVEL; | 2383 | */ |
2384 | if (level > PT_DIRECTORY_LEVEL) | ||
2385 | level = PT_DIRECTORY_LEVEL; | ||
2145 | 2386 | ||
2146 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | 2387 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); |
2388 | } else | ||
2389 | level = PT_PAGE_TABLE_LEVEL; | ||
2147 | 2390 | ||
2148 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2391 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2149 | smp_rmb(); | 2392 | smp_rmb(); |
2150 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2393 | |
2394 | if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) | ||
2395 | return 0; | ||
2151 | 2396 | ||
2152 | /* mmio */ | 2397 | /* mmio */ |
2153 | if (is_error_pfn(pfn)) | 2398 | if (is_error_pfn(pfn)) |
@@ -2157,7 +2402,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
2157 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2402 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2158 | goto out_unlock; | 2403 | goto out_unlock; |
2159 | kvm_mmu_free_some_pages(vcpu); | 2404 | kvm_mmu_free_some_pages(vcpu); |
2160 | r = __direct_map(vcpu, v, write, level, gfn, pfn); | 2405 | if (likely(!force_pt_level)) |
2406 | transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); | ||
2407 | r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, | ||
2408 | prefault); | ||
2161 | spin_unlock(&vcpu->kvm->mmu_lock); | 2409 | spin_unlock(&vcpu->kvm->mmu_lock); |
2162 | 2410 | ||
2163 | 2411 | ||
@@ -2179,7 +2427,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
2179 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | 2427 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
2180 | return; | 2428 | return; |
2181 | spin_lock(&vcpu->kvm->mmu_lock); | 2429 | spin_lock(&vcpu->kvm->mmu_lock); |
2182 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | 2430 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && |
2431 | (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || | ||
2432 | vcpu->arch.mmu.direct_map)) { | ||
2183 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2433 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2184 | 2434 | ||
2185 | sp = page_header(root); | 2435 | sp = page_header(root); |
@@ -2222,83 +2472,163 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) | |||
2222 | return ret; | 2472 | return ret; |
2223 | } | 2473 | } |
2224 | 2474 | ||
2225 | static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | 2475 | static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) |
2226 | { | 2476 | { |
2227 | int i; | ||
2228 | gfn_t root_gfn; | ||
2229 | struct kvm_mmu_page *sp; | 2477 | struct kvm_mmu_page *sp; |
2230 | int direct = 0; | 2478 | unsigned i; |
2231 | u64 pdptr; | ||
2232 | |||
2233 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; | ||
2234 | 2479 | ||
2235 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | 2480 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { |
2481 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2482 | kvm_mmu_free_some_pages(vcpu); | ||
2483 | sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, | ||
2484 | 1, ACC_ALL, NULL); | ||
2485 | ++sp->root_count; | ||
2486 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2487 | vcpu->arch.mmu.root_hpa = __pa(sp->spt); | ||
2488 | } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { | ||
2489 | for (i = 0; i < 4; ++i) { | ||
2490 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
2491 | |||
2492 | ASSERT(!VALID_PAGE(root)); | ||
2493 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2494 | kvm_mmu_free_some_pages(vcpu); | ||
2495 | sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), | ||
2496 | i << 30, | ||
2497 | PT32_ROOT_LEVEL, 1, ACC_ALL, | ||
2498 | NULL); | ||
2499 | root = __pa(sp->spt); | ||
2500 | ++sp->root_count; | ||
2501 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2502 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; | ||
2503 | } | ||
2504 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | ||
2505 | } else | ||
2506 | BUG(); | ||
2507 | |||
2508 | return 0; | ||
2509 | } | ||
2510 | |||
2511 | static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) | ||
2512 | { | ||
2513 | struct kvm_mmu_page *sp; | ||
2514 | u64 pdptr, pm_mask; | ||
2515 | gfn_t root_gfn; | ||
2516 | int i; | ||
2517 | |||
2518 | root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; | ||
2519 | |||
2520 | if (mmu_check_root(vcpu, root_gfn)) | ||
2521 | return 1; | ||
2522 | |||
2523 | /* | ||
2524 | * Do we shadow a long mode page table? If so we need to | ||
2525 | * write-protect the guests page table root. | ||
2526 | */ | ||
2527 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | ||
2236 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2528 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2237 | 2529 | ||
2238 | ASSERT(!VALID_PAGE(root)); | 2530 | ASSERT(!VALID_PAGE(root)); |
2239 | if (mmu_check_root(vcpu, root_gfn)) | 2531 | |
2240 | return 1; | ||
2241 | if (tdp_enabled) { | ||
2242 | direct = 1; | ||
2243 | root_gfn = 0; | ||
2244 | } | ||
2245 | spin_lock(&vcpu->kvm->mmu_lock); | 2532 | spin_lock(&vcpu->kvm->mmu_lock); |
2246 | kvm_mmu_free_some_pages(vcpu); | 2533 | kvm_mmu_free_some_pages(vcpu); |
2247 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, | 2534 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, |
2248 | PT64_ROOT_LEVEL, direct, | 2535 | 0, ACC_ALL, NULL); |
2249 | ACC_ALL, NULL); | ||
2250 | root = __pa(sp->spt); | 2536 | root = __pa(sp->spt); |
2251 | ++sp->root_count; | 2537 | ++sp->root_count; |
2252 | spin_unlock(&vcpu->kvm->mmu_lock); | 2538 | spin_unlock(&vcpu->kvm->mmu_lock); |
2253 | vcpu->arch.mmu.root_hpa = root; | 2539 | vcpu->arch.mmu.root_hpa = root; |
2254 | return 0; | 2540 | return 0; |
2255 | } | 2541 | } |
2256 | direct = !is_paging(vcpu); | 2542 | |
2543 | /* | ||
2544 | * We shadow a 32 bit page table. This may be a legacy 2-level | ||
2545 | * or a PAE 3-level page table. In either case we need to be aware that | ||
2546 | * the shadow page table may be a PAE or a long mode page table. | ||
2547 | */ | ||
2548 | pm_mask = PT_PRESENT_MASK; | ||
2549 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) | ||
2550 | pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; | ||
2551 | |||
2257 | for (i = 0; i < 4; ++i) { | 2552 | for (i = 0; i < 4; ++i) { |
2258 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | 2553 | hpa_t root = vcpu->arch.mmu.pae_root[i]; |
2259 | 2554 | ||
2260 | ASSERT(!VALID_PAGE(root)); | 2555 | ASSERT(!VALID_PAGE(root)); |
2261 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { | 2556 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { |
2262 | pdptr = kvm_pdptr_read(vcpu, i); | 2557 | pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); |
2263 | if (!is_present_gpte(pdptr)) { | 2558 | if (!is_present_gpte(pdptr)) { |
2264 | vcpu->arch.mmu.pae_root[i] = 0; | 2559 | vcpu->arch.mmu.pae_root[i] = 0; |
2265 | continue; | 2560 | continue; |
2266 | } | 2561 | } |
2267 | root_gfn = pdptr >> PAGE_SHIFT; | 2562 | root_gfn = pdptr >> PAGE_SHIFT; |
2268 | } else if (vcpu->arch.mmu.root_level == 0) | 2563 | if (mmu_check_root(vcpu, root_gfn)) |
2269 | root_gfn = 0; | 2564 | return 1; |
2270 | if (mmu_check_root(vcpu, root_gfn)) | ||
2271 | return 1; | ||
2272 | if (tdp_enabled) { | ||
2273 | direct = 1; | ||
2274 | root_gfn = i << 30; | ||
2275 | } | 2565 | } |
2276 | spin_lock(&vcpu->kvm->mmu_lock); | 2566 | spin_lock(&vcpu->kvm->mmu_lock); |
2277 | kvm_mmu_free_some_pages(vcpu); | 2567 | kvm_mmu_free_some_pages(vcpu); |
2278 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | 2568 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, |
2279 | PT32_ROOT_LEVEL, direct, | 2569 | PT32_ROOT_LEVEL, 0, |
2280 | ACC_ALL, NULL); | 2570 | ACC_ALL, NULL); |
2281 | root = __pa(sp->spt); | 2571 | root = __pa(sp->spt); |
2282 | ++sp->root_count; | 2572 | ++sp->root_count; |
2283 | spin_unlock(&vcpu->kvm->mmu_lock); | 2573 | spin_unlock(&vcpu->kvm->mmu_lock); |
2284 | 2574 | ||
2285 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; | 2575 | vcpu->arch.mmu.pae_root[i] = root | pm_mask; |
2286 | } | 2576 | } |
2287 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | 2577 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); |
2578 | |||
2579 | /* | ||
2580 | * If we shadow a 32 bit page table with a long mode page | ||
2581 | * table we enter this path. | ||
2582 | */ | ||
2583 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
2584 | if (vcpu->arch.mmu.lm_root == NULL) { | ||
2585 | /* | ||
2586 | * The additional page necessary for this is only | ||
2587 | * allocated on demand. | ||
2588 | */ | ||
2589 | |||
2590 | u64 *lm_root; | ||
2591 | |||
2592 | lm_root = (void*)get_zeroed_page(GFP_KERNEL); | ||
2593 | if (lm_root == NULL) | ||
2594 | return 1; | ||
2595 | |||
2596 | lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; | ||
2597 | |||
2598 | vcpu->arch.mmu.lm_root = lm_root; | ||
2599 | } | ||
2600 | |||
2601 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); | ||
2602 | } | ||
2603 | |||
2288 | return 0; | 2604 | return 0; |
2289 | } | 2605 | } |
2290 | 2606 | ||
2607 | static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | ||
2608 | { | ||
2609 | if (vcpu->arch.mmu.direct_map) | ||
2610 | return mmu_alloc_direct_roots(vcpu); | ||
2611 | else | ||
2612 | return mmu_alloc_shadow_roots(vcpu); | ||
2613 | } | ||
2614 | |||
2291 | static void mmu_sync_roots(struct kvm_vcpu *vcpu) | 2615 | static void mmu_sync_roots(struct kvm_vcpu *vcpu) |
2292 | { | 2616 | { |
2293 | int i; | 2617 | int i; |
2294 | struct kvm_mmu_page *sp; | 2618 | struct kvm_mmu_page *sp; |
2295 | 2619 | ||
2620 | if (vcpu->arch.mmu.direct_map) | ||
2621 | return; | ||
2622 | |||
2296 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | 2623 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
2297 | return; | 2624 | return; |
2298 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | 2625 | |
2626 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); | ||
2627 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | ||
2299 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2628 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2300 | sp = page_header(root); | 2629 | sp = page_header(root); |
2301 | mmu_sync_children(vcpu, sp); | 2630 | mmu_sync_children(vcpu, sp); |
2631 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | ||
2302 | return; | 2632 | return; |
2303 | } | 2633 | } |
2304 | for (i = 0; i < 4; ++i) { | 2634 | for (i = 0; i < 4; ++i) { |
@@ -2310,6 +2640,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2310 | mmu_sync_children(vcpu, sp); | 2640 | mmu_sync_children(vcpu, sp); |
2311 | } | 2641 | } |
2312 | } | 2642 | } |
2643 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | ||
2313 | } | 2644 | } |
2314 | 2645 | ||
2315 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | 2646 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) |
@@ -2320,15 +2651,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2320 | } | 2651 | } |
2321 | 2652 | ||
2322 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, | 2653 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, |
2323 | u32 access, u32 *error) | 2654 | u32 access, struct x86_exception *exception) |
2324 | { | 2655 | { |
2325 | if (error) | 2656 | if (exception) |
2326 | *error = 0; | 2657 | exception->error_code = 0; |
2327 | return vaddr; | 2658 | return vaddr; |
2328 | } | 2659 | } |
2329 | 2660 | ||
2661 | static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, | ||
2662 | u32 access, | ||
2663 | struct x86_exception *exception) | ||
2664 | { | ||
2665 | if (exception) | ||
2666 | exception->error_code = 0; | ||
2667 | return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); | ||
2668 | } | ||
2669 | |||
2330 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | 2670 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, |
2331 | u32 error_code) | 2671 | u32 error_code, bool prefault) |
2332 | { | 2672 | { |
2333 | gfn_t gfn; | 2673 | gfn_t gfn; |
2334 | int r; | 2674 | int r; |
@@ -2344,17 +2684,68 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
2344 | gfn = gva >> PAGE_SHIFT; | 2684 | gfn = gva >> PAGE_SHIFT; |
2345 | 2685 | ||
2346 | return nonpaging_map(vcpu, gva & PAGE_MASK, | 2686 | return nonpaging_map(vcpu, gva & PAGE_MASK, |
2347 | error_code & PFERR_WRITE_MASK, gfn); | 2687 | error_code & PFERR_WRITE_MASK, gfn, prefault); |
2688 | } | ||
2689 | |||
2690 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) | ||
2691 | { | ||
2692 | struct kvm_arch_async_pf arch; | ||
2693 | |||
2694 | arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; | ||
2695 | arch.gfn = gfn; | ||
2696 | arch.direct_map = vcpu->arch.mmu.direct_map; | ||
2697 | arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); | ||
2698 | |||
2699 | return kvm_setup_async_pf(vcpu, gva, gfn, &arch); | ||
2700 | } | ||
2701 | |||
2702 | static bool can_do_async_pf(struct kvm_vcpu *vcpu) | ||
2703 | { | ||
2704 | if (unlikely(!irqchip_in_kernel(vcpu->kvm) || | ||
2705 | kvm_event_needs_reinjection(vcpu))) | ||
2706 | return false; | ||
2707 | |||
2708 | return kvm_x86_ops->interrupt_allowed(vcpu); | ||
2709 | } | ||
2710 | |||
2711 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | ||
2712 | gva_t gva, pfn_t *pfn, bool write, bool *writable) | ||
2713 | { | ||
2714 | bool async; | ||
2715 | |||
2716 | *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); | ||
2717 | |||
2718 | if (!async) | ||
2719 | return false; /* *pfn has correct page already */ | ||
2720 | |||
2721 | put_page(pfn_to_page(*pfn)); | ||
2722 | |||
2723 | if (!prefault && can_do_async_pf(vcpu)) { | ||
2724 | trace_kvm_try_async_get_page(gva, gfn); | ||
2725 | if (kvm_find_async_pf_gfn(vcpu, gfn)) { | ||
2726 | trace_kvm_async_pf_doublefault(gva, gfn); | ||
2727 | kvm_make_request(KVM_REQ_APF_HALT, vcpu); | ||
2728 | return true; | ||
2729 | } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) | ||
2730 | return true; | ||
2731 | } | ||
2732 | |||
2733 | *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); | ||
2734 | |||
2735 | return false; | ||
2348 | } | 2736 | } |
2349 | 2737 | ||
2350 | static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | 2738 | static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, |
2351 | u32 error_code) | 2739 | bool prefault) |
2352 | { | 2740 | { |
2353 | pfn_t pfn; | 2741 | pfn_t pfn; |
2354 | int r; | 2742 | int r; |
2355 | int level; | 2743 | int level; |
2744 | int force_pt_level; | ||
2356 | gfn_t gfn = gpa >> PAGE_SHIFT; | 2745 | gfn_t gfn = gpa >> PAGE_SHIFT; |
2357 | unsigned long mmu_seq; | 2746 | unsigned long mmu_seq; |
2747 | int write = error_code & PFERR_WRITE_MASK; | ||
2748 | bool map_writable; | ||
2358 | 2749 | ||
2359 | ASSERT(vcpu); | 2750 | ASSERT(vcpu); |
2360 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 2751 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
@@ -2363,21 +2754,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
2363 | if (r) | 2754 | if (r) |
2364 | return r; | 2755 | return r; |
2365 | 2756 | ||
2366 | level = mapping_level(vcpu, gfn); | 2757 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); |
2367 | 2758 | if (likely(!force_pt_level)) { | |
2368 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | 2759 | level = mapping_level(vcpu, gfn); |
2760 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | ||
2761 | } else | ||
2762 | level = PT_PAGE_TABLE_LEVEL; | ||
2369 | 2763 | ||
2370 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2764 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2371 | smp_rmb(); | 2765 | smp_rmb(); |
2372 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2766 | |
2767 | if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) | ||
2768 | return 0; | ||
2769 | |||
2770 | /* mmio */ | ||
2373 | if (is_error_pfn(pfn)) | 2771 | if (is_error_pfn(pfn)) |
2374 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); | 2772 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); |
2375 | spin_lock(&vcpu->kvm->mmu_lock); | 2773 | spin_lock(&vcpu->kvm->mmu_lock); |
2376 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2774 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2377 | goto out_unlock; | 2775 | goto out_unlock; |
2378 | kvm_mmu_free_some_pages(vcpu); | 2776 | kvm_mmu_free_some_pages(vcpu); |
2379 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | 2777 | if (likely(!force_pt_level)) |
2380 | level, gfn, pfn); | 2778 | transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); |
2779 | r = __direct_map(vcpu, gpa, write, map_writable, | ||
2780 | level, gfn, pfn, prefault); | ||
2381 | spin_unlock(&vcpu->kvm->mmu_lock); | 2781 | spin_unlock(&vcpu->kvm->mmu_lock); |
2382 | 2782 | ||
2383 | return r; | 2783 | return r; |
@@ -2393,10 +2793,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu) | |||
2393 | mmu_free_roots(vcpu); | 2793 | mmu_free_roots(vcpu); |
2394 | } | 2794 | } |
2395 | 2795 | ||
2396 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) | 2796 | static int nonpaging_init_context(struct kvm_vcpu *vcpu, |
2797 | struct kvm_mmu *context) | ||
2397 | { | 2798 | { |
2398 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
2399 | |||
2400 | context->new_cr3 = nonpaging_new_cr3; | 2799 | context->new_cr3 = nonpaging_new_cr3; |
2401 | context->page_fault = nonpaging_page_fault; | 2800 | context->page_fault = nonpaging_page_fault; |
2402 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 2801 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
@@ -2404,9 +2803,12 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) | |||
2404 | context->prefetch_page = nonpaging_prefetch_page; | 2803 | context->prefetch_page = nonpaging_prefetch_page; |
2405 | context->sync_page = nonpaging_sync_page; | 2804 | context->sync_page = nonpaging_sync_page; |
2406 | context->invlpg = nonpaging_invlpg; | 2805 | context->invlpg = nonpaging_invlpg; |
2806 | context->update_pte = nonpaging_update_pte; | ||
2407 | context->root_level = 0; | 2807 | context->root_level = 0; |
2408 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 2808 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
2409 | context->root_hpa = INVALID_PAGE; | 2809 | context->root_hpa = INVALID_PAGE; |
2810 | context->direct_map = true; | ||
2811 | context->nx = false; | ||
2410 | return 0; | 2812 | return 0; |
2411 | } | 2813 | } |
2412 | 2814 | ||
@@ -2418,15 +2820,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | |||
2418 | 2820 | ||
2419 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | 2821 | static void paging_new_cr3(struct kvm_vcpu *vcpu) |
2420 | { | 2822 | { |
2421 | pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); | 2823 | pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); |
2422 | mmu_free_roots(vcpu); | 2824 | mmu_free_roots(vcpu); |
2423 | } | 2825 | } |
2424 | 2826 | ||
2827 | static unsigned long get_cr3(struct kvm_vcpu *vcpu) | ||
2828 | { | ||
2829 | return kvm_read_cr3(vcpu); | ||
2830 | } | ||
2831 | |||
2425 | static void inject_page_fault(struct kvm_vcpu *vcpu, | 2832 | static void inject_page_fault(struct kvm_vcpu *vcpu, |
2426 | u64 addr, | 2833 | struct x86_exception *fault) |
2427 | u32 err_code) | ||
2428 | { | 2834 | { |
2429 | kvm_inject_page_fault(vcpu, addr, err_code); | 2835 | vcpu->arch.mmu.inject_page_fault(vcpu, fault); |
2430 | } | 2836 | } |
2431 | 2837 | ||
2432 | static void paging_free(struct kvm_vcpu *vcpu) | 2838 | static void paging_free(struct kvm_vcpu *vcpu) |
@@ -2434,12 +2840,12 @@ static void paging_free(struct kvm_vcpu *vcpu) | |||
2434 | nonpaging_free(vcpu); | 2840 | nonpaging_free(vcpu); |
2435 | } | 2841 | } |
2436 | 2842 | ||
2437 | static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) | 2843 | static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) |
2438 | { | 2844 | { |
2439 | int bit7; | 2845 | int bit7; |
2440 | 2846 | ||
2441 | bit7 = (gpte >> 7) & 1; | 2847 | bit7 = (gpte >> 7) & 1; |
2442 | return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; | 2848 | return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; |
2443 | } | 2849 | } |
2444 | 2850 | ||
2445 | #define PTTYPE 64 | 2851 | #define PTTYPE 64 |
@@ -2450,13 +2856,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) | |||
2450 | #include "paging_tmpl.h" | 2856 | #include "paging_tmpl.h" |
2451 | #undef PTTYPE | 2857 | #undef PTTYPE |
2452 | 2858 | ||
2453 | static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) | 2859 | static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, |
2860 | struct kvm_mmu *context, | ||
2861 | int level) | ||
2454 | { | 2862 | { |
2455 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
2456 | int maxphyaddr = cpuid_maxphyaddr(vcpu); | 2863 | int maxphyaddr = cpuid_maxphyaddr(vcpu); |
2457 | u64 exb_bit_rsvd = 0; | 2864 | u64 exb_bit_rsvd = 0; |
2458 | 2865 | ||
2459 | if (!is_nx(vcpu)) | 2866 | if (!context->nx) |
2460 | exb_bit_rsvd = rsvd_bits(63, 63); | 2867 | exb_bit_rsvd = rsvd_bits(63, 63); |
2461 | switch (level) { | 2868 | switch (level) { |
2462 | case PT32_ROOT_LEVEL: | 2869 | case PT32_ROOT_LEVEL: |
@@ -2511,9 +2918,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) | |||
2511 | } | 2918 | } |
2512 | } | 2919 | } |
2513 | 2920 | ||
2514 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | 2921 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, |
2922 | struct kvm_mmu *context, | ||
2923 | int level) | ||
2515 | { | 2924 | { |
2516 | struct kvm_mmu *context = &vcpu->arch.mmu; | 2925 | context->nx = is_nx(vcpu); |
2926 | |||
2927 | reset_rsvds_bits_mask(vcpu, context, level); | ||
2517 | 2928 | ||
2518 | ASSERT(is_pae(vcpu)); | 2929 | ASSERT(is_pae(vcpu)); |
2519 | context->new_cr3 = paging_new_cr3; | 2930 | context->new_cr3 = paging_new_cr3; |
@@ -2522,24 +2933,28 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | |||
2522 | context->prefetch_page = paging64_prefetch_page; | 2933 | context->prefetch_page = paging64_prefetch_page; |
2523 | context->sync_page = paging64_sync_page; | 2934 | context->sync_page = paging64_sync_page; |
2524 | context->invlpg = paging64_invlpg; | 2935 | context->invlpg = paging64_invlpg; |
2936 | context->update_pte = paging64_update_pte; | ||
2525 | context->free = paging_free; | 2937 | context->free = paging_free; |
2526 | context->root_level = level; | 2938 | context->root_level = level; |
2527 | context->shadow_root_level = level; | 2939 | context->shadow_root_level = level; |
2528 | context->root_hpa = INVALID_PAGE; | 2940 | context->root_hpa = INVALID_PAGE; |
2941 | context->direct_map = false; | ||
2529 | return 0; | 2942 | return 0; |
2530 | } | 2943 | } |
2531 | 2944 | ||
2532 | static int paging64_init_context(struct kvm_vcpu *vcpu) | 2945 | static int paging64_init_context(struct kvm_vcpu *vcpu, |
2946 | struct kvm_mmu *context) | ||
2533 | { | 2947 | { |
2534 | reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); | 2948 | return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); |
2535 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); | ||
2536 | } | 2949 | } |
2537 | 2950 | ||
2538 | static int paging32_init_context(struct kvm_vcpu *vcpu) | 2951 | static int paging32_init_context(struct kvm_vcpu *vcpu, |
2952 | struct kvm_mmu *context) | ||
2539 | { | 2953 | { |
2540 | struct kvm_mmu *context = &vcpu->arch.mmu; | 2954 | context->nx = false; |
2955 | |||
2956 | reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); | ||
2541 | 2957 | ||
2542 | reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); | ||
2543 | context->new_cr3 = paging_new_cr3; | 2958 | context->new_cr3 = paging_new_cr3; |
2544 | context->page_fault = paging32_page_fault; | 2959 | context->page_fault = paging32_page_fault; |
2545 | context->gva_to_gpa = paging32_gva_to_gpa; | 2960 | context->gva_to_gpa = paging32_gva_to_gpa; |
@@ -2547,44 +2962,57 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) | |||
2547 | context->prefetch_page = paging32_prefetch_page; | 2962 | context->prefetch_page = paging32_prefetch_page; |
2548 | context->sync_page = paging32_sync_page; | 2963 | context->sync_page = paging32_sync_page; |
2549 | context->invlpg = paging32_invlpg; | 2964 | context->invlpg = paging32_invlpg; |
2965 | context->update_pte = paging32_update_pte; | ||
2550 | context->root_level = PT32_ROOT_LEVEL; | 2966 | context->root_level = PT32_ROOT_LEVEL; |
2551 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 2967 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
2552 | context->root_hpa = INVALID_PAGE; | 2968 | context->root_hpa = INVALID_PAGE; |
2969 | context->direct_map = false; | ||
2553 | return 0; | 2970 | return 0; |
2554 | } | 2971 | } |
2555 | 2972 | ||
2556 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | 2973 | static int paging32E_init_context(struct kvm_vcpu *vcpu, |
2974 | struct kvm_mmu *context) | ||
2557 | { | 2975 | { |
2558 | reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); | 2976 | return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); |
2559 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); | ||
2560 | } | 2977 | } |
2561 | 2978 | ||
2562 | static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | 2979 | static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) |
2563 | { | 2980 | { |
2564 | struct kvm_mmu *context = &vcpu->arch.mmu; | 2981 | struct kvm_mmu *context = vcpu->arch.walk_mmu; |
2565 | 2982 | ||
2983 | context->base_role.word = 0; | ||
2566 | context->new_cr3 = nonpaging_new_cr3; | 2984 | context->new_cr3 = nonpaging_new_cr3; |
2567 | context->page_fault = tdp_page_fault; | 2985 | context->page_fault = tdp_page_fault; |
2568 | context->free = nonpaging_free; | 2986 | context->free = nonpaging_free; |
2569 | context->prefetch_page = nonpaging_prefetch_page; | 2987 | context->prefetch_page = nonpaging_prefetch_page; |
2570 | context->sync_page = nonpaging_sync_page; | 2988 | context->sync_page = nonpaging_sync_page; |
2571 | context->invlpg = nonpaging_invlpg; | 2989 | context->invlpg = nonpaging_invlpg; |
2990 | context->update_pte = nonpaging_update_pte; | ||
2572 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); | 2991 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); |
2573 | context->root_hpa = INVALID_PAGE; | 2992 | context->root_hpa = INVALID_PAGE; |
2993 | context->direct_map = true; | ||
2994 | context->set_cr3 = kvm_x86_ops->set_tdp_cr3; | ||
2995 | context->get_cr3 = get_cr3; | ||
2996 | context->inject_page_fault = kvm_inject_page_fault; | ||
2997 | context->nx = is_nx(vcpu); | ||
2574 | 2998 | ||
2575 | if (!is_paging(vcpu)) { | 2999 | if (!is_paging(vcpu)) { |
3000 | context->nx = false; | ||
2576 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 3001 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
2577 | context->root_level = 0; | 3002 | context->root_level = 0; |
2578 | } else if (is_long_mode(vcpu)) { | 3003 | } else if (is_long_mode(vcpu)) { |
2579 | reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); | 3004 | context->nx = is_nx(vcpu); |
3005 | reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); | ||
2580 | context->gva_to_gpa = paging64_gva_to_gpa; | 3006 | context->gva_to_gpa = paging64_gva_to_gpa; |
2581 | context->root_level = PT64_ROOT_LEVEL; | 3007 | context->root_level = PT64_ROOT_LEVEL; |
2582 | } else if (is_pae(vcpu)) { | 3008 | } else if (is_pae(vcpu)) { |
2583 | reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); | 3009 | context->nx = is_nx(vcpu); |
3010 | reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); | ||
2584 | context->gva_to_gpa = paging64_gva_to_gpa; | 3011 | context->gva_to_gpa = paging64_gva_to_gpa; |
2585 | context->root_level = PT32E_ROOT_LEVEL; | 3012 | context->root_level = PT32E_ROOT_LEVEL; |
2586 | } else { | 3013 | } else { |
2587 | reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); | 3014 | context->nx = false; |
3015 | reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); | ||
2588 | context->gva_to_gpa = paging32_gva_to_gpa; | 3016 | context->gva_to_gpa = paging32_gva_to_gpa; |
2589 | context->root_level = PT32_ROOT_LEVEL; | 3017 | context->root_level = PT32_ROOT_LEVEL; |
2590 | } | 3018 | } |
@@ -2592,33 +3020,81 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
2592 | return 0; | 3020 | return 0; |
2593 | } | 3021 | } |
2594 | 3022 | ||
2595 | static int init_kvm_softmmu(struct kvm_vcpu *vcpu) | 3023 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) |
2596 | { | 3024 | { |
2597 | int r; | 3025 | int r; |
2598 | |||
2599 | ASSERT(vcpu); | 3026 | ASSERT(vcpu); |
2600 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 3027 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
2601 | 3028 | ||
2602 | if (!is_paging(vcpu)) | 3029 | if (!is_paging(vcpu)) |
2603 | r = nonpaging_init_context(vcpu); | 3030 | r = nonpaging_init_context(vcpu, context); |
2604 | else if (is_long_mode(vcpu)) | 3031 | else if (is_long_mode(vcpu)) |
2605 | r = paging64_init_context(vcpu); | 3032 | r = paging64_init_context(vcpu, context); |
2606 | else if (is_pae(vcpu)) | 3033 | else if (is_pae(vcpu)) |
2607 | r = paging32E_init_context(vcpu); | 3034 | r = paging32E_init_context(vcpu, context); |
2608 | else | 3035 | else |
2609 | r = paging32_init_context(vcpu); | 3036 | r = paging32_init_context(vcpu, context); |
2610 | 3037 | ||
2611 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); | 3038 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); |
2612 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); | 3039 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); |
2613 | 3040 | ||
2614 | return r; | 3041 | return r; |
2615 | } | 3042 | } |
3043 | EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); | ||
2616 | 3044 | ||
2617 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | 3045 | static int init_kvm_softmmu(struct kvm_vcpu *vcpu) |
2618 | { | 3046 | { |
2619 | vcpu->arch.update_pte.pfn = bad_pfn; | 3047 | int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); |
2620 | 3048 | ||
2621 | if (tdp_enabled) | 3049 | vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; |
3050 | vcpu->arch.walk_mmu->get_cr3 = get_cr3; | ||
3051 | vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; | ||
3052 | |||
3053 | return r; | ||
3054 | } | ||
3055 | |||
3056 | static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) | ||
3057 | { | ||
3058 | struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; | ||
3059 | |||
3060 | g_context->get_cr3 = get_cr3; | ||
3061 | g_context->inject_page_fault = kvm_inject_page_fault; | ||
3062 | |||
3063 | /* | ||
3064 | * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The | ||
3065 | * translation of l2_gpa to l1_gpa addresses is done using the | ||
3066 | * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa | ||
3067 | * functions between mmu and nested_mmu are swapped. | ||
3068 | */ | ||
3069 | if (!is_paging(vcpu)) { | ||
3070 | g_context->nx = false; | ||
3071 | g_context->root_level = 0; | ||
3072 | g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; | ||
3073 | } else if (is_long_mode(vcpu)) { | ||
3074 | g_context->nx = is_nx(vcpu); | ||
3075 | reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); | ||
3076 | g_context->root_level = PT64_ROOT_LEVEL; | ||
3077 | g_context->gva_to_gpa = paging64_gva_to_gpa_nested; | ||
3078 | } else if (is_pae(vcpu)) { | ||
3079 | g_context->nx = is_nx(vcpu); | ||
3080 | reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); | ||
3081 | g_context->root_level = PT32E_ROOT_LEVEL; | ||
3082 | g_context->gva_to_gpa = paging64_gva_to_gpa_nested; | ||
3083 | } else { | ||
3084 | g_context->nx = false; | ||
3085 | reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); | ||
3086 | g_context->root_level = PT32_ROOT_LEVEL; | ||
3087 | g_context->gva_to_gpa = paging32_gva_to_gpa_nested; | ||
3088 | } | ||
3089 | |||
3090 | return 0; | ||
3091 | } | ||
3092 | |||
3093 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | ||
3094 | { | ||
3095 | if (mmu_is_nested(vcpu)) | ||
3096 | return init_kvm_nested_mmu(vcpu); | ||
3097 | else if (tdp_enabled) | ||
2622 | return init_kvm_tdp_mmu(vcpu); | 3098 | return init_kvm_tdp_mmu(vcpu); |
2623 | else | 3099 | else |
2624 | return init_kvm_softmmu(vcpu); | 3100 | return init_kvm_softmmu(vcpu); |
@@ -2653,7 +3129,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) | |||
2653 | if (r) | 3129 | if (r) |
2654 | goto out; | 3130 | goto out; |
2655 | /* set_cr3() should ensure TLB has been flushed */ | 3131 | /* set_cr3() should ensure TLB has been flushed */ |
2656 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | 3132 | vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); |
2657 | out: | 3133 | out: |
2658 | return r; | 3134 | return r; |
2659 | } | 3135 | } |
@@ -2663,6 +3139,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) | |||
2663 | { | 3139 | { |
2664 | mmu_free_roots(vcpu); | 3140 | mmu_free_roots(vcpu); |
2665 | } | 3141 | } |
3142 | EXPORT_SYMBOL_GPL(kvm_mmu_unload); | ||
2666 | 3143 | ||
2667 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | 3144 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, |
2668 | struct kvm_mmu_page *sp, | 3145 | struct kvm_mmu_page *sp, |
@@ -2686,8 +3163,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | |||
2686 | } | 3163 | } |
2687 | 3164 | ||
2688 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | 3165 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, |
2689 | struct kvm_mmu_page *sp, | 3166 | struct kvm_mmu_page *sp, u64 *spte, |
2690 | u64 *spte, | ||
2691 | const void *new) | 3167 | const void *new) |
2692 | { | 3168 | { |
2693 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { | 3169 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { |
@@ -2695,14 +3171,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | |||
2695 | return; | 3171 | return; |
2696 | } | 3172 | } |
2697 | 3173 | ||
2698 | if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) | ||
2699 | return; | ||
2700 | |||
2701 | ++vcpu->kvm->stat.mmu_pte_updated; | 3174 | ++vcpu->kvm->stat.mmu_pte_updated; |
2702 | if (!sp->role.cr4_pae) | 3175 | vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); |
2703 | paging32_update_pte(vcpu, sp, spte, new); | ||
2704 | else | ||
2705 | paging64_update_pte(vcpu, sp, spte, new); | ||
2706 | } | 3176 | } |
2707 | 3177 | ||
2708 | static bool need_remote_flush(u64 old, u64 new) | 3178 | static bool need_remote_flush(u64 old, u64 new) |
@@ -2737,28 +3207,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) | |||
2737 | return !!(spte && (*spte & shadow_accessed_mask)); | 3207 | return !!(spte && (*spte & shadow_accessed_mask)); |
2738 | } | 3208 | } |
2739 | 3209 | ||
2740 | static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
2741 | u64 gpte) | ||
2742 | { | ||
2743 | gfn_t gfn; | ||
2744 | pfn_t pfn; | ||
2745 | |||
2746 | if (!is_present_gpte(gpte)) | ||
2747 | return; | ||
2748 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
2749 | |||
2750 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||
2751 | smp_rmb(); | ||
2752 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | ||
2753 | |||
2754 | if (is_error_pfn(pfn)) { | ||
2755 | kvm_release_pfn_clean(pfn); | ||
2756 | return; | ||
2757 | } | ||
2758 | vcpu->arch.update_pte.gfn = gfn; | ||
2759 | vcpu->arch.update_pte.pfn = pfn; | ||
2760 | } | ||
2761 | |||
2762 | static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) | 3210 | static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) |
2763 | { | 3211 | { |
2764 | u64 *spte = vcpu->arch.last_pte_updated; | 3212 | u64 *spte = vcpu->arch.last_pte_updated; |
@@ -2780,21 +3228,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2780 | struct kvm_mmu_page *sp; | 3228 | struct kvm_mmu_page *sp; |
2781 | struct hlist_node *node; | 3229 | struct hlist_node *node; |
2782 | LIST_HEAD(invalid_list); | 3230 | LIST_HEAD(invalid_list); |
2783 | u64 entry, gentry; | 3231 | u64 entry, gentry, *spte; |
2784 | u64 *spte; | 3232 | unsigned pte_size, page_offset, misaligned, quadrant, offset; |
2785 | unsigned offset = offset_in_page(gpa); | 3233 | int level, npte, invlpg_counter, r, flooded = 0; |
2786 | unsigned pte_size; | ||
2787 | unsigned page_offset; | ||
2788 | unsigned misaligned; | ||
2789 | unsigned quadrant; | ||
2790 | int level; | ||
2791 | int flooded = 0; | ||
2792 | int npte; | ||
2793 | int r; | ||
2794 | int invlpg_counter; | ||
2795 | bool remote_flush, local_flush, zap_page; | 3234 | bool remote_flush, local_flush, zap_page; |
2796 | 3235 | ||
2797 | zap_page = remote_flush = local_flush = false; | 3236 | zap_page = remote_flush = local_flush = false; |
3237 | offset = offset_in_page(gpa); | ||
2798 | 3238 | ||
2799 | pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); | 3239 | pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); |
2800 | 3240 | ||
@@ -2802,9 +3242,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2802 | 3242 | ||
2803 | /* | 3243 | /* |
2804 | * Assume that the pte write on a page table of the same type | 3244 | * Assume that the pte write on a page table of the same type |
2805 | * as the current vcpu paging mode. This is nearly always true | 3245 | * as the current vcpu paging mode since we update the sptes only |
2806 | * (might be false while changing modes). Note it is verified later | 3246 | * when they have the same mode. |
2807 | * by update_pte(). | ||
2808 | */ | 3247 | */ |
2809 | if ((is_pae(vcpu) && bytes == 4) || !new) { | 3248 | if ((is_pae(vcpu) && bytes == 4) || !new) { |
2810 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ | 3249 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ |
@@ -2830,15 +3269,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2830 | break; | 3269 | break; |
2831 | } | 3270 | } |
2832 | 3271 | ||
2833 | mmu_guess_page_from_pte_write(vcpu, gpa, gentry); | ||
2834 | spin_lock(&vcpu->kvm->mmu_lock); | 3272 | spin_lock(&vcpu->kvm->mmu_lock); |
2835 | if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) | 3273 | if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) |
2836 | gentry = 0; | 3274 | gentry = 0; |
2837 | kvm_mmu_access_page(vcpu, gfn); | ||
2838 | kvm_mmu_free_some_pages(vcpu); | 3275 | kvm_mmu_free_some_pages(vcpu); |
2839 | ++vcpu->kvm->stat.mmu_pte_write; | 3276 | ++vcpu->kvm->stat.mmu_pte_write; |
2840 | kvm_mmu_audit(vcpu, "pre pte write"); | 3277 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); |
2841 | if (guest_initiated) { | 3278 | if (guest_initiated) { |
3279 | kvm_mmu_access_page(vcpu, gfn); | ||
2842 | if (gfn == vcpu->arch.last_pt_write_gfn | 3280 | if (gfn == vcpu->arch.last_pt_write_gfn |
2843 | && !last_updated_pte_accessed(vcpu)) { | 3281 | && !last_updated_pte_accessed(vcpu)) { |
2844 | ++vcpu->arch.last_pt_write_count; | 3282 | ++vcpu->arch.last_pt_write_count; |
@@ -2910,12 +3348,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2910 | } | 3348 | } |
2911 | mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); | 3349 | mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); |
2912 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | 3350 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
2913 | kvm_mmu_audit(vcpu, "post pte write"); | 3351 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); |
2914 | spin_unlock(&vcpu->kvm->mmu_lock); | 3352 | spin_unlock(&vcpu->kvm->mmu_lock); |
2915 | if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { | ||
2916 | kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); | ||
2917 | vcpu->arch.update_pte.pfn = bad_pfn; | ||
2918 | } | ||
2919 | } | 3353 | } |
2920 | 3354 | ||
2921 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | 3355 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) |
@@ -2923,7 +3357,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | |||
2923 | gpa_t gpa; | 3357 | gpa_t gpa; |
2924 | int r; | 3358 | int r; |
2925 | 3359 | ||
2926 | if (tdp_enabled) | 3360 | if (vcpu->arch.mmu.direct_map) |
2927 | return 0; | 3361 | return 0; |
2928 | 3362 | ||
2929 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); | 3363 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); |
@@ -2937,29 +3371,27 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); | |||
2937 | 3371 | ||
2938 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | 3372 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) |
2939 | { | 3373 | { |
2940 | int free_pages; | ||
2941 | LIST_HEAD(invalid_list); | 3374 | LIST_HEAD(invalid_list); |
2942 | 3375 | ||
2943 | free_pages = vcpu->kvm->arch.n_free_mmu_pages; | 3376 | while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && |
2944 | while (free_pages < KVM_REFILL_PAGES && | ||
2945 | !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { | 3377 | !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { |
2946 | struct kvm_mmu_page *sp; | 3378 | struct kvm_mmu_page *sp; |
2947 | 3379 | ||
2948 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | 3380 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, |
2949 | struct kvm_mmu_page, link); | 3381 | struct kvm_mmu_page, link); |
2950 | free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, | 3382 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); |
2951 | &invalid_list); | 3383 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
2952 | ++vcpu->kvm->stat.mmu_recycled; | 3384 | ++vcpu->kvm->stat.mmu_recycled; |
2953 | } | 3385 | } |
2954 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
2955 | } | 3386 | } |
2956 | 3387 | ||
2957 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | 3388 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, |
3389 | void *insn, int insn_len) | ||
2958 | { | 3390 | { |
2959 | int r; | 3391 | int r; |
2960 | enum emulation_result er; | 3392 | enum emulation_result er; |
2961 | 3393 | ||
2962 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); | 3394 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); |
2963 | if (r < 0) | 3395 | if (r < 0) |
2964 | goto out; | 3396 | goto out; |
2965 | 3397 | ||
@@ -2972,7 +3404,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | |||
2972 | if (r) | 3404 | if (r) |
2973 | goto out; | 3405 | goto out; |
2974 | 3406 | ||
2975 | er = emulate_instruction(vcpu, cr2, error_code, 0); | 3407 | er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); |
2976 | 3408 | ||
2977 | switch (er) { | 3409 | switch (er) { |
2978 | case EMULATE_DONE: | 3410 | case EMULATE_DONE: |
@@ -3013,6 +3445,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); | |||
3013 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | 3445 | static void free_mmu_pages(struct kvm_vcpu *vcpu) |
3014 | { | 3446 | { |
3015 | free_page((unsigned long)vcpu->arch.mmu.pae_root); | 3447 | free_page((unsigned long)vcpu->arch.mmu.pae_root); |
3448 | if (vcpu->arch.mmu.lm_root != NULL) | ||
3449 | free_page((unsigned long)vcpu->arch.mmu.lm_root); | ||
3016 | } | 3450 | } |
3017 | 3451 | ||
3018 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | 3452 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) |
@@ -3054,15 +3488,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) | |||
3054 | return init_kvm_mmu(vcpu); | 3488 | return init_kvm_mmu(vcpu); |
3055 | } | 3489 | } |
3056 | 3490 | ||
3057 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | ||
3058 | { | ||
3059 | ASSERT(vcpu); | ||
3060 | |||
3061 | destroy_kvm_mmu(vcpu); | ||
3062 | free_mmu_pages(vcpu); | ||
3063 | mmu_free_memory_caches(vcpu); | ||
3064 | } | ||
3065 | |||
3066 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | 3491 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) |
3067 | { | 3492 | { |
3068 | struct kvm_mmu_page *sp; | 3493 | struct kvm_mmu_page *sp; |
@@ -3075,10 +3500,22 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
3075 | continue; | 3500 | continue; |
3076 | 3501 | ||
3077 | pt = sp->spt; | 3502 | pt = sp->spt; |
3078 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | 3503 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { |
3504 | if (!is_shadow_present_pte(pt[i]) || | ||
3505 | !is_last_spte(pt[i], sp->role.level)) | ||
3506 | continue; | ||
3507 | |||
3508 | if (is_large_pte(pt[i])) { | ||
3509 | drop_spte(kvm, &pt[i], | ||
3510 | shadow_trap_nonpresent_pte); | ||
3511 | --kvm->stat.lpages; | ||
3512 | continue; | ||
3513 | } | ||
3514 | |||
3079 | /* avoid RMW */ | 3515 | /* avoid RMW */ |
3080 | if (is_writable_pte(pt[i])) | 3516 | if (is_writable_pte(pt[i])) |
3081 | pt[i] &= ~PT_WRITABLE_MASK; | 3517 | update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); |
3518 | } | ||
3082 | } | 3519 | } |
3083 | kvm_flush_remote_tlbs(kvm); | 3520 | kvm_flush_remote_tlbs(kvm); |
3084 | } | 3521 | } |
@@ -3108,27 +3545,27 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, | |||
3108 | return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); | 3545 | return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); |
3109 | } | 3546 | } |
3110 | 3547 | ||
3111 | static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) | 3548 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) |
3112 | { | 3549 | { |
3113 | struct kvm *kvm; | 3550 | struct kvm *kvm; |
3114 | struct kvm *kvm_freed = NULL; | 3551 | struct kvm *kvm_freed = NULL; |
3115 | int cache_count = 0; | 3552 | int nr_to_scan = sc->nr_to_scan; |
3553 | |||
3554 | if (nr_to_scan == 0) | ||
3555 | goto out; | ||
3116 | 3556 | ||
3117 | spin_lock(&kvm_lock); | 3557 | raw_spin_lock(&kvm_lock); |
3118 | 3558 | ||
3119 | list_for_each_entry(kvm, &vm_list, vm_list) { | 3559 | list_for_each_entry(kvm, &vm_list, vm_list) { |
3120 | int npages, idx, freed_pages; | 3560 | int idx, freed_pages; |
3121 | LIST_HEAD(invalid_list); | 3561 | LIST_HEAD(invalid_list); |
3122 | 3562 | ||
3123 | idx = srcu_read_lock(&kvm->srcu); | 3563 | idx = srcu_read_lock(&kvm->srcu); |
3124 | spin_lock(&kvm->mmu_lock); | 3564 | spin_lock(&kvm->mmu_lock); |
3125 | npages = kvm->arch.n_alloc_mmu_pages - | 3565 | if (!kvm_freed && nr_to_scan > 0 && |
3126 | kvm->arch.n_free_mmu_pages; | 3566 | kvm->arch.n_used_mmu_pages > 0) { |
3127 | cache_count += npages; | ||
3128 | if (!kvm_freed && nr_to_scan > 0 && npages > 0) { | ||
3129 | freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, | 3567 | freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, |
3130 | &invalid_list); | 3568 | &invalid_list); |
3131 | cache_count -= freed_pages; | ||
3132 | kvm_freed = kvm; | 3569 | kvm_freed = kvm; |
3133 | } | 3570 | } |
3134 | nr_to_scan--; | 3571 | nr_to_scan--; |
@@ -3140,9 +3577,10 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) | |||
3140 | if (kvm_freed) | 3577 | if (kvm_freed) |
3141 | list_move_tail(&kvm_freed->vm_list, &vm_list); | 3578 | list_move_tail(&kvm_freed->vm_list, &vm_list); |
3142 | 3579 | ||
3143 | spin_unlock(&kvm_lock); | 3580 | raw_spin_unlock(&kvm_lock); |
3144 | 3581 | ||
3145 | return cache_count; | 3582 | out: |
3583 | return percpu_counter_read_positive(&kvm_total_used_mmu_pages); | ||
3146 | } | 3584 | } |
3147 | 3585 | ||
3148 | static struct shrinker mmu_shrinker = { | 3586 | static struct shrinker mmu_shrinker = { |
@@ -3160,12 +3598,6 @@ static void mmu_destroy_caches(void) | |||
3160 | kmem_cache_destroy(mmu_page_header_cache); | 3598 | kmem_cache_destroy(mmu_page_header_cache); |
3161 | } | 3599 | } |
3162 | 3600 | ||
3163 | void kvm_mmu_module_exit(void) | ||
3164 | { | ||
3165 | mmu_destroy_caches(); | ||
3166 | unregister_shrinker(&mmu_shrinker); | ||
3167 | } | ||
3168 | |||
3169 | int kvm_mmu_module_init(void) | 3601 | int kvm_mmu_module_init(void) |
3170 | { | 3602 | { |
3171 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | 3603 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", |
@@ -3185,6 +3617,9 @@ int kvm_mmu_module_init(void) | |||
3185 | if (!mmu_page_header_cache) | 3617 | if (!mmu_page_header_cache) |
3186 | goto nomem; | 3618 | goto nomem; |
3187 | 3619 | ||
3620 | if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) | ||
3621 | goto nomem; | ||
3622 | |||
3188 | register_shrinker(&mmu_shrinker); | 3623 | register_shrinker(&mmu_shrinker); |
3189 | 3624 | ||
3190 | return 0; | 3625 | return 0; |
@@ -3259,7 +3694,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, | |||
3259 | 3694 | ||
3260 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) | 3695 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) |
3261 | { | 3696 | { |
3262 | (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); | 3697 | (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu)); |
3263 | return 1; | 3698 | return 1; |
3264 | } | 3699 | } |
3265 | 3700 | ||
@@ -3355,271 +3790,25 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) | |||
3355 | } | 3790 | } |
3356 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); | 3791 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); |
3357 | 3792 | ||
3358 | #ifdef AUDIT | 3793 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) |
3359 | |||
3360 | static const char *audit_msg; | ||
3361 | |||
3362 | static gva_t canonicalize(gva_t gva) | ||
3363 | { | ||
3364 | #ifdef CONFIG_X86_64 | ||
3365 | gva = (long long)(gva << 16) >> 16; | ||
3366 | #endif | ||
3367 | return gva; | ||
3368 | } | ||
3369 | |||
3370 | |||
3371 | typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); | ||
3372 | |||
3373 | static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, | ||
3374 | inspect_spte_fn fn) | ||
3375 | { | ||
3376 | int i; | ||
3377 | |||
3378 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
3379 | u64 ent = sp->spt[i]; | ||
3380 | |||
3381 | if (is_shadow_present_pte(ent)) { | ||
3382 | if (!is_last_spte(ent, sp->role.level)) { | ||
3383 | struct kvm_mmu_page *child; | ||
3384 | child = page_header(ent & PT64_BASE_ADDR_MASK); | ||
3385 | __mmu_spte_walk(kvm, child, fn); | ||
3386 | } else | ||
3387 | fn(kvm, &sp->spt[i]); | ||
3388 | } | ||
3389 | } | ||
3390 | } | ||
3391 | |||
3392 | static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) | ||
3393 | { | ||
3394 | int i; | ||
3395 | struct kvm_mmu_page *sp; | ||
3396 | |||
3397 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
3398 | return; | ||
3399 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
3400 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
3401 | sp = page_header(root); | ||
3402 | __mmu_spte_walk(vcpu->kvm, sp, fn); | ||
3403 | return; | ||
3404 | } | ||
3405 | for (i = 0; i < 4; ++i) { | ||
3406 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
3407 | |||
3408 | if (root && VALID_PAGE(root)) { | ||
3409 | root &= PT64_BASE_ADDR_MASK; | ||
3410 | sp = page_header(root); | ||
3411 | __mmu_spte_walk(vcpu->kvm, sp, fn); | ||
3412 | } | ||
3413 | } | ||
3414 | return; | ||
3415 | } | ||
3416 | |||
3417 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | ||
3418 | gva_t va, int level) | ||
3419 | { | ||
3420 | u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); | ||
3421 | int i; | ||
3422 | gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); | ||
3423 | |||
3424 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { | ||
3425 | u64 ent = pt[i]; | ||
3426 | |||
3427 | if (ent == shadow_trap_nonpresent_pte) | ||
3428 | continue; | ||
3429 | |||
3430 | va = canonicalize(va); | ||
3431 | if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) | ||
3432 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
3433 | else { | ||
3434 | gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); | ||
3435 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
3436 | pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); | ||
3437 | hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; | ||
3438 | |||
3439 | if (is_error_pfn(pfn)) { | ||
3440 | kvm_release_pfn_clean(pfn); | ||
3441 | continue; | ||
3442 | } | ||
3443 | |||
3444 | if (is_shadow_present_pte(ent) | ||
3445 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | ||
3446 | printk(KERN_ERR "xx audit error: (%s) levels %d" | ||
3447 | " gva %lx gpa %llx hpa %llx ent %llx %d\n", | ||
3448 | audit_msg, vcpu->arch.mmu.root_level, | ||
3449 | va, gpa, hpa, ent, | ||
3450 | is_shadow_present_pte(ent)); | ||
3451 | else if (ent == shadow_notrap_nonpresent_pte | ||
3452 | && !is_error_hpa(hpa)) | ||
3453 | printk(KERN_ERR "audit: (%s) notrap shadow," | ||
3454 | " valid guest gva %lx\n", audit_msg, va); | ||
3455 | kvm_release_pfn_clean(pfn); | ||
3456 | |||
3457 | } | ||
3458 | } | ||
3459 | } | ||
3460 | |||
3461 | static void audit_mappings(struct kvm_vcpu *vcpu) | ||
3462 | { | ||
3463 | unsigned i; | ||
3464 | |||
3465 | if (vcpu->arch.mmu.root_level == 4) | ||
3466 | audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); | ||
3467 | else | ||
3468 | for (i = 0; i < 4; ++i) | ||
3469 | if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) | ||
3470 | audit_mappings_page(vcpu, | ||
3471 | vcpu->arch.mmu.pae_root[i], | ||
3472 | i << 30, | ||
3473 | 2); | ||
3474 | } | ||
3475 | |||
3476 | static int count_rmaps(struct kvm_vcpu *vcpu) | ||
3477 | { | ||
3478 | struct kvm *kvm = vcpu->kvm; | ||
3479 | struct kvm_memslots *slots; | ||
3480 | int nmaps = 0; | ||
3481 | int i, j, k, idx; | ||
3482 | |||
3483 | idx = srcu_read_lock(&kvm->srcu); | ||
3484 | slots = kvm_memslots(kvm); | ||
3485 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
3486 | struct kvm_memory_slot *m = &slots->memslots[i]; | ||
3487 | struct kvm_rmap_desc *d; | ||
3488 | |||
3489 | for (j = 0; j < m->npages; ++j) { | ||
3490 | unsigned long *rmapp = &m->rmap[j]; | ||
3491 | |||
3492 | if (!*rmapp) | ||
3493 | continue; | ||
3494 | if (!(*rmapp & 1)) { | ||
3495 | ++nmaps; | ||
3496 | continue; | ||
3497 | } | ||
3498 | d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
3499 | while (d) { | ||
3500 | for (k = 0; k < RMAP_EXT; ++k) | ||
3501 | if (d->sptes[k]) | ||
3502 | ++nmaps; | ||
3503 | else | ||
3504 | break; | ||
3505 | d = d->more; | ||
3506 | } | ||
3507 | } | ||
3508 | } | ||
3509 | srcu_read_unlock(&kvm->srcu, idx); | ||
3510 | return nmaps; | ||
3511 | } | ||
3512 | |||
3513 | void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | ||
3514 | { | ||
3515 | unsigned long *rmapp; | ||
3516 | struct kvm_mmu_page *rev_sp; | ||
3517 | gfn_t gfn; | ||
3518 | |||
3519 | if (is_writable_pte(*sptep)) { | ||
3520 | rev_sp = page_header(__pa(sptep)); | ||
3521 | gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); | ||
3522 | |||
3523 | if (!gfn_to_memslot(kvm, gfn)) { | ||
3524 | if (!printk_ratelimit()) | ||
3525 | return; | ||
3526 | printk(KERN_ERR "%s: no memslot for gfn %ld\n", | ||
3527 | audit_msg, gfn); | ||
3528 | printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", | ||
3529 | audit_msg, (long int)(sptep - rev_sp->spt), | ||
3530 | rev_sp->gfn); | ||
3531 | dump_stack(); | ||
3532 | return; | ||
3533 | } | ||
3534 | |||
3535 | rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); | ||
3536 | if (!*rmapp) { | ||
3537 | if (!printk_ratelimit()) | ||
3538 | return; | ||
3539 | printk(KERN_ERR "%s: no rmap for writable spte %llx\n", | ||
3540 | audit_msg, *sptep); | ||
3541 | dump_stack(); | ||
3542 | } | ||
3543 | } | ||
3544 | |||
3545 | } | ||
3546 | |||
3547 | void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) | ||
3548 | { | ||
3549 | mmu_spte_walk(vcpu, inspect_spte_has_rmap); | ||
3550 | } | ||
3551 | |||
3552 | static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) | ||
3553 | { | 3794 | { |
3554 | struct kvm_mmu_page *sp; | 3795 | ASSERT(vcpu); |
3555 | int i; | ||
3556 | |||
3557 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
3558 | u64 *pt = sp->spt; | ||
3559 | |||
3560 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) | ||
3561 | continue; | ||
3562 | |||
3563 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
3564 | u64 ent = pt[i]; | ||
3565 | |||
3566 | if (!(ent & PT_PRESENT_MASK)) | ||
3567 | continue; | ||
3568 | if (!is_writable_pte(ent)) | ||
3569 | continue; | ||
3570 | inspect_spte_has_rmap(vcpu->kvm, &pt[i]); | ||
3571 | } | ||
3572 | } | ||
3573 | return; | ||
3574 | } | ||
3575 | 3796 | ||
3576 | static void audit_rmap(struct kvm_vcpu *vcpu) | 3797 | destroy_kvm_mmu(vcpu); |
3577 | { | 3798 | free_mmu_pages(vcpu); |
3578 | check_writable_mappings_rmap(vcpu); | 3799 | mmu_free_memory_caches(vcpu); |
3579 | count_rmaps(vcpu); | ||
3580 | } | 3800 | } |
3581 | 3801 | ||
3582 | static void audit_write_protection(struct kvm_vcpu *vcpu) | 3802 | #ifdef CONFIG_KVM_MMU_AUDIT |
3583 | { | 3803 | #include "mmu_audit.c" |
3584 | struct kvm_mmu_page *sp; | 3804 | #else |
3585 | struct kvm_memory_slot *slot; | 3805 | static void mmu_audit_disable(void) { } |
3586 | unsigned long *rmapp; | 3806 | #endif |
3587 | u64 *spte; | ||
3588 | gfn_t gfn; | ||
3589 | |||
3590 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
3591 | if (sp->role.direct) | ||
3592 | continue; | ||
3593 | if (sp->unsync) | ||
3594 | continue; | ||
3595 | |||
3596 | slot = gfn_to_memslot(vcpu->kvm, sp->gfn); | ||
3597 | rmapp = &slot->rmap[gfn - slot->base_gfn]; | ||
3598 | |||
3599 | spte = rmap_next(vcpu->kvm, rmapp, NULL); | ||
3600 | while (spte) { | ||
3601 | if (is_writable_pte(*spte)) | ||
3602 | printk(KERN_ERR "%s: (%s) shadow page has " | ||
3603 | "writable mappings: gfn %lx role %x\n", | ||
3604 | __func__, audit_msg, sp->gfn, | ||
3605 | sp->role.word); | ||
3606 | spte = rmap_next(vcpu->kvm, rmapp, spte); | ||
3607 | } | ||
3608 | } | ||
3609 | } | ||
3610 | 3807 | ||
3611 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | 3808 | void kvm_mmu_module_exit(void) |
3612 | { | 3809 | { |
3613 | int olddbg = dbg; | 3810 | mmu_destroy_caches(); |
3614 | 3811 | percpu_counter_destroy(&kvm_total_used_mmu_pages); | |
3615 | dbg = 0; | 3812 | unregister_shrinker(&mmu_shrinker); |
3616 | audit_msg = msg; | 3813 | mmu_audit_disable(); |
3617 | audit_rmap(vcpu); | ||
3618 | audit_write_protection(vcpu); | ||
3619 | if (strcmp("pre pte write", audit_msg) != 0) | ||
3620 | audit_mappings(vcpu); | ||
3621 | audit_writable_sptes_have_rmaps(vcpu); | ||
3622 | dbg = olddbg; | ||
3623 | } | 3814 | } |
3624 | |||
3625 | #endif | ||