diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-24 15:01:20 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-24 15:01:20 -0400 |
commit | 5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a (patch) | |
tree | d1fc25d9650d3ac24591bba6f5e2e7a1afc54796 /arch/x86/kvm/mmu.c | |
parent | 3c4cfadef6a1665d9cd02a543782d03d3e6740c6 (diff) | |
parent | 1a577b72475d161b6677c05abe57301362023bb2 (diff) |
Merge tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Avi Kivity:
"Highlights include
- full big real mode emulation on pre-Westmere Intel hosts (can be
disabled with emulate_invalid_guest_state=0)
- relatively small ppc and s390 updates
- PCID/INVPCID support in guests
- EOI avoidance; 3.6 guests should perform better on 3.6 hosts on
interrupt intensive workloads)
- Lockless write faults during live migration
- EPT accessed/dirty bits support for new Intel processors"
Fix up conflicts in:
- Documentation/virtual/kvm/api.txt:
Stupid subchapter numbering, added next to each other.
- arch/powerpc/kvm/booke_interrupts.S:
PPC asm changes clashing with the KVM fixes
- arch/s390/include/asm/sigp.h, arch/s390/kvm/sigp.c:
Duplicated commits through the kvm tree and the s390 tree, with
subsequent edits in the KVM tree.
* tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (93 commits)
KVM: fix race with level interrupts
x86, hyper: fix build with !CONFIG_KVM_GUEST
Revert "apic: fix kvm build on UP without IOAPIC"
KVM guest: switch to apic_set_eoi_write, apic_write
apic: add apic_set_eoi_write for PV use
KVM: VMX: Implement PCID/INVPCID for guests with EPT
KVM: Add x86_hyper_kvm to complete detect_hypervisor_platform check
KVM: PPC: Critical interrupt emulation support
KVM: PPC: e500mc: Fix tlbilx emulation for 64-bit guests
KVM: PPC64: booke: Set interrupt computation mode for 64-bit host
KVM: PPC: bookehv: Add ESR flag to Data Storage Interrupt
KVM: PPC: bookehv64: Add support for std/ld emulation.
booke: Added crit/mc exception handler for e500v2
booke/bookehv: Add host crit-watchdog exception support
KVM: MMU: document mmu-lock and fast page fault
KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint
KVM: MMU: trace fast page fault
KVM: MMU: fast path of handling guest page fault
KVM: MMU: introduce SPTE_MMU_WRITEABLE bit
KVM: MMU: fold tlb flush judgement into mmu_spte_update
...
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu.c | 359 |
1 files changed, 261 insertions, 98 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 57e168e27b5b..01ca00423938 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -90,7 +90,7 @@ module_param(dbg, bool, 0644); | |||
90 | 90 | ||
91 | #define PTE_PREFETCH_NUM 8 | 91 | #define PTE_PREFETCH_NUM 8 |
92 | 92 | ||
93 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | 93 | #define PT_FIRST_AVAIL_BITS_SHIFT 10 |
94 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | 94 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 |
95 | 95 | ||
96 | #define PT64_LEVEL_BITS 9 | 96 | #define PT64_LEVEL_BITS 9 |
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644); | |||
145 | #define CREATE_TRACE_POINTS | 145 | #define CREATE_TRACE_POINTS |
146 | #include "mmutrace.h" | 146 | #include "mmutrace.h" |
147 | 147 | ||
148 | #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | 148 | #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) |
149 | #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) | ||
149 | 150 | ||
150 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 151 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
151 | 152 | ||
@@ -188,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask; | |||
188 | static u64 __read_mostly shadow_mmio_mask; | 189 | static u64 __read_mostly shadow_mmio_mask; |
189 | 190 | ||
190 | static void mmu_spte_set(u64 *sptep, u64 spte); | 191 | static void mmu_spte_set(u64 *sptep, u64 spte); |
192 | static void mmu_free_roots(struct kvm_vcpu *vcpu); | ||
191 | 193 | ||
192 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) | 194 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) |
193 | { | 195 | { |
@@ -444,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte) | |||
444 | } | 446 | } |
445 | #endif | 447 | #endif |
446 | 448 | ||
449 | static bool spte_is_locklessly_modifiable(u64 spte) | ||
450 | { | ||
451 | return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); | ||
452 | } | ||
453 | |||
447 | static bool spte_has_volatile_bits(u64 spte) | 454 | static bool spte_has_volatile_bits(u64 spte) |
448 | { | 455 | { |
456 | /* | ||
457 | * Always atomicly update spte if it can be updated | ||
458 | * out of mmu-lock, it can ensure dirty bit is not lost, | ||
459 | * also, it can help us to get a stable is_writable_pte() | ||
460 | * to ensure tlb flush is not missed. | ||
461 | */ | ||
462 | if (spte_is_locklessly_modifiable(spte)) | ||
463 | return true; | ||
464 | |||
449 | if (!shadow_accessed_mask) | 465 | if (!shadow_accessed_mask) |
450 | return false; | 466 | return false; |
451 | 467 | ||
@@ -478,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) | |||
478 | 494 | ||
479 | /* Rules for using mmu_spte_update: | 495 | /* Rules for using mmu_spte_update: |
480 | * Update the state bits, it means the mapped pfn is not changged. | 496 | * Update the state bits, it means the mapped pfn is not changged. |
497 | * | ||
498 | * Whenever we overwrite a writable spte with a read-only one we | ||
499 | * should flush remote TLBs. Otherwise rmap_write_protect | ||
500 | * will find a read-only spte, even though the writable spte | ||
501 | * might be cached on a CPU's TLB, the return value indicates this | ||
502 | * case. | ||
481 | */ | 503 | */ |
482 | static void mmu_spte_update(u64 *sptep, u64 new_spte) | 504 | static bool mmu_spte_update(u64 *sptep, u64 new_spte) |
483 | { | 505 | { |
484 | u64 mask, old_spte = *sptep; | 506 | u64 old_spte = *sptep; |
507 | bool ret = false; | ||
485 | 508 | ||
486 | WARN_ON(!is_rmap_spte(new_spte)); | 509 | WARN_ON(!is_rmap_spte(new_spte)); |
487 | 510 | ||
488 | if (!is_shadow_present_pte(old_spte)) | 511 | if (!is_shadow_present_pte(old_spte)) { |
489 | return mmu_spte_set(sptep, new_spte); | 512 | mmu_spte_set(sptep, new_spte); |
490 | 513 | return ret; | |
491 | new_spte |= old_spte & shadow_dirty_mask; | 514 | } |
492 | |||
493 | mask = shadow_accessed_mask; | ||
494 | if (is_writable_pte(old_spte)) | ||
495 | mask |= shadow_dirty_mask; | ||
496 | 515 | ||
497 | if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) | 516 | if (!spte_has_volatile_bits(old_spte)) |
498 | __update_clear_spte_fast(sptep, new_spte); | 517 | __update_clear_spte_fast(sptep, new_spte); |
499 | else | 518 | else |
500 | old_spte = __update_clear_spte_slow(sptep, new_spte); | 519 | old_spte = __update_clear_spte_slow(sptep, new_spte); |
501 | 520 | ||
521 | /* | ||
522 | * For the spte updated out of mmu-lock is safe, since | ||
523 | * we always atomicly update it, see the comments in | ||
524 | * spte_has_volatile_bits(). | ||
525 | */ | ||
526 | if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) | ||
527 | ret = true; | ||
528 | |||
502 | if (!shadow_accessed_mask) | 529 | if (!shadow_accessed_mask) |
503 | return; | 530 | return ret; |
504 | 531 | ||
505 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) | 532 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) |
506 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); | 533 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); |
507 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) | 534 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) |
508 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); | 535 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); |
536 | |||
537 | return ret; | ||
509 | } | 538 | } |
510 | 539 | ||
511 | /* | 540 | /* |
@@ -652,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | |||
652 | mmu_page_header_cache); | 681 | mmu_page_header_cache); |
653 | } | 682 | } |
654 | 683 | ||
655 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | 684 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) |
656 | size_t size) | ||
657 | { | 685 | { |
658 | void *p; | 686 | void *p; |
659 | 687 | ||
@@ -664,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | |||
664 | 692 | ||
665 | static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) | 693 | static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) |
666 | { | 694 | { |
667 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, | 695 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); |
668 | sizeof(struct pte_list_desc)); | ||
669 | } | 696 | } |
670 | 697 | ||
671 | static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) | 698 | static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) |
@@ -1051,35 +1078,82 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) | |||
1051 | rmap_remove(kvm, sptep); | 1078 | rmap_remove(kvm, sptep); |
1052 | } | 1079 | } |
1053 | 1080 | ||
1054 | static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) | 1081 | |
1082 | static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) | ||
1083 | { | ||
1084 | if (is_large_pte(*sptep)) { | ||
1085 | WARN_ON(page_header(__pa(sptep))->role.level == | ||
1086 | PT_PAGE_TABLE_LEVEL); | ||
1087 | drop_spte(kvm, sptep); | ||
1088 | --kvm->stat.lpages; | ||
1089 | return true; | ||
1090 | } | ||
1091 | |||
1092 | return false; | ||
1093 | } | ||
1094 | |||
1095 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | ||
1096 | { | ||
1097 | if (__drop_large_spte(vcpu->kvm, sptep)) | ||
1098 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1099 | } | ||
1100 | |||
1101 | /* | ||
1102 | * Write-protect on the specified @sptep, @pt_protect indicates whether | ||
1103 | * spte writ-protection is caused by protecting shadow page table. | ||
1104 | * @flush indicates whether tlb need be flushed. | ||
1105 | * | ||
1106 | * Note: write protection is difference between drity logging and spte | ||
1107 | * protection: | ||
1108 | * - for dirty logging, the spte can be set to writable at anytime if | ||
1109 | * its dirty bitmap is properly set. | ||
1110 | * - for spte protection, the spte can be writable only after unsync-ing | ||
1111 | * shadow page. | ||
1112 | * | ||
1113 | * Return true if the spte is dropped. | ||
1114 | */ | ||
1115 | static bool | ||
1116 | spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) | ||
1117 | { | ||
1118 | u64 spte = *sptep; | ||
1119 | |||
1120 | if (!is_writable_pte(spte) && | ||
1121 | !(pt_protect && spte_is_locklessly_modifiable(spte))) | ||
1122 | return false; | ||
1123 | |||
1124 | rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); | ||
1125 | |||
1126 | if (__drop_large_spte(kvm, sptep)) { | ||
1127 | *flush |= true; | ||
1128 | return true; | ||
1129 | } | ||
1130 | |||
1131 | if (pt_protect) | ||
1132 | spte &= ~SPTE_MMU_WRITEABLE; | ||
1133 | spte = spte & ~PT_WRITABLE_MASK; | ||
1134 | |||
1135 | *flush |= mmu_spte_update(sptep, spte); | ||
1136 | return false; | ||
1137 | } | ||
1138 | |||
1139 | static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, | ||
1140 | int level, bool pt_protect) | ||
1055 | { | 1141 | { |
1056 | u64 *sptep; | 1142 | u64 *sptep; |
1057 | struct rmap_iterator iter; | 1143 | struct rmap_iterator iter; |
1058 | int write_protected = 0; | 1144 | bool flush = false; |
1059 | 1145 | ||
1060 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { | 1146 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { |
1061 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | 1147 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); |
1062 | rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); | 1148 | if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { |
1063 | |||
1064 | if (!is_writable_pte(*sptep)) { | ||
1065 | sptep = rmap_get_next(&iter); | ||
1066 | continue; | ||
1067 | } | ||
1068 | |||
1069 | if (level == PT_PAGE_TABLE_LEVEL) { | ||
1070 | mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); | ||
1071 | sptep = rmap_get_next(&iter); | ||
1072 | } else { | ||
1073 | BUG_ON(!is_large_pte(*sptep)); | ||
1074 | drop_spte(kvm, sptep); | ||
1075 | --kvm->stat.lpages; | ||
1076 | sptep = rmap_get_first(*rmapp, &iter); | 1149 | sptep = rmap_get_first(*rmapp, &iter); |
1150 | continue; | ||
1077 | } | 1151 | } |
1078 | 1152 | ||
1079 | write_protected = 1; | 1153 | sptep = rmap_get_next(&iter); |
1080 | } | 1154 | } |
1081 | 1155 | ||
1082 | return write_protected; | 1156 | return flush; |
1083 | } | 1157 | } |
1084 | 1158 | ||
1085 | /** | 1159 | /** |
@@ -1100,26 +1174,26 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, | |||
1100 | 1174 | ||
1101 | while (mask) { | 1175 | while (mask) { |
1102 | rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; | 1176 | rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; |
1103 | __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); | 1177 | __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); |
1104 | 1178 | ||
1105 | /* clear the first set bit */ | 1179 | /* clear the first set bit */ |
1106 | mask &= mask - 1; | 1180 | mask &= mask - 1; |
1107 | } | 1181 | } |
1108 | } | 1182 | } |
1109 | 1183 | ||
1110 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) | 1184 | static bool rmap_write_protect(struct kvm *kvm, u64 gfn) |
1111 | { | 1185 | { |
1112 | struct kvm_memory_slot *slot; | 1186 | struct kvm_memory_slot *slot; |
1113 | unsigned long *rmapp; | 1187 | unsigned long *rmapp; |
1114 | int i; | 1188 | int i; |
1115 | int write_protected = 0; | 1189 | bool write_protected = false; |
1116 | 1190 | ||
1117 | slot = gfn_to_memslot(kvm, gfn); | 1191 | slot = gfn_to_memslot(kvm, gfn); |
1118 | 1192 | ||
1119 | for (i = PT_PAGE_TABLE_LEVEL; | 1193 | for (i = PT_PAGE_TABLE_LEVEL; |
1120 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 1194 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
1121 | rmapp = __gfn_to_rmap(gfn, i, slot); | 1195 | rmapp = __gfn_to_rmap(gfn, i, slot); |
1122 | write_protected |= __rmap_write_protect(kvm, rmapp, i); | 1196 | write_protected |= __rmap_write_protect(kvm, rmapp, i, true); |
1123 | } | 1197 | } |
1124 | 1198 | ||
1125 | return write_protected; | 1199 | return write_protected; |
@@ -1238,11 +1312,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1238 | unsigned long data) | 1312 | unsigned long data) |
1239 | { | 1313 | { |
1240 | u64 *sptep; | 1314 | u64 *sptep; |
1241 | struct rmap_iterator iter; | 1315 | struct rmap_iterator uninitialized_var(iter); |
1242 | int young = 0; | 1316 | int young = 0; |
1243 | 1317 | ||
1244 | /* | 1318 | /* |
1245 | * Emulate the accessed bit for EPT, by checking if this page has | 1319 | * In case of absence of EPT Access and Dirty Bits supports, |
1320 | * emulate the accessed bit for EPT, by checking if this page has | ||
1246 | * an EPT mapping, and clearing it if it does. On the next access, | 1321 | * an EPT mapping, and clearing it if it does. On the next access, |
1247 | * a new EPT mapping will be established. | 1322 | * a new EPT mapping will be established. |
1248 | * This has some overhead, but not as much as the cost of swapping | 1323 | * This has some overhead, but not as much as the cost of swapping |
@@ -1253,11 +1328,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1253 | 1328 | ||
1254 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; | 1329 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
1255 | sptep = rmap_get_next(&iter)) { | 1330 | sptep = rmap_get_next(&iter)) { |
1256 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | 1331 | BUG_ON(!is_shadow_present_pte(*sptep)); |
1257 | 1332 | ||
1258 | if (*sptep & PT_ACCESSED_MASK) { | 1333 | if (*sptep & shadow_accessed_mask) { |
1259 | young = 1; | 1334 | young = 1; |
1260 | clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); | 1335 | clear_bit((ffs(shadow_accessed_mask) - 1), |
1336 | (unsigned long *)sptep); | ||
1261 | } | 1337 | } |
1262 | } | 1338 | } |
1263 | 1339 | ||
@@ -1281,9 +1357,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1281 | 1357 | ||
1282 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; | 1358 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
1283 | sptep = rmap_get_next(&iter)) { | 1359 | sptep = rmap_get_next(&iter)) { |
1284 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | 1360 | BUG_ON(!is_shadow_present_pte(*sptep)); |
1285 | 1361 | ||
1286 | if (*sptep & PT_ACCESSED_MASK) { | 1362 | if (*sptep & shadow_accessed_mask) { |
1287 | young = 1; | 1363 | young = 1; |
1288 | break; | 1364 | break; |
1289 | } | 1365 | } |
@@ -1401,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
1401 | u64 *parent_pte, int direct) | 1477 | u64 *parent_pte, int direct) |
1402 | { | 1478 | { |
1403 | struct kvm_mmu_page *sp; | 1479 | struct kvm_mmu_page *sp; |
1404 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, | 1480 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); |
1405 | sizeof *sp); | 1481 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); |
1406 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
1407 | if (!direct) | 1482 | if (!direct) |
1408 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, | 1483 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); |
1409 | PAGE_SIZE); | ||
1410 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | 1484 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); |
1411 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | 1485 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); |
1412 | bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); | 1486 | bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); |
@@ -1701,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, | |||
1701 | 1775 | ||
1702 | kvm_mmu_pages_init(parent, &parents, &pages); | 1776 | kvm_mmu_pages_init(parent, &parents, &pages); |
1703 | while (mmu_unsync_walk(parent, &pages)) { | 1777 | while (mmu_unsync_walk(parent, &pages)) { |
1704 | int protected = 0; | 1778 | bool protected = false; |
1705 | 1779 | ||
1706 | for_each_sp(pages, sp, parents, i) | 1780 | for_each_sp(pages, sp, parents, i) |
1707 | protected |= rmap_write_protect(vcpu->kvm, sp->gfn); | 1781 | protected |= rmap_write_protect(vcpu->kvm, sp->gfn); |
@@ -1866,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) | |||
1866 | mmu_spte_set(sptep, spte); | 1940 | mmu_spte_set(sptep, spte); |
1867 | } | 1941 | } |
1868 | 1942 | ||
1869 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | ||
1870 | { | ||
1871 | if (is_large_pte(*sptep)) { | ||
1872 | drop_spte(vcpu->kvm, sptep); | ||
1873 | --vcpu->kvm->stat.lpages; | ||
1874 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1875 | } | ||
1876 | } | ||
1877 | |||
1878 | static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, | 1943 | static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
1879 | unsigned direct_access) | 1944 | unsigned direct_access) |
1880 | { | 1945 | { |
@@ -2243,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2243 | gfn_t gfn, pfn_t pfn, bool speculative, | 2308 | gfn_t gfn, pfn_t pfn, bool speculative, |
2244 | bool can_unsync, bool host_writable) | 2309 | bool can_unsync, bool host_writable) |
2245 | { | 2310 | { |
2246 | u64 spte, entry = *sptep; | 2311 | u64 spte; |
2247 | int ret = 0; | 2312 | int ret = 0; |
2248 | 2313 | ||
2249 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) | 2314 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) |
@@ -2257,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2257 | spte |= shadow_x_mask; | 2322 | spte |= shadow_x_mask; |
2258 | else | 2323 | else |
2259 | spte |= shadow_nx_mask; | 2324 | spte |= shadow_nx_mask; |
2325 | |||
2260 | if (pte_access & ACC_USER_MASK) | 2326 | if (pte_access & ACC_USER_MASK) |
2261 | spte |= shadow_user_mask; | 2327 | spte |= shadow_user_mask; |
2328 | |||
2262 | if (level > PT_PAGE_TABLE_LEVEL) | 2329 | if (level > PT_PAGE_TABLE_LEVEL) |
2263 | spte |= PT_PAGE_SIZE_MASK; | 2330 | spte |= PT_PAGE_SIZE_MASK; |
2264 | if (tdp_enabled) | 2331 | if (tdp_enabled) |
@@ -2283,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2283 | goto done; | 2350 | goto done; |
2284 | } | 2351 | } |
2285 | 2352 | ||
2286 | spte |= PT_WRITABLE_MASK; | 2353 | spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; |
2287 | 2354 | ||
2288 | if (!vcpu->arch.mmu.direct_map | 2355 | if (!vcpu->arch.mmu.direct_map |
2289 | && !(pte_access & ACC_WRITE_MASK)) { | 2356 | && !(pte_access & ACC_WRITE_MASK)) { |
@@ -2312,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2312 | __func__, gfn); | 2379 | __func__, gfn); |
2313 | ret = 1; | 2380 | ret = 1; |
2314 | pte_access &= ~ACC_WRITE_MASK; | 2381 | pte_access &= ~ACC_WRITE_MASK; |
2315 | if (is_writable_pte(spte)) | 2382 | spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); |
2316 | spte &= ~PT_WRITABLE_MASK; | ||
2317 | } | 2383 | } |
2318 | } | 2384 | } |
2319 | 2385 | ||
@@ -2321,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2321 | mark_page_dirty(vcpu->kvm, gfn); | 2387 | mark_page_dirty(vcpu->kvm, gfn); |
2322 | 2388 | ||
2323 | set_pte: | 2389 | set_pte: |
2324 | mmu_spte_update(sptep, spte); | 2390 | if (mmu_spte_update(sptep, spte)) |
2325 | /* | ||
2326 | * If we overwrite a writable spte with a read-only one we | ||
2327 | * should flush remote TLBs. Otherwise rmap_write_protect | ||
2328 | * will find a read-only spte, even though the writable spte | ||
2329 | * might be cached on a CPU's TLB. | ||
2330 | */ | ||
2331 | if (is_writable_pte(entry) && !is_writable_pte(*sptep)) | ||
2332 | kvm_flush_remote_tlbs(vcpu->kvm); | 2391 | kvm_flush_remote_tlbs(vcpu->kvm); |
2333 | done: | 2392 | done: |
2334 | return ret; | 2393 | return ret; |
@@ -2403,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2403 | 2462 | ||
2404 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | 2463 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) |
2405 | { | 2464 | { |
2465 | mmu_free_roots(vcpu); | ||
2406 | } | 2466 | } |
2407 | 2467 | ||
2408 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | 2468 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, |
@@ -2625,18 +2685,116 @@ exit: | |||
2625 | return ret; | 2685 | return ret; |
2626 | } | 2686 | } |
2627 | 2687 | ||
2688 | static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) | ||
2689 | { | ||
2690 | /* | ||
2691 | * #PF can be fast only if the shadow page table is present and it | ||
2692 | * is caused by write-protect, that means we just need change the | ||
2693 | * W bit of the spte which can be done out of mmu-lock. | ||
2694 | */ | ||
2695 | if (!(error_code & PFERR_PRESENT_MASK) || | ||
2696 | !(error_code & PFERR_WRITE_MASK)) | ||
2697 | return false; | ||
2698 | |||
2699 | return true; | ||
2700 | } | ||
2701 | |||
2702 | static bool | ||
2703 | fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) | ||
2704 | { | ||
2705 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | ||
2706 | gfn_t gfn; | ||
2707 | |||
2708 | WARN_ON(!sp->role.direct); | ||
2709 | |||
2710 | /* | ||
2711 | * The gfn of direct spte is stable since it is calculated | ||
2712 | * by sp->gfn. | ||
2713 | */ | ||
2714 | gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); | ||
2715 | |||
2716 | if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) | ||
2717 | mark_page_dirty(vcpu->kvm, gfn); | ||
2718 | |||
2719 | return true; | ||
2720 | } | ||
2721 | |||
2722 | /* | ||
2723 | * Return value: | ||
2724 | * - true: let the vcpu to access on the same address again. | ||
2725 | * - false: let the real page fault path to fix it. | ||
2726 | */ | ||
2727 | static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, | ||
2728 | u32 error_code) | ||
2729 | { | ||
2730 | struct kvm_shadow_walk_iterator iterator; | ||
2731 | bool ret = false; | ||
2732 | u64 spte = 0ull; | ||
2733 | |||
2734 | if (!page_fault_can_be_fast(vcpu, error_code)) | ||
2735 | return false; | ||
2736 | |||
2737 | walk_shadow_page_lockless_begin(vcpu); | ||
2738 | for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) | ||
2739 | if (!is_shadow_present_pte(spte) || iterator.level < level) | ||
2740 | break; | ||
2741 | |||
2742 | /* | ||
2743 | * If the mapping has been changed, let the vcpu fault on the | ||
2744 | * same address again. | ||
2745 | */ | ||
2746 | if (!is_rmap_spte(spte)) { | ||
2747 | ret = true; | ||
2748 | goto exit; | ||
2749 | } | ||
2750 | |||
2751 | if (!is_last_spte(spte, level)) | ||
2752 | goto exit; | ||
2753 | |||
2754 | /* | ||
2755 | * Check if it is a spurious fault caused by TLB lazily flushed. | ||
2756 | * | ||
2757 | * Need not check the access of upper level table entries since | ||
2758 | * they are always ACC_ALL. | ||
2759 | */ | ||
2760 | if (is_writable_pte(spte)) { | ||
2761 | ret = true; | ||
2762 | goto exit; | ||
2763 | } | ||
2764 | |||
2765 | /* | ||
2766 | * Currently, to simplify the code, only the spte write-protected | ||
2767 | * by dirty-log can be fast fixed. | ||
2768 | */ | ||
2769 | if (!spte_is_locklessly_modifiable(spte)) | ||
2770 | goto exit; | ||
2771 | |||
2772 | /* | ||
2773 | * Currently, fast page fault only works for direct mapping since | ||
2774 | * the gfn is not stable for indirect shadow page. | ||
2775 | * See Documentation/virtual/kvm/locking.txt to get more detail. | ||
2776 | */ | ||
2777 | ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); | ||
2778 | exit: | ||
2779 | trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, | ||
2780 | spte, ret); | ||
2781 | walk_shadow_page_lockless_end(vcpu); | ||
2782 | |||
2783 | return ret; | ||
2784 | } | ||
2785 | |||
2628 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | 2786 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, |
2629 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | 2787 | gva_t gva, pfn_t *pfn, bool write, bool *writable); |
2630 | 2788 | ||
2631 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | 2789 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, |
2632 | bool prefault) | 2790 | gfn_t gfn, bool prefault) |
2633 | { | 2791 | { |
2634 | int r; | 2792 | int r; |
2635 | int level; | 2793 | int level; |
2636 | int force_pt_level; | 2794 | int force_pt_level; |
2637 | pfn_t pfn; | 2795 | pfn_t pfn; |
2638 | unsigned long mmu_seq; | 2796 | unsigned long mmu_seq; |
2639 | bool map_writable; | 2797 | bool map_writable, write = error_code & PFERR_WRITE_MASK; |
2640 | 2798 | ||
2641 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); | 2799 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); |
2642 | if (likely(!force_pt_level)) { | 2800 | if (likely(!force_pt_level)) { |
@@ -2653,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | |||
2653 | } else | 2811 | } else |
2654 | level = PT_PAGE_TABLE_LEVEL; | 2812 | level = PT_PAGE_TABLE_LEVEL; |
2655 | 2813 | ||
2814 | if (fast_page_fault(vcpu, v, level, error_code)) | ||
2815 | return 0; | ||
2816 | |||
2656 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2817 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2657 | smp_rmb(); | 2818 | smp_rmb(); |
2658 | 2819 | ||
@@ -3041,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
3041 | gfn = gva >> PAGE_SHIFT; | 3202 | gfn = gva >> PAGE_SHIFT; |
3042 | 3203 | ||
3043 | return nonpaging_map(vcpu, gva & PAGE_MASK, | 3204 | return nonpaging_map(vcpu, gva & PAGE_MASK, |
3044 | error_code & PFERR_WRITE_MASK, gfn, prefault); | 3205 | error_code, gfn, prefault); |
3045 | } | 3206 | } |
3046 | 3207 | ||
3047 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) | 3208 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) |
@@ -3121,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
3121 | } else | 3282 | } else |
3122 | level = PT_PAGE_TABLE_LEVEL; | 3283 | level = PT_PAGE_TABLE_LEVEL; |
3123 | 3284 | ||
3285 | if (fast_page_fault(vcpu, gpa, level, error_code)) | ||
3286 | return 0; | ||
3287 | |||
3124 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 3288 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
3125 | smp_rmb(); | 3289 | smp_rmb(); |
3126 | 3290 | ||
@@ -3885,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) | |||
3885 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | 4049 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) |
3886 | { | 4050 | { |
3887 | struct kvm_mmu_page *sp; | 4051 | struct kvm_mmu_page *sp; |
4052 | bool flush = false; | ||
3888 | 4053 | ||
3889 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | 4054 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { |
3890 | int i; | 4055 | int i; |
@@ -3899,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
3899 | !is_last_spte(pt[i], sp->role.level)) | 4064 | !is_last_spte(pt[i], sp->role.level)) |
3900 | continue; | 4065 | continue; |
3901 | 4066 | ||
3902 | if (is_large_pte(pt[i])) { | 4067 | spte_write_protect(kvm, &pt[i], &flush, false); |
3903 | drop_spte(kvm, &pt[i]); | ||
3904 | --kvm->stat.lpages; | ||
3905 | continue; | ||
3906 | } | ||
3907 | |||
3908 | /* avoid RMW */ | ||
3909 | if (is_writable_pte(pt[i])) | ||
3910 | mmu_spte_update(&pt[i], | ||
3911 | pt[i] & ~PT_WRITABLE_MASK); | ||
3912 | } | 4068 | } |
3913 | } | 4069 | } |
3914 | kvm_flush_remote_tlbs(kvm); | 4070 | kvm_flush_remote_tlbs(kvm); |
@@ -3945,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, | |||
3945 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | 4101 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) |
3946 | { | 4102 | { |
3947 | struct kvm *kvm; | 4103 | struct kvm *kvm; |
3948 | struct kvm *kvm_freed = NULL; | ||
3949 | int nr_to_scan = sc->nr_to_scan; | 4104 | int nr_to_scan = sc->nr_to_scan; |
3950 | 4105 | ||
3951 | if (nr_to_scan == 0) | 4106 | if (nr_to_scan == 0) |
@@ -3957,22 +4112,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
3957 | int idx; | 4112 | int idx; |
3958 | LIST_HEAD(invalid_list); | 4113 | LIST_HEAD(invalid_list); |
3959 | 4114 | ||
4115 | /* | ||
4116 | * n_used_mmu_pages is accessed without holding kvm->mmu_lock | ||
4117 | * here. We may skip a VM instance errorneosly, but we do not | ||
4118 | * want to shrink a VM that only started to populate its MMU | ||
4119 | * anyway. | ||
4120 | */ | ||
4121 | if (kvm->arch.n_used_mmu_pages > 0) { | ||
4122 | if (!nr_to_scan--) | ||
4123 | break; | ||
4124 | continue; | ||
4125 | } | ||
4126 | |||
3960 | idx = srcu_read_lock(&kvm->srcu); | 4127 | idx = srcu_read_lock(&kvm->srcu); |
3961 | spin_lock(&kvm->mmu_lock); | 4128 | spin_lock(&kvm->mmu_lock); |
3962 | if (!kvm_freed && nr_to_scan > 0 && | ||
3963 | kvm->arch.n_used_mmu_pages > 0) { | ||
3964 | kvm_mmu_remove_some_alloc_mmu_pages(kvm, | ||
3965 | &invalid_list); | ||
3966 | kvm_freed = kvm; | ||
3967 | } | ||
3968 | nr_to_scan--; | ||
3969 | 4129 | ||
4130 | kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); | ||
3970 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 4131 | kvm_mmu_commit_zap_page(kvm, &invalid_list); |
4132 | |||
3971 | spin_unlock(&kvm->mmu_lock); | 4133 | spin_unlock(&kvm->mmu_lock); |
3972 | srcu_read_unlock(&kvm->srcu, idx); | 4134 | srcu_read_unlock(&kvm->srcu, idx); |
4135 | |||
4136 | list_move_tail(&kvm->vm_list, &vm_list); | ||
4137 | break; | ||
3973 | } | 4138 | } |
3974 | if (kvm_freed) | ||
3975 | list_move_tail(&kvm_freed->vm_list, &vm_list); | ||
3976 | 4139 | ||
3977 | raw_spin_unlock(&kvm_lock); | 4140 | raw_spin_unlock(&kvm_lock); |
3978 | 4141 | ||