diff options
author | Avi Kivity <avi@qumranet.com> | 2007-01-05 19:36:43 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.osdl.org> | 2007-01-06 02:55:24 -0500 |
commit | cea0f0e7ea54753c3265dc77f605a6dad1912cfc (patch) | |
tree | e0a3e64b45fe83f1f0ae89556e1f6fcf92f07185 /drivers/kvm/paging_tmpl.h | |
parent | 25c0de2cc6c26cb99553c2444936a7951c120c09 (diff) |
[PATCH] KVM: MMU: Shadow page table caching
Define a hashtable for caching shadow page tables. Look up the cache on
context switch (cr3 change) or during page faults.
The key to the cache is a combination of
- the guest page table frame number
- the number of paging levels in the guest
* we can cache real mode, 32-bit mode, pae, and long mode page
tables simultaneously. this is useful for smp bootup.
- the guest page table table
* some kernels use a page as both a page table and a page directory. this
allows multiple shadow pages to exist for that page, one per level
- the "quadrant"
* 32-bit mode page tables span 4MB, whereas a shadow page table spans
2MB. similarly, a 32-bit page directory spans 4GB, while a shadow
page directory spans 1GB. the quadrant allows caching up to 4 shadow page
tables for one guest page in one level.
- a "metaphysical" bit
* for real mode, and for pse pages, there is no guest page table, so set
the bit to avoid write protecting the page.
Signed-off-by: Avi Kivity <avi@qumranet.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/kvm/paging_tmpl.h')
-rw-r--r-- | drivers/kvm/paging_tmpl.h | 62 |
1 files changed, 53 insertions, 9 deletions
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 11cac9ddf26a..f7cce443ca6f 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h | |||
@@ -32,6 +32,11 @@ | |||
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) |
34 | #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK | 34 | #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK |
35 | #ifdef CONFIG_X86_64 | ||
36 | #define PT_MAX_FULL_LEVELS 4 | ||
37 | #else | ||
38 | #define PT_MAX_FULL_LEVELS 2 | ||
39 | #endif | ||
35 | #elif PTTYPE == 32 | 40 | #elif PTTYPE == 32 |
36 | #define pt_element_t u32 | 41 | #define pt_element_t u32 |
37 | #define guest_walker guest_walker32 | 42 | #define guest_walker guest_walker32 |
@@ -42,6 +47,7 @@ | |||
42 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 47 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
43 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | 48 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) |
44 | #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK | 49 | #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK |
50 | #define PT_MAX_FULL_LEVELS 2 | ||
45 | #else | 51 | #else |
46 | #error Invalid PTTYPE value | 52 | #error Invalid PTTYPE value |
47 | #endif | 53 | #endif |
@@ -52,7 +58,7 @@ | |||
52 | */ | 58 | */ |
53 | struct guest_walker { | 59 | struct guest_walker { |
54 | int level; | 60 | int level; |
55 | gfn_t table_gfn; | 61 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; |
56 | pt_element_t *table; | 62 | pt_element_t *table; |
57 | pt_element_t *ptep; | 63 | pt_element_t *ptep; |
58 | pt_element_t inherited_ar; | 64 | pt_element_t inherited_ar; |
@@ -68,7 +74,9 @@ static void FNAME(walk_addr)(struct guest_walker *walker, | |||
68 | struct kvm_memory_slot *slot; | 74 | struct kvm_memory_slot *slot; |
69 | pt_element_t *ptep; | 75 | pt_element_t *ptep; |
70 | pt_element_t root; | 76 | pt_element_t root; |
77 | gfn_t table_gfn; | ||
71 | 78 | ||
79 | pgprintk("%s: addr %lx\n", __FUNCTION__, addr); | ||
72 | walker->level = vcpu->mmu.root_level; | 80 | walker->level = vcpu->mmu.root_level; |
73 | walker->table = NULL; | 81 | walker->table = NULL; |
74 | root = vcpu->cr3; | 82 | root = vcpu->cr3; |
@@ -81,8 +89,11 @@ static void FNAME(walk_addr)(struct guest_walker *walker, | |||
81 | --walker->level; | 89 | --walker->level; |
82 | } | 90 | } |
83 | #endif | 91 | #endif |
84 | walker->table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 92 | table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
85 | slot = gfn_to_memslot(vcpu->kvm, walker->table_gfn); | 93 | walker->table_gfn[walker->level - 1] = table_gfn; |
94 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
95 | walker->level - 1, table_gfn); | ||
96 | slot = gfn_to_memslot(vcpu->kvm, table_gfn); | ||
86 | hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); | 97 | hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); |
87 | walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); | 98 | walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); |
88 | 99 | ||
@@ -111,12 +122,15 @@ static void FNAME(walk_addr)(struct guest_walker *walker, | |||
111 | 122 | ||
112 | if (walker->level != 3 || is_long_mode(vcpu)) | 123 | if (walker->level != 3 || is_long_mode(vcpu)) |
113 | walker->inherited_ar &= walker->table[index]; | 124 | walker->inherited_ar &= walker->table[index]; |
114 | walker->table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | 125 | table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; |
115 | paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK); | 126 | paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK); |
116 | kunmap_atomic(walker->table, KM_USER0); | 127 | kunmap_atomic(walker->table, KM_USER0); |
117 | walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), | 128 | walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), |
118 | KM_USER0); | 129 | KM_USER0); |
119 | --walker->level; | 130 | --walker->level; |
131 | walker->table_gfn[walker->level - 1 ] = table_gfn; | ||
132 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
133 | walker->level - 1, table_gfn); | ||
120 | } | 134 | } |
121 | walker->ptep = ptep; | 135 | walker->ptep = ptep; |
122 | } | 136 | } |
@@ -181,6 +195,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
181 | u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; | 195 | u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; |
182 | struct kvm_mmu_page *shadow_page; | 196 | struct kvm_mmu_page *shadow_page; |
183 | u64 shadow_pte; | 197 | u64 shadow_pte; |
198 | int metaphysical; | ||
199 | gfn_t table_gfn; | ||
184 | 200 | ||
185 | if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { | 201 | if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { |
186 | if (level == PT_PAGE_TABLE_LEVEL) | 202 | if (level == PT_PAGE_TABLE_LEVEL) |
@@ -205,7 +221,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
205 | return shadow_ent; | 221 | return shadow_ent; |
206 | } | 222 | } |
207 | 223 | ||
208 | shadow_page = kvm_mmu_alloc_page(vcpu, shadow_ent); | 224 | if (level - 1 == PT_PAGE_TABLE_LEVEL |
225 | && walker->level == PT_DIRECTORY_LEVEL) { | ||
226 | metaphysical = 1; | ||
227 | table_gfn = (*guest_ent & PT_BASE_ADDR_MASK) | ||
228 | >> PAGE_SHIFT; | ||
229 | } else { | ||
230 | metaphysical = 0; | ||
231 | table_gfn = walker->table_gfn[level - 2]; | ||
232 | } | ||
233 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | ||
234 | metaphysical, shadow_ent); | ||
209 | if (!shadow_page) | 235 | if (!shadow_page) |
210 | return ERR_PTR(-ENOMEM); | 236 | return ERR_PTR(-ENOMEM); |
211 | shadow_addr = shadow_page->page_hpa; | 237 | shadow_addr = shadow_page->page_hpa; |
@@ -227,7 +253,8 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, | |||
227 | u64 *shadow_ent, | 253 | u64 *shadow_ent, |
228 | struct guest_walker *walker, | 254 | struct guest_walker *walker, |
229 | gva_t addr, | 255 | gva_t addr, |
230 | int user) | 256 | int user, |
257 | int *write_pt) | ||
231 | { | 258 | { |
232 | pt_element_t *guest_ent; | 259 | pt_element_t *guest_ent; |
233 | int writable_shadow; | 260 | int writable_shadow; |
@@ -264,6 +291,12 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, | |||
264 | } | 291 | } |
265 | 292 | ||
266 | gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 293 | gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
294 | if (kvm_mmu_lookup_page(vcpu, gfn)) { | ||
295 | pgprintk("%s: found shadow page for %lx, marking ro\n", | ||
296 | __FUNCTION__, gfn); | ||
297 | *write_pt = 1; | ||
298 | return 0; | ||
299 | } | ||
267 | mark_page_dirty(vcpu->kvm, gfn); | 300 | mark_page_dirty(vcpu->kvm, gfn); |
268 | *shadow_ent |= PT_WRITABLE_MASK; | 301 | *shadow_ent |= PT_WRITABLE_MASK; |
269 | *guest_ent |= PT_DIRTY_MASK; | 302 | *guest_ent |= PT_DIRTY_MASK; |
@@ -294,7 +327,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
294 | struct guest_walker walker; | 327 | struct guest_walker walker; |
295 | u64 *shadow_pte; | 328 | u64 *shadow_pte; |
296 | int fixed; | 329 | int fixed; |
330 | int write_pt = 0; | ||
297 | 331 | ||
332 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | ||
298 | /* | 333 | /* |
299 | * Look up the shadow pte for the faulting address. | 334 | * Look up the shadow pte for the faulting address. |
300 | */ | 335 | */ |
@@ -302,6 +337,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
302 | FNAME(walk_addr)(&walker, vcpu, addr); | 337 | FNAME(walk_addr)(&walker, vcpu, addr); |
303 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker); | 338 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker); |
304 | if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */ | 339 | if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */ |
340 | printk("%s: oom\n", __FUNCTION__); | ||
305 | nonpaging_flush(vcpu); | 341 | nonpaging_flush(vcpu); |
306 | FNAME(release_walker)(&walker); | 342 | FNAME(release_walker)(&walker); |
307 | continue; | 343 | continue; |
@@ -313,20 +349,27 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
313 | * The page is not mapped by the guest. Let the guest handle it. | 349 | * The page is not mapped by the guest. Let the guest handle it. |
314 | */ | 350 | */ |
315 | if (!shadow_pte) { | 351 | if (!shadow_pte) { |
352 | pgprintk("%s: not mapped\n", __FUNCTION__); | ||
316 | inject_page_fault(vcpu, addr, error_code); | 353 | inject_page_fault(vcpu, addr, error_code); |
317 | FNAME(release_walker)(&walker); | 354 | FNAME(release_walker)(&walker); |
318 | return 0; | 355 | return 0; |
319 | } | 356 | } |
320 | 357 | ||
358 | pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, | ||
359 | shadow_pte, *shadow_pte); | ||
360 | |||
321 | /* | 361 | /* |
322 | * Update the shadow pte. | 362 | * Update the shadow pte. |
323 | */ | 363 | */ |
324 | if (write_fault) | 364 | if (write_fault) |
325 | fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, | 365 | fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, |
326 | user_fault); | 366 | user_fault, &write_pt); |
327 | else | 367 | else |
328 | fixed = fix_read_pf(shadow_pte); | 368 | fixed = fix_read_pf(shadow_pte); |
329 | 369 | ||
370 | pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__, | ||
371 | shadow_pte, *shadow_pte); | ||
372 | |||
330 | FNAME(release_walker)(&walker); | 373 | FNAME(release_walker)(&walker); |
331 | 374 | ||
332 | /* | 375 | /* |
@@ -344,14 +387,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
344 | /* | 387 | /* |
345 | * pte not present, guest page fault. | 388 | * pte not present, guest page fault. |
346 | */ | 389 | */ |
347 | if (pte_present && !fixed) { | 390 | if (pte_present && !fixed && !write_pt) { |
348 | inject_page_fault(vcpu, addr, error_code); | 391 | inject_page_fault(vcpu, addr, error_code); |
349 | return 0; | 392 | return 0; |
350 | } | 393 | } |
351 | 394 | ||
352 | ++kvm_stat.pf_fixed; | 395 | ++kvm_stat.pf_fixed; |
353 | 396 | ||
354 | return 0; | 397 | return write_pt; |
355 | } | 398 | } |
356 | 399 | ||
357 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | 400 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) |
@@ -395,3 +438,4 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | |||
395 | #undef PT_PTE_COPY_MASK | 438 | #undef PT_PTE_COPY_MASK |
396 | #undef PT_NON_PTE_COPY_MASK | 439 | #undef PT_NON_PTE_COPY_MASK |
397 | #undef PT_DIR_BASE_ADDR_MASK | 440 | #undef PT_DIR_BASE_ADDR_MASK |
441 | #undef PT_MAX_FULL_LEVELS | ||