diff options
Diffstat (limited to 'arch/x86/kvm/paging_tmpl.h')
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 461 |
1 files changed, 461 insertions, 0 deletions
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h new file mode 100644 index 000000000000..56b88f7e83ef --- /dev/null +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -0,0 +1,461 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * We need the mmu code to access both 32-bit and 64-bit guest ptes, | ||
22 | * so the code in this file is compiled twice, once per pte size. | ||
23 | */ | ||
24 | |||
25 | #if PTTYPE == 64 | ||
26 | #define pt_element_t u64 | ||
27 | #define guest_walker guest_walker64 | ||
28 | #define FNAME(name) paging##64_##name | ||
29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | ||
30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | ||
31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | ||
34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | ||
35 | #ifdef CONFIG_X86_64 | ||
36 | #define PT_MAX_FULL_LEVELS 4 | ||
37 | #define CMPXCHG cmpxchg | ||
38 | #else | ||
39 | #define CMPXCHG cmpxchg64 | ||
40 | #define PT_MAX_FULL_LEVELS 2 | ||
41 | #endif | ||
42 | #elif PTTYPE == 32 | ||
43 | #define pt_element_t u32 | ||
44 | #define guest_walker guest_walker32 | ||
45 | #define FNAME(name) paging##32_##name | ||
46 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | ||
47 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | ||
48 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | ||
49 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | ||
51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | ||
52 | #define PT_MAX_FULL_LEVELS 2 | ||
53 | #define CMPXCHG cmpxchg | ||
54 | #else | ||
55 | #error Invalid PTTYPE value | ||
56 | #endif | ||
57 | |||
58 | #define gpte_to_gfn FNAME(gpte_to_gfn) | ||
59 | #define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) | ||
60 | |||
61 | /* | ||
62 | * The guest_walker structure emulates the behavior of the hardware page | ||
63 | * table walker. | ||
64 | */ | ||
65 | struct guest_walker { | ||
66 | int level; | ||
67 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | ||
68 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; | ||
69 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; | ||
70 | unsigned pt_access; | ||
71 | unsigned pte_access; | ||
72 | gfn_t gfn; | ||
73 | u32 error_code; | ||
74 | }; | ||
75 | |||
76 | static gfn_t gpte_to_gfn(pt_element_t gpte) | ||
77 | { | ||
78 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
79 | } | ||
80 | |||
81 | static gfn_t gpte_to_gfn_pde(pt_element_t gpte) | ||
82 | { | ||
83 | return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
84 | } | ||
85 | |||
86 | static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, | ||
87 | gfn_t table_gfn, unsigned index, | ||
88 | pt_element_t orig_pte, pt_element_t new_pte) | ||
89 | { | ||
90 | pt_element_t ret; | ||
91 | pt_element_t *table; | ||
92 | struct page *page; | ||
93 | |||
94 | page = gfn_to_page(kvm, table_gfn); | ||
95 | table = kmap_atomic(page, KM_USER0); | ||
96 | |||
97 | ret = CMPXCHG(&table[index], orig_pte, new_pte); | ||
98 | |||
99 | kunmap_atomic(table, KM_USER0); | ||
100 | |||
101 | kvm_release_page_dirty(page); | ||
102 | |||
103 | return (ret != orig_pte); | ||
104 | } | ||
105 | |||
106 | static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | ||
107 | { | ||
108 | unsigned access; | ||
109 | |||
110 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; | ||
111 | #if PTTYPE == 64 | ||
112 | if (is_nx(vcpu)) | ||
113 | access &= ~(gpte >> PT64_NX_SHIFT); | ||
114 | #endif | ||
115 | return access; | ||
116 | } | ||
117 | |||
118 | /* | ||
119 | * Fetch a guest pte for a guest virtual address | ||
120 | */ | ||
121 | static int FNAME(walk_addr)(struct guest_walker *walker, | ||
122 | struct kvm_vcpu *vcpu, gva_t addr, | ||
123 | int write_fault, int user_fault, int fetch_fault) | ||
124 | { | ||
125 | pt_element_t pte; | ||
126 | gfn_t table_gfn; | ||
127 | unsigned index, pt_access, pte_access; | ||
128 | gpa_t pte_gpa; | ||
129 | |||
130 | pgprintk("%s: addr %lx\n", __FUNCTION__, addr); | ||
131 | walk: | ||
132 | walker->level = vcpu->arch.mmu.root_level; | ||
133 | pte = vcpu->arch.cr3; | ||
134 | #if PTTYPE == 64 | ||
135 | if (!is_long_mode(vcpu)) { | ||
136 | pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; | ||
137 | if (!is_present_pte(pte)) | ||
138 | goto not_present; | ||
139 | --walker->level; | ||
140 | } | ||
141 | #endif | ||
142 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | ||
143 | (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); | ||
144 | |||
145 | pt_access = ACC_ALL; | ||
146 | |||
147 | for (;;) { | ||
148 | index = PT_INDEX(addr, walker->level); | ||
149 | |||
150 | table_gfn = gpte_to_gfn(pte); | ||
151 | pte_gpa = gfn_to_gpa(table_gfn); | ||
152 | pte_gpa += index * sizeof(pt_element_t); | ||
153 | walker->table_gfn[walker->level - 1] = table_gfn; | ||
154 | walker->pte_gpa[walker->level - 1] = pte_gpa; | ||
155 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
156 | walker->level - 1, table_gfn); | ||
157 | |||
158 | kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); | ||
159 | |||
160 | if (!is_present_pte(pte)) | ||
161 | goto not_present; | ||
162 | |||
163 | if (write_fault && !is_writeble_pte(pte)) | ||
164 | if (user_fault || is_write_protection(vcpu)) | ||
165 | goto access_error; | ||
166 | |||
167 | if (user_fault && !(pte & PT_USER_MASK)) | ||
168 | goto access_error; | ||
169 | |||
170 | #if PTTYPE == 64 | ||
171 | if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) | ||
172 | goto access_error; | ||
173 | #endif | ||
174 | |||
175 | if (!(pte & PT_ACCESSED_MASK)) { | ||
176 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
177 | if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, | ||
178 | index, pte, pte|PT_ACCESSED_MASK)) | ||
179 | goto walk; | ||
180 | pte |= PT_ACCESSED_MASK; | ||
181 | } | ||
182 | |||
183 | pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); | ||
184 | |||
185 | walker->ptes[walker->level - 1] = pte; | ||
186 | |||
187 | if (walker->level == PT_PAGE_TABLE_LEVEL) { | ||
188 | walker->gfn = gpte_to_gfn(pte); | ||
189 | break; | ||
190 | } | ||
191 | |||
192 | if (walker->level == PT_DIRECTORY_LEVEL | ||
193 | && (pte & PT_PAGE_SIZE_MASK) | ||
194 | && (PTTYPE == 64 || is_pse(vcpu))) { | ||
195 | walker->gfn = gpte_to_gfn_pde(pte); | ||
196 | walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); | ||
197 | if (PTTYPE == 32 && is_cpuid_PSE36()) | ||
198 | walker->gfn += pse36_gfn_delta(pte); | ||
199 | break; | ||
200 | } | ||
201 | |||
202 | pt_access = pte_access; | ||
203 | --walker->level; | ||
204 | } | ||
205 | |||
206 | if (write_fault && !is_dirty_pte(pte)) { | ||
207 | bool ret; | ||
208 | |||
209 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
210 | ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, | ||
211 | pte|PT_DIRTY_MASK); | ||
212 | if (ret) | ||
213 | goto walk; | ||
214 | pte |= PT_DIRTY_MASK; | ||
215 | kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); | ||
216 | walker->ptes[walker->level - 1] = pte; | ||
217 | } | ||
218 | |||
219 | walker->pt_access = pt_access; | ||
220 | walker->pte_access = pte_access; | ||
221 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", | ||
222 | __FUNCTION__, (u64)pte, pt_access, pte_access); | ||
223 | return 1; | ||
224 | |||
225 | not_present: | ||
226 | walker->error_code = 0; | ||
227 | goto err; | ||
228 | |||
229 | access_error: | ||
230 | walker->error_code = PFERR_PRESENT_MASK; | ||
231 | |||
232 | err: | ||
233 | if (write_fault) | ||
234 | walker->error_code |= PFERR_WRITE_MASK; | ||
235 | if (user_fault) | ||
236 | walker->error_code |= PFERR_USER_MASK; | ||
237 | if (fetch_fault) | ||
238 | walker->error_code |= PFERR_FETCH_MASK; | ||
239 | return 0; | ||
240 | } | ||
241 | |||
242 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | ||
243 | u64 *spte, const void *pte, int bytes, | ||
244 | int offset_in_pte) | ||
245 | { | ||
246 | pt_element_t gpte; | ||
247 | unsigned pte_access; | ||
248 | |||
249 | gpte = *(const pt_element_t *)pte; | ||
250 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | ||
251 | if (!offset_in_pte && !is_present_pte(gpte)) | ||
252 | set_shadow_pte(spte, shadow_notrap_nonpresent_pte); | ||
253 | return; | ||
254 | } | ||
255 | if (bytes < sizeof(pt_element_t)) | ||
256 | return; | ||
257 | pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); | ||
258 | pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); | ||
259 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | ||
260 | gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte)); | ||
261 | } | ||
262 | |||
263 | /* | ||
264 | * Fetch a shadow pte for a specific level in the paging hierarchy. | ||
265 | */ | ||
266 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
267 | struct guest_walker *walker, | ||
268 | int user_fault, int write_fault, int *ptwrite) | ||
269 | { | ||
270 | hpa_t shadow_addr; | ||
271 | int level; | ||
272 | u64 *shadow_ent; | ||
273 | unsigned access = walker->pt_access; | ||
274 | |||
275 | if (!is_present_pte(walker->ptes[walker->level - 1])) | ||
276 | return NULL; | ||
277 | |||
278 | shadow_addr = vcpu->arch.mmu.root_hpa; | ||
279 | level = vcpu->arch.mmu.shadow_root_level; | ||
280 | if (level == PT32E_ROOT_LEVEL) { | ||
281 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | ||
282 | shadow_addr &= PT64_BASE_ADDR_MASK; | ||
283 | --level; | ||
284 | } | ||
285 | |||
286 | for (; ; level--) { | ||
287 | u32 index = SHADOW_PT_INDEX(addr, level); | ||
288 | struct kvm_mmu_page *shadow_page; | ||
289 | u64 shadow_pte; | ||
290 | int metaphysical; | ||
291 | gfn_t table_gfn; | ||
292 | bool new_page = 0; | ||
293 | |||
294 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
295 | if (is_shadow_present_pte(*shadow_ent)) { | ||
296 | if (level == PT_PAGE_TABLE_LEVEL) | ||
297 | break; | ||
298 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | ||
299 | continue; | ||
300 | } | ||
301 | |||
302 | if (level == PT_PAGE_TABLE_LEVEL) | ||
303 | break; | ||
304 | |||
305 | if (level - 1 == PT_PAGE_TABLE_LEVEL | ||
306 | && walker->level == PT_DIRECTORY_LEVEL) { | ||
307 | metaphysical = 1; | ||
308 | if (!is_dirty_pte(walker->ptes[level - 1])) | ||
309 | access &= ~ACC_WRITE_MASK; | ||
310 | table_gfn = gpte_to_gfn(walker->ptes[level - 1]); | ||
311 | } else { | ||
312 | metaphysical = 0; | ||
313 | table_gfn = walker->table_gfn[level - 2]; | ||
314 | } | ||
315 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | ||
316 | metaphysical, access, | ||
317 | shadow_ent, &new_page); | ||
318 | if (new_page && !metaphysical) { | ||
319 | pt_element_t curr_pte; | ||
320 | kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2], | ||
321 | &curr_pte, sizeof(curr_pte)); | ||
322 | if (curr_pte != walker->ptes[level - 2]) | ||
323 | return NULL; | ||
324 | } | ||
325 | shadow_addr = __pa(shadow_page->spt); | ||
326 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
327 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
328 | *shadow_ent = shadow_pte; | ||
329 | } | ||
330 | |||
331 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, | ||
332 | user_fault, write_fault, | ||
333 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, | ||
334 | ptwrite, walker->gfn); | ||
335 | |||
336 | return shadow_ent; | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * Page fault handler. There are several causes for a page fault: | ||
341 | * - there is no shadow pte for the guest pte | ||
342 | * - write access through a shadow pte marked read only so that we can set | ||
343 | * the dirty bit | ||
344 | * - write access to a shadow pte marked read only so we can update the page | ||
345 | * dirty bitmap, when userspace requests it | ||
346 | * - mmio access; in this case we will never install a present shadow pte | ||
347 | * - normal guest page fault due to the guest pte marked not present, not | ||
348 | * writable, or not executable | ||
349 | * | ||
350 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or | ||
351 | * a negative value on error. | ||
352 | */ | ||
353 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | ||
354 | u32 error_code) | ||
355 | { | ||
356 | int write_fault = error_code & PFERR_WRITE_MASK; | ||
357 | int user_fault = error_code & PFERR_USER_MASK; | ||
358 | int fetch_fault = error_code & PFERR_FETCH_MASK; | ||
359 | struct guest_walker walker; | ||
360 | u64 *shadow_pte; | ||
361 | int write_pt = 0; | ||
362 | int r; | ||
363 | |||
364 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | ||
365 | kvm_mmu_audit(vcpu, "pre page fault"); | ||
366 | |||
367 | r = mmu_topup_memory_caches(vcpu); | ||
368 | if (r) | ||
369 | return r; | ||
370 | |||
371 | /* | ||
372 | * Look up the shadow pte for the faulting address. | ||
373 | */ | ||
374 | r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, | ||
375 | fetch_fault); | ||
376 | |||
377 | /* | ||
378 | * The page is not mapped by the guest. Let the guest handle it. | ||
379 | */ | ||
380 | if (!r) { | ||
381 | pgprintk("%s: guest page fault\n", __FUNCTION__); | ||
382 | inject_page_fault(vcpu, addr, walker.error_code); | ||
383 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | ||
384 | return 0; | ||
385 | } | ||
386 | |||
387 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | ||
388 | &write_pt); | ||
389 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, | ||
390 | shadow_pte, *shadow_pte, write_pt); | ||
391 | |||
392 | if (!write_pt) | ||
393 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | ||
394 | |||
395 | /* | ||
396 | * mmio: emulate if accessible, otherwise its a guest fault. | ||
397 | */ | ||
398 | if (shadow_pte && is_io_pte(*shadow_pte)) | ||
399 | return 1; | ||
400 | |||
401 | ++vcpu->stat.pf_fixed; | ||
402 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); | ||
403 | |||
404 | return write_pt; | ||
405 | } | ||
406 | |||
407 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
408 | { | ||
409 | struct guest_walker walker; | ||
410 | gpa_t gpa = UNMAPPED_GVA; | ||
411 | int r; | ||
412 | |||
413 | r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); | ||
414 | |||
415 | if (r) { | ||
416 | gpa = gfn_to_gpa(walker.gfn); | ||
417 | gpa |= vaddr & ~PAGE_MASK; | ||
418 | } | ||
419 | |||
420 | return gpa; | ||
421 | } | ||
422 | |||
423 | static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | ||
424 | struct kvm_mmu_page *sp) | ||
425 | { | ||
426 | int i, offset = 0; | ||
427 | pt_element_t *gpt; | ||
428 | struct page *page; | ||
429 | |||
430 | if (sp->role.metaphysical | ||
431 | || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { | ||
432 | nonpaging_prefetch_page(vcpu, sp); | ||
433 | return; | ||
434 | } | ||
435 | |||
436 | if (PTTYPE == 32) | ||
437 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
438 | page = gfn_to_page(vcpu->kvm, sp->gfn); | ||
439 | gpt = kmap_atomic(page, KM_USER0); | ||
440 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
441 | if (is_present_pte(gpt[offset + i])) | ||
442 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
443 | else | ||
444 | sp->spt[i] = shadow_notrap_nonpresent_pte; | ||
445 | kunmap_atomic(gpt, KM_USER0); | ||
446 | kvm_release_page_clean(page); | ||
447 | } | ||
448 | |||
449 | #undef pt_element_t | ||
450 | #undef guest_walker | ||
451 | #undef FNAME | ||
452 | #undef PT_BASE_ADDR_MASK | ||
453 | #undef PT_INDEX | ||
454 | #undef SHADOW_PT_INDEX | ||
455 | #undef PT_LEVEL_MASK | ||
456 | #undef PT_DIR_BASE_ADDR_MASK | ||
457 | #undef PT_LEVEL_BITS | ||
458 | #undef PT_MAX_FULL_LEVELS | ||
459 | #undef gpte_to_gfn | ||
460 | #undef gpte_to_gfn_pde | ||
461 | #undef CMPXCHG | ||