diff options
author | Jeremy Fitzhardinge <jeremy@xensource.com> | 2007-07-17 21:37:05 -0400 |
---|---|---|
committer | Jeremy Fitzhardinge <jeremy@goop.org> | 2007-07-18 11:47:43 -0400 |
commit | f4f97b3ea90130520afb478cbc2918be2b6587b8 (patch) | |
tree | 1aeebe3230b4a7eef0630eec148927c1adf340a5 /arch/i386/xen/mmu.c | |
parent | c85b04c3749507546f6d5868976e4793e35c2ec0 (diff) |
xen: Complete pagetable pinning
Xen requires all active pagetables to be marked read-only. When the
base of the pagetable is loaded into %cr3, the hypervisor validates
the entire pagetable and only allows the load to proceed if it all
checks out.
This is pretty slow, so to mitigate this cost Xen has a notion of
pinned pagetables. Pinned pagetables are pagetables which are
considered to be active even if no processor's cr3 is pointing to is.
This means that it must remain read-only and all updates are validated
by the hypervisor. This makes context switches much cheaper, because
the hypervisor doesn't need to revalidate the pagetable each time.
This also adds a new paravirt hook which is called during setup once
the zones and memory allocator have been initialized. When the
init_mm pagetable is first built, the struct page array does not yet
exist, and so there's nowhere to put he init_mm pagetable's PG_pinned
flags. Once the zones are initialized and the struct page array
exists, we can set the PG_pinned flags for those pages.
This patch also adds the Xen support for pte pages allocated out of
highmem (highpte) by implementing xen_kmap_atomic_pte.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Zach Amsden <zach@vmware.com>
Diffstat (limited to 'arch/i386/xen/mmu.c')
-rw-r--r-- | arch/i386/xen/mmu.c | 260 |
1 files changed, 170 insertions, 90 deletions
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c index de16cb5f55ca..53501ce2d15c 100644 --- a/arch/i386/xen/mmu.c +++ b/arch/i386/xen/mmu.c | |||
@@ -38,19 +38,22 @@ | |||
38 | * | 38 | * |
39 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | 39 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 |
40 | */ | 40 | */ |
41 | #include <linux/highmem.h> | ||
41 | #include <linux/bug.h> | 42 | #include <linux/bug.h> |
42 | #include <linux/sched.h> | 43 | #include <linux/sched.h> |
43 | 44 | ||
44 | #include <asm/pgtable.h> | 45 | #include <asm/pgtable.h> |
45 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
46 | #include <asm/mmu_context.h> | 47 | #include <asm/mmu_context.h> |
48 | #include <asm/paravirt.h> | ||
47 | 49 | ||
48 | #include <asm/xen/hypercall.h> | 50 | #include <asm/xen/hypercall.h> |
49 | #include <asm/paravirt.h> | 51 | #include <asm/xen/hypervisor.h> |
50 | 52 | ||
51 | #include <xen/page.h> | 53 | #include <xen/page.h> |
52 | #include <xen/interface/xen.h> | 54 | #include <xen/interface/xen.h> |
53 | 55 | ||
56 | #include "multicalls.h" | ||
54 | #include "mmu.h" | 57 | #include "mmu.h" |
55 | 58 | ||
56 | xmaddr_t arbitrary_virt_to_machine(unsigned long address) | 59 | xmaddr_t arbitrary_virt_to_machine(unsigned long address) |
@@ -92,16 +95,6 @@ void make_lowmem_page_readwrite(void *vaddr) | |||
92 | } | 95 | } |
93 | 96 | ||
94 | 97 | ||
95 | void xen_set_pte(pte_t *ptep, pte_t pte) | ||
96 | { | ||
97 | struct mmu_update u; | ||
98 | |||
99 | u.ptr = virt_to_machine(ptep).maddr; | ||
100 | u.val = pte_val_ma(pte); | ||
101 | if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) | ||
102 | BUG(); | ||
103 | } | ||
104 | |||
105 | void xen_set_pmd(pmd_t *ptr, pmd_t val) | 98 | void xen_set_pmd(pmd_t *ptr, pmd_t val) |
106 | { | 99 | { |
107 | struct mmu_update u; | 100 | struct mmu_update u; |
@@ -112,18 +105,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) | |||
112 | BUG(); | 105 | BUG(); |
113 | } | 106 | } |
114 | 107 | ||
115 | #ifdef CONFIG_X86_PAE | ||
116 | void xen_set_pud(pud_t *ptr, pud_t val) | ||
117 | { | ||
118 | struct mmu_update u; | ||
119 | |||
120 | u.ptr = virt_to_machine(ptr).maddr; | ||
121 | u.val = pud_val_ma(val); | ||
122 | if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) | ||
123 | BUG(); | ||
124 | } | ||
125 | #endif | ||
126 | |||
127 | /* | 108 | /* |
128 | * Associate a virtual page frame with a given physical page frame | 109 | * Associate a virtual page frame with a given physical page frame |
129 | * and protection flags for that frame. | 110 | * and protection flags for that frame. |
@@ -170,6 +151,23 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
170 | } | 151 | } |
171 | 152 | ||
172 | #ifdef CONFIG_X86_PAE | 153 | #ifdef CONFIG_X86_PAE |
154 | void xen_set_pud(pud_t *ptr, pud_t val) | ||
155 | { | ||
156 | struct mmu_update u; | ||
157 | |||
158 | u.ptr = virt_to_machine(ptr).maddr; | ||
159 | u.val = pud_val_ma(val); | ||
160 | if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) | ||
161 | BUG(); | ||
162 | } | ||
163 | |||
164 | void xen_set_pte(pte_t *ptep, pte_t pte) | ||
165 | { | ||
166 | ptep->pte_high = pte.pte_high; | ||
167 | smp_wmb(); | ||
168 | ptep->pte_low = pte.pte_low; | ||
169 | } | ||
170 | |||
173 | void xen_set_pte_atomic(pte_t *ptep, pte_t pte) | 171 | void xen_set_pte_atomic(pte_t *ptep, pte_t pte) |
174 | { | 172 | { |
175 | set_64bit((u64 *)ptep, pte_val_ma(pte)); | 173 | set_64bit((u64 *)ptep, pte_val_ma(pte)); |
@@ -239,6 +237,11 @@ pgd_t xen_make_pgd(unsigned long long pgd) | |||
239 | return (pgd_t){ pgd }; | 237 | return (pgd_t){ pgd }; |
240 | } | 238 | } |
241 | #else /* !PAE */ | 239 | #else /* !PAE */ |
240 | void xen_set_pte(pte_t *ptep, pte_t pte) | ||
241 | { | ||
242 | *ptep = pte; | ||
243 | } | ||
244 | |||
242 | unsigned long xen_pte_val(pte_t pte) | 245 | unsigned long xen_pte_val(pte_t pte) |
243 | { | 246 | { |
244 | unsigned long ret = pte.pte_low; | 247 | unsigned long ret = pte.pte_low; |
@@ -249,13 +252,6 @@ unsigned long xen_pte_val(pte_t pte) | |||
249 | return ret; | 252 | return ret; |
250 | } | 253 | } |
251 | 254 | ||
252 | unsigned long xen_pmd_val(pmd_t pmd) | ||
253 | { | ||
254 | /* a BUG here is a lot easier to track down than a NULL eip */ | ||
255 | BUG(); | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | unsigned long xen_pgd_val(pgd_t pgd) | 255 | unsigned long xen_pgd_val(pgd_t pgd) |
260 | { | 256 | { |
261 | unsigned long ret = pgd.pgd; | 257 | unsigned long ret = pgd.pgd; |
@@ -272,13 +268,6 @@ pte_t xen_make_pte(unsigned long pte) | |||
272 | return (pte_t){ pte }; | 268 | return (pte_t){ pte }; |
273 | } | 269 | } |
274 | 270 | ||
275 | pmd_t xen_make_pmd(unsigned long pmd) | ||
276 | { | ||
277 | /* a BUG here is a lot easier to track down than a NULL eip */ | ||
278 | BUG(); | ||
279 | return __pmd(0); | ||
280 | } | ||
281 | |||
282 | pgd_t xen_make_pgd(unsigned long pgd) | 271 | pgd_t xen_make_pgd(unsigned long pgd) |
283 | { | 272 | { |
284 | if (pgd & _PAGE_PRESENT) | 273 | if (pgd & _PAGE_PRESENT) |
@@ -290,108 +279,199 @@ pgd_t xen_make_pgd(unsigned long pgd) | |||
290 | 279 | ||
291 | 280 | ||
292 | 281 | ||
293 | static void pgd_walk_set_prot(void *pt, pgprot_t flags) | 282 | /* |
294 | { | 283 | (Yet another) pagetable walker. This one is intended for pinning a |
295 | unsigned long pfn = PFN_DOWN(__pa(pt)); | 284 | pagetable. This means that it walks a pagetable and calls the |
296 | 285 | callback function on each page it finds making up the page table, | |
297 | if (HYPERVISOR_update_va_mapping((unsigned long)pt, | 286 | at every level. It walks the entire pagetable, but it only bothers |
298 | pfn_pte(pfn, flags), 0) < 0) | 287 | pinning pte pages which are below pte_limit. In the normal case |
299 | BUG(); | 288 | this will be TASK_SIZE, but at boot we need to pin up to |
300 | } | 289 | FIXADDR_TOP. But the important bit is that we don't pin beyond |
301 | 290 | there, because then we start getting into Xen's ptes. | |
302 | static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) | 291 | */ |
292 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | ||
293 | unsigned long limit) | ||
303 | { | 294 | { |
304 | pgd_t *pgd = pgd_base; | 295 | pgd_t *pgd = pgd_base; |
305 | pud_t *pud; | 296 | int flush = 0; |
306 | pmd_t *pmd; | 297 | unsigned long addr = 0; |
307 | pte_t *pte; | 298 | unsigned long pgd_next; |
308 | int g, u, m; | 299 | |
300 | BUG_ON(limit > FIXADDR_TOP); | ||
309 | 301 | ||
310 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 302 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
311 | return; | 303 | return 0; |
304 | |||
305 | for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { | ||
306 | pud_t *pud; | ||
307 | unsigned long pud_limit, pud_next; | ||
312 | 308 | ||
313 | for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { | 309 | pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); |
314 | if (pgd_none(*pgd)) | 310 | |
311 | if (!pgd_val(*pgd)) | ||
315 | continue; | 312 | continue; |
313 | |||
316 | pud = pud_offset(pgd, 0); | 314 | pud = pud_offset(pgd, 0); |
317 | 315 | ||
318 | if (PTRS_PER_PUD > 1) /* not folded */ | 316 | if (PTRS_PER_PUD > 1) /* not folded */ |
319 | pgd_walk_set_prot(pud, flags); | 317 | flush |= (*func)(virt_to_page(pud), 0); |
318 | |||
319 | for (; addr != pud_limit; pud++, addr = pud_next) { | ||
320 | pmd_t *pmd; | ||
321 | unsigned long pmd_limit; | ||
322 | |||
323 | pud_next = pud_addr_end(addr, pud_limit); | ||
324 | |||
325 | if (pud_next < limit) | ||
326 | pmd_limit = pud_next; | ||
327 | else | ||
328 | pmd_limit = limit; | ||
320 | 329 | ||
321 | for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | ||
322 | if (pud_none(*pud)) | 330 | if (pud_none(*pud)) |
323 | continue; | 331 | continue; |
332 | |||
324 | pmd = pmd_offset(pud, 0); | 333 | pmd = pmd_offset(pud, 0); |
325 | 334 | ||
326 | if (PTRS_PER_PMD > 1) /* not folded */ | 335 | if (PTRS_PER_PMD > 1) /* not folded */ |
327 | pgd_walk_set_prot(pmd, flags); | 336 | flush |= (*func)(virt_to_page(pmd), 0); |
337 | |||
338 | for (; addr != pmd_limit; pmd++) { | ||
339 | addr += (PAGE_SIZE * PTRS_PER_PTE); | ||
340 | if ((pmd_limit-1) < (addr-1)) { | ||
341 | addr = pmd_limit; | ||
342 | break; | ||
343 | } | ||
328 | 344 | ||
329 | for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { | ||
330 | if (pmd_none(*pmd)) | 345 | if (pmd_none(*pmd)) |
331 | continue; | 346 | continue; |
332 | 347 | ||
333 | /* This can get called before mem_map | 348 | flush |= (*func)(pmd_page(*pmd), 0); |
334 | is set up, so we assume nothing is | ||
335 | highmem at that point. */ | ||
336 | if (mem_map == NULL || | ||
337 | !PageHighMem(pmd_page(*pmd))) { | ||
338 | pte = pte_offset_kernel(pmd, 0); | ||
339 | pgd_walk_set_prot(pte, flags); | ||
340 | } | ||
341 | } | 349 | } |
342 | } | 350 | } |
343 | } | 351 | } |
344 | 352 | ||
345 | if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base, | 353 | flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); |
346 | pfn_pte(PFN_DOWN(__pa(pgd_base)), | 354 | |
347 | flags), | 355 | return flush; |
348 | UVMF_TLB_FLUSH) < 0) | ||
349 | BUG(); | ||
350 | } | 356 | } |
351 | 357 | ||
358 | static int pin_page(struct page *page, unsigned flags) | ||
359 | { | ||
360 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); | ||
361 | int flush; | ||
362 | |||
363 | if (pgfl) | ||
364 | flush = 0; /* already pinned */ | ||
365 | else if (PageHighMem(page)) | ||
366 | /* kmaps need flushing if we found an unpinned | ||
367 | highpage */ | ||
368 | flush = 1; | ||
369 | else { | ||
370 | void *pt = lowmem_page_address(page); | ||
371 | unsigned long pfn = page_to_pfn(page); | ||
372 | struct multicall_space mcs = __xen_mc_entry(0); | ||
373 | |||
374 | flush = 0; | ||
375 | |||
376 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | ||
377 | pfn_pte(pfn, PAGE_KERNEL_RO), | ||
378 | flags); | ||
379 | } | ||
380 | |||
381 | return flush; | ||
382 | } | ||
352 | 383 | ||
353 | /* This is called just after a mm has been duplicated from its parent, | 384 | /* This is called just after a mm has been created, but it has not |
354 | but it has not been used yet. We need to make sure that its | 385 | been used yet. We need to make sure that its pagetable is all |
355 | pagetable is all read-only, and can be pinned. */ | 386 | read-only, and can be pinned. */ |
356 | void xen_pgd_pin(pgd_t *pgd) | 387 | void xen_pgd_pin(pgd_t *pgd) |
357 | { | 388 | { |
358 | struct mmuext_op op; | 389 | struct multicall_space mcs; |
390 | struct mmuext_op *op; | ||
359 | 391 | ||
360 | pgd_walk(pgd, PAGE_KERNEL_RO); | 392 | xen_mc_batch(); |
361 | 393 | ||
362 | #if defined(CONFIG_X86_PAE) | 394 | if (pgd_walk(pgd, pin_page, TASK_SIZE)) |
363 | op.cmd = MMUEXT_PIN_L3_TABLE; | 395 | kmap_flush_unused(); |
396 | |||
397 | mcs = __xen_mc_entry(sizeof(*op)); | ||
398 | op = mcs.args; | ||
399 | |||
400 | #ifdef CONFIG_X86_PAE | ||
401 | op->cmd = MMUEXT_PIN_L3_TABLE; | ||
364 | #else | 402 | #else |
365 | op.cmd = MMUEXT_PIN_L2_TABLE; | 403 | op->cmd = MMUEXT_PIN_L2_TABLE; |
366 | #endif | 404 | #endif |
367 | op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | 405 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); |
368 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) | 406 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
369 | BUG(); | 407 | |
408 | xen_mc_issue(0); | ||
370 | } | 409 | } |
371 | 410 | ||
372 | /* Release a pagetables pages back as normal RW */ | 411 | /* The init_mm pagetable is really pinned as soon as its created, but |
373 | void xen_pgd_unpin(pgd_t *pgd) | 412 | that's before we have page structures to store the bits. So do all |
413 | the book-keeping now. */ | ||
414 | static __init int mark_pinned(struct page *page, unsigned flags) | ||
374 | { | 415 | { |
375 | struct mmuext_op op; | 416 | SetPagePinned(page); |
417 | return 0; | ||
418 | } | ||
376 | 419 | ||
377 | op.cmd = MMUEXT_UNPIN_TABLE; | 420 | void __init xen_mark_init_mm_pinned(void) |
378 | op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | 421 | { |
422 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); | ||
423 | } | ||
379 | 424 | ||
380 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) | 425 | static int unpin_page(struct page *page, unsigned flags) |
381 | BUG(); | 426 | { |
427 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); | ||
382 | 428 | ||
383 | pgd_walk(pgd, PAGE_KERNEL); | 429 | if (pgfl && !PageHighMem(page)) { |
430 | void *pt = lowmem_page_address(page); | ||
431 | unsigned long pfn = page_to_pfn(page); | ||
432 | struct multicall_space mcs = __xen_mc_entry(0); | ||
433 | |||
434 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | ||
435 | pfn_pte(pfn, PAGE_KERNEL), | ||
436 | flags); | ||
437 | } | ||
438 | |||
439 | return 0; /* never need to flush on unpin */ | ||
384 | } | 440 | } |
385 | 441 | ||
442 | /* Release a pagetables pages back as normal RW */ | ||
443 | static void xen_pgd_unpin(pgd_t *pgd) | ||
444 | { | ||
445 | struct mmuext_op *op; | ||
446 | struct multicall_space mcs; | ||
447 | |||
448 | xen_mc_batch(); | ||
449 | |||
450 | mcs = __xen_mc_entry(sizeof(*op)); | ||
451 | |||
452 | op = mcs.args; | ||
453 | op->cmd = MMUEXT_UNPIN_TABLE; | ||
454 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | ||
455 | |||
456 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
457 | |||
458 | pgd_walk(pgd, unpin_page, TASK_SIZE); | ||
459 | |||
460 | xen_mc_issue(0); | ||
461 | } | ||
386 | 462 | ||
387 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) | 463 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) |
388 | { | 464 | { |
465 | spin_lock(&next->page_table_lock); | ||
389 | xen_pgd_pin(next->pgd); | 466 | xen_pgd_pin(next->pgd); |
467 | spin_unlock(&next->page_table_lock); | ||
390 | } | 468 | } |
391 | 469 | ||
392 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | 470 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) |
393 | { | 471 | { |
472 | spin_lock(&mm->page_table_lock); | ||
394 | xen_pgd_pin(mm->pgd); | 473 | xen_pgd_pin(mm->pgd); |
474 | spin_unlock(&mm->page_table_lock); | ||
395 | } | 475 | } |
396 | 476 | ||
397 | void xen_exit_mmap(struct mm_struct *mm) | 477 | void xen_exit_mmap(struct mm_struct *mm) |