aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/xen/mmu.c
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy@xensource.com>2007-07-17 21:37:05 -0400
committerJeremy Fitzhardinge <jeremy@goop.org>2007-07-18 11:47:43 -0400
commitf4f97b3ea90130520afb478cbc2918be2b6587b8 (patch)
tree1aeebe3230b4a7eef0630eec148927c1adf340a5 /arch/i386/xen/mmu.c
parentc85b04c3749507546f6d5868976e4793e35c2ec0 (diff)
xen: Complete pagetable pinning
Xen requires all active pagetables to be marked read-only. When the base of the pagetable is loaded into %cr3, the hypervisor validates the entire pagetable and only allows the load to proceed if it all checks out. This is pretty slow, so to mitigate this cost Xen has a notion of pinned pagetables. Pinned pagetables are pagetables which are considered to be active even if no processor's cr3 is pointing to is. This means that it must remain read-only and all updates are validated by the hypervisor. This makes context switches much cheaper, because the hypervisor doesn't need to revalidate the pagetable each time. This also adds a new paravirt hook which is called during setup once the zones and memory allocator have been initialized. When the init_mm pagetable is first built, the struct page array does not yet exist, and so there's nowhere to put he init_mm pagetable's PG_pinned flags. Once the zones are initialized and the struct page array exists, we can set the PG_pinned flags for those pages. This patch also adds the Xen support for pte pages allocated out of highmem (highpte) by implementing xen_kmap_atomic_pte. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Zach Amsden <zach@vmware.com>
Diffstat (limited to 'arch/i386/xen/mmu.c')
-rw-r--r--arch/i386/xen/mmu.c260
1 files changed, 170 insertions, 90 deletions
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
index de16cb5f55ca..53501ce2d15c 100644
--- a/arch/i386/xen/mmu.c
+++ b/arch/i386/xen/mmu.c
@@ -38,19 +38,22 @@
38 * 38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */ 40 */
41#include <linux/highmem.h>
41#include <linux/bug.h> 42#include <linux/bug.h>
42#include <linux/sched.h> 43#include <linux/sched.h>
43 44
44#include <asm/pgtable.h> 45#include <asm/pgtable.h>
45#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
46#include <asm/mmu_context.h> 47#include <asm/mmu_context.h>
48#include <asm/paravirt.h>
47 49
48#include <asm/xen/hypercall.h> 50#include <asm/xen/hypercall.h>
49#include <asm/paravirt.h> 51#include <asm/xen/hypervisor.h>
50 52
51#include <xen/page.h> 53#include <xen/page.h>
52#include <xen/interface/xen.h> 54#include <xen/interface/xen.h>
53 55
56#include "multicalls.h"
54#include "mmu.h" 57#include "mmu.h"
55 58
56xmaddr_t arbitrary_virt_to_machine(unsigned long address) 59xmaddr_t arbitrary_virt_to_machine(unsigned long address)
@@ -92,16 +95,6 @@ void make_lowmem_page_readwrite(void *vaddr)
92} 95}
93 96
94 97
95void xen_set_pte(pte_t *ptep, pte_t pte)
96{
97 struct mmu_update u;
98
99 u.ptr = virt_to_machine(ptep).maddr;
100 u.val = pte_val_ma(pte);
101 if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
102 BUG();
103}
104
105void xen_set_pmd(pmd_t *ptr, pmd_t val) 98void xen_set_pmd(pmd_t *ptr, pmd_t val)
106{ 99{
107 struct mmu_update u; 100 struct mmu_update u;
@@ -112,18 +105,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
112 BUG(); 105 BUG();
113} 106}
114 107
115#ifdef CONFIG_X86_PAE
116void xen_set_pud(pud_t *ptr, pud_t val)
117{
118 struct mmu_update u;
119
120 u.ptr = virt_to_machine(ptr).maddr;
121 u.val = pud_val_ma(val);
122 if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
123 BUG();
124}
125#endif
126
127/* 108/*
128 * Associate a virtual page frame with a given physical page frame 109 * Associate a virtual page frame with a given physical page frame
129 * and protection flags for that frame. 110 * and protection flags for that frame.
@@ -170,6 +151,23 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
170} 151}
171 152
172#ifdef CONFIG_X86_PAE 153#ifdef CONFIG_X86_PAE
154void xen_set_pud(pud_t *ptr, pud_t val)
155{
156 struct mmu_update u;
157
158 u.ptr = virt_to_machine(ptr).maddr;
159 u.val = pud_val_ma(val);
160 if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
161 BUG();
162}
163
164void xen_set_pte(pte_t *ptep, pte_t pte)
165{
166 ptep->pte_high = pte.pte_high;
167 smp_wmb();
168 ptep->pte_low = pte.pte_low;
169}
170
173void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 171void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
174{ 172{
175 set_64bit((u64 *)ptep, pte_val_ma(pte)); 173 set_64bit((u64 *)ptep, pte_val_ma(pte));
@@ -239,6 +237,11 @@ pgd_t xen_make_pgd(unsigned long long pgd)
239 return (pgd_t){ pgd }; 237 return (pgd_t){ pgd };
240} 238}
241#else /* !PAE */ 239#else /* !PAE */
240void xen_set_pte(pte_t *ptep, pte_t pte)
241{
242 *ptep = pte;
243}
244
242unsigned long xen_pte_val(pte_t pte) 245unsigned long xen_pte_val(pte_t pte)
243{ 246{
244 unsigned long ret = pte.pte_low; 247 unsigned long ret = pte.pte_low;
@@ -249,13 +252,6 @@ unsigned long xen_pte_val(pte_t pte)
249 return ret; 252 return ret;
250} 253}
251 254
252unsigned long xen_pmd_val(pmd_t pmd)
253{
254 /* a BUG here is a lot easier to track down than a NULL eip */
255 BUG();
256 return 0;
257}
258
259unsigned long xen_pgd_val(pgd_t pgd) 255unsigned long xen_pgd_val(pgd_t pgd)
260{ 256{
261 unsigned long ret = pgd.pgd; 257 unsigned long ret = pgd.pgd;
@@ -272,13 +268,6 @@ pte_t xen_make_pte(unsigned long pte)
272 return (pte_t){ pte }; 268 return (pte_t){ pte };
273} 269}
274 270
275pmd_t xen_make_pmd(unsigned long pmd)
276{
277 /* a BUG here is a lot easier to track down than a NULL eip */
278 BUG();
279 return __pmd(0);
280}
281
282pgd_t xen_make_pgd(unsigned long pgd) 271pgd_t xen_make_pgd(unsigned long pgd)
283{ 272{
284 if (pgd & _PAGE_PRESENT) 273 if (pgd & _PAGE_PRESENT)
@@ -290,108 +279,199 @@ pgd_t xen_make_pgd(unsigned long pgd)
290 279
291 280
292 281
293static void pgd_walk_set_prot(void *pt, pgprot_t flags) 282/*
294{ 283 (Yet another) pagetable walker. This one is intended for pinning a
295 unsigned long pfn = PFN_DOWN(__pa(pt)); 284 pagetable. This means that it walks a pagetable and calls the
296 285 callback function on each page it finds making up the page table,
297 if (HYPERVISOR_update_va_mapping((unsigned long)pt, 286 at every level. It walks the entire pagetable, but it only bothers
298 pfn_pte(pfn, flags), 0) < 0) 287 pinning pte pages which are below pte_limit. In the normal case
299 BUG(); 288 this will be TASK_SIZE, but at boot we need to pin up to
300} 289 FIXADDR_TOP. But the important bit is that we don't pin beyond
301 290 there, because then we start getting into Xen's ptes.
302static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) 291*/
292static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
293 unsigned long limit)
303{ 294{
304 pgd_t *pgd = pgd_base; 295 pgd_t *pgd = pgd_base;
305 pud_t *pud; 296 int flush = 0;
306 pmd_t *pmd; 297 unsigned long addr = 0;
307 pte_t *pte; 298 unsigned long pgd_next;
308 int g, u, m; 299
300 BUG_ON(limit > FIXADDR_TOP);
309 301
310 if (xen_feature(XENFEAT_auto_translated_physmap)) 302 if (xen_feature(XENFEAT_auto_translated_physmap))
311 return; 303 return 0;
304
305 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
306 pud_t *pud;
307 unsigned long pud_limit, pud_next;
312 308
313 for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { 309 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
314 if (pgd_none(*pgd)) 310
311 if (!pgd_val(*pgd))
315 continue; 312 continue;
313
316 pud = pud_offset(pgd, 0); 314 pud = pud_offset(pgd, 0);
317 315
318 if (PTRS_PER_PUD > 1) /* not folded */ 316 if (PTRS_PER_PUD > 1) /* not folded */
319 pgd_walk_set_prot(pud, flags); 317 flush |= (*func)(virt_to_page(pud), 0);
318
319 for (; addr != pud_limit; pud++, addr = pud_next) {
320 pmd_t *pmd;
321 unsigned long pmd_limit;
322
323 pud_next = pud_addr_end(addr, pud_limit);
324
325 if (pud_next < limit)
326 pmd_limit = pud_next;
327 else
328 pmd_limit = limit;
320 329
321 for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
322 if (pud_none(*pud)) 330 if (pud_none(*pud))
323 continue; 331 continue;
332
324 pmd = pmd_offset(pud, 0); 333 pmd = pmd_offset(pud, 0);
325 334
326 if (PTRS_PER_PMD > 1) /* not folded */ 335 if (PTRS_PER_PMD > 1) /* not folded */
327 pgd_walk_set_prot(pmd, flags); 336 flush |= (*func)(virt_to_page(pmd), 0);
337
338 for (; addr != pmd_limit; pmd++) {
339 addr += (PAGE_SIZE * PTRS_PER_PTE);
340 if ((pmd_limit-1) < (addr-1)) {
341 addr = pmd_limit;
342 break;
343 }
328 344
329 for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
330 if (pmd_none(*pmd)) 345 if (pmd_none(*pmd))
331 continue; 346 continue;
332 347
333 /* This can get called before mem_map 348 flush |= (*func)(pmd_page(*pmd), 0);
334 is set up, so we assume nothing is
335 highmem at that point. */
336 if (mem_map == NULL ||
337 !PageHighMem(pmd_page(*pmd))) {
338 pte = pte_offset_kernel(pmd, 0);
339 pgd_walk_set_prot(pte, flags);
340 }
341 } 349 }
342 } 350 }
343 } 351 }
344 352
345 if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base, 353 flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
346 pfn_pte(PFN_DOWN(__pa(pgd_base)), 354
347 flags), 355 return flush;
348 UVMF_TLB_FLUSH) < 0)
349 BUG();
350} 356}
351 357
358static int pin_page(struct page *page, unsigned flags)
359{
360 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
361 int flush;
362
363 if (pgfl)
364 flush = 0; /* already pinned */
365 else if (PageHighMem(page))
366 /* kmaps need flushing if we found an unpinned
367 highpage */
368 flush = 1;
369 else {
370 void *pt = lowmem_page_address(page);
371 unsigned long pfn = page_to_pfn(page);
372 struct multicall_space mcs = __xen_mc_entry(0);
373
374 flush = 0;
375
376 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
377 pfn_pte(pfn, PAGE_KERNEL_RO),
378 flags);
379 }
380
381 return flush;
382}
352 383
353/* This is called just after a mm has been duplicated from its parent, 384/* This is called just after a mm has been created, but it has not
354 but it has not been used yet. We need to make sure that its 385 been used yet. We need to make sure that its pagetable is all
355 pagetable is all read-only, and can be pinned. */ 386 read-only, and can be pinned. */
356void xen_pgd_pin(pgd_t *pgd) 387void xen_pgd_pin(pgd_t *pgd)
357{ 388{
358 struct mmuext_op op; 389 struct multicall_space mcs;
390 struct mmuext_op *op;
359 391
360 pgd_walk(pgd, PAGE_KERNEL_RO); 392 xen_mc_batch();
361 393
362#if defined(CONFIG_X86_PAE) 394 if (pgd_walk(pgd, pin_page, TASK_SIZE))
363 op.cmd = MMUEXT_PIN_L3_TABLE; 395 kmap_flush_unused();
396
397 mcs = __xen_mc_entry(sizeof(*op));
398 op = mcs.args;
399
400#ifdef CONFIG_X86_PAE
401 op->cmd = MMUEXT_PIN_L3_TABLE;
364#else 402#else
365 op.cmd = MMUEXT_PIN_L2_TABLE; 403 op->cmd = MMUEXT_PIN_L2_TABLE;
366#endif 404#endif
367 op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); 405 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
368 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 406 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
369 BUG(); 407
408 xen_mc_issue(0);
370} 409}
371 410
372/* Release a pagetables pages back as normal RW */ 411/* The init_mm pagetable is really pinned as soon as its created, but
373void xen_pgd_unpin(pgd_t *pgd) 412 that's before we have page structures to store the bits. So do all
413 the book-keeping now. */
414static __init int mark_pinned(struct page *page, unsigned flags)
374{ 415{
375 struct mmuext_op op; 416 SetPagePinned(page);
417 return 0;
418}
376 419
377 op.cmd = MMUEXT_UNPIN_TABLE; 420void __init xen_mark_init_mm_pinned(void)
378 op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); 421{
422 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
423}
379 424
380 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 425static int unpin_page(struct page *page, unsigned flags)
381 BUG(); 426{
427 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
382 428
383 pgd_walk(pgd, PAGE_KERNEL); 429 if (pgfl && !PageHighMem(page)) {
430 void *pt = lowmem_page_address(page);
431 unsigned long pfn = page_to_pfn(page);
432 struct multicall_space mcs = __xen_mc_entry(0);
433
434 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
435 pfn_pte(pfn, PAGE_KERNEL),
436 flags);
437 }
438
439 return 0; /* never need to flush on unpin */
384} 440}
385 441
442/* Release a pagetables pages back as normal RW */
443static void xen_pgd_unpin(pgd_t *pgd)
444{
445 struct mmuext_op *op;
446 struct multicall_space mcs;
447
448 xen_mc_batch();
449
450 mcs = __xen_mc_entry(sizeof(*op));
451
452 op = mcs.args;
453 op->cmd = MMUEXT_UNPIN_TABLE;
454 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
455
456 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
457
458 pgd_walk(pgd, unpin_page, TASK_SIZE);
459
460 xen_mc_issue(0);
461}
386 462
387void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 463void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
388{ 464{
465 spin_lock(&next->page_table_lock);
389 xen_pgd_pin(next->pgd); 466 xen_pgd_pin(next->pgd);
467 spin_unlock(&next->page_table_lock);
390} 468}
391 469
392void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 470void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
393{ 471{
472 spin_lock(&mm->page_table_lock);
394 xen_pgd_pin(mm->pgd); 473 xen_pgd_pin(mm->pgd);
474 spin_unlock(&mm->page_table_lock);
395} 475}
396 476
397void xen_exit_mmap(struct mm_struct *mm) 477void xen_exit_mmap(struct mm_struct *mm)