aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy@xensource.com>2007-10-16 14:51:30 -0400
committerJeremy Fitzhardinge <jeremy@goop.org>2007-10-16 14:51:30 -0400
commit74260714c56de4f967fcb2f17a8656bc574b75be (patch)
treef02bcd991285a20a543fae69f916577c8447b8f4
parent9f79991d4186089e228274196413572cc000143b (diff)
xen: lock pte pages while pinning/unpinning
When a pagetable is created, it is made globally visible in the rmap prio tree before it is pinned via arch_dup_mmap(), and remains in the rmap tree while it is unpinned with arch_exit_mmap(). This means that other CPUs may race with the pinning/unpinning process, and see a pte between when it gets marked RO and actually pinned, causing any pte updates to fail with write-protect faults. As a result, all pte pages must be properly locked, and only unlocked once the pinning/unpinning process has finished. In order to avoid taking spinlocks for the whole pagetable - which may overflow the PREEMPT_BITS portion of preempt counter - it locks and pins each pte page individually, and then finally pins the whole pagetable. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Rik van Riel <riel@redhat.com> Cc: Hugh Dickens <hugh@veritas.com> Cc: David Rientjes <rientjes@google.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andi Kleen <ak@suse.de> Cc: Keir Fraser <keir@xensource.com> Cc: Jan Beulich <jbeulich@novell.com>
-rw-r--r--arch/x86/xen/enlighten.c30
-rw-r--r--arch/x86/xen/mmu.c113
-rw-r--r--mm/Kconfig1
3 files changed, 103 insertions, 41 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e334bf7cb327..4186cb6a7f5a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -666,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
666 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 666 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
667} 667}
668 668
669static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
670{
671 struct mmuext_op op;
672 op.cmd = level;
673 op.arg1.mfn = pfn_to_mfn(pfn);
674 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
675 BUG();
676}
677
669/* This needs to make sure the new pte page is pinned iff its being 678/* This needs to make sure the new pte page is pinned iff its being
670 attached to a pinned pagetable. */ 679 attached to a pinned pagetable. */
671static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) 680static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
@@ -675,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
675 if (PagePinned(virt_to_page(mm->pgd))) { 684 if (PagePinned(virt_to_page(mm->pgd))) {
676 SetPagePinned(page); 685 SetPagePinned(page);
677 686
678 if (!PageHighMem(page)) 687 if (!PageHighMem(page)) {
679 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 688 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
680 else 689 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
690 } else
681 /* make sure there are no stray mappings of 691 /* make sure there are no stray mappings of
682 this page */ 692 this page */
683 kmap_flush_unused(); 693 kmap_flush_unused();
@@ -690,8 +700,10 @@ static void xen_release_pt(u32 pfn)
690 struct page *page = pfn_to_page(pfn); 700 struct page *page = pfn_to_page(pfn);
691 701
692 if (PagePinned(page)) { 702 if (PagePinned(page)) {
693 if (!PageHighMem(page)) 703 if (!PageHighMem(page)) {
704 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
694 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 705 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
706 }
695 } 707 }
696} 708}
697 709
@@ -806,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
806 /* Actually pin the pagetable down, but we can't set PG_pinned 818 /* Actually pin the pagetable down, but we can't set PG_pinned
807 yet because the page structures don't exist yet. */ 819 yet because the page structures don't exist yet. */
808 { 820 {
809 struct mmuext_op op; 821 unsigned level;
822
810#ifdef CONFIG_X86_PAE 823#ifdef CONFIG_X86_PAE
811 op.cmd = MMUEXT_PIN_L3_TABLE; 824 level = MMUEXT_PIN_L3_TABLE;
812#else 825#else
813 op.cmd = MMUEXT_PIN_L3_TABLE; 826 level = MMUEXT_PIN_L2_TABLE;
814#endif 827#endif
815 op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); 828
816 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) 829 pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
817 BUG();
818 } 830 }
819} 831}
820 832
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 72f08ab43a4d..b2e32f9d0071 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -303,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd)
303} 303}
304#endif /* CONFIG_X86_PAE */ 304#endif /* CONFIG_X86_PAE */
305 305
306 306enum pt_level {
307 PT_PGD,
308 PT_PUD,
309 PT_PMD,
310 PT_PTE
311};
307 312
308/* 313/*
309 (Yet another) pagetable walker. This one is intended for pinning a 314 (Yet another) pagetable walker. This one is intended for pinning a
@@ -315,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd)
315 FIXADDR_TOP. But the important bit is that we don't pin beyond 320 FIXADDR_TOP. But the important bit is that we don't pin beyond
316 there, because then we start getting into Xen's ptes. 321 there, because then we start getting into Xen's ptes.
317*/ 322*/
318static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), 323static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
319 unsigned long limit) 324 unsigned long limit)
320{ 325{
321 pgd_t *pgd = pgd_base; 326 pgd_t *pgd = pgd_base;
@@ -340,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
340 pud = pud_offset(pgd, 0); 345 pud = pud_offset(pgd, 0);
341 346
342 if (PTRS_PER_PUD > 1) /* not folded */ 347 if (PTRS_PER_PUD > 1) /* not folded */
343 flush |= (*func)(virt_to_page(pud), 0); 348 flush |= (*func)(virt_to_page(pud), PT_PUD);
344 349
345 for (; addr != pud_limit; pud++, addr = pud_next) { 350 for (; addr != pud_limit; pud++, addr = pud_next) {
346 pmd_t *pmd; 351 pmd_t *pmd;
@@ -359,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
359 pmd = pmd_offset(pud, 0); 364 pmd = pmd_offset(pud, 0);
360 365
361 if (PTRS_PER_PMD > 1) /* not folded */ 366 if (PTRS_PER_PMD > 1) /* not folded */
362 flush |= (*func)(virt_to_page(pmd), 0); 367 flush |= (*func)(virt_to_page(pmd), PT_PMD);
363 368
364 for (; addr != pmd_limit; pmd++) { 369 for (; addr != pmd_limit; pmd++) {
365 addr += (PAGE_SIZE * PTRS_PER_PTE); 370 addr += (PAGE_SIZE * PTRS_PER_PTE);
@@ -371,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
371 if (pmd_none(*pmd)) 376 if (pmd_none(*pmd))
372 continue; 377 continue;
373 378
374 flush |= (*func)(pmd_page(*pmd), 0); 379 flush |= (*func)(pmd_page(*pmd), PT_PTE);
375 } 380 }
376 } 381 }
377 } 382 }
378 383
379 flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); 384 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
380 385
381 return flush; 386 return flush;
382} 387}
383 388
384static int pin_page(struct page *page, unsigned flags) 389static spinlock_t *lock_pte(struct page *page)
390{
391 spinlock_t *ptl = NULL;
392
393#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
394 ptl = __pte_lockptr(page);
395 spin_lock(ptl);
396#endif
397
398 return ptl;
399}
400
401static void do_unlock(void *v)
402{
403 spinlock_t *ptl = v;
404 spin_unlock(ptl);
405}
406
407static void xen_do_pin(unsigned level, unsigned long pfn)
408{
409 struct mmuext_op *op;
410 struct multicall_space mcs;
411
412 mcs = __xen_mc_entry(sizeof(*op));
413 op = mcs.args;
414 op->cmd = level;
415 op->arg1.mfn = pfn_to_mfn(pfn);
416 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
417}
418
419static int pin_page(struct page *page, enum pt_level level)
385{ 420{
386 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); 421 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
387 int flush; 422 int flush;
@@ -396,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags)
396 void *pt = lowmem_page_address(page); 431 void *pt = lowmem_page_address(page);
397 unsigned long pfn = page_to_pfn(page); 432 unsigned long pfn = page_to_pfn(page);
398 struct multicall_space mcs = __xen_mc_entry(0); 433 struct multicall_space mcs = __xen_mc_entry(0);
434 spinlock_t *ptl;
399 435
400 flush = 0; 436 flush = 0;
401 437
438 ptl = NULL;
439 if (level == PT_PTE)
440 ptl = lock_pte(page);
441
402 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 442 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
403 pfn_pte(pfn, PAGE_KERNEL_RO), 443 pfn_pte(pfn, PAGE_KERNEL_RO),
404 flags); 444 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
445
446 if (level == PT_PTE)
447 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
448
449 if (ptl) {
450 /* Queue a deferred unlock for when this batch
451 is completed. */
452 xen_mc_callback(do_unlock, ptl);
453 }
405 } 454 }
406 455
407 return flush; 456 return flush;
@@ -412,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags)
412 read-only, and can be pinned. */ 461 read-only, and can be pinned. */
413void xen_pgd_pin(pgd_t *pgd) 462void xen_pgd_pin(pgd_t *pgd)
414{ 463{
415 struct multicall_space mcs; 464 unsigned level;
416 struct mmuext_op *op;
417 465
418 xen_mc_batch(); 466 xen_mc_batch();
419 467
@@ -424,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd)
424 xen_mc_batch(); 472 xen_mc_batch();
425 } 473 }
426 474
427 mcs = __xen_mc_entry(sizeof(*op));
428 op = mcs.args;
429
430#ifdef CONFIG_X86_PAE 475#ifdef CONFIG_X86_PAE
431 op->cmd = MMUEXT_PIN_L3_TABLE; 476 level = MMUEXT_PIN_L3_TABLE;
432#else 477#else
433 op->cmd = MMUEXT_PIN_L2_TABLE; 478 level = MMUEXT_PIN_L2_TABLE;
434#endif 479#endif
435 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); 480
436 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 481 xen_do_pin(level, PFN_DOWN(__pa(pgd)));
437 482
438 xen_mc_issue(0); 483 xen_mc_issue(0);
439} 484}
@@ -441,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd)
441/* The init_mm pagetable is really pinned as soon as its created, but 486/* The init_mm pagetable is really pinned as soon as its created, but
442 that's before we have page structures to store the bits. So do all 487 that's before we have page structures to store the bits. So do all
443 the book-keeping now. */ 488 the book-keeping now. */
444static __init int mark_pinned(struct page *page, unsigned flags) 489static __init int mark_pinned(struct page *page, enum pt_level level)
445{ 490{
446 SetPagePinned(page); 491 SetPagePinned(page);
447 return 0; 492 return 0;
@@ -452,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void)
452 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 497 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
453} 498}
454 499
455static int unpin_page(struct page *page, unsigned flags) 500static int unpin_page(struct page *page, enum pt_level level)
456{ 501{
457 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); 502 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
458 503
459 if (pgfl && !PageHighMem(page)) { 504 if (pgfl && !PageHighMem(page)) {
460 void *pt = lowmem_page_address(page); 505 void *pt = lowmem_page_address(page);
461 unsigned long pfn = page_to_pfn(page); 506 unsigned long pfn = page_to_pfn(page);
462 struct multicall_space mcs = __xen_mc_entry(0); 507 spinlock_t *ptl = NULL;
508 struct multicall_space mcs;
509
510 if (level == PT_PTE) {
511 ptl = lock_pte(page);
512
513 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
514 }
515
516 mcs = __xen_mc_entry(0);
463 517
464 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 518 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
465 pfn_pte(pfn, PAGE_KERNEL), 519 pfn_pte(pfn, PAGE_KERNEL),
466 flags); 520 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
521
522 if (ptl) {
523 /* unlock when batch completed */
524 xen_mc_callback(do_unlock, ptl);
525 }
467 } 526 }
468 527
469 return 0; /* never need to flush on unpin */ 528 return 0; /* never need to flush on unpin */
@@ -472,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags)
472/* Release a pagetables pages back as normal RW */ 531/* Release a pagetables pages back as normal RW */
473static void xen_pgd_unpin(pgd_t *pgd) 532static void xen_pgd_unpin(pgd_t *pgd)
474{ 533{
475 struct mmuext_op *op;
476 struct multicall_space mcs;
477
478 xen_mc_batch(); 534 xen_mc_batch();
479 535
480 mcs = __xen_mc_entry(sizeof(*op)); 536 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
481
482 op = mcs.args;
483 op->cmd = MMUEXT_UNPIN_TABLE;
484 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
485
486 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
487 537
488 pgd_walk(pgd, unpin_page, TASK_SIZE); 538 pgd_walk(pgd, unpin_page, TASK_SIZE);
489 539
@@ -585,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm)
585 /* pgd may not be pinned in the error exit path of execve */ 635 /* pgd may not be pinned in the error exit path of execve */
586 if (PagePinned(virt_to_page(mm->pgd))) 636 if (PagePinned(virt_to_page(mm->pgd)))
587 xen_pgd_unpin(mm->pgd); 637 xen_pgd_unpin(mm->pgd);
638
588 spin_unlock(&mm->page_table_lock); 639 spin_unlock(&mm->page_table_lock);
589} 640}
diff --git a/mm/Kconfig b/mm/Kconfig
index a7609cbcb00d..e24d348083c3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,7 +137,6 @@ config SPLIT_PTLOCK_CPUS
137 int 137 int
138 default "4096" if ARM && !CPU_CACHE_VIPT 138 default "4096" if ARM && !CPU_CACHE_VIPT
139 default "4096" if PARISC && !PA20 139 default "4096" if PARISC && !PA20
140 default "4096" if XEN
141 default "4" 140 default "4"
142 141
143# 142#