diff options
author | Jeremy Fitzhardinge <jeremy@xensource.com> | 2007-10-16 14:51:30 -0400 |
---|---|---|
committer | Jeremy Fitzhardinge <jeremy@goop.org> | 2007-10-16 14:51:30 -0400 |
commit | 74260714c56de4f967fcb2f17a8656bc574b75be (patch) | |
tree | f02bcd991285a20a543fae69f916577c8447b8f4 | |
parent | 9f79991d4186089e228274196413572cc000143b (diff) |
xen: lock pte pages while pinning/unpinning
When a pagetable is created, it is made globally visible in the rmap
prio tree before it is pinned via arch_dup_mmap(), and remains in the
rmap tree while it is unpinned with arch_exit_mmap().
This means that other CPUs may race with the pinning/unpinning
process, and see a pte between when it gets marked RO and actually
pinned, causing any pte updates to fail with write-protect faults.
As a result, all pte pages must be properly locked, and only unlocked
once the pinning/unpinning process has finished.
In order to avoid taking spinlocks for the whole pagetable - which may
overflow the PREEMPT_BITS portion of preempt counter - it locks and pins
each pte page individually, and then finally pins the whole pagetable.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hugh Dickens <hugh@veritas.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andi Kleen <ak@suse.de>
Cc: Keir Fraser <keir@xensource.com>
Cc: Jan Beulich <jbeulich@novell.com>
-rw-r--r-- | arch/x86/xen/enlighten.c | 30 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 113 | ||||
-rw-r--r-- | mm/Kconfig | 1 |
3 files changed, 103 insertions, 41 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e334bf7cb327..4186cb6a7f5a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -666,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) | |||
666 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 666 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); |
667 | } | 667 | } |
668 | 668 | ||
669 | static void pin_pagetable_pfn(unsigned level, unsigned long pfn) | ||
670 | { | ||
671 | struct mmuext_op op; | ||
672 | op.cmd = level; | ||
673 | op.arg1.mfn = pfn_to_mfn(pfn); | ||
674 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) | ||
675 | BUG(); | ||
676 | } | ||
677 | |||
669 | /* This needs to make sure the new pte page is pinned iff its being | 678 | /* This needs to make sure the new pte page is pinned iff its being |
670 | attached to a pinned pagetable. */ | 679 | attached to a pinned pagetable. */ |
671 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | 680 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) |
@@ -675,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | |||
675 | if (PagePinned(virt_to_page(mm->pgd))) { | 684 | if (PagePinned(virt_to_page(mm->pgd))) { |
676 | SetPagePinned(page); | 685 | SetPagePinned(page); |
677 | 686 | ||
678 | if (!PageHighMem(page)) | 687 | if (!PageHighMem(page)) { |
679 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 688 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); |
680 | else | 689 | pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); |
690 | } else | ||
681 | /* make sure there are no stray mappings of | 691 | /* make sure there are no stray mappings of |
682 | this page */ | 692 | this page */ |
683 | kmap_flush_unused(); | 693 | kmap_flush_unused(); |
@@ -690,8 +700,10 @@ static void xen_release_pt(u32 pfn) | |||
690 | struct page *page = pfn_to_page(pfn); | 700 | struct page *page = pfn_to_page(pfn); |
691 | 701 | ||
692 | if (PagePinned(page)) { | 702 | if (PagePinned(page)) { |
693 | if (!PageHighMem(page)) | 703 | if (!PageHighMem(page)) { |
704 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); | ||
694 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 705 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); |
706 | } | ||
695 | } | 707 | } |
696 | } | 708 | } |
697 | 709 | ||
@@ -806,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base) | |||
806 | /* Actually pin the pagetable down, but we can't set PG_pinned | 818 | /* Actually pin the pagetable down, but we can't set PG_pinned |
807 | yet because the page structures don't exist yet. */ | 819 | yet because the page structures don't exist yet. */ |
808 | { | 820 | { |
809 | struct mmuext_op op; | 821 | unsigned level; |
822 | |||
810 | #ifdef CONFIG_X86_PAE | 823 | #ifdef CONFIG_X86_PAE |
811 | op.cmd = MMUEXT_PIN_L3_TABLE; | 824 | level = MMUEXT_PIN_L3_TABLE; |
812 | #else | 825 | #else |
813 | op.cmd = MMUEXT_PIN_L3_TABLE; | 826 | level = MMUEXT_PIN_L2_TABLE; |
814 | #endif | 827 | #endif |
815 | op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); | 828 | |
816 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) | 829 | pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); |
817 | BUG(); | ||
818 | } | 830 | } |
819 | } | 831 | } |
820 | 832 | ||
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 72f08ab43a4d..b2e32f9d0071 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -303,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd) | |||
303 | } | 303 | } |
304 | #endif /* CONFIG_X86_PAE */ | 304 | #endif /* CONFIG_X86_PAE */ |
305 | 305 | ||
306 | 306 | enum pt_level { | |
307 | PT_PGD, | ||
308 | PT_PUD, | ||
309 | PT_PMD, | ||
310 | PT_PTE | ||
311 | }; | ||
307 | 312 | ||
308 | /* | 313 | /* |
309 | (Yet another) pagetable walker. This one is intended for pinning a | 314 | (Yet another) pagetable walker. This one is intended for pinning a |
@@ -315,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd) | |||
315 | FIXADDR_TOP. But the important bit is that we don't pin beyond | 320 | FIXADDR_TOP. But the important bit is that we don't pin beyond |
316 | there, because then we start getting into Xen's ptes. | 321 | there, because then we start getting into Xen's ptes. |
317 | */ | 322 | */ |
318 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | 323 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), |
319 | unsigned long limit) | 324 | unsigned long limit) |
320 | { | 325 | { |
321 | pgd_t *pgd = pgd_base; | 326 | pgd_t *pgd = pgd_base; |
@@ -340,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
340 | pud = pud_offset(pgd, 0); | 345 | pud = pud_offset(pgd, 0); |
341 | 346 | ||
342 | if (PTRS_PER_PUD > 1) /* not folded */ | 347 | if (PTRS_PER_PUD > 1) /* not folded */ |
343 | flush |= (*func)(virt_to_page(pud), 0); | 348 | flush |= (*func)(virt_to_page(pud), PT_PUD); |
344 | 349 | ||
345 | for (; addr != pud_limit; pud++, addr = pud_next) { | 350 | for (; addr != pud_limit; pud++, addr = pud_next) { |
346 | pmd_t *pmd; | 351 | pmd_t *pmd; |
@@ -359,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
359 | pmd = pmd_offset(pud, 0); | 364 | pmd = pmd_offset(pud, 0); |
360 | 365 | ||
361 | if (PTRS_PER_PMD > 1) /* not folded */ | 366 | if (PTRS_PER_PMD > 1) /* not folded */ |
362 | flush |= (*func)(virt_to_page(pmd), 0); | 367 | flush |= (*func)(virt_to_page(pmd), PT_PMD); |
363 | 368 | ||
364 | for (; addr != pmd_limit; pmd++) { | 369 | for (; addr != pmd_limit; pmd++) { |
365 | addr += (PAGE_SIZE * PTRS_PER_PTE); | 370 | addr += (PAGE_SIZE * PTRS_PER_PTE); |
@@ -371,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
371 | if (pmd_none(*pmd)) | 376 | if (pmd_none(*pmd)) |
372 | continue; | 377 | continue; |
373 | 378 | ||
374 | flush |= (*func)(pmd_page(*pmd), 0); | 379 | flush |= (*func)(pmd_page(*pmd), PT_PTE); |
375 | } | 380 | } |
376 | } | 381 | } |
377 | } | 382 | } |
378 | 383 | ||
379 | flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); | 384 | flush |= (*func)(virt_to_page(pgd_base), PT_PGD); |
380 | 385 | ||
381 | return flush; | 386 | return flush; |
382 | } | 387 | } |
383 | 388 | ||
384 | static int pin_page(struct page *page, unsigned flags) | 389 | static spinlock_t *lock_pte(struct page *page) |
390 | { | ||
391 | spinlock_t *ptl = NULL; | ||
392 | |||
393 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | ||
394 | ptl = __pte_lockptr(page); | ||
395 | spin_lock(ptl); | ||
396 | #endif | ||
397 | |||
398 | return ptl; | ||
399 | } | ||
400 | |||
401 | static void do_unlock(void *v) | ||
402 | { | ||
403 | spinlock_t *ptl = v; | ||
404 | spin_unlock(ptl); | ||
405 | } | ||
406 | |||
407 | static void xen_do_pin(unsigned level, unsigned long pfn) | ||
408 | { | ||
409 | struct mmuext_op *op; | ||
410 | struct multicall_space mcs; | ||
411 | |||
412 | mcs = __xen_mc_entry(sizeof(*op)); | ||
413 | op = mcs.args; | ||
414 | op->cmd = level; | ||
415 | op->arg1.mfn = pfn_to_mfn(pfn); | ||
416 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
417 | } | ||
418 | |||
419 | static int pin_page(struct page *page, enum pt_level level) | ||
385 | { | 420 | { |
386 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); | 421 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); |
387 | int flush; | 422 | int flush; |
@@ -396,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags) | |||
396 | void *pt = lowmem_page_address(page); | 431 | void *pt = lowmem_page_address(page); |
397 | unsigned long pfn = page_to_pfn(page); | 432 | unsigned long pfn = page_to_pfn(page); |
398 | struct multicall_space mcs = __xen_mc_entry(0); | 433 | struct multicall_space mcs = __xen_mc_entry(0); |
434 | spinlock_t *ptl; | ||
399 | 435 | ||
400 | flush = 0; | 436 | flush = 0; |
401 | 437 | ||
438 | ptl = NULL; | ||
439 | if (level == PT_PTE) | ||
440 | ptl = lock_pte(page); | ||
441 | |||
402 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 442 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
403 | pfn_pte(pfn, PAGE_KERNEL_RO), | 443 | pfn_pte(pfn, PAGE_KERNEL_RO), |
404 | flags); | 444 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
445 | |||
446 | if (level == PT_PTE) | ||
447 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); | ||
448 | |||
449 | if (ptl) { | ||
450 | /* Queue a deferred unlock for when this batch | ||
451 | is completed. */ | ||
452 | xen_mc_callback(do_unlock, ptl); | ||
453 | } | ||
405 | } | 454 | } |
406 | 455 | ||
407 | return flush; | 456 | return flush; |
@@ -412,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags) | |||
412 | read-only, and can be pinned. */ | 461 | read-only, and can be pinned. */ |
413 | void xen_pgd_pin(pgd_t *pgd) | 462 | void xen_pgd_pin(pgd_t *pgd) |
414 | { | 463 | { |
415 | struct multicall_space mcs; | 464 | unsigned level; |
416 | struct mmuext_op *op; | ||
417 | 465 | ||
418 | xen_mc_batch(); | 466 | xen_mc_batch(); |
419 | 467 | ||
@@ -424,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd) | |||
424 | xen_mc_batch(); | 472 | xen_mc_batch(); |
425 | } | 473 | } |
426 | 474 | ||
427 | mcs = __xen_mc_entry(sizeof(*op)); | ||
428 | op = mcs.args; | ||
429 | |||
430 | #ifdef CONFIG_X86_PAE | 475 | #ifdef CONFIG_X86_PAE |
431 | op->cmd = MMUEXT_PIN_L3_TABLE; | 476 | level = MMUEXT_PIN_L3_TABLE; |
432 | #else | 477 | #else |
433 | op->cmd = MMUEXT_PIN_L2_TABLE; | 478 | level = MMUEXT_PIN_L2_TABLE; |
434 | #endif | 479 | #endif |
435 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | 480 | |
436 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 481 | xen_do_pin(level, PFN_DOWN(__pa(pgd))); |
437 | 482 | ||
438 | xen_mc_issue(0); | 483 | xen_mc_issue(0); |
439 | } | 484 | } |
@@ -441,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd) | |||
441 | /* The init_mm pagetable is really pinned as soon as its created, but | 486 | /* The init_mm pagetable is really pinned as soon as its created, but |
442 | that's before we have page structures to store the bits. So do all | 487 | that's before we have page structures to store the bits. So do all |
443 | the book-keeping now. */ | 488 | the book-keeping now. */ |
444 | static __init int mark_pinned(struct page *page, unsigned flags) | 489 | static __init int mark_pinned(struct page *page, enum pt_level level) |
445 | { | 490 | { |
446 | SetPagePinned(page); | 491 | SetPagePinned(page); |
447 | return 0; | 492 | return 0; |
@@ -452,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void) | |||
452 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); | 497 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); |
453 | } | 498 | } |
454 | 499 | ||
455 | static int unpin_page(struct page *page, unsigned flags) | 500 | static int unpin_page(struct page *page, enum pt_level level) |
456 | { | 501 | { |
457 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); | 502 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); |
458 | 503 | ||
459 | if (pgfl && !PageHighMem(page)) { | 504 | if (pgfl && !PageHighMem(page)) { |
460 | void *pt = lowmem_page_address(page); | 505 | void *pt = lowmem_page_address(page); |
461 | unsigned long pfn = page_to_pfn(page); | 506 | unsigned long pfn = page_to_pfn(page); |
462 | struct multicall_space mcs = __xen_mc_entry(0); | 507 | spinlock_t *ptl = NULL; |
508 | struct multicall_space mcs; | ||
509 | |||
510 | if (level == PT_PTE) { | ||
511 | ptl = lock_pte(page); | ||
512 | |||
513 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | ||
514 | } | ||
515 | |||
516 | mcs = __xen_mc_entry(0); | ||
463 | 517 | ||
464 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 518 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
465 | pfn_pte(pfn, PAGE_KERNEL), | 519 | pfn_pte(pfn, PAGE_KERNEL), |
466 | flags); | 520 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
521 | |||
522 | if (ptl) { | ||
523 | /* unlock when batch completed */ | ||
524 | xen_mc_callback(do_unlock, ptl); | ||
525 | } | ||
467 | } | 526 | } |
468 | 527 | ||
469 | return 0; /* never need to flush on unpin */ | 528 | return 0; /* never need to flush on unpin */ |
@@ -472,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags) | |||
472 | /* Release a pagetables pages back as normal RW */ | 531 | /* Release a pagetables pages back as normal RW */ |
473 | static void xen_pgd_unpin(pgd_t *pgd) | 532 | static void xen_pgd_unpin(pgd_t *pgd) |
474 | { | 533 | { |
475 | struct mmuext_op *op; | ||
476 | struct multicall_space mcs; | ||
477 | |||
478 | xen_mc_batch(); | 534 | xen_mc_batch(); |
479 | 535 | ||
480 | mcs = __xen_mc_entry(sizeof(*op)); | 536 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
481 | |||
482 | op = mcs.args; | ||
483 | op->cmd = MMUEXT_UNPIN_TABLE; | ||
484 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | ||
485 | |||
486 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
487 | 537 | ||
488 | pgd_walk(pgd, unpin_page, TASK_SIZE); | 538 | pgd_walk(pgd, unpin_page, TASK_SIZE); |
489 | 539 | ||
@@ -585,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm) | |||
585 | /* pgd may not be pinned in the error exit path of execve */ | 635 | /* pgd may not be pinned in the error exit path of execve */ |
586 | if (PagePinned(virt_to_page(mm->pgd))) | 636 | if (PagePinned(virt_to_page(mm->pgd))) |
587 | xen_pgd_unpin(mm->pgd); | 637 | xen_pgd_unpin(mm->pgd); |
638 | |||
588 | spin_unlock(&mm->page_table_lock); | 639 | spin_unlock(&mm->page_table_lock); |
589 | } | 640 | } |
diff --git a/mm/Kconfig b/mm/Kconfig index a7609cbcb00d..e24d348083c3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -137,7 +137,6 @@ config SPLIT_PTLOCK_CPUS | |||
137 | int | 137 | int |
138 | default "4096" if ARM && !CPU_CACHE_VIPT | 138 | default "4096" if ARM && !CPU_CACHE_VIPT |
139 | default "4096" if PARISC && !PA20 | 139 | default "4096" if PARISC && !PA20 |
140 | default "4096" if XEN | ||
141 | default "4" | 140 | default "4" |
142 | 141 | ||
143 | # | 142 | # |