diff options
| author | Jeremy Fitzhardinge <jeremy@xensource.com> | 2007-10-16 14:51:30 -0400 |
|---|---|---|
| committer | Jeremy Fitzhardinge <jeremy@goop.org> | 2007-10-16 14:51:30 -0400 |
| commit | 74260714c56de4f967fcb2f17a8656bc574b75be (patch) | |
| tree | f02bcd991285a20a543fae69f916577c8447b8f4 | |
| parent | 9f79991d4186089e228274196413572cc000143b (diff) | |
xen: lock pte pages while pinning/unpinning
When a pagetable is created, it is made globally visible in the rmap
prio tree before it is pinned via arch_dup_mmap(), and remains in the
rmap tree while it is unpinned with arch_exit_mmap().
This means that other CPUs may race with the pinning/unpinning
process, and see a pte between when it gets marked RO and actually
pinned, causing any pte updates to fail with write-protect faults.
As a result, all pte pages must be properly locked, and only unlocked
once the pinning/unpinning process has finished.
In order to avoid taking spinlocks for the whole pagetable - which may
overflow the PREEMPT_BITS portion of preempt counter - it locks and pins
each pte page individually, and then finally pins the whole pagetable.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hugh Dickens <hugh@veritas.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andi Kleen <ak@suse.de>
Cc: Keir Fraser <keir@xensource.com>
Cc: Jan Beulich <jbeulich@novell.com>
| -rw-r--r-- | arch/x86/xen/enlighten.c | 30 | ||||
| -rw-r--r-- | arch/x86/xen/mmu.c | 113 | ||||
| -rw-r--r-- | mm/Kconfig | 1 |
3 files changed, 103 insertions, 41 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e334bf7cb327..4186cb6a7f5a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
| @@ -666,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) | |||
| 666 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 666 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); |
| 667 | } | 667 | } |
| 668 | 668 | ||
| 669 | static void pin_pagetable_pfn(unsigned level, unsigned long pfn) | ||
| 670 | { | ||
| 671 | struct mmuext_op op; | ||
| 672 | op.cmd = level; | ||
| 673 | op.arg1.mfn = pfn_to_mfn(pfn); | ||
| 674 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) | ||
| 675 | BUG(); | ||
| 676 | } | ||
| 677 | |||
| 669 | /* This needs to make sure the new pte page is pinned iff its being | 678 | /* This needs to make sure the new pte page is pinned iff its being |
| 670 | attached to a pinned pagetable. */ | 679 | attached to a pinned pagetable. */ |
| 671 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | 680 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) |
| @@ -675,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | |||
| 675 | if (PagePinned(virt_to_page(mm->pgd))) { | 684 | if (PagePinned(virt_to_page(mm->pgd))) { |
| 676 | SetPagePinned(page); | 685 | SetPagePinned(page); |
| 677 | 686 | ||
| 678 | if (!PageHighMem(page)) | 687 | if (!PageHighMem(page)) { |
| 679 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 688 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); |
| 680 | else | 689 | pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); |
| 690 | } else | ||
| 681 | /* make sure there are no stray mappings of | 691 | /* make sure there are no stray mappings of |
| 682 | this page */ | 692 | this page */ |
| 683 | kmap_flush_unused(); | 693 | kmap_flush_unused(); |
| @@ -690,8 +700,10 @@ static void xen_release_pt(u32 pfn) | |||
| 690 | struct page *page = pfn_to_page(pfn); | 700 | struct page *page = pfn_to_page(pfn); |
| 691 | 701 | ||
| 692 | if (PagePinned(page)) { | 702 | if (PagePinned(page)) { |
| 693 | if (!PageHighMem(page)) | 703 | if (!PageHighMem(page)) { |
| 704 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); | ||
| 694 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 705 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); |
| 706 | } | ||
| 695 | } | 707 | } |
| 696 | } | 708 | } |
| 697 | 709 | ||
| @@ -806,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base) | |||
| 806 | /* Actually pin the pagetable down, but we can't set PG_pinned | 818 | /* Actually pin the pagetable down, but we can't set PG_pinned |
| 807 | yet because the page structures don't exist yet. */ | 819 | yet because the page structures don't exist yet. */ |
| 808 | { | 820 | { |
| 809 | struct mmuext_op op; | 821 | unsigned level; |
| 822 | |||
| 810 | #ifdef CONFIG_X86_PAE | 823 | #ifdef CONFIG_X86_PAE |
| 811 | op.cmd = MMUEXT_PIN_L3_TABLE; | 824 | level = MMUEXT_PIN_L3_TABLE; |
| 812 | #else | 825 | #else |
| 813 | op.cmd = MMUEXT_PIN_L3_TABLE; | 826 | level = MMUEXT_PIN_L2_TABLE; |
| 814 | #endif | 827 | #endif |
| 815 | op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); | 828 | |
| 816 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) | 829 | pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); |
| 817 | BUG(); | ||
| 818 | } | 830 | } |
| 819 | } | 831 | } |
| 820 | 832 | ||
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 72f08ab43a4d..b2e32f9d0071 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
| @@ -303,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd) | |||
| 303 | } | 303 | } |
| 304 | #endif /* CONFIG_X86_PAE */ | 304 | #endif /* CONFIG_X86_PAE */ |
| 305 | 305 | ||
| 306 | 306 | enum pt_level { | |
| 307 | PT_PGD, | ||
| 308 | PT_PUD, | ||
| 309 | PT_PMD, | ||
| 310 | PT_PTE | ||
| 311 | }; | ||
| 307 | 312 | ||
| 308 | /* | 313 | /* |
| 309 | (Yet another) pagetable walker. This one is intended for pinning a | 314 | (Yet another) pagetable walker. This one is intended for pinning a |
| @@ -315,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd) | |||
| 315 | FIXADDR_TOP. But the important bit is that we don't pin beyond | 320 | FIXADDR_TOP. But the important bit is that we don't pin beyond |
| 316 | there, because then we start getting into Xen's ptes. | 321 | there, because then we start getting into Xen's ptes. |
| 317 | */ | 322 | */ |
| 318 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | 323 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), |
| 319 | unsigned long limit) | 324 | unsigned long limit) |
| 320 | { | 325 | { |
| 321 | pgd_t *pgd = pgd_base; | 326 | pgd_t *pgd = pgd_base; |
| @@ -340,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
| 340 | pud = pud_offset(pgd, 0); | 345 | pud = pud_offset(pgd, 0); |
| 341 | 346 | ||
| 342 | if (PTRS_PER_PUD > 1) /* not folded */ | 347 | if (PTRS_PER_PUD > 1) /* not folded */ |
| 343 | flush |= (*func)(virt_to_page(pud), 0); | 348 | flush |= (*func)(virt_to_page(pud), PT_PUD); |
| 344 | 349 | ||
| 345 | for (; addr != pud_limit; pud++, addr = pud_next) { | 350 | for (; addr != pud_limit; pud++, addr = pud_next) { |
| 346 | pmd_t *pmd; | 351 | pmd_t *pmd; |
| @@ -359,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
| 359 | pmd = pmd_offset(pud, 0); | 364 | pmd = pmd_offset(pud, 0); |
| 360 | 365 | ||
| 361 | if (PTRS_PER_PMD > 1) /* not folded */ | 366 | if (PTRS_PER_PMD > 1) /* not folded */ |
| 362 | flush |= (*func)(virt_to_page(pmd), 0); | 367 | flush |= (*func)(virt_to_page(pmd), PT_PMD); |
| 363 | 368 | ||
| 364 | for (; addr != pmd_limit; pmd++) { | 369 | for (; addr != pmd_limit; pmd++) { |
| 365 | addr += (PAGE_SIZE * PTRS_PER_PTE); | 370 | addr += (PAGE_SIZE * PTRS_PER_PTE); |
| @@ -371,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | |||
| 371 | if (pmd_none(*pmd)) | 376 | if (pmd_none(*pmd)) |
| 372 | continue; | 377 | continue; |
| 373 | 378 | ||
| 374 | flush |= (*func)(pmd_page(*pmd), 0); | 379 | flush |= (*func)(pmd_page(*pmd), PT_PTE); |
| 375 | } | 380 | } |
| 376 | } | 381 | } |
| 377 | } | 382 | } |
| 378 | 383 | ||
| 379 | flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); | 384 | flush |= (*func)(virt_to_page(pgd_base), PT_PGD); |
| 380 | 385 | ||
| 381 | return flush; | 386 | return flush; |
| 382 | } | 387 | } |
| 383 | 388 | ||
| 384 | static int pin_page(struct page *page, unsigned flags) | 389 | static spinlock_t *lock_pte(struct page *page) |
| 390 | { | ||
| 391 | spinlock_t *ptl = NULL; | ||
| 392 | |||
| 393 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | ||
| 394 | ptl = __pte_lockptr(page); | ||
| 395 | spin_lock(ptl); | ||
| 396 | #endif | ||
| 397 | |||
| 398 | return ptl; | ||
| 399 | } | ||
| 400 | |||
| 401 | static void do_unlock(void *v) | ||
| 402 | { | ||
| 403 | spinlock_t *ptl = v; | ||
| 404 | spin_unlock(ptl); | ||
| 405 | } | ||
| 406 | |||
| 407 | static void xen_do_pin(unsigned level, unsigned long pfn) | ||
| 408 | { | ||
| 409 | struct mmuext_op *op; | ||
| 410 | struct multicall_space mcs; | ||
| 411 | |||
| 412 | mcs = __xen_mc_entry(sizeof(*op)); | ||
| 413 | op = mcs.args; | ||
| 414 | op->cmd = level; | ||
| 415 | op->arg1.mfn = pfn_to_mfn(pfn); | ||
| 416 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
| 417 | } | ||
| 418 | |||
| 419 | static int pin_page(struct page *page, enum pt_level level) | ||
| 385 | { | 420 | { |
| 386 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); | 421 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); |
| 387 | int flush; | 422 | int flush; |
| @@ -396,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags) | |||
| 396 | void *pt = lowmem_page_address(page); | 431 | void *pt = lowmem_page_address(page); |
| 397 | unsigned long pfn = page_to_pfn(page); | 432 | unsigned long pfn = page_to_pfn(page); |
| 398 | struct multicall_space mcs = __xen_mc_entry(0); | 433 | struct multicall_space mcs = __xen_mc_entry(0); |
| 434 | spinlock_t *ptl; | ||
| 399 | 435 | ||
| 400 | flush = 0; | 436 | flush = 0; |
| 401 | 437 | ||
| 438 | ptl = NULL; | ||
| 439 | if (level == PT_PTE) | ||
| 440 | ptl = lock_pte(page); | ||
| 441 | |||
| 402 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 442 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
| 403 | pfn_pte(pfn, PAGE_KERNEL_RO), | 443 | pfn_pte(pfn, PAGE_KERNEL_RO), |
| 404 | flags); | 444 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
| 445 | |||
| 446 | if (level == PT_PTE) | ||
| 447 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); | ||
| 448 | |||
| 449 | if (ptl) { | ||
| 450 | /* Queue a deferred unlock for when this batch | ||
| 451 | is completed. */ | ||
| 452 | xen_mc_callback(do_unlock, ptl); | ||
| 453 | } | ||
| 405 | } | 454 | } |
| 406 | 455 | ||
| 407 | return flush; | 456 | return flush; |
| @@ -412,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags) | |||
| 412 | read-only, and can be pinned. */ | 461 | read-only, and can be pinned. */ |
| 413 | void xen_pgd_pin(pgd_t *pgd) | 462 | void xen_pgd_pin(pgd_t *pgd) |
| 414 | { | 463 | { |
| 415 | struct multicall_space mcs; | 464 | unsigned level; |
| 416 | struct mmuext_op *op; | ||
| 417 | 465 | ||
| 418 | xen_mc_batch(); | 466 | xen_mc_batch(); |
| 419 | 467 | ||
| @@ -424,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd) | |||
| 424 | xen_mc_batch(); | 472 | xen_mc_batch(); |
| 425 | } | 473 | } |
| 426 | 474 | ||
| 427 | mcs = __xen_mc_entry(sizeof(*op)); | ||
| 428 | op = mcs.args; | ||
| 429 | |||
| 430 | #ifdef CONFIG_X86_PAE | 475 | #ifdef CONFIG_X86_PAE |
| 431 | op->cmd = MMUEXT_PIN_L3_TABLE; | 476 | level = MMUEXT_PIN_L3_TABLE; |
| 432 | #else | 477 | #else |
| 433 | op->cmd = MMUEXT_PIN_L2_TABLE; | 478 | level = MMUEXT_PIN_L2_TABLE; |
| 434 | #endif | 479 | #endif |
| 435 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | 480 | |
| 436 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 481 | xen_do_pin(level, PFN_DOWN(__pa(pgd))); |
| 437 | 482 | ||
| 438 | xen_mc_issue(0); | 483 | xen_mc_issue(0); |
| 439 | } | 484 | } |
| @@ -441,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd) | |||
| 441 | /* The init_mm pagetable is really pinned as soon as its created, but | 486 | /* The init_mm pagetable is really pinned as soon as its created, but |
| 442 | that's before we have page structures to store the bits. So do all | 487 | that's before we have page structures to store the bits. So do all |
| 443 | the book-keeping now. */ | 488 | the book-keeping now. */ |
| 444 | static __init int mark_pinned(struct page *page, unsigned flags) | 489 | static __init int mark_pinned(struct page *page, enum pt_level level) |
| 445 | { | 490 | { |
| 446 | SetPagePinned(page); | 491 | SetPagePinned(page); |
| 447 | return 0; | 492 | return 0; |
| @@ -452,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void) | |||
| 452 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); | 497 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); |
| 453 | } | 498 | } |
| 454 | 499 | ||
| 455 | static int unpin_page(struct page *page, unsigned flags) | 500 | static int unpin_page(struct page *page, enum pt_level level) |
| 456 | { | 501 | { |
| 457 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); | 502 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); |
| 458 | 503 | ||
| 459 | if (pgfl && !PageHighMem(page)) { | 504 | if (pgfl && !PageHighMem(page)) { |
| 460 | void *pt = lowmem_page_address(page); | 505 | void *pt = lowmem_page_address(page); |
| 461 | unsigned long pfn = page_to_pfn(page); | 506 | unsigned long pfn = page_to_pfn(page); |
| 462 | struct multicall_space mcs = __xen_mc_entry(0); | 507 | spinlock_t *ptl = NULL; |
| 508 | struct multicall_space mcs; | ||
| 509 | |||
| 510 | if (level == PT_PTE) { | ||
| 511 | ptl = lock_pte(page); | ||
| 512 | |||
| 513 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | ||
| 514 | } | ||
| 515 | |||
| 516 | mcs = __xen_mc_entry(0); | ||
| 463 | 517 | ||
| 464 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 518 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
| 465 | pfn_pte(pfn, PAGE_KERNEL), | 519 | pfn_pte(pfn, PAGE_KERNEL), |
| 466 | flags); | 520 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
| 521 | |||
| 522 | if (ptl) { | ||
| 523 | /* unlock when batch completed */ | ||
| 524 | xen_mc_callback(do_unlock, ptl); | ||
| 525 | } | ||
| 467 | } | 526 | } |
| 468 | 527 | ||
| 469 | return 0; /* never need to flush on unpin */ | 528 | return 0; /* never need to flush on unpin */ |
| @@ -472,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags) | |||
| 472 | /* Release a pagetables pages back as normal RW */ | 531 | /* Release a pagetables pages back as normal RW */ |
| 473 | static void xen_pgd_unpin(pgd_t *pgd) | 532 | static void xen_pgd_unpin(pgd_t *pgd) |
| 474 | { | 533 | { |
| 475 | struct mmuext_op *op; | ||
| 476 | struct multicall_space mcs; | ||
| 477 | |||
| 478 | xen_mc_batch(); | 534 | xen_mc_batch(); |
| 479 | 535 | ||
| 480 | mcs = __xen_mc_entry(sizeof(*op)); | 536 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
| 481 | |||
| 482 | op = mcs.args; | ||
| 483 | op->cmd = MMUEXT_UNPIN_TABLE; | ||
| 484 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | ||
| 485 | |||
| 486 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
| 487 | 537 | ||
| 488 | pgd_walk(pgd, unpin_page, TASK_SIZE); | 538 | pgd_walk(pgd, unpin_page, TASK_SIZE); |
| 489 | 539 | ||
| @@ -585,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm) | |||
| 585 | /* pgd may not be pinned in the error exit path of execve */ | 635 | /* pgd may not be pinned in the error exit path of execve */ |
| 586 | if (PagePinned(virt_to_page(mm->pgd))) | 636 | if (PagePinned(virt_to_page(mm->pgd))) |
| 587 | xen_pgd_unpin(mm->pgd); | 637 | xen_pgd_unpin(mm->pgd); |
| 638 | |||
| 588 | spin_unlock(&mm->page_table_lock); | 639 | spin_unlock(&mm->page_table_lock); |
| 589 | } | 640 | } |
diff --git a/mm/Kconfig b/mm/Kconfig index a7609cbcb00d..e24d348083c3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -137,7 +137,6 @@ config SPLIT_PTLOCK_CPUS | |||
| 137 | int | 137 | int |
| 138 | default "4096" if ARM && !CPU_CACHE_VIPT | 138 | default "4096" if ARM && !CPU_CACHE_VIPT |
| 139 | default "4096" if PARISC && !PA20 | 139 | default "4096" if PARISC && !PA20 |
| 140 | default "4096" if XEN | ||
| 141 | default "4" | 140 | default "4" |
| 142 | 141 | ||
| 143 | # | 142 | # |
