diff options
author | Jeremy Fitzhardinge <jeremy@goop.org> | 2008-07-08 18:07:06 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-07-16 05:03:59 -0400 |
commit | 5deb30d194d28b6bf7dacfb758267a51bf7c5b78 (patch) | |
tree | fab33e09b8ea65a6e7144cf8b487dba539fdc15d /arch/x86/xen/mmu.c | |
parent | a8fc1089e49caa5dca346dfacb5c84abf9a22a0c (diff) |
xen: rework pgd_walk to deal with 32/64 bit
Rewrite pgd_walk to deal with 64-bit address spaces. There are two
notible features of 64-bit workspaces:
1. The physical address is only 48 bits wide, with the upper 16 bits
being sign extension; kernel addresses are negative, and userspace is
positive.
2. The Xen hypervisor mapping is at the negative-most address, just above
the sign-extension hole.
1. means that we can't easily use addresses when traversing the space,
since we must deal with sign extension. This rewrite expresses
everything in terms of pgd/pud/pmd indices, which means we don't need
to worry about the exact configuration of the virtual memory space.
This approach works equally well in 32-bit.
To deal with 2, assume the hole is between the uppermost userspace
address and PAGE_OFFSET. For 64-bit this skips the Xen mapping hole.
For 32-bit, the hole is zero-sized.
In all cases, the uppermost kernel address is FIXADDR_TOP.
A side-effect of this patch is that the upper boundary is actually
handled properly, exposing a long-standing bug in 32-bit, which failed
to pin kernel pmd page. The kernel pmd is not shared, and so must be
explicitly pinned, even though the kernel ptes are shared and don't
need pinning.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/xen/mmu.c')
-rw-r--r-- | arch/x86/xen/mmu.c | 115 |
1 files changed, 75 insertions, 40 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index eb31ed291b93..046c1f23dd6e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -44,6 +44,7 @@ | |||
44 | 44 | ||
45 | #include <asm/pgtable.h> | 45 | #include <asm/pgtable.h> |
46 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
47 | #include <asm/fixmap.h> | ||
47 | #include <asm/mmu_context.h> | 48 | #include <asm/mmu_context.h> |
48 | #include <asm/paravirt.h> | 49 | #include <asm/paravirt.h> |
49 | #include <asm/linkage.h> | 50 | #include <asm/linkage.h> |
@@ -491,77 +492,103 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) | |||
491 | #endif /* PAGETABLE_LEVELS == 4 */ | 492 | #endif /* PAGETABLE_LEVELS == 4 */ |
492 | 493 | ||
493 | /* | 494 | /* |
494 | (Yet another) pagetable walker. This one is intended for pinning a | 495 | * (Yet another) pagetable walker. This one is intended for pinning a |
495 | pagetable. This means that it walks a pagetable and calls the | 496 | * pagetable. This means that it walks a pagetable and calls the |
496 | callback function on each page it finds making up the page table, | 497 | * callback function on each page it finds making up the page table, |
497 | at every level. It walks the entire pagetable, but it only bothers | 498 | * at every level. It walks the entire pagetable, but it only bothers |
498 | pinning pte pages which are below pte_limit. In the normal case | 499 | * pinning pte pages which are below limit. In the normal case this |
499 | this will be TASK_SIZE, but at boot we need to pin up to | 500 | * will be STACK_TOP_MAX, but at boot we need to pin up to |
500 | FIXADDR_TOP. But the important bit is that we don't pin beyond | 501 | * FIXADDR_TOP. |
501 | there, because then we start getting into Xen's ptes. | 502 | * |
502 | */ | 503 | * For 32-bit the important bit is that we don't pin beyond there, |
503 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), | 504 | * because then we start getting into Xen's ptes. |
505 | * | ||
506 | * For 64-bit, we must skip the Xen hole in the middle of the address | ||
507 | * space, just after the big x86-64 virtual hole. | ||
508 | */ | ||
509 | static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | ||
504 | unsigned long limit) | 510 | unsigned long limit) |
505 | { | 511 | { |
506 | pgd_t *pgd = pgd_base; | ||
507 | int flush = 0; | 512 | int flush = 0; |
508 | unsigned long addr = 0; | 513 | unsigned hole_low, hole_high; |
509 | unsigned long pgd_next; | 514 | unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; |
515 | unsigned pgdidx, pudidx, pmdidx; | ||
510 | 516 | ||
511 | BUG_ON(limit > FIXADDR_TOP); | 517 | /* The limit is the last byte to be touched */ |
518 | limit--; | ||
519 | BUG_ON(limit >= FIXADDR_TOP); | ||
512 | 520 | ||
513 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 521 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
514 | return 0; | 522 | return 0; |
515 | 523 | ||
516 | for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { | 524 | /* |
525 | * 64-bit has a great big hole in the middle of the address | ||
526 | * space, which contains the Xen mappings. On 32-bit these | ||
527 | * will end up making a zero-sized hole and so is a no-op. | ||
528 | */ | ||
529 | hole_low = pgd_index(STACK_TOP_MAX + PGDIR_SIZE - 1); | ||
530 | hole_high = pgd_index(PAGE_OFFSET); | ||
531 | |||
532 | pgdidx_limit = pgd_index(limit); | ||
533 | #if PTRS_PER_PUD > 1 | ||
534 | pudidx_limit = pud_index(limit); | ||
535 | #else | ||
536 | pudidx_limit = 0; | ||
537 | #endif | ||
538 | #if PTRS_PER_PMD > 1 | ||
539 | pmdidx_limit = pmd_index(limit); | ||
540 | #else | ||
541 | pmdidx_limit = 0; | ||
542 | #endif | ||
543 | |||
544 | flush |= (*func)(virt_to_page(pgd), PT_PGD); | ||
545 | |||
546 | for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { | ||
517 | pud_t *pud; | 547 | pud_t *pud; |
518 | unsigned long pud_limit, pud_next; | ||
519 | 548 | ||
520 | pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); | 549 | if (pgdidx >= hole_low && pgdidx < hole_high) |
550 | continue; | ||
521 | 551 | ||
522 | if (!pgd_val(*pgd)) | 552 | if (!pgd_val(pgd[pgdidx])) |
523 | continue; | 553 | continue; |
524 | 554 | ||
525 | pud = pud_offset(pgd, 0); | 555 | pud = pud_offset(&pgd[pgdidx], 0); |
526 | 556 | ||
527 | if (PTRS_PER_PUD > 1) /* not folded */ | 557 | if (PTRS_PER_PUD > 1) /* not folded */ |
528 | flush |= (*func)(virt_to_page(pud), PT_PUD); | 558 | flush |= (*func)(virt_to_page(pud), PT_PUD); |
529 | 559 | ||
530 | for (; addr != pud_limit; pud++, addr = pud_next) { | 560 | for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { |
531 | pmd_t *pmd; | 561 | pmd_t *pmd; |
532 | unsigned long pmd_limit; | ||
533 | 562 | ||
534 | pud_next = pud_addr_end(addr, pud_limit); | 563 | if (pgdidx == pgdidx_limit && |
535 | 564 | pudidx > pudidx_limit) | |
536 | if (pud_next < limit) | 565 | goto out; |
537 | pmd_limit = pud_next; | ||
538 | else | ||
539 | pmd_limit = limit; | ||
540 | 566 | ||
541 | if (pud_none(*pud)) | 567 | if (pud_none(pud[pudidx])) |
542 | continue; | 568 | continue; |
543 | 569 | ||
544 | pmd = pmd_offset(pud, 0); | 570 | pmd = pmd_offset(&pud[pudidx], 0); |
545 | 571 | ||
546 | if (PTRS_PER_PMD > 1) /* not folded */ | 572 | if (PTRS_PER_PMD > 1) /* not folded */ |
547 | flush |= (*func)(virt_to_page(pmd), PT_PMD); | 573 | flush |= (*func)(virt_to_page(pmd), PT_PMD); |
548 | 574 | ||
549 | for (; addr != pmd_limit; pmd++) { | 575 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { |
550 | addr += (PAGE_SIZE * PTRS_PER_PTE); | 576 | struct page *pte; |
551 | if ((pmd_limit-1) < (addr-1)) { | 577 | |
552 | addr = pmd_limit; | 578 | if (pgdidx == pgdidx_limit && |
553 | break; | 579 | pudidx == pudidx_limit && |
554 | } | 580 | pmdidx > pmdidx_limit) |
581 | goto out; | ||
555 | 582 | ||
556 | if (pmd_none(*pmd)) | 583 | if (pmd_none(pmd[pmdidx])) |
557 | continue; | 584 | continue; |
558 | 585 | ||
559 | flush |= (*func)(pmd_page(*pmd), PT_PTE); | 586 | pte = pmd_page(pmd[pmdidx]); |
587 | flush |= (*func)(pte, PT_PTE); | ||
560 | } | 588 | } |
561 | } | 589 | } |
562 | } | 590 | } |
563 | 591 | out: | |
564 | flush |= (*func)(virt_to_page(pgd_base), PT_PGD); | ||
565 | 592 | ||
566 | return flush; | 593 | return flush; |
567 | } | 594 | } |
@@ -650,6 +677,11 @@ void xen_pgd_pin(pgd_t *pgd) | |||
650 | xen_mc_batch(); | 677 | xen_mc_batch(); |
651 | } | 678 | } |
652 | 679 | ||
680 | #ifdef CONFIG_X86_PAE | ||
681 | /* Need to make sure unshared kernel PMD is pinnable */ | ||
682 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | ||
683 | #endif | ||
684 | |||
653 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); | 685 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); |
654 | xen_mc_issue(0); | 686 | xen_mc_issue(0); |
655 | } | 687 | } |
@@ -731,6 +763,10 @@ static void xen_pgd_unpin(pgd_t *pgd) | |||
731 | 763 | ||
732 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); | 764 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
733 | 765 | ||
766 | #ifdef CONFIG_X86_PAE | ||
767 | /* Need to make sure unshared kernel PMD is unpinned */ | ||
768 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | ||
769 | #endif | ||
734 | pgd_walk(pgd, unpin_page, TASK_SIZE); | 770 | pgd_walk(pgd, unpin_page, TASK_SIZE); |
735 | 771 | ||
736 | xen_mc_issue(0); | 772 | xen_mc_issue(0); |
@@ -750,7 +786,6 @@ void xen_mm_unpin_all(void) | |||
750 | list_for_each_entry(page, &pgd_list, lru) { | 786 | list_for_each_entry(page, &pgd_list, lru) { |
751 | if (PageSavePinned(page)) { | 787 | if (PageSavePinned(page)) { |
752 | BUG_ON(!PagePinned(page)); | 788 | BUG_ON(!PagePinned(page)); |
753 | printk("unpinning pinned %p\n", page_address(page)); | ||
754 | xen_pgd_unpin((pgd_t *)page_address(page)); | 789 | xen_pgd_unpin((pgd_t *)page_address(page)); |
755 | ClearPageSavePinned(page); | 790 | ClearPageSavePinned(page); |
756 | } | 791 | } |