diff options
author | Jeremy Fitzhardinge <jeremy@goop.org> | 2008-08-19 16:32:51 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-08-20 06:40:08 -0400 |
commit | 11ad93e59d114f4b218873f1c93261be725d2e22 (patch) | |
tree | 00cceefe48f6fbd01607cfbb7285145321ebf1f7 /arch | |
parent | 63d3a75d6f1fcf2f33e6abbe84e1f428c3586152 (diff) |
xen: clarify locking used when pinning a pagetable.
Add some comments explaining the locking and pinning algorithm when
using split pte locks. Also implement a minor optimisation of not
pinning the PTE when not using split pte locks.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Xen-devel <xen-devel@lists.xensource.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/xen/mmu.c | 41 |
1 files changed, 35 insertions, 6 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aa37469da696..d3752b6ce6e6 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -590,8 +590,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
590 | pmdidx_limit = 0; | 590 | pmdidx_limit = 0; |
591 | #endif | 591 | #endif |
592 | 592 | ||
593 | flush |= (*func)(virt_to_page(pgd), PT_PGD); | ||
594 | |||
595 | for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { | 593 | for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { |
596 | pud_t *pud; | 594 | pud_t *pud; |
597 | 595 | ||
@@ -637,7 +635,11 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
637 | } | 635 | } |
638 | } | 636 | } |
639 | } | 637 | } |
638 | |||
640 | out: | 639 | out: |
640 | /* Do the top level last, so that the callbacks can use it as | ||
641 | a cue to do final things like tlb flushes. */ | ||
642 | flush |= (*func)(virt_to_page(pgd), PT_PGD); | ||
641 | 643 | ||
642 | return flush; | 644 | return flush; |
643 | } | 645 | } |
@@ -691,6 +693,26 @@ static int pin_page(struct page *page, enum pt_level level) | |||
691 | 693 | ||
692 | flush = 0; | 694 | flush = 0; |
693 | 695 | ||
696 | /* | ||
697 | * We need to hold the pagetable lock between the time | ||
698 | * we make the pagetable RO and when we actually pin | ||
699 | * it. If we don't, then other users may come in and | ||
700 | * attempt to update the pagetable by writing it, | ||
701 | * which will fail because the memory is RO but not | ||
702 | * pinned, so Xen won't do the trap'n'emulate. | ||
703 | * | ||
704 | * If we're using split pte locks, we can't hold the | ||
705 | * entire pagetable's worth of locks during the | ||
706 | * traverse, because we may wrap the preempt count (8 | ||
707 | * bits). The solution is to mark RO and pin each PTE | ||
708 | * page while holding the lock. This means the number | ||
709 | * of locks we end up holding is never more than a | ||
710 | * batch size (~32 entries, at present). | ||
711 | * | ||
712 | * If we're not using split pte locks, we needn't pin | ||
713 | * the PTE pages independently, because we're | ||
714 | * protected by the overall pagetable lock. | ||
715 | */ | ||
694 | ptl = NULL; | 716 | ptl = NULL; |
695 | if (level == PT_PTE) | 717 | if (level == PT_PTE) |
696 | ptl = lock_pte(page); | 718 | ptl = lock_pte(page); |
@@ -699,10 +721,9 @@ static int pin_page(struct page *page, enum pt_level level) | |||
699 | pfn_pte(pfn, PAGE_KERNEL_RO), | 721 | pfn_pte(pfn, PAGE_KERNEL_RO), |
700 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); | 722 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
701 | 723 | ||
702 | if (level == PT_PTE) | 724 | if (ptl) { |
703 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); | 725 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); |
704 | 726 | ||
705 | if (ptl) { | ||
706 | /* Queue a deferred unlock for when this batch | 727 | /* Queue a deferred unlock for when this batch |
707 | is completed. */ | 728 | is completed. */ |
708 | xen_mc_callback(do_unlock, ptl); | 729 | xen_mc_callback(do_unlock, ptl); |
@@ -796,10 +817,18 @@ static int unpin_page(struct page *page, enum pt_level level) | |||
796 | spinlock_t *ptl = NULL; | 817 | spinlock_t *ptl = NULL; |
797 | struct multicall_space mcs; | 818 | struct multicall_space mcs; |
798 | 819 | ||
820 | /* | ||
821 | * Do the converse to pin_page. If we're using split | ||
822 | * pte locks, we must be holding the lock for while | ||
823 | * the pte page is unpinned but still RO to prevent | ||
824 | * concurrent updates from seeing it in this | ||
825 | * partially-pinned state. | ||
826 | */ | ||
799 | if (level == PT_PTE) { | 827 | if (level == PT_PTE) { |
800 | ptl = lock_pte(page); | 828 | ptl = lock_pte(page); |
801 | 829 | ||
802 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | 830 | if (ptl) |
831 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | ||
803 | } | 832 | } |
804 | 833 | ||
805 | mcs = __xen_mc_entry(0); | 834 | mcs = __xen_mc_entry(0); |
@@ -837,7 +866,7 @@ static void xen_pgd_unpin(pgd_t *pgd) | |||
837 | 866 | ||
838 | #ifdef CONFIG_X86_PAE | 867 | #ifdef CONFIG_X86_PAE |
839 | /* Need to make sure unshared kernel PMD is unpinned */ | 868 | /* Need to make sure unshared kernel PMD is unpinned */ |
840 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | 869 | unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); |
841 | #endif | 870 | #endif |
842 | 871 | ||
843 | pgd_walk(pgd, unpin_page, USER_LIMIT); | 872 | pgd_walk(pgd, unpin_page, USER_LIMIT); |