diff options
author | Jeremy Fitzhardinge <jeremy@goop.org> | 2008-10-08 16:01:39 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-09 08:25:19 -0400 |
commit | eefb47f6a1e855653d275cb90592a3587ea93a09 (patch) | |
tree | f6b5b611a8900d975874ab0ac9e7f5df935ef862 /arch/x86 | |
parent | d19c8e516e0a17e049bcfbe96f86e040254ddf14 (diff) |
xen: use spin_lock_nest_lock when pinning a pagetable
When pinning/unpinning a pagetable with split pte locks, we can end up
holding multiple pte locks at once (we need to hold the locks while
there's a pending batched hypercall affecting the pte page). Because
all the pte locks are in the same lock class, lockdep thinks that
we're potentially taking a lock recursively.
This warning is spurious because we always take the pte locks while
holding mm->page_table_lock. lockdep now has spin_lock_nest_lock to
express this kind of dominant lock use, so use it here so that lockdep
knows what's going on.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/xen/mmu.c | 74 | ||||
-rw-r--r-- | arch/x86/xen/mmu.h | 3 |
2 files changed, 48 insertions, 29 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 64e58681767e..ae173f6edd8b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -651,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) | |||
651 | * For 64-bit, we must skip the Xen hole in the middle of the address | 651 | * For 64-bit, we must skip the Xen hole in the middle of the address |
652 | * space, just after the big x86-64 virtual hole. | 652 | * space, just after the big x86-64 virtual hole. |
653 | */ | 653 | */ |
654 | static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | 654 | static int xen_pgd_walk(struct mm_struct *mm, |
655 | int (*func)(struct mm_struct *mm, struct page *, | ||
656 | enum pt_level), | ||
655 | unsigned long limit) | 657 | unsigned long limit) |
656 | { | 658 | { |
659 | pgd_t *pgd = mm->pgd; | ||
657 | int flush = 0; | 660 | int flush = 0; |
658 | unsigned hole_low, hole_high; | 661 | unsigned hole_low, hole_high; |
659 | unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; | 662 | unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; |
@@ -698,7 +701,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
698 | pud = pud_offset(&pgd[pgdidx], 0); | 701 | pud = pud_offset(&pgd[pgdidx], 0); |
699 | 702 | ||
700 | if (PTRS_PER_PUD > 1) /* not folded */ | 703 | if (PTRS_PER_PUD > 1) /* not folded */ |
701 | flush |= (*func)(virt_to_page(pud), PT_PUD); | 704 | flush |= (*func)(mm, virt_to_page(pud), PT_PUD); |
702 | 705 | ||
703 | for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { | 706 | for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { |
704 | pmd_t *pmd; | 707 | pmd_t *pmd; |
@@ -713,7 +716,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
713 | pmd = pmd_offset(&pud[pudidx], 0); | 716 | pmd = pmd_offset(&pud[pudidx], 0); |
714 | 717 | ||
715 | if (PTRS_PER_PMD > 1) /* not folded */ | 718 | if (PTRS_PER_PMD > 1) /* not folded */ |
716 | flush |= (*func)(virt_to_page(pmd), PT_PMD); | 719 | flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); |
717 | 720 | ||
718 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { | 721 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { |
719 | struct page *pte; | 722 | struct page *pte; |
@@ -727,7 +730,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
727 | continue; | 730 | continue; |
728 | 731 | ||
729 | pte = pmd_page(pmd[pmdidx]); | 732 | pte = pmd_page(pmd[pmdidx]); |
730 | flush |= (*func)(pte, PT_PTE); | 733 | flush |= (*func)(mm, pte, PT_PTE); |
731 | } | 734 | } |
732 | } | 735 | } |
733 | } | 736 | } |
@@ -735,20 +738,20 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
735 | out: | 738 | out: |
736 | /* Do the top level last, so that the callbacks can use it as | 739 | /* Do the top level last, so that the callbacks can use it as |
737 | a cue to do final things like tlb flushes. */ | 740 | a cue to do final things like tlb flushes. */ |
738 | flush |= (*func)(virt_to_page(pgd), PT_PGD); | 741 | flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); |
739 | 742 | ||
740 | return flush; | 743 | return flush; |
741 | } | 744 | } |
742 | 745 | ||
743 | /* If we're using split pte locks, then take the page's lock and | 746 | /* If we're using split pte locks, then take the page's lock and |
744 | return a pointer to it. Otherwise return NULL. */ | 747 | return a pointer to it. Otherwise return NULL. */ |
745 | static spinlock_t *xen_pte_lock(struct page *page) | 748 | static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) |
746 | { | 749 | { |
747 | spinlock_t *ptl = NULL; | 750 | spinlock_t *ptl = NULL; |
748 | 751 | ||
749 | #if USE_SPLIT_PTLOCKS | 752 | #if USE_SPLIT_PTLOCKS |
750 | ptl = __pte_lockptr(page); | 753 | ptl = __pte_lockptr(page); |
751 | spin_lock(ptl); | 754 | spin_lock_nest_lock(ptl, &mm->page_table_lock); |
752 | #endif | 755 | #endif |
753 | 756 | ||
754 | return ptl; | 757 | return ptl; |
@@ -772,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn) | |||
772 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 775 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
773 | } | 776 | } |
774 | 777 | ||
775 | static int xen_pin_page(struct page *page, enum pt_level level) | 778 | static int xen_pin_page(struct mm_struct *mm, struct page *page, |
779 | enum pt_level level) | ||
776 | { | 780 | { |
777 | unsigned pgfl = TestSetPagePinned(page); | 781 | unsigned pgfl = TestSetPagePinned(page); |
778 | int flush; | 782 | int flush; |
@@ -813,7 +817,7 @@ static int xen_pin_page(struct page *page, enum pt_level level) | |||
813 | */ | 817 | */ |
814 | ptl = NULL; | 818 | ptl = NULL; |
815 | if (level == PT_PTE) | 819 | if (level == PT_PTE) |
816 | ptl = xen_pte_lock(page); | 820 | ptl = xen_pte_lock(page, mm); |
817 | 821 | ||
818 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 822 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
819 | pfn_pte(pfn, PAGE_KERNEL_RO), | 823 | pfn_pte(pfn, PAGE_KERNEL_RO), |
@@ -834,11 +838,11 @@ static int xen_pin_page(struct page *page, enum pt_level level) | |||
834 | /* This is called just after a mm has been created, but it has not | 838 | /* This is called just after a mm has been created, but it has not |
835 | been used yet. We need to make sure that its pagetable is all | 839 | been used yet. We need to make sure that its pagetable is all |
836 | read-only, and can be pinned. */ | 840 | read-only, and can be pinned. */ |
837 | void xen_pgd_pin(pgd_t *pgd) | 841 | static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) |
838 | { | 842 | { |
839 | xen_mc_batch(); | 843 | xen_mc_batch(); |
840 | 844 | ||
841 | if (xen_pgd_walk(pgd, xen_pin_page, USER_LIMIT)) { | 845 | if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) { |
842 | /* re-enable interrupts for kmap_flush_unused */ | 846 | /* re-enable interrupts for kmap_flush_unused */ |
843 | xen_mc_issue(0); | 847 | xen_mc_issue(0); |
844 | kmap_flush_unused(); | 848 | kmap_flush_unused(); |
@@ -852,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd) | |||
852 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); | 856 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); |
853 | 857 | ||
854 | if (user_pgd) { | 858 | if (user_pgd) { |
855 | xen_pin_page(virt_to_page(user_pgd), PT_PGD); | 859 | xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); |
856 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); | 860 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); |
857 | } | 861 | } |
858 | } | 862 | } |
859 | #else /* CONFIG_X86_32 */ | 863 | #else /* CONFIG_X86_32 */ |
860 | #ifdef CONFIG_X86_PAE | 864 | #ifdef CONFIG_X86_PAE |
861 | /* Need to make sure unshared kernel PMD is pinnable */ | 865 | /* Need to make sure unshared kernel PMD is pinnable */ |
862 | xen_pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | 866 | xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), |
867 | PT_PMD); | ||
863 | #endif | 868 | #endif |
864 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); | 869 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); |
865 | #endif /* CONFIG_X86_64 */ | 870 | #endif /* CONFIG_X86_64 */ |
866 | xen_mc_issue(0); | 871 | xen_mc_issue(0); |
867 | } | 872 | } |
868 | 873 | ||
874 | static void xen_pgd_pin(struct mm_struct *mm) | ||
875 | { | ||
876 | __xen_pgd_pin(mm, mm->pgd); | ||
877 | } | ||
878 | |||
869 | /* | 879 | /* |
870 | * On save, we need to pin all pagetables to make sure they get their | 880 | * On save, we need to pin all pagetables to make sure they get their |
871 | * mfns turned into pfns. Search the list for any unpinned pgds and pin | 881 | * mfns turned into pfns. Search the list for any unpinned pgds and pin |
872 | * them (unpinned pgds are not currently in use, probably because the | 882 | * them (unpinned pgds are not currently in use, probably because the |
873 | * process is under construction or destruction). | 883 | * process is under construction or destruction). |
884 | * | ||
885 | * Expected to be called in stop_machine() ("equivalent to taking | ||
886 | * every spinlock in the system"), so the locking doesn't really | ||
887 | * matter all that much. | ||
874 | */ | 888 | */ |
875 | void xen_mm_pin_all(void) | 889 | void xen_mm_pin_all(void) |
876 | { | 890 | { |
@@ -881,7 +895,7 @@ void xen_mm_pin_all(void) | |||
881 | 895 | ||
882 | list_for_each_entry(page, &pgd_list, lru) { | 896 | list_for_each_entry(page, &pgd_list, lru) { |
883 | if (!PagePinned(page)) { | 897 | if (!PagePinned(page)) { |
884 | xen_pgd_pin((pgd_t *)page_address(page)); | 898 | __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); |
885 | SetPageSavePinned(page); | 899 | SetPageSavePinned(page); |
886 | } | 900 | } |
887 | } | 901 | } |
@@ -894,7 +908,8 @@ void xen_mm_pin_all(void) | |||
894 | * that's before we have page structures to store the bits. So do all | 908 | * that's before we have page structures to store the bits. So do all |
895 | * the book-keeping now. | 909 | * the book-keeping now. |
896 | */ | 910 | */ |
897 | static __init int xen_mark_pinned(struct page *page, enum pt_level level) | 911 | static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, |
912 | enum pt_level level) | ||
898 | { | 913 | { |
899 | SetPagePinned(page); | 914 | SetPagePinned(page); |
900 | return 0; | 915 | return 0; |
@@ -902,10 +917,11 @@ static __init int xen_mark_pinned(struct page *page, enum pt_level level) | |||
902 | 917 | ||
903 | void __init xen_mark_init_mm_pinned(void) | 918 | void __init xen_mark_init_mm_pinned(void) |
904 | { | 919 | { |
905 | xen_pgd_walk(init_mm.pgd, xen_mark_pinned, FIXADDR_TOP); | 920 | xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); |
906 | } | 921 | } |
907 | 922 | ||
908 | static int xen_unpin_page(struct page *page, enum pt_level level) | 923 | static int xen_unpin_page(struct mm_struct *mm, struct page *page, |
924 | enum pt_level level) | ||
909 | { | 925 | { |
910 | unsigned pgfl = TestClearPagePinned(page); | 926 | unsigned pgfl = TestClearPagePinned(page); |
911 | 927 | ||
@@ -923,7 +939,7 @@ static int xen_unpin_page(struct page *page, enum pt_level level) | |||
923 | * partially-pinned state. | 939 | * partially-pinned state. |
924 | */ | 940 | */ |
925 | if (level == PT_PTE) { | 941 | if (level == PT_PTE) { |
926 | ptl = xen_pte_lock(page); | 942 | ptl = xen_pte_lock(page, mm); |
927 | 943 | ||
928 | if (ptl) | 944 | if (ptl) |
929 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | 945 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); |
@@ -945,7 +961,7 @@ static int xen_unpin_page(struct page *page, enum pt_level level) | |||
945 | } | 961 | } |
946 | 962 | ||
947 | /* Release a pagetables pages back as normal RW */ | 963 | /* Release a pagetables pages back as normal RW */ |
948 | static void xen_pgd_unpin(pgd_t *pgd) | 964 | static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) |
949 | { | 965 | { |
950 | xen_mc_batch(); | 966 | xen_mc_batch(); |
951 | 967 | ||
@@ -957,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd) | |||
957 | 973 | ||
958 | if (user_pgd) { | 974 | if (user_pgd) { |
959 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); | 975 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); |
960 | xen_unpin_page(virt_to_page(user_pgd), PT_PGD); | 976 | xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); |
961 | } | 977 | } |
962 | } | 978 | } |
963 | #endif | 979 | #endif |
964 | 980 | ||
965 | #ifdef CONFIG_X86_PAE | 981 | #ifdef CONFIG_X86_PAE |
966 | /* Need to make sure unshared kernel PMD is unpinned */ | 982 | /* Need to make sure unshared kernel PMD is unpinned */ |
967 | xen_unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | 983 | xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), |
984 | PT_PMD); | ||
968 | #endif | 985 | #endif |
969 | 986 | ||
970 | xen_pgd_walk(pgd, xen_unpin_page, USER_LIMIT); | 987 | xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT); |
971 | 988 | ||
972 | xen_mc_issue(0); | 989 | xen_mc_issue(0); |
973 | } | 990 | } |
974 | 991 | ||
992 | static void xen_pgd_unpin(struct mm_struct *mm) | ||
993 | { | ||
994 | __xen_pgd_unpin(mm, mm->pgd); | ||
995 | } | ||
996 | |||
975 | /* | 997 | /* |
976 | * On resume, undo any pinning done at save, so that the rest of the | 998 | * On resume, undo any pinning done at save, so that the rest of the |
977 | * kernel doesn't see any unexpected pinned pagetables. | 999 | * kernel doesn't see any unexpected pinned pagetables. |
@@ -986,7 +1008,7 @@ void xen_mm_unpin_all(void) | |||
986 | list_for_each_entry(page, &pgd_list, lru) { | 1008 | list_for_each_entry(page, &pgd_list, lru) { |
987 | if (PageSavePinned(page)) { | 1009 | if (PageSavePinned(page)) { |
988 | BUG_ON(!PagePinned(page)); | 1010 | BUG_ON(!PagePinned(page)); |
989 | xen_pgd_unpin((pgd_t *)page_address(page)); | 1011 | __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); |
990 | ClearPageSavePinned(page); | 1012 | ClearPageSavePinned(page); |
991 | } | 1013 | } |
992 | } | 1014 | } |
@@ -997,14 +1019,14 @@ void xen_mm_unpin_all(void) | |||
997 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) | 1019 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) |
998 | { | 1020 | { |
999 | spin_lock(&next->page_table_lock); | 1021 | spin_lock(&next->page_table_lock); |
1000 | xen_pgd_pin(next->pgd); | 1022 | xen_pgd_pin(next); |
1001 | spin_unlock(&next->page_table_lock); | 1023 | spin_unlock(&next->page_table_lock); |
1002 | } | 1024 | } |
1003 | 1025 | ||
1004 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | 1026 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) |
1005 | { | 1027 | { |
1006 | spin_lock(&mm->page_table_lock); | 1028 | spin_lock(&mm->page_table_lock); |
1007 | xen_pgd_pin(mm->pgd); | 1029 | xen_pgd_pin(mm); |
1008 | spin_unlock(&mm->page_table_lock); | 1030 | spin_unlock(&mm->page_table_lock); |
1009 | } | 1031 | } |
1010 | 1032 | ||
@@ -1095,7 +1117,7 @@ void xen_exit_mmap(struct mm_struct *mm) | |||
1095 | 1117 | ||
1096 | /* pgd may not be pinned in the error exit path of execve */ | 1118 | /* pgd may not be pinned in the error exit path of execve */ |
1097 | if (xen_page_pinned(mm->pgd)) | 1119 | if (xen_page_pinned(mm->pgd)) |
1098 | xen_pgd_unpin(mm->pgd); | 1120 | xen_pgd_unpin(mm); |
1099 | 1121 | ||
1100 | spin_unlock(&mm->page_table_lock); | 1122 | spin_unlock(&mm->page_table_lock); |
1101 | } | 1123 | } |
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 0f59bd03f9e3..98d71659da5a 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h | |||
@@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); | |||
18 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); | 18 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); |
19 | void xen_exit_mmap(struct mm_struct *mm); | 19 | void xen_exit_mmap(struct mm_struct *mm); |
20 | 20 | ||
21 | void xen_pgd_pin(pgd_t *pgd); | ||
22 | //void xen_pgd_unpin(pgd_t *pgd); | ||
23 | |||
24 | pteval_t xen_pte_val(pte_t); | 21 | pteval_t xen_pte_val(pte_t); |
25 | pmdval_t xen_pmd_val(pmd_t); | 22 | pmdval_t xen_pmd_val(pmd_t); |
26 | pgdval_t xen_pgd_val(pgd_t); | 23 | pgdval_t xen_pgd_val(pgd_t); |