diff options
Diffstat (limited to 'arch/powerpc/mm/hugetlbpage.c')
-rw-r--r-- | arch/powerpc/mm/hugetlbpage.c | 299 |
1 files changed, 174 insertions, 125 deletions
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 4210549ac95e..834ca8eb38f2 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c | |||
@@ -21,6 +21,9 @@ | |||
21 | #include <asm/pgalloc.h> | 21 | #include <asm/pgalloc.h> |
22 | #include <asm/tlb.h> | 22 | #include <asm/tlb.h> |
23 | #include <asm/setup.h> | 23 | #include <asm/setup.h> |
24 | #include <asm/hugetlb.h> | ||
25 | |||
26 | #ifdef CONFIG_HUGETLB_PAGE | ||
24 | 27 | ||
25 | #define PAGE_SHIFT_64K 16 | 28 | #define PAGE_SHIFT_64K 16 |
26 | #define PAGE_SHIFT_16M 24 | 29 | #define PAGE_SHIFT_16M 24 |
@@ -100,68 +103,9 @@ int pgd_huge(pgd_t pgd) | |||
100 | } | 103 | } |
101 | #endif | 104 | #endif |
102 | 105 | ||
103 | /* | ||
104 | * We have 4 cases for pgds and pmds: | ||
105 | * (1) invalid (all zeroes) | ||
106 | * (2) pointer to next table, as normal; bottom 6 bits == 0 | ||
107 | * (3) leaf pte for huge page, bottom two bits != 00 | ||
108 | * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table | ||
109 | */ | ||
110 | pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) | ||
111 | { | ||
112 | pgd_t *pg; | ||
113 | pud_t *pu; | ||
114 | pmd_t *pm; | ||
115 | pte_t *ret_pte; | ||
116 | hugepd_t *hpdp = NULL; | ||
117 | unsigned pdshift = PGDIR_SHIFT; | ||
118 | |||
119 | if (shift) | ||
120 | *shift = 0; | ||
121 | |||
122 | pg = pgdir + pgd_index(ea); | ||
123 | |||
124 | if (pgd_huge(*pg)) { | ||
125 | ret_pte = (pte_t *) pg; | ||
126 | goto out; | ||
127 | } else if (is_hugepd(pg)) | ||
128 | hpdp = (hugepd_t *)pg; | ||
129 | else if (!pgd_none(*pg)) { | ||
130 | pdshift = PUD_SHIFT; | ||
131 | pu = pud_offset(pg, ea); | ||
132 | |||
133 | if (pud_huge(*pu)) { | ||
134 | ret_pte = (pte_t *) pu; | ||
135 | goto out; | ||
136 | } else if (is_hugepd(pu)) | ||
137 | hpdp = (hugepd_t *)pu; | ||
138 | else if (!pud_none(*pu)) { | ||
139 | pdshift = PMD_SHIFT; | ||
140 | pm = pmd_offset(pu, ea); | ||
141 | |||
142 | if (pmd_huge(*pm)) { | ||
143 | ret_pte = (pte_t *) pm; | ||
144 | goto out; | ||
145 | } else if (is_hugepd(pm)) | ||
146 | hpdp = (hugepd_t *)pm; | ||
147 | else if (!pmd_none(*pm)) | ||
148 | return pte_offset_kernel(pm, ea); | ||
149 | } | ||
150 | } | ||
151 | if (!hpdp) | ||
152 | return NULL; | ||
153 | |||
154 | ret_pte = hugepte_offset(hpdp, ea, pdshift); | ||
155 | pdshift = hugepd_shift(*hpdp); | ||
156 | out: | ||
157 | if (shift) | ||
158 | *shift = pdshift; | ||
159 | return ret_pte; | ||
160 | } | ||
161 | EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); | ||
162 | |||
163 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | 106 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) |
164 | { | 107 | { |
108 | /* Only called for hugetlbfs pages, hence can ignore THP */ | ||
165 | return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); | 109 | return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); |
166 | } | 110 | } |
167 | 111 | ||
@@ -736,11 +680,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |||
736 | struct page *page; | 680 | struct page *page; |
737 | unsigned shift; | 681 | unsigned shift; |
738 | unsigned long mask; | 682 | unsigned long mask; |
739 | 683 | /* | |
684 | * Transparent hugepages are handled by generic code. We can skip them | ||
685 | * here. | ||
686 | */ | ||
740 | ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); | 687 | ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); |
741 | 688 | ||
742 | /* Verify it is a huge page else bail. */ | 689 | /* Verify it is a huge page else bail. */ |
743 | if (!ptep || !shift) | 690 | if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) |
744 | return ERR_PTR(-EINVAL); | 691 | return ERR_PTR(-EINVAL); |
745 | 692 | ||
746 | mask = (1UL << shift) - 1; | 693 | mask = (1UL << shift) - 1; |
@@ -759,69 +706,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |||
759 | return NULL; | 706 | return NULL; |
760 | } | 707 | } |
761 | 708 | ||
762 | int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, | ||
763 | unsigned long end, int write, struct page **pages, int *nr) | ||
764 | { | ||
765 | unsigned long mask; | ||
766 | unsigned long pte_end; | ||
767 | struct page *head, *page, *tail; | ||
768 | pte_t pte; | ||
769 | int refs; | ||
770 | |||
771 | pte_end = (addr + sz) & ~(sz-1); | ||
772 | if (pte_end < end) | ||
773 | end = pte_end; | ||
774 | |||
775 | pte = *ptep; | ||
776 | mask = _PAGE_PRESENT | _PAGE_USER; | ||
777 | if (write) | ||
778 | mask |= _PAGE_RW; | ||
779 | |||
780 | if ((pte_val(pte) & mask) != mask) | ||
781 | return 0; | ||
782 | |||
783 | /* hugepages are never "special" */ | ||
784 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
785 | |||
786 | refs = 0; | ||
787 | head = pte_page(pte); | ||
788 | |||
789 | page = head + ((addr & (sz-1)) >> PAGE_SHIFT); | ||
790 | tail = page; | ||
791 | do { | ||
792 | VM_BUG_ON(compound_head(page) != head); | ||
793 | pages[*nr] = page; | ||
794 | (*nr)++; | ||
795 | page++; | ||
796 | refs++; | ||
797 | } while (addr += PAGE_SIZE, addr != end); | ||
798 | |||
799 | if (!page_cache_add_speculative(head, refs)) { | ||
800 | *nr -= refs; | ||
801 | return 0; | ||
802 | } | ||
803 | |||
804 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { | ||
805 | /* Could be optimized better */ | ||
806 | *nr -= refs; | ||
807 | while (refs--) | ||
808 | put_page(head); | ||
809 | return 0; | ||
810 | } | ||
811 | |||
812 | /* | ||
813 | * Any tail page need their mapcount reference taken before we | ||
814 | * return. | ||
815 | */ | ||
816 | while (refs--) { | ||
817 | if (PageTail(tail)) | ||
818 | get_huge_page_tail(tail); | ||
819 | tail++; | ||
820 | } | ||
821 | |||
822 | return 1; | ||
823 | } | ||
824 | |||
825 | static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, | 709 | static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, |
826 | unsigned long sz) | 710 | unsigned long sz) |
827 | { | 711 | { |
@@ -1038,3 +922,168 @@ void flush_dcache_icache_hugepage(struct page *page) | |||
1038 | } | 922 | } |
1039 | } | 923 | } |
1040 | } | 924 | } |
925 | |||
926 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
927 | |||
928 | /* | ||
929 | * We have 4 cases for pgds and pmds: | ||
930 | * (1) invalid (all zeroes) | ||
931 | * (2) pointer to next table, as normal; bottom 6 bits == 0 | ||
932 | * (3) leaf pte for huge page, bottom two bits != 00 | ||
933 | * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table | ||
934 | * | ||
935 | * So long as we atomically load page table pointers we are safe against teardown, | ||
936 | * we can follow the address down to the the page and take a ref on it. | ||
937 | */ | ||
938 | |||
939 | pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) | ||
940 | { | ||
941 | pgd_t pgd, *pgdp; | ||
942 | pud_t pud, *pudp; | ||
943 | pmd_t pmd, *pmdp; | ||
944 | pte_t *ret_pte; | ||
945 | hugepd_t *hpdp = NULL; | ||
946 | unsigned pdshift = PGDIR_SHIFT; | ||
947 | |||
948 | if (shift) | ||
949 | *shift = 0; | ||
950 | |||
951 | pgdp = pgdir + pgd_index(ea); | ||
952 | pgd = ACCESS_ONCE(*pgdp); | ||
953 | /* | ||
954 | * Always operate on the local stack value. This make sure the | ||
955 | * value don't get updated by a parallel THP split/collapse, | ||
956 | * page fault or a page unmap. The return pte_t * is still not | ||
957 | * stable. So should be checked there for above conditions. | ||
958 | */ | ||
959 | if (pgd_none(pgd)) | ||
960 | return NULL; | ||
961 | else if (pgd_huge(pgd)) { | ||
962 | ret_pte = (pte_t *) pgdp; | ||
963 | goto out; | ||
964 | } else if (is_hugepd(&pgd)) | ||
965 | hpdp = (hugepd_t *)&pgd; | ||
966 | else { | ||
967 | /* | ||
968 | * Even if we end up with an unmap, the pgtable will not | ||
969 | * be freed, because we do an rcu free and here we are | ||
970 | * irq disabled | ||
971 | */ | ||
972 | pdshift = PUD_SHIFT; | ||
973 | pudp = pud_offset(&pgd, ea); | ||
974 | pud = ACCESS_ONCE(*pudp); | ||
975 | |||
976 | if (pud_none(pud)) | ||
977 | return NULL; | ||
978 | else if (pud_huge(pud)) { | ||
979 | ret_pte = (pte_t *) pudp; | ||
980 | goto out; | ||
981 | } else if (is_hugepd(&pud)) | ||
982 | hpdp = (hugepd_t *)&pud; | ||
983 | else { | ||
984 | pdshift = PMD_SHIFT; | ||
985 | pmdp = pmd_offset(&pud, ea); | ||
986 | pmd = ACCESS_ONCE(*pmdp); | ||
987 | /* | ||
988 | * A hugepage collapse is captured by pmd_none, because | ||
989 | * it mark the pmd none and do a hpte invalidate. | ||
990 | * | ||
991 | * A hugepage split is captured by pmd_trans_splitting | ||
992 | * because we mark the pmd trans splitting and do a | ||
993 | * hpte invalidate | ||
994 | * | ||
995 | */ | ||
996 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | ||
997 | return NULL; | ||
998 | |||
999 | if (pmd_huge(pmd) || pmd_large(pmd)) { | ||
1000 | ret_pte = (pte_t *) pmdp; | ||
1001 | goto out; | ||
1002 | } else if (is_hugepd(&pmd)) | ||
1003 | hpdp = (hugepd_t *)&pmd; | ||
1004 | else | ||
1005 | return pte_offset_kernel(&pmd, ea); | ||
1006 | } | ||
1007 | } | ||
1008 | if (!hpdp) | ||
1009 | return NULL; | ||
1010 | |||
1011 | ret_pte = hugepte_offset(hpdp, ea, pdshift); | ||
1012 | pdshift = hugepd_shift(*hpdp); | ||
1013 | out: | ||
1014 | if (shift) | ||
1015 | *shift = pdshift; | ||
1016 | return ret_pte; | ||
1017 | } | ||
1018 | EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); | ||
1019 | |||
1020 | int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, | ||
1021 | unsigned long end, int write, struct page **pages, int *nr) | ||
1022 | { | ||
1023 | unsigned long mask; | ||
1024 | unsigned long pte_end; | ||
1025 | struct page *head, *page, *tail; | ||
1026 | pte_t pte; | ||
1027 | int refs; | ||
1028 | |||
1029 | pte_end = (addr + sz) & ~(sz-1); | ||
1030 | if (pte_end < end) | ||
1031 | end = pte_end; | ||
1032 | |||
1033 | pte = ACCESS_ONCE(*ptep); | ||
1034 | mask = _PAGE_PRESENT | _PAGE_USER; | ||
1035 | if (write) | ||
1036 | mask |= _PAGE_RW; | ||
1037 | |||
1038 | if ((pte_val(pte) & mask) != mask) | ||
1039 | return 0; | ||
1040 | |||
1041 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
1042 | /* | ||
1043 | * check for splitting here | ||
1044 | */ | ||
1045 | if (pmd_trans_splitting(pte_pmd(pte))) | ||
1046 | return 0; | ||
1047 | #endif | ||
1048 | |||
1049 | /* hugepages are never "special" */ | ||
1050 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
1051 | |||
1052 | refs = 0; | ||
1053 | head = pte_page(pte); | ||
1054 | |||
1055 | page = head + ((addr & (sz-1)) >> PAGE_SHIFT); | ||
1056 | tail = page; | ||
1057 | do { | ||
1058 | VM_BUG_ON(compound_head(page) != head); | ||
1059 | pages[*nr] = page; | ||
1060 | (*nr)++; | ||
1061 | page++; | ||
1062 | refs++; | ||
1063 | } while (addr += PAGE_SIZE, addr != end); | ||
1064 | |||
1065 | if (!page_cache_add_speculative(head, refs)) { | ||
1066 | *nr -= refs; | ||
1067 | return 0; | ||
1068 | } | ||
1069 | |||
1070 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { | ||
1071 | /* Could be optimized better */ | ||
1072 | *nr -= refs; | ||
1073 | while (refs--) | ||
1074 | put_page(head); | ||
1075 | return 0; | ||
1076 | } | ||
1077 | |||
1078 | /* | ||
1079 | * Any tail page need their mapcount reference taken before we | ||
1080 | * return. | ||
1081 | */ | ||
1082 | while (refs--) { | ||
1083 | if (PageTail(tail)) | ||
1084 | get_huge_page_tail(tail); | ||
1085 | tail++; | ||
1086 | } | ||
1087 | |||
1088 | return 1; | ||
1089 | } | ||