aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2009-10-26 15:24:31 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2009-10-30 02:20:58 -0400
commita4fe3ce7699bfe1bd88f816b55d42d8fe1dac655 (patch)
treeb72c982ffbb9f05d78a952288d60c4dc2d31a4d9 /arch/powerpc/mm
parenta0668cdc154e54bf0c85182e0535eea237d53146 (diff)
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different pagetable layout: that is, the bottem level table of pointers to hugepages is a different size, and may branch off from the normal page tables at a different level. Every hugepage aware path that needs to walk the pagetables must therefore look up the hugepage size from the slice info first, and work out the correct way to walk the pagetables accordingly. Future hardware is likely to add more possible hugepage sizes, more layout options and more mess. This patch, therefore reworks the handling of hugepage pagetables to reduce this complexity. In the new scheme, instead of having to consult the slice mask, pagetable walking code can check a flag in the PGD/PUD/PMD entries to see where to branch off to hugepage pagetables, and the entry also contains the information (eseentially hugepage shift) necessary to then interpret that table without recourse to the slice mask. This scheme can be extended neatly to handle multiple levels of self-describing "special" hugepage pagetables, although for now we assume only one level exists. This approach means that only the pagetable allocation path needs to know how the pagetables should be set out. All other (hugepage) pagetable walking paths can just interpret the structure as they go. There already was a flag bit in PGD/PUD/PMD entries for hugepage directory pointers, but it was only used for debug. We alter that flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable pointer (normally it would be 1 since the pointer lies in the linear mapping). This means that asm pagetable walking can test for (and punt on) hugepage pointers with the same test that checks for unpopulated page directory entries (beq becomes bge), since hugepage pointers will always be positive, and normal pointers always negative. While we're at it, we get rid of the confusing (and grep defeating) #defining of hugepte_shift to be the same thing as mmu_huge_psizes. Signed-off-by: David Gibson <dwg@au1.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/gup.c149
-rw-r--r--arch/powerpc/mm/hash_utils_64.c26
-rw-r--r--arch/powerpc/mm/hugetlbpage.c473
-rw-r--r--arch/powerpc/mm/init_64.c10
4 files changed, 276 insertions, 382 deletions
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index bc122a120bf0..d7efdbf640c7 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
55 return 1; 55 return 1;
56} 56}
57 57
58#ifdef CONFIG_HUGETLB_PAGE
59static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
60 unsigned long *addr, unsigned long end,
61 int write, struct page **pages, int *nr)
62{
63 unsigned long mask;
64 unsigned long pte_end;
65 struct page *head, *page;
66 pte_t pte;
67 int refs;
68
69 pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
70 if (pte_end < end)
71 end = pte_end;
72
73 pte = *ptep;
74 mask = _PAGE_PRESENT|_PAGE_USER;
75 if (write)
76 mask |= _PAGE_RW;
77 if ((pte_val(pte) & mask) != mask)
78 return 0;
79 /* hugepages are never "special" */
80 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
81
82 refs = 0;
83 head = pte_page(pte);
84 page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
85 do {
86 VM_BUG_ON(compound_head(page) != head);
87 pages[*nr] = page;
88 (*nr)++;
89 page++;
90 refs++;
91 } while (*addr += PAGE_SIZE, *addr != end);
92
93 if (!page_cache_add_speculative(head, refs)) {
94 *nr -= refs;
95 return 0;
96 }
97 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
98 /* Could be optimized better */
99 while (*nr) {
100 put_page(page);
101 (*nr)--;
102 }
103 }
104
105 return 1;
106}
107#endif /* CONFIG_HUGETLB_PAGE */
108
109static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 58static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
110 int write, struct page **pages, int *nr) 59 int write, struct page **pages, int *nr)
111{ 60{
@@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
119 next = pmd_addr_end(addr, end); 68 next = pmd_addr_end(addr, end);
120 if (pmd_none(pmd)) 69 if (pmd_none(pmd))
121 return 0; 70 return 0;
122 if (!gup_pte_range(pmd, addr, next, write, pages, nr)) 71 if (is_hugepd(pmdp)) {
72 if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
73 addr, next, write, pages, nr))
74 return 0;
75 } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
123 return 0; 76 return 0;
124 } while (pmdp++, addr = next, addr != end); 77 } while (pmdp++, addr = next, addr != end);
125 78
@@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
139 next = pud_addr_end(addr, end); 92 next = pud_addr_end(addr, end);
140 if (pud_none(pud)) 93 if (pud_none(pud))
141 return 0; 94 return 0;
142 if (!gup_pmd_range(pud, addr, next, write, pages, nr)) 95 if (is_hugepd(pudp)) {
96 if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
97 addr, next, write, pages, nr))
98 return 0;
99 } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
143 return 0; 100 return 0;
144 } while (pudp++, addr = next, addr != end); 101 } while (pudp++, addr = next, addr != end);
145 102
@@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
154 unsigned long next; 111 unsigned long next;
155 pgd_t *pgdp; 112 pgd_t *pgdp;
156 int nr = 0; 113 int nr = 0;
157#ifdef CONFIG_PPC64
158 unsigned int shift;
159 int psize;
160#endif
161 114
162 pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); 115 pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
163 116
@@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
172 125
173 pr_devel(" aligned: %lx .. %lx\n", start, end); 126 pr_devel(" aligned: %lx .. %lx\n", start, end);
174 127
175#ifdef CONFIG_HUGETLB_PAGE
176 /* We bail out on slice boundary crossing when hugetlb is
177 * enabled in order to not have to deal with two different
178 * page table formats
179 */
180 if (addr < SLICE_LOW_TOP) {
181 if (end > SLICE_LOW_TOP)
182 goto slow_irqon;
183
184 if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
185 GET_LOW_SLICE_INDEX(end - 1)))
186 goto slow_irqon;
187 } else {
188 if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
189 GET_HIGH_SLICE_INDEX(end - 1)))
190 goto slow_irqon;
191 }
192#endif /* CONFIG_HUGETLB_PAGE */
193
194 /* 128 /*
195 * XXX: batch / limit 'nr', to avoid large irq off latency 129 * XXX: batch / limit 'nr', to avoid large irq off latency
196 * needs some instrumenting to determine the common sizes used by 130 * needs some instrumenting to determine the common sizes used by
@@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
210 */ 144 */
211 local_irq_disable(); 145 local_irq_disable();
212 146
213#ifdef CONFIG_PPC64 147 pgdp = pgd_offset(mm, addr);
214 /* Those bits are related to hugetlbfs implementation and only exist 148 do {
215 * on 64-bit for now 149 pgd_t pgd = *pgdp;
216 */ 150
217 psize = get_slice_psize(mm, addr); 151 pr_devel(" %016lx: normal pgd %p\n", addr,
218 shift = mmu_psize_defs[psize].shift; 152 (void *)pgd_val(pgd));
219#endif /* CONFIG_PPC64 */ 153 next = pgd_addr_end(addr, end);
220 154 if (pgd_none(pgd))
221#ifdef CONFIG_HUGETLB_PAGE 155 goto slow;
222 if (unlikely(mmu_huge_psizes[psize])) { 156 if (is_hugepd(pgdp)) {
223 pte_t *ptep; 157 if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
224 unsigned long a = addr; 158 addr, next, write, pages, &nr))
225 unsigned long sz = ((1UL) << shift);
226 struct hstate *hstate = size_to_hstate(sz);
227
228 BUG_ON(!hstate);
229 /*
230 * XXX: could be optimized to avoid hstate
231 * lookup entirely (just use shift)
232 */
233
234 do {
235 VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
236 ptep = huge_pte_offset(mm, a);
237 pr_devel(" %016lx: huge ptep %p\n", a, ptep);
238 if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
239 &nr))
240 goto slow;
241 } while (a != end);
242 } else
243#endif /* CONFIG_HUGETLB_PAGE */
244 {
245 pgdp = pgd_offset(mm, addr);
246 do {
247 pgd_t pgd = *pgdp;
248
249#ifdef CONFIG_PPC64
250 VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
251#endif
252 pr_devel(" %016lx: normal pgd %p\n", addr,
253 (void *)pgd_val(pgd));
254 next = pgd_addr_end(addr, end);
255 if (pgd_none(pgd))
256 goto slow;
257 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
258 goto slow; 159 goto slow;
259 } while (pgdp++, addr = next, addr != end); 160 } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
260 } 161 goto slow;
162 } while (pgdp++, addr = next, addr != end);
163
261 local_irq_enable(); 164 local_irq_enable();
262 165
263 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); 166 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 1ade7eb6ae00..485dcd197a61 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -891,6 +891,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
891 unsigned long vsid; 891 unsigned long vsid;
892 struct mm_struct *mm; 892 struct mm_struct *mm;
893 pte_t *ptep; 893 pte_t *ptep;
894 unsigned hugeshift;
894 const struct cpumask *tmp; 895 const struct cpumask *tmp;
895 int rc, user_region = 0, local = 0; 896 int rc, user_region = 0, local = 0;
896 int psize, ssize; 897 int psize, ssize;
@@ -943,30 +944,31 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
943 if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) 944 if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
944 local = 1; 945 local = 1;
945 946
946#ifdef CONFIG_HUGETLB_PAGE
947 /* Handle hugepage regions */
948 if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
949 DBG_LOW(" -> huge page !\n");
950 return hash_huge_page(mm, access, ea, vsid, local, trap);
951 }
952#endif /* CONFIG_HUGETLB_PAGE */
953
954#ifndef CONFIG_PPC_64K_PAGES 947#ifndef CONFIG_PPC_64K_PAGES
955 /* If we use 4K pages and our psize is not 4K, then we are hitting 948 /* If we use 4K pages and our psize is not 4K, then we might
956 * a special driver mapping, we need to align the address before 949 * be hitting a special driver mapping, and need to align the
957 * we fetch the PTE 950 * address before we fetch the PTE.
951 *
952 * It could also be a hugepage mapping, in which case this is
953 * not necessary, but it's not harmful, either.
958 */ 954 */
959 if (psize != MMU_PAGE_4K) 955 if (psize != MMU_PAGE_4K)
960 ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); 956 ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
961#endif /* CONFIG_PPC_64K_PAGES */ 957#endif /* CONFIG_PPC_64K_PAGES */
962 958
963 /* Get PTE and page size from page tables */ 959 /* Get PTE and page size from page tables */
964 ptep = find_linux_pte(pgdir, ea); 960 ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
965 if (ptep == NULL || !pte_present(*ptep)) { 961 if (ptep == NULL || !pte_present(*ptep)) {
966 DBG_LOW(" no PTE !\n"); 962 DBG_LOW(" no PTE !\n");
967 return 1; 963 return 1;
968 } 964 }
969 965
966#ifdef CONFIG_HUGETLB_PAGE
967 if (hugeshift)
968 return __hash_page_huge(ea, access, vsid, ptep, trap, local,
969 ssize, hugeshift, psize);
970#endif /* CONFIG_HUGETLB_PAGE */
971
970#ifndef CONFIG_PPC_64K_PAGES 972#ifndef CONFIG_PPC_64K_PAGES
971 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); 973 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
972#else 974#else
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7230d7a4fbd9..95220a5dee58 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -40,25 +40,11 @@ static unsigned nr_gpages;
40/* Array of valid huge page sizes - non-zero value(hugepte_shift) is 40/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
41 * stored for the huge page sizes that are valid. 41 * stored for the huge page sizes that are valid.
42 */ 42 */
43unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ 43static unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
44
45#define hugepte_shift mmu_huge_psizes
46#define HUGEPTE_INDEX_SIZE(psize) (mmu_huge_psizes[(psize)])
47#define PTRS_PER_HUGEPTE(psize) (1 << mmu_huge_psizes[psize])
48
49#define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \
50 + HUGEPTE_INDEX_SIZE(psize))
51#define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize))
52#define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1))
53 44
54/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 45/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
55 * will choke on pointers to hugepte tables, which is handy for 46 * will choke on pointers to hugepte tables, which is handy for
56 * catching screwups early. */ 47 * catching screwups early. */
57#define HUGEPD_OK 0x1
58
59typedef struct { unsigned long pd; } hugepd_t;
60
61#define hugepd_none(hpd) ((hpd).pd == 0)
62 48
63static inline int shift_to_mmu_psize(unsigned int shift) 49static inline int shift_to_mmu_psize(unsigned int shift)
64{ 50{
@@ -82,71 +68,126 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
82 BUG(); 68 BUG();
83} 69}
84 70
71#define hugepd_none(hpd) ((hpd).pd == 0)
72
85static inline pte_t *hugepd_page(hugepd_t hpd) 73static inline pte_t *hugepd_page(hugepd_t hpd)
86{ 74{
87 BUG_ON(!(hpd.pd & HUGEPD_OK)); 75 BUG_ON(!hugepd_ok(hpd));
88 return (pte_t *)(hpd.pd & ~HUGEPD_OK); 76 return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
77}
78
79static inline unsigned int hugepd_shift(hugepd_t hpd)
80{
81 return hpd.pd & HUGEPD_SHIFT_MASK;
89} 82}
90 83
91static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, 84static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
92 struct hstate *hstate)
93{ 85{
94 unsigned int shift = huge_page_shift(hstate); 86 unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
95 int psize = shift_to_mmu_psize(shift);
96 unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
97 pte_t *dir = hugepd_page(*hpdp); 87 pte_t *dir = hugepd_page(*hpdp);
98 88
99 return dir + idx; 89 return dir + idx;
100} 90}
101 91
92pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
93{
94 pgd_t *pg;
95 pud_t *pu;
96 pmd_t *pm;
97 hugepd_t *hpdp = NULL;
98 unsigned pdshift = PGDIR_SHIFT;
99
100 if (shift)
101 *shift = 0;
102
103 pg = pgdir + pgd_index(ea);
104 if (is_hugepd(pg)) {
105 hpdp = (hugepd_t *)pg;
106 } else if (!pgd_none(*pg)) {
107 pdshift = PUD_SHIFT;
108 pu = pud_offset(pg, ea);
109 if (is_hugepd(pu))
110 hpdp = (hugepd_t *)pu;
111 else if (!pud_none(*pu)) {
112 pdshift = PMD_SHIFT;
113 pm = pmd_offset(pu, ea);
114 if (is_hugepd(pm))
115 hpdp = (hugepd_t *)pm;
116 else if (!pmd_none(*pm)) {
117 return pte_offset_map(pm, ea);
118 }
119 }
120 }
121
122 if (!hpdp)
123 return NULL;
124
125 if (shift)
126 *shift = hugepd_shift(*hpdp);
127 return hugepte_offset(hpdp, ea, pdshift);
128}
129
130pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
131{
132 return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
133}
134
102static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 135static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
103 unsigned long address, unsigned int psize) 136 unsigned long address, unsigned pdshift, unsigned pshift)
104{ 137{
105 pte_t *new = kmem_cache_zalloc(PGT_CACHE(hugepte_shift[psize]), 138 pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
106 GFP_KERNEL|__GFP_REPEAT); 139 GFP_KERNEL|__GFP_REPEAT);
107 140
141 BUG_ON(pshift > HUGEPD_SHIFT_MASK);
142 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
143
108 if (! new) 144 if (! new)
109 return -ENOMEM; 145 return -ENOMEM;
110 146
111 spin_lock(&mm->page_table_lock); 147 spin_lock(&mm->page_table_lock);
112 if (!hugepd_none(*hpdp)) 148 if (!hugepd_none(*hpdp))
113 kmem_cache_free(PGT_CACHE(hugepte_shift[psize]), new); 149 kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
114 else 150 else
115 hpdp->pd = (unsigned long)new | HUGEPD_OK; 151 hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
116 spin_unlock(&mm->page_table_lock); 152 spin_unlock(&mm->page_table_lock);
117 return 0; 153 return 0;
118} 154}
119 155
120 156pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
121static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
122{ 157{
123 if (huge_page_shift(hstate) < PUD_SHIFT) 158 pgd_t *pg;
124 return pud_offset(pgd, addr); 159 pud_t *pu;
125 else 160 pmd_t *pm;
126 return (pud_t *) pgd; 161 hugepd_t *hpdp = NULL;
127} 162 unsigned pshift = __ffs(sz);
128static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, 163 unsigned pdshift = PGDIR_SHIFT;
129 struct hstate *hstate) 164
130{ 165 addr &= ~(sz-1);
131 if (huge_page_shift(hstate) < PUD_SHIFT) 166
132 return pud_alloc(mm, pgd, addr); 167 pg = pgd_offset(mm, addr);
133 else 168 if (pshift >= PUD_SHIFT) {
134 return (pud_t *) pgd; 169 hpdp = (hugepd_t *)pg;
135} 170 } else {
136static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) 171 pdshift = PUD_SHIFT;
137{ 172 pu = pud_alloc(mm, pg, addr);
138 if (huge_page_shift(hstate) < PMD_SHIFT) 173 if (pshift >= PMD_SHIFT) {
139 return pmd_offset(pud, addr); 174 hpdp = (hugepd_t *)pu;
140 else 175 } else {
141 return (pmd_t *) pud; 176 pdshift = PMD_SHIFT;
142} 177 pm = pmd_alloc(mm, pu, addr);
143static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, 178 hpdp = (hugepd_t *)pm;
144 struct hstate *hstate) 179 }
145{ 180 }
146 if (huge_page_shift(hstate) < PMD_SHIFT) 181
147 return pmd_alloc(mm, pud, addr); 182 if (!hpdp)
148 else 183 return NULL;
149 return (pmd_t *) pud; 184
185 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
186
187 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
188 return NULL;
189
190 return hugepte_offset(hpdp, addr, pdshift);
150} 191}
151 192
152/* Build list of addresses of gigantic pages. This function is used in early 193/* Build list of addresses of gigantic pages. This function is used in early
@@ -180,92 +221,38 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
180 return 1; 221 return 1;
181} 222}
182 223
183
184/* Modelled after find_linux_pte() */
185pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
186{
187 pgd_t *pg;
188 pud_t *pu;
189 pmd_t *pm;
190
191 unsigned int psize;
192 unsigned int shift;
193 unsigned long sz;
194 struct hstate *hstate;
195 psize = get_slice_psize(mm, addr);
196 shift = mmu_psize_to_shift(psize);
197 sz = ((1UL) << shift);
198 hstate = size_to_hstate(sz);
199
200 addr &= hstate->mask;
201
202 pg = pgd_offset(mm, addr);
203 if (!pgd_none(*pg)) {
204 pu = hpud_offset(pg, addr, hstate);
205 if (!pud_none(*pu)) {
206 pm = hpmd_offset(pu, addr, hstate);
207 if (!pmd_none(*pm))
208 return hugepte_offset((hugepd_t *)pm, addr,
209 hstate);
210 }
211 }
212
213 return NULL;
214}
215
216pte_t *huge_pte_alloc(struct mm_struct *mm,
217 unsigned long addr, unsigned long sz)
218{
219 pgd_t *pg;
220 pud_t *pu;
221 pmd_t *pm;
222 hugepd_t *hpdp = NULL;
223 struct hstate *hstate;
224 unsigned int psize;
225 hstate = size_to_hstate(sz);
226
227 psize = get_slice_psize(mm, addr);
228 BUG_ON(!mmu_huge_psizes[psize]);
229
230 addr &= hstate->mask;
231
232 pg = pgd_offset(mm, addr);
233 pu = hpud_alloc(mm, pg, addr, hstate);
234
235 if (pu) {
236 pm = hpmd_alloc(mm, pu, addr, hstate);
237 if (pm)
238 hpdp = (hugepd_t *)pm;
239 }
240
241 if (! hpdp)
242 return NULL;
243
244 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
245 return NULL;
246
247 return hugepte_offset(hpdp, addr, hstate);
248}
249
250int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 224int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
251{ 225{
252 return 0; 226 return 0;
253} 227}
254 228
255static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, 229static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
256 unsigned int psize) 230 unsigned long start, unsigned long end,
231 unsigned long floor, unsigned long ceiling)
257{ 232{
258 pte_t *hugepte = hugepd_page(*hpdp); 233 pte_t *hugepte = hugepd_page(*hpdp);
234 unsigned shift = hugepd_shift(*hpdp);
235 unsigned long pdmask = ~((1UL << pdshift) - 1);
236
237 start &= pdmask;
238 if (start < floor)
239 return;
240 if (ceiling) {
241 ceiling &= pdmask;
242 if (! ceiling)
243 return;
244 }
245 if (end - 1 > ceiling - 1)
246 return;
259 247
260 hpdp->pd = 0; 248 hpdp->pd = 0;
261 tlb->need_flush = 1; 249 tlb->need_flush = 1;
262 pgtable_free_tlb(tlb, hugepte, hugepte_shift[psize]); 250 pgtable_free_tlb(tlb, hugepte, pdshift - shift);
263} 251}
264 252
265static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 253static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
266 unsigned long addr, unsigned long end, 254 unsigned long addr, unsigned long end,
267 unsigned long floor, unsigned long ceiling, 255 unsigned long floor, unsigned long ceiling)
268 unsigned int psize)
269{ 256{
270 pmd_t *pmd; 257 pmd_t *pmd;
271 unsigned long next; 258 unsigned long next;
@@ -277,7 +264,8 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
277 next = pmd_addr_end(addr, end); 264 next = pmd_addr_end(addr, end);
278 if (pmd_none(*pmd)) 265 if (pmd_none(*pmd))
279 continue; 266 continue;
280 free_hugepte_range(tlb, (hugepd_t *)pmd, psize); 267 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
268 addr, next, floor, ceiling);
281 } while (pmd++, addr = next, addr != end); 269 } while (pmd++, addr = next, addr != end);
282 270
283 start &= PUD_MASK; 271 start &= PUD_MASK;
@@ -303,23 +291,19 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
303 pud_t *pud; 291 pud_t *pud;
304 unsigned long next; 292 unsigned long next;
305 unsigned long start; 293 unsigned long start;
306 unsigned int shift;
307 unsigned int psize = get_slice_psize(tlb->mm, addr);
308 shift = mmu_psize_to_shift(psize);
309 294
310 start = addr; 295 start = addr;
311 pud = pud_offset(pgd, addr); 296 pud = pud_offset(pgd, addr);
312 do { 297 do {
313 next = pud_addr_end(addr, end); 298 next = pud_addr_end(addr, end);
314 if (shift < PMD_SHIFT) { 299 if (!is_hugepd(pud)) {
315 if (pud_none_or_clear_bad(pud)) 300 if (pud_none_or_clear_bad(pud))
316 continue; 301 continue;
317 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 302 hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
318 ceiling, psize); 303 ceiling);
319 } else { 304 } else {
320 if (pud_none(*pud)) 305 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
321 continue; 306 addr, next, floor, ceiling);
322 free_hugepte_range(tlb, (hugepd_t *)pud, psize);
323 } 307 }
324 } while (pud++, addr = next, addr != end); 308 } while (pud++, addr = next, addr != end);
325 309
@@ -350,74 +334,34 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
350{ 334{
351 pgd_t *pgd; 335 pgd_t *pgd;
352 unsigned long next; 336 unsigned long next;
353 unsigned long start;
354 337
355 /* 338 /*
356 * Comments below take from the normal free_pgd_range(). They 339 * Because there are a number of different possible pagetable
357 * apply here too. The tests against HUGEPD_MASK below are 340 * layouts for hugepage ranges, we limit knowledge of how
358 * essential, because we *don't* test for this at the bottom 341 * things should be laid out to the allocation path
359 * level. Without them we'll attempt to free a hugepte table 342 * (huge_pte_alloc(), above). Everything else works out the
360 * when we unmap just part of it, even if there are other 343 * structure as it goes from information in the hugepd
361 * active mappings using it. 344 * pointers. That means that we can't here use the
362 * 345 * optimization used in the normal page free_pgd_range(), of
363 * The next few lines have given us lots of grief... 346 * checking whether we're actually covering a large enough
364 * 347 * range to have to do anything at the top level of the walk
365 * Why are we testing HUGEPD* at this top level? Because 348 * instead of at the bottom.
366 * often there will be no work to do at all, and we'd prefer
367 * not to go all the way down to the bottom just to discover
368 * that.
369 *
370 * Why all these "- 1"s? Because 0 represents both the bottom
371 * of the address space and the top of it (using -1 for the
372 * top wouldn't help much: the masks would do the wrong thing).
373 * The rule is that addr 0 and floor 0 refer to the bottom of
374 * the address space, but end 0 and ceiling 0 refer to the top
375 * Comparisons need to use "end - 1" and "ceiling - 1" (though
376 * that end 0 case should be mythical).
377 *
378 * Wherever addr is brought up or ceiling brought down, we
379 * must be careful to reject "the opposite 0" before it
380 * confuses the subsequent tests. But what about where end is
381 * brought down by HUGEPD_SIZE below? no, end can't go down to
382 * 0 there.
383 * 349 *
384 * Whereas we round start (addr) and ceiling down, by different 350 * To make sense of this, you should probably go read the big
385 * masks at different levels, in order to test whether a table 351 * block comment at the top of the normal free_pgd_range(),
386 * now has no other vmas using it, so can be freed, we don't 352 * too.
387 * bother to round floor or end up - the tests don't need that.
388 */ 353 */
389 unsigned int psize = get_slice_psize(tlb->mm, addr);
390
391 addr &= HUGEPD_MASK(psize);
392 if (addr < floor) {
393 addr += HUGEPD_SIZE(psize);
394 if (!addr)
395 return;
396 }
397 if (ceiling) {
398 ceiling &= HUGEPD_MASK(psize);
399 if (!ceiling)
400 return;
401 }
402 if (end - 1 > ceiling - 1)
403 end -= HUGEPD_SIZE(psize);
404 if (addr > end - 1)
405 return;
406 354
407 start = addr;
408 pgd = pgd_offset(tlb->mm, addr); 355 pgd = pgd_offset(tlb->mm, addr);
409 do { 356 do {
410 psize = get_slice_psize(tlb->mm, addr);
411 BUG_ON(!mmu_huge_psizes[psize]);
412 next = pgd_addr_end(addr, end); 357 next = pgd_addr_end(addr, end);
413 if (mmu_psize_to_shift(psize) < PUD_SHIFT) { 358 if (!is_hugepd(pgd)) {
414 if (pgd_none_or_clear_bad(pgd)) 359 if (pgd_none_or_clear_bad(pgd))
415 continue; 360 continue;
416 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 361 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
417 } else { 362 } else {
418 if (pgd_none(*pgd)) 363 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
419 continue; 364 addr, next, floor, ceiling);
420 free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
421 } 365 }
422 } while (pgd++, addr = next, addr != end); 366 } while (pgd++, addr = next, addr != end);
423} 367}
@@ -448,19 +392,19 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
448{ 392{
449 pte_t *ptep; 393 pte_t *ptep;
450 struct page *page; 394 struct page *page;
451 unsigned int mmu_psize = get_slice_psize(mm, address); 395 unsigned shift;
396 unsigned long mask;
397
398 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
452 399
453 /* Verify it is a huge page else bail. */ 400 /* Verify it is a huge page else bail. */
454 if (!mmu_huge_psizes[mmu_psize]) 401 if (!ptep || !shift)
455 return ERR_PTR(-EINVAL); 402 return ERR_PTR(-EINVAL);
456 403
457 ptep = huge_pte_offset(mm, address); 404 mask = (1UL << shift) - 1;
458 page = pte_page(*ptep); 405 page = pte_page(*ptep);
459 if (page) { 406 if (page)
460 unsigned int shift = mmu_psize_to_shift(mmu_psize); 407 page += (address & mask) / PAGE_SIZE;
461 unsigned long sz = ((1UL) << shift);
462 page += (address % sz) / PAGE_SIZE;
463 }
464 408
465 return page; 409 return page;
466} 410}
@@ -483,6 +427,73 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
483 return NULL; 427 return NULL;
484} 428}
485 429
430static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
431 unsigned long end, int write, struct page **pages, int *nr)
432{
433 unsigned long mask;
434 unsigned long pte_end;
435 struct page *head, *page;
436 pte_t pte;
437 int refs;
438
439 pte_end = (addr + sz) & ~(sz-1);
440 if (pte_end < end)
441 end = pte_end;
442
443 pte = *ptep;
444 mask = _PAGE_PRESENT | _PAGE_USER;
445 if (write)
446 mask |= _PAGE_RW;
447
448 if ((pte_val(pte) & mask) != mask)
449 return 0;
450
451 /* hugepages are never "special" */
452 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
453
454 refs = 0;
455 head = pte_page(pte);
456
457 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
458 do {
459 VM_BUG_ON(compound_head(page) != head);
460 pages[*nr] = page;
461 (*nr)++;
462 page++;
463 refs++;
464 } while (addr += PAGE_SIZE, addr != end);
465
466 if (!page_cache_add_speculative(head, refs)) {
467 *nr -= refs;
468 return 0;
469 }
470
471 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
472 /* Could be optimized better */
473 while (*nr) {
474 put_page(page);
475 (*nr)--;
476 }
477 }
478
479 return 1;
480}
481
482int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
483 unsigned long addr, unsigned long end,
484 int write, struct page **pages, int *nr)
485{
486 pte_t *ptep;
487 unsigned long sz = 1UL << hugepd_shift(*hugepd);
488
489 ptep = hugepte_offset(hugepd, addr, pdshift);
490 do {
491 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
492 return 0;
493 } while (ptep++, addr += sz, addr != end);
494
495 return 1;
496}
486 497
487unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 498unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
488 unsigned long len, unsigned long pgoff, 499 unsigned long len, unsigned long pgoff,
@@ -530,34 +541,20 @@ static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
530 return rflags; 541 return rflags;
531} 542}
532 543
533int hash_huge_page(struct mm_struct *mm, unsigned long access, 544int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
534 unsigned long ea, unsigned long vsid, int local, 545 pte_t *ptep, unsigned long trap, int local, int ssize,
535 unsigned long trap) 546 unsigned int shift, unsigned int mmu_psize)
536{ 547{
537 pte_t *ptep;
538 unsigned long old_pte, new_pte; 548 unsigned long old_pte, new_pte;
539 unsigned long va, rflags, pa, sz; 549 unsigned long va, rflags, pa, sz;
540 long slot; 550 long slot;
541 int err = 1; 551 int err = 1;
542 int ssize = user_segment_size(ea);
543 unsigned int mmu_psize;
544 int shift;
545 mmu_psize = get_slice_psize(mm, ea);
546 552
547 if (!mmu_huge_psizes[mmu_psize]) 553 BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
548 goto out;
549 ptep = huge_pte_offset(mm, ea);
550 554
551 /* Search the Linux page table for a match with va */ 555 /* Search the Linux page table for a match with va */
552 va = hpt_va(ea, vsid, ssize); 556 va = hpt_va(ea, vsid, ssize);
553 557
554 /*
555 * If no pte found or not present, send the problem up to
556 * do_page_fault
557 */
558 if (unlikely(!ptep || pte_none(*ptep)))
559 goto out;
560
561 /* 558 /*
562 * Check the user's access rights to the page. If access should be 559 * Check the user's access rights to the page. If access should be
563 * prevented then send the problem up to do_page_fault. 560 * prevented then send the problem up to do_page_fault.
@@ -588,7 +585,6 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
588 rflags = 0x2 | (!(new_pte & _PAGE_RW)); 585 rflags = 0x2 | (!(new_pte & _PAGE_RW));
589 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ 586 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
590 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); 587 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
591 shift = mmu_psize_to_shift(mmu_psize);
592 sz = ((1UL) << shift); 588 sz = ((1UL) << shift);
593 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) 589 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
594 /* No CPU has hugepages but lacks no execute, so we 590 /* No CPU has hugepages but lacks no execute, so we
@@ -672,6 +668,8 @@ repeat:
672 668
673static void __init set_huge_psize(int psize) 669static void __init set_huge_psize(int psize)
674{ 670{
671 unsigned pdshift;
672
675 /* Check that it is a page size supported by the hardware and 673 /* Check that it is a page size supported by the hardware and
676 * that it fits within pagetable limits. */ 674 * that it fits within pagetable limits. */
677 if (mmu_psize_defs[psize].shift && 675 if (mmu_psize_defs[psize].shift &&
@@ -686,29 +684,14 @@ static void __init set_huge_psize(int psize)
686 return; 684 return;
687 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); 685 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
688 686
689 switch (mmu_psize_defs[psize].shift) { 687 if (mmu_psize_defs[psize].shift < PMD_SHIFT)
690 case PAGE_SHIFT_64K: 688 pdshift = PMD_SHIFT;
691 /* We only allow 64k hpages with 4k base page, 689 else if (mmu_psize_defs[psize].shift < PUD_SHIFT)
692 * which was checked above, and always put them 690 pdshift = PUD_SHIFT;
693 * at the PMD */ 691 else
694 hugepte_shift[psize] = PMD_SHIFT; 692 pdshift = PGDIR_SHIFT;
695 break; 693 mmu_huge_psizes[psize] = pdshift - mmu_psize_defs[psize].shift;
696 case PAGE_SHIFT_16M: 694 }
697 /* 16M pages can be at two different levels
698 * of pagestables based on base page size */
699 if (PAGE_SHIFT == PAGE_SHIFT_64K)
700 hugepte_shift[psize] = PMD_SHIFT;
701 else /* 4k base page */
702 hugepte_shift[psize] = PUD_SHIFT;
703 break;
704 case PAGE_SHIFT_16G:
705 /* 16G pages are always at PGD level */
706 hugepte_shift[psize] = PGDIR_SHIFT;
707 break;
708 }
709 hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
710 } else
711 hugepte_shift[psize] = 0;
712} 695}
713 696
714static int __init hugepage_setup_sz(char *str) 697static int __init hugepage_setup_sz(char *str)
@@ -732,7 +715,7 @@ __setup("hugepagesz=", hugepage_setup_sz);
732 715
733static int __init hugetlbpage_init(void) 716static int __init hugetlbpage_init(void)
734{ 717{
735 unsigned int psize; 718 int psize;
736 719
737 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 720 if (!cpu_has_feature(CPU_FTR_16M_PAGE))
738 return -ENODEV; 721 return -ENODEV;
@@ -753,8 +736,8 @@ static int __init hugetlbpage_init(void)
753 736
754 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 737 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
755 if (mmu_huge_psizes[psize]) { 738 if (mmu_huge_psizes[psize]) {
756 pgtable_cache_add(hugepte_shift[psize], NULL); 739 pgtable_cache_add(mmu_huge_psizes[psize], NULL);
757 if (!PGT_CACHE(hugepte_shift[psize])) 740 if (!PGT_CACHE(mmu_huge_psizes[psize]))
758 panic("hugetlbpage_init(): could not create " 741 panic("hugetlbpage_init(): could not create "
759 "pgtable cache for %d bit pagesize\n", 742 "pgtable cache for %d bit pagesize\n",
760 mmu_psize_to_shift(psize)); 743 mmu_psize_to_shift(psize));
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 82ac61dcd3af..776f28d02b6b 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -41,6 +41,7 @@
41#include <linux/module.h> 41#include <linux/module.h>
42#include <linux/poison.h> 42#include <linux/poison.h>
43#include <linux/lmb.h> 43#include <linux/lmb.h>
44#include <linux/hugetlb.h>
44 45
45#include <asm/pgalloc.h> 46#include <asm/pgalloc.h>
46#include <asm/page.h> 47#include <asm/page.h>
@@ -136,8 +137,13 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
136 137
137 /* When batching pgtable pointers for RCU freeing, we store 138 /* When batching pgtable pointers for RCU freeing, we store
138 * the index size in the low bits. Table alignment must be 139 * the index size in the low bits. Table alignment must be
139 * big enough to fit it */ 140 * big enough to fit it.
140 unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1; 141 *
142 * Likewise, hugeapge pagetable pointers contain a (different)
143 * shift value in the low bits. All tables must be aligned so
144 * as to leave enough 0 bits in the address to contain it. */
145 unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
146 HUGEPD_SHIFT_MASK + 1);
141 struct kmem_cache *new; 147 struct kmem_cache *new;
142 148
143 /* It would be nice if this was a BUILD_BUG_ON(), but at the 149 /* It would be nice if this was a BUILD_BUG_ON(), but at the