aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm/gup.c
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2009-10-26 15:24:31 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2009-10-30 02:20:58 -0400
commita4fe3ce7699bfe1bd88f816b55d42d8fe1dac655 (patch)
treeb72c982ffbb9f05d78a952288d60c4dc2d31a4d9 /arch/powerpc/mm/gup.c
parenta0668cdc154e54bf0c85182e0535eea237d53146 (diff)
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different pagetable layout: that is, the bottem level table of pointers to hugepages is a different size, and may branch off from the normal page tables at a different level. Every hugepage aware path that needs to walk the pagetables must therefore look up the hugepage size from the slice info first, and work out the correct way to walk the pagetables accordingly. Future hardware is likely to add more possible hugepage sizes, more layout options and more mess. This patch, therefore reworks the handling of hugepage pagetables to reduce this complexity. In the new scheme, instead of having to consult the slice mask, pagetable walking code can check a flag in the PGD/PUD/PMD entries to see where to branch off to hugepage pagetables, and the entry also contains the information (eseentially hugepage shift) necessary to then interpret that table without recourse to the slice mask. This scheme can be extended neatly to handle multiple levels of self-describing "special" hugepage pagetables, although for now we assume only one level exists. This approach means that only the pagetable allocation path needs to know how the pagetables should be set out. All other (hugepage) pagetable walking paths can just interpret the structure as they go. There already was a flag bit in PGD/PUD/PMD entries for hugepage directory pointers, but it was only used for debug. We alter that flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable pointer (normally it would be 1 since the pointer lies in the linear mapping). This means that asm pagetable walking can test for (and punt on) hugepage pointers with the same test that checks for unpopulated page directory entries (beq becomes bge), since hugepage pointers will always be positive, and normal pointers always negative. While we're at it, we get rid of the confusing (and grep defeating) #defining of hugepte_shift to be the same thing as mmu_huge_psizes. Signed-off-by: David Gibson <dwg@au1.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm/gup.c')
-rw-r--r--arch/powerpc/mm/gup.c149
1 files changed, 26 insertions, 123 deletions
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index bc122a120bf0..d7efdbf640c7 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
55 return 1; 55 return 1;
56} 56}
57 57
58#ifdef CONFIG_HUGETLB_PAGE
59static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
60 unsigned long *addr, unsigned long end,
61 int write, struct page **pages, int *nr)
62{
63 unsigned long mask;
64 unsigned long pte_end;
65 struct page *head, *page;
66 pte_t pte;
67 int refs;
68
69 pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
70 if (pte_end < end)
71 end = pte_end;
72
73 pte = *ptep;
74 mask = _PAGE_PRESENT|_PAGE_USER;
75 if (write)
76 mask |= _PAGE_RW;
77 if ((pte_val(pte) & mask) != mask)
78 return 0;
79 /* hugepages are never "special" */
80 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
81
82 refs = 0;
83 head = pte_page(pte);
84 page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
85 do {
86 VM_BUG_ON(compound_head(page) != head);
87 pages[*nr] = page;
88 (*nr)++;
89 page++;
90 refs++;
91 } while (*addr += PAGE_SIZE, *addr != end);
92
93 if (!page_cache_add_speculative(head, refs)) {
94 *nr -= refs;
95 return 0;
96 }
97 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
98 /* Could be optimized better */
99 while (*nr) {
100 put_page(page);
101 (*nr)--;
102 }
103 }
104
105 return 1;
106}
107#endif /* CONFIG_HUGETLB_PAGE */
108
109static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 58static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
110 int write, struct page **pages, int *nr) 59 int write, struct page **pages, int *nr)
111{ 60{
@@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
119 next = pmd_addr_end(addr, end); 68 next = pmd_addr_end(addr, end);
120 if (pmd_none(pmd)) 69 if (pmd_none(pmd))
121 return 0; 70 return 0;
122 if (!gup_pte_range(pmd, addr, next, write, pages, nr)) 71 if (is_hugepd(pmdp)) {
72 if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
73 addr, next, write, pages, nr))
74 return 0;
75 } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
123 return 0; 76 return 0;
124 } while (pmdp++, addr = next, addr != end); 77 } while (pmdp++, addr = next, addr != end);
125 78
@@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
139 next = pud_addr_end(addr, end); 92 next = pud_addr_end(addr, end);
140 if (pud_none(pud)) 93 if (pud_none(pud))
141 return 0; 94 return 0;
142 if (!gup_pmd_range(pud, addr, next, write, pages, nr)) 95 if (is_hugepd(pudp)) {
96 if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
97 addr, next, write, pages, nr))
98 return 0;
99 } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
143 return 0; 100 return 0;
144 } while (pudp++, addr = next, addr != end); 101 } while (pudp++, addr = next, addr != end);
145 102
@@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
154 unsigned long next; 111 unsigned long next;
155 pgd_t *pgdp; 112 pgd_t *pgdp;
156 int nr = 0; 113 int nr = 0;
157#ifdef CONFIG_PPC64
158 unsigned int shift;
159 int psize;
160#endif
161 114
162 pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); 115 pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
163 116
@@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
172 125
173 pr_devel(" aligned: %lx .. %lx\n", start, end); 126 pr_devel(" aligned: %lx .. %lx\n", start, end);
174 127
175#ifdef CONFIG_HUGETLB_PAGE
176 /* We bail out on slice boundary crossing when hugetlb is
177 * enabled in order to not have to deal with two different
178 * page table formats
179 */
180 if (addr < SLICE_LOW_TOP) {
181 if (end > SLICE_LOW_TOP)
182 goto slow_irqon;
183
184 if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
185 GET_LOW_SLICE_INDEX(end - 1)))
186 goto slow_irqon;
187 } else {
188 if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
189 GET_HIGH_SLICE_INDEX(end - 1)))
190 goto slow_irqon;
191 }
192#endif /* CONFIG_HUGETLB_PAGE */
193
194 /* 128 /*
195 * XXX: batch / limit 'nr', to avoid large irq off latency 129 * XXX: batch / limit 'nr', to avoid large irq off latency
196 * needs some instrumenting to determine the common sizes used by 130 * needs some instrumenting to determine the common sizes used by
@@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
210 */ 144 */
211 local_irq_disable(); 145 local_irq_disable();
212 146
213#ifdef CONFIG_PPC64 147 pgdp = pgd_offset(mm, addr);
214 /* Those bits are related to hugetlbfs implementation and only exist 148 do {
215 * on 64-bit for now 149 pgd_t pgd = *pgdp;
216 */ 150
217 psize = get_slice_psize(mm, addr); 151 pr_devel(" %016lx: normal pgd %p\n", addr,
218 shift = mmu_psize_defs[psize].shift; 152 (void *)pgd_val(pgd));
219#endif /* CONFIG_PPC64 */ 153 next = pgd_addr_end(addr, end);
220 154 if (pgd_none(pgd))
221#ifdef CONFIG_HUGETLB_PAGE 155 goto slow;
222 if (unlikely(mmu_huge_psizes[psize])) { 156 if (is_hugepd(pgdp)) {
223 pte_t *ptep; 157 if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
224 unsigned long a = addr; 158 addr, next, write, pages, &nr))
225 unsigned long sz = ((1UL) << shift);
226 struct hstate *hstate = size_to_hstate(sz);
227
228 BUG_ON(!hstate);
229 /*
230 * XXX: could be optimized to avoid hstate
231 * lookup entirely (just use shift)
232 */
233
234 do {
235 VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
236 ptep = huge_pte_offset(mm, a);
237 pr_devel(" %016lx: huge ptep %p\n", a, ptep);
238 if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
239 &nr))
240 goto slow;
241 } while (a != end);
242 } else
243#endif /* CONFIG_HUGETLB_PAGE */
244 {
245 pgdp = pgd_offset(mm, addr);
246 do {
247 pgd_t pgd = *pgdp;
248
249#ifdef CONFIG_PPC64
250 VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
251#endif
252 pr_devel(" %016lx: normal pgd %p\n", addr,
253 (void *)pgd_val(pgd));
254 next = pgd_addr_end(addr, end);
255 if (pgd_none(pgd))
256 goto slow;
257 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
258 goto slow; 159 goto slow;
259 } while (pgdp++, addr = next, addr != end); 160 } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
260 } 161 goto slow;
162 } while (pgdp++, addr = next, addr != end);
163
261 local_irq_enable(); 164 local_irq_enable();
262 165
263 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); 166 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);