aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
authorBecky Bruce <beckyb@kernel.crashing.org>2011-06-28 05:54:48 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2011-09-19 19:19:40 -0400
commit41151e77a4d96ea138cede6d84c955aa4769ce74 (patch)
tree2d997b77b9adf406a2fd30326bff688577d2e64f /arch/powerpc/mm
parent7df5659eefad9b6d457ccdee016bd78bd064cfc0 (diff)
powerpc: Hugetlb for BookE
Enable hugepages on Freescale BookE processors. This allows the kernel to use huge TLB entries to map pages, which can greatly reduce the number of TLB misses and the amount of TLB thrashing experienced by applications with large memory footprints. Care should be taken when using this on FSL processors, as the number of large TLB entries supported by the core is low (16-64) on current processors. The supported set of hugepage sizes include 4m, 16m, 64m, 256m, and 1g. Page sizes larger than the max zone size are called "gigantic" pages and must be allocated on the command line (and cannot be deallocated). This is currently only fully implemented for Freescale 32-bit BookE processors, but there is some infrastructure in the code for 64-bit BooKE. Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/Makefile1
-rw-r--r--arch/powerpc/mm/hash_utils_64.c3
-rw-r--r--arch/powerpc/mm/hugetlbpage-book3e.c121
-rw-r--r--arch/powerpc/mm/hugetlbpage.c379
-rw-r--r--arch/powerpc/mm/init_32.c9
-rw-r--r--arch/powerpc/mm/mem.c5
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c5
-rw-r--r--arch/powerpc/mm/pgtable.c3
-rw-r--r--arch/powerpc/mm/tlb_low_64e.S24
-rw-r--r--arch/powerpc/mm/tlb_nohash.c46
10 files changed, 536 insertions, 60 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index bdca46e0838..991ee813d2a 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o
29ifeq ($(CONFIG_HUGETLB_PAGE),y) 29ifeq ($(CONFIG_HUGETLB_PAGE),y)
30obj-y += hugetlbpage.o 30obj-y += hugetlbpage.o
31obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o 31obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
32obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
32endif 33endif
33obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o 34obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
34obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o 35obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 26b2872b3d0..1f8b2a05e3d 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -105,9 +105,6 @@ int mmu_kernel_ssize = MMU_SEGSIZE_256M;
105int mmu_highuser_ssize = MMU_SEGSIZE_256M; 105int mmu_highuser_ssize = MMU_SEGSIZE_256M;
106u16 mmu_slb_size = 64; 106u16 mmu_slb_size = 64;
107EXPORT_SYMBOL_GPL(mmu_slb_size); 107EXPORT_SYMBOL_GPL(mmu_slb_size);
108#ifdef CONFIG_HUGETLB_PAGE
109unsigned int HPAGE_SHIFT;
110#endif
111#ifdef CONFIG_PPC_64K_PAGES 108#ifdef CONFIG_PPC_64K_PAGES
112int mmu_ci_restrictions; 109int mmu_ci_restrictions;
113#endif 110#endif
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
new file mode 100644
index 00000000000..1295b7c1cda
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -0,0 +1,121 @@
1/*
2 * PPC Huge TLB Page Support for Book3E MMU
3 *
4 * Copyright (C) 2009 David Gibson, IBM Corporation.
5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
6 *
7 */
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10
11static inline int mmu_get_tsize(int psize)
12{
13 return mmu_psize_defs[psize].enc;
14}
15
16static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
17{
18 int found = 0;
19
20 mtspr(SPRN_MAS6, pid << 16);
21 if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) {
22 asm volatile(
23 "li %0,0\n"
24 "tlbsx. 0,%1\n"
25 "bne 1f\n"
26 "li %0,1\n"
27 "1:\n"
28 : "=&r"(found) : "r"(ea));
29 } else {
30 asm volatile(
31 "tlbsx 0,%1\n"
32 "mfspr %0,0x271\n"
33 "srwi %0,%0,31\n"
34 : "=&r"(found) : "r"(ea));
35 }
36
37 return found;
38}
39
40void book3e_hugetlb_preload(struct mm_struct *mm, unsigned long ea, pte_t pte)
41{
42 unsigned long mas1, mas2;
43 u64 mas7_3;
44 unsigned long psize, tsize, shift;
45 unsigned long flags;
46
47#ifdef CONFIG_PPC_FSL_BOOK3E
48 int index, lz, ncams;
49 struct vm_area_struct *vma;
50#endif
51
52 if (unlikely(is_kernel_addr(ea)))
53 return;
54
55#ifdef CONFIG_MM_SLICES
56 psize = mmu_get_tsize(get_slice_psize(mm, ea));
57 tsize = mmu_get_psize(psize);
58 shift = mmu_psize_defs[psize].shift;
59#else
60 vma = find_vma(mm, ea);
61 psize = vma_mmu_pagesize(vma); /* returns actual size in bytes */
62 asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (psize));
63 shift = 31 - lz;
64 tsize = 21 - lz;
65#endif
66
67 /*
68 * We can't be interrupted while we're setting up the MAS
69 * regusters or after we've confirmed that no tlb exists.
70 */
71 local_irq_save(flags);
72
73 if (unlikely(book3e_tlb_exists(ea, mm->context.id))) {
74 local_irq_restore(flags);
75 return;
76 }
77
78#ifdef CONFIG_PPC_FSL_BOOK3E
79 ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
80
81 /* We have to use the CAM(TLB1) on FSL parts for hugepages */
82 index = __get_cpu_var(next_tlbcam_idx);
83 mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
84
85 /* Just round-robin the entries and wrap when we hit the end */
86 if (unlikely(index == ncams - 1))
87 __get_cpu_var(next_tlbcam_idx) = tlbcam_index;
88 else
89 __get_cpu_var(next_tlbcam_idx)++;
90#endif
91 mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
92 mas2 = ea & ~((1UL << shift) - 1);
93 mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
94 mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT;
95 mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK;
96 if (!pte_dirty(pte))
97 mas7_3 &= ~(MAS3_SW|MAS3_UW);
98
99 mtspr(SPRN_MAS1, mas1);
100 mtspr(SPRN_MAS2, mas2);
101
102 if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
103 mtspr(SPRN_MAS7_MAS3, mas7_3);
104 } else {
105 mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
106 mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
107 }
108
109 asm volatile ("tlbwe");
110
111 local_irq_restore(flags);
112}
113
114void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
115{
116 struct hstate *hstate = hstate_file(vma->vm_file);
117 unsigned long tsize = huge_page_shift(hstate) - 10;
118
119 __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, tsize, 0);
120
121}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0b9a5c1901b..3a5f59dcbb3 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * PPC64 (POWER4) Huge TLB Page Support for Kernel. 2 * PPC Huge TLB Page Support for Kernel.
3 * 3 *
4 * Copyright (C) 2003 David Gibson, IBM Corporation. 4 * Copyright (C) 2003 David Gibson, IBM Corporation.
5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
5 * 6 *
6 * Based on the IA-32 version: 7 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
@@ -11,24 +12,39 @@
11#include <linux/io.h> 12#include <linux/io.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
13#include <linux/hugetlb.h> 14#include <linux/hugetlb.h>
15#include <linux/of_fdt.h>
16#include <linux/memblock.h>
17#include <linux/bootmem.h>
14#include <asm/pgtable.h> 18#include <asm/pgtable.h>
15#include <asm/pgalloc.h> 19#include <asm/pgalloc.h>
16#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/setup.h>
17 22
18#define PAGE_SHIFT_64K 16 23#define PAGE_SHIFT_64K 16
19#define PAGE_SHIFT_16M 24 24#define PAGE_SHIFT_16M 24
20#define PAGE_SHIFT_16G 34 25#define PAGE_SHIFT_16G 34
21 26
22#define MAX_NUMBER_GPAGES 1024 27unsigned int HPAGE_SHIFT;
23 28
24/* Tracks the 16G pages after the device tree is scanned and before the 29/*
25 * huge_boot_pages list is ready. */ 30 * Tracks gpages after the device tree is scanned and before the
26static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; 31 * huge_boot_pages list is ready. On 64-bit implementations, this is
32 * just used to track 16G pages and so is a single array. 32-bit
33 * implementations may have more than one gpage size due to limitations
34 * of the memory allocators, so we need multiple arrays
35 */
36#ifdef CONFIG_PPC64
37#define MAX_NUMBER_GPAGES 1024
38static u64 gpage_freearray[MAX_NUMBER_GPAGES];
27static unsigned nr_gpages; 39static unsigned nr_gpages;
28 40#else
29/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 41#define MAX_NUMBER_GPAGES 128
30 * will choke on pointers to hugepte tables, which is handy for 42struct psize_gpages {
31 * catching screwups early. */ 43 u64 gpage_list[MAX_NUMBER_GPAGES];
44 unsigned int nr_gpages;
45};
46static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
47#endif
32 48
33static inline int shift_to_mmu_psize(unsigned int shift) 49static inline int shift_to_mmu_psize(unsigned int shift)
34{ 50{
@@ -49,25 +65,6 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
49 65
50#define hugepd_none(hpd) ((hpd).pd == 0) 66#define hugepd_none(hpd) ((hpd).pd == 0)
51 67
52static inline pte_t *hugepd_page(hugepd_t hpd)
53{
54 BUG_ON(!hugepd_ok(hpd));
55 return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
56}
57
58static inline unsigned int hugepd_shift(hugepd_t hpd)
59{
60 return hpd.pd & HUGEPD_SHIFT_MASK;
61}
62
63static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
64{
65 unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
66 pte_t *dir = hugepd_page(*hpdp);
67
68 return dir + idx;
69}
70
71pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) 68pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
72{ 69{
73 pgd_t *pg; 70 pgd_t *pg;
@@ -93,7 +90,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
93 if (is_hugepd(pm)) 90 if (is_hugepd(pm))
94 hpdp = (hugepd_t *)pm; 91 hpdp = (hugepd_t *)pm;
95 else if (!pmd_none(*pm)) { 92 else if (!pmd_none(*pm)) {
96 return pte_offset_map(pm, ea); 93 return pte_offset_kernel(pm, ea);
97 } 94 }
98 } 95 }
99 } 96 }
@@ -114,8 +111,18 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
114static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 111static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
115 unsigned long address, unsigned pdshift, unsigned pshift) 112 unsigned long address, unsigned pdshift, unsigned pshift)
116{ 113{
117 pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift), 114 struct kmem_cache *cachep;
118 GFP_KERNEL|__GFP_REPEAT); 115 pte_t *new;
116
117#ifdef CONFIG_PPC64
118 cachep = PGT_CACHE(pdshift - pshift);
119#else
120 int i;
121 int num_hugepd = 1 << (pshift - pdshift);
122 cachep = hugepte_cache;
123#endif
124
125 new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
119 126
120 BUG_ON(pshift > HUGEPD_SHIFT_MASK); 127 BUG_ON(pshift > HUGEPD_SHIFT_MASK);
121 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); 128 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
@@ -124,10 +131,31 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
124 return -ENOMEM; 131 return -ENOMEM;
125 132
126 spin_lock(&mm->page_table_lock); 133 spin_lock(&mm->page_table_lock);
134#ifdef CONFIG_PPC64
127 if (!hugepd_none(*hpdp)) 135 if (!hugepd_none(*hpdp))
128 kmem_cache_free(PGT_CACHE(pdshift - pshift), new); 136 kmem_cache_free(cachep, new);
129 else 137 else
130 hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift; 138 hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
139#else
140 /*
141 * We have multiple higher-level entries that point to the same
142 * actual pte location. Fill in each as we go and backtrack on error.
143 * We need all of these so the DTLB pgtable walk code can find the
144 * right higher-level entry without knowing if it's a hugepage or not.
145 */
146 for (i = 0; i < num_hugepd; i++, hpdp++) {
147 if (unlikely(!hugepd_none(*hpdp)))
148 break;
149 else
150 hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
151 }
152 /* If we bailed from the for loop early, an error occurred, clean up */
153 if (i < num_hugepd) {
154 for (i = i - 1 ; i >= 0; i--, hpdp--)
155 hpdp->pd = 0;
156 kmem_cache_free(cachep, new);
157 }
158#endif
131 spin_unlock(&mm->page_table_lock); 159 spin_unlock(&mm->page_table_lock);
132 return 0; 160 return 0;
133} 161}
@@ -169,11 +197,132 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
169 return hugepte_offset(hpdp, addr, pdshift); 197 return hugepte_offset(hpdp, addr, pdshift);
170} 198}
171 199
200#ifdef CONFIG_PPC32
172/* Build list of addresses of gigantic pages. This function is used in early 201/* Build list of addresses of gigantic pages. This function is used in early
173 * boot before the buddy or bootmem allocator is setup. 202 * boot before the buddy or bootmem allocator is setup.
174 */ 203 */
175void add_gpage(unsigned long addr, unsigned long page_size, 204void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
176 unsigned long number_of_pages) 205{
206 unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
207 int i;
208
209 if (addr == 0)
210 return;
211
212 gpage_freearray[idx].nr_gpages = number_of_pages;
213
214 for (i = 0; i < number_of_pages; i++) {
215 gpage_freearray[idx].gpage_list[i] = addr;
216 addr += page_size;
217 }
218}
219
220/*
221 * Moves the gigantic page addresses from the temporary list to the
222 * huge_boot_pages list.
223 */
224int alloc_bootmem_huge_page(struct hstate *hstate)
225{
226 struct huge_bootmem_page *m;
227 int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT);
228 int nr_gpages = gpage_freearray[idx].nr_gpages;
229
230 if (nr_gpages == 0)
231 return 0;
232
233#ifdef CONFIG_HIGHMEM
234 /*
235 * If gpages can be in highmem we can't use the trick of storing the
236 * data structure in the page; allocate space for this
237 */
238 m = alloc_bootmem(sizeof(struct huge_bootmem_page));
239 m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
240#else
241 m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
242#endif
243
244 list_add(&m->list, &huge_boot_pages);
245 gpage_freearray[idx].nr_gpages = nr_gpages;
246 gpage_freearray[idx].gpage_list[nr_gpages] = 0;
247 m->hstate = hstate;
248
249 return 1;
250}
251/*
252 * Scan the command line hugepagesz= options for gigantic pages; store those in
253 * a list that we use to allocate the memory once all options are parsed.
254 */
255
256unsigned long gpage_npages[MMU_PAGE_COUNT];
257
258static int __init do_gpage_early_setup(char *param, char *val)
259{
260 static phys_addr_t size;
261 unsigned long npages;
262
263 /*
264 * The hugepagesz and hugepages cmdline options are interleaved. We
265 * use the size variable to keep track of whether or not this was done
266 * properly and skip over instances where it is incorrect. Other
267 * command-line parsing code will issue warnings, so we don't need to.
268 *
269 */
270 if ((strcmp(param, "default_hugepagesz") == 0) ||
271 (strcmp(param, "hugepagesz") == 0)) {
272 size = memparse(val, NULL);
273 } else if (strcmp(param, "hugepages") == 0) {
274 if (size != 0) {
275 if (sscanf(val, "%lu", &npages) <= 0)
276 npages = 0;
277 gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
278 size = 0;
279 }
280 }
281 return 0;
282}
283
284
285/*
286 * This function allocates physical space for pages that are larger than the
287 * buddy allocator can handle. We want to allocate these in highmem because
288 * the amount of lowmem is limited. This means that this function MUST be
289 * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
290 * allocate to grab highmem.
291 */
292void __init reserve_hugetlb_gpages(void)
293{
294 static __initdata char cmdline[COMMAND_LINE_SIZE];
295 phys_addr_t size, base;
296 int i;
297
298 strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
299 parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup);
300
301 /*
302 * Walk gpage list in reverse, allocating larger page sizes first.
303 * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
304 * When we reach the point in the list where pages are no longer
305 * considered gpages, we're done.
306 */
307 for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
308 if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
309 continue;
310 else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
311 break;
312
313 size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
314 base = memblock_alloc_base(size * gpage_npages[i], size,
315 MEMBLOCK_ALLOC_ANYWHERE);
316 add_gpage(base, size, gpage_npages[i]);
317 }
318}
319
320#else /* PPC64 */
321
322/* Build list of addresses of gigantic pages. This function is used in early
323 * boot before the buddy or bootmem allocator is setup.
324 */
325void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
177{ 326{
178 if (!addr) 327 if (!addr)
179 return; 328 return;
@@ -199,19 +348,79 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
199 m->hstate = hstate; 348 m->hstate = hstate;
200 return 1; 349 return 1;
201} 350}
351#endif
202 352
203int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 353int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
204{ 354{
205 return 0; 355 return 0;
206} 356}
207 357
358#ifdef CONFIG_PPC32
359#define HUGEPD_FREELIST_SIZE \
360 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
361
362struct hugepd_freelist {
363 struct rcu_head rcu;
364 unsigned int index;
365 void *ptes[0];
366};
367
368static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
369
370static void hugepd_free_rcu_callback(struct rcu_head *head)
371{
372 struct hugepd_freelist *batch =
373 container_of(head, struct hugepd_freelist, rcu);
374 unsigned int i;
375
376 for (i = 0; i < batch->index; i++)
377 kmem_cache_free(hugepte_cache, batch->ptes[i]);
378
379 free_page((unsigned long)batch);
380}
381
382static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
383{
384 struct hugepd_freelist **batchp;
385
386 batchp = &__get_cpu_var(hugepd_freelist_cur);
387
388 if (atomic_read(&tlb->mm->mm_users) < 2 ||
389 cpumask_equal(mm_cpumask(tlb->mm),
390 cpumask_of(smp_processor_id()))) {
391 kmem_cache_free(hugepte_cache, hugepte);
392 return;
393 }
394
395 if (*batchp == NULL) {
396 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
397 (*batchp)->index = 0;
398 }
399
400 (*batchp)->ptes[(*batchp)->index++] = hugepte;
401 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
402 call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
403 *batchp = NULL;
404 }
405}
406#endif
407
208static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, 408static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
209 unsigned long start, unsigned long end, 409 unsigned long start, unsigned long end,
210 unsigned long floor, unsigned long ceiling) 410 unsigned long floor, unsigned long ceiling)
211{ 411{
212 pte_t *hugepte = hugepd_page(*hpdp); 412 pte_t *hugepte = hugepd_page(*hpdp);
213 unsigned shift = hugepd_shift(*hpdp); 413 int i;
414
214 unsigned long pdmask = ~((1UL << pdshift) - 1); 415 unsigned long pdmask = ~((1UL << pdshift) - 1);
416 unsigned int num_hugepd = 1;
417
418#ifdef CONFIG_PPC64
419 unsigned int shift = hugepd_shift(*hpdp);
420#else
421 /* Note: On 32-bit the hpdp may be the first of several */
422 num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
423#endif
215 424
216 start &= pdmask; 425 start &= pdmask;
217 if (start < floor) 426 if (start < floor)
@@ -224,9 +433,15 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
224 if (end - 1 > ceiling - 1) 433 if (end - 1 > ceiling - 1)
225 return; 434 return;
226 435
227 hpdp->pd = 0; 436 for (i = 0; i < num_hugepd; i++, hpdp++)
437 hpdp->pd = 0;
438
228 tlb->need_flush = 1; 439 tlb->need_flush = 1;
440#ifdef CONFIG_PPC64
229 pgtable_free_tlb(tlb, hugepte, pdshift - shift); 441 pgtable_free_tlb(tlb, hugepte, pdshift - shift);
442#else
443 hugepd_free(tlb, hugepte);
444#endif
230} 445}
231 446
232static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 447static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -331,18 +546,27 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
331 * too. 546 * too.
332 */ 547 */
333 548
334 pgd = pgd_offset(tlb->mm, addr);
335 do { 549 do {
336 next = pgd_addr_end(addr, end); 550 next = pgd_addr_end(addr, end);
551 pgd = pgd_offset(tlb->mm, addr);
337 if (!is_hugepd(pgd)) { 552 if (!is_hugepd(pgd)) {
338 if (pgd_none_or_clear_bad(pgd)) 553 if (pgd_none_or_clear_bad(pgd))
339 continue; 554 continue;
340 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 555 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
341 } else { 556 } else {
557#ifdef CONFIG_PPC32
558 /*
559 * Increment next by the size of the huge mapping since
560 * on 32-bit there may be more than one entry at the pgd
561 * level for a single hugepage, but all of them point to
562 * the same kmem cache that holds the hugepte.
563 */
564 next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
565#endif
342 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, 566 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
343 addr, next, floor, ceiling); 567 addr, next, floor, ceiling);
344 } 568 }
345 } while (pgd++, addr = next, addr != end); 569 } while (addr = next, addr != end);
346} 570}
347 571
348struct page * 572struct page *
@@ -466,17 +690,35 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
466 unsigned long len, unsigned long pgoff, 690 unsigned long len, unsigned long pgoff,
467 unsigned long flags) 691 unsigned long flags)
468{ 692{
693#ifdef CONFIG_MM_SLICES
469 struct hstate *hstate = hstate_file(file); 694 struct hstate *hstate = hstate_file(file);
470 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 695 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
471 696
472 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); 697 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
698#else
699 return get_unmapped_area(file, addr, len, pgoff, flags);
700#endif
473} 701}
474 702
475unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 703unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
476{ 704{
705#ifdef CONFIG_MM_SLICES
477 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); 706 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
478 707
479 return 1UL << mmu_psize_to_shift(psize); 708 return 1UL << mmu_psize_to_shift(psize);
709#else
710 if (!is_vm_hugetlb_page(vma))
711 return PAGE_SIZE;
712
713 return huge_page_size(hstate_vma(vma));
714#endif
715}
716
717static inline bool is_power_of_4(unsigned long x)
718{
719 if (is_power_of_2(x))
720 return (__ilog2(x) % 2) ? false : true;
721 return false;
480} 722}
481 723
482static int __init add_huge_page_size(unsigned long long size) 724static int __init add_huge_page_size(unsigned long long size)
@@ -486,9 +728,14 @@ static int __init add_huge_page_size(unsigned long long size)
486 728
487 /* Check that it is a page size supported by the hardware and 729 /* Check that it is a page size supported by the hardware and
488 * that it fits within pagetable and slice limits. */ 730 * that it fits within pagetable and slice limits. */
731#ifdef CONFIG_PPC_FSL_BOOK3E
732 if ((size < PAGE_SIZE) || !is_power_of_4(size))
733 return -EINVAL;
734#else
489 if (!is_power_of_2(size) 735 if (!is_power_of_2(size)
490 || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT)) 736 || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
491 return -EINVAL; 737 return -EINVAL;
738#endif
492 739
493 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) 740 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
494 return -EINVAL; 741 return -EINVAL;
@@ -525,6 +772,46 @@ static int __init hugepage_setup_sz(char *str)
525} 772}
526__setup("hugepagesz=", hugepage_setup_sz); 773__setup("hugepagesz=", hugepage_setup_sz);
527 774
775#ifdef CONFIG_FSL_BOOKE
776struct kmem_cache *hugepte_cache;
777static int __init hugetlbpage_init(void)
778{
779 int psize;
780
781 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
782 unsigned shift;
783
784 if (!mmu_psize_defs[psize].shift)
785 continue;
786
787 shift = mmu_psize_to_shift(psize);
788
789 /* Don't treat normal page sizes as huge... */
790 if (shift != PAGE_SHIFT)
791 if (add_huge_page_size(1ULL << shift) < 0)
792 continue;
793 }
794
795 /*
796 * Create a kmem cache for hugeptes. The bottom bits in the pte have
797 * size information encoded in them, so align them to allow this
798 */
799 hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t),
800 HUGEPD_SHIFT_MASK + 1, 0, NULL);
801 if (hugepte_cache == NULL)
802 panic("%s: Unable to create kmem cache for hugeptes\n",
803 __func__);
804
805 /* Default hpage size = 4M */
806 if (mmu_psize_defs[MMU_PAGE_4M].shift)
807 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
808 else
809 panic("%s: Unable to set default huge page size\n", __func__);
810
811
812 return 0;
813}
814#else
528static int __init hugetlbpage_init(void) 815static int __init hugetlbpage_init(void)
529{ 816{
530 int psize; 817 int psize;
@@ -567,15 +854,23 @@ static int __init hugetlbpage_init(void)
567 854
568 return 0; 855 return 0;
569} 856}
570 857#endif
571module_init(hugetlbpage_init); 858module_init(hugetlbpage_init);
572 859
573void flush_dcache_icache_hugepage(struct page *page) 860void flush_dcache_icache_hugepage(struct page *page)
574{ 861{
575 int i; 862 int i;
863 void *start;
576 864
577 BUG_ON(!PageCompound(page)); 865 BUG_ON(!PageCompound(page));
578 866
579 for (i = 0; i < (1UL << compound_order(page)); i++) 867 for (i = 0; i < (1UL << compound_order(page)); i++) {
580 __flush_dcache_icache(page_address(page+i)); 868 if (!PageHighMem(page)) {
869 __flush_dcache_icache(page_address(page+i));
870 } else {
871 start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE);
872 __flush_dcache_icache(start);
873 kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
874 }
875 }
581} 876}
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index c77fef56dad..161cefde5c1 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -32,6 +32,8 @@
32#include <linux/pagemap.h> 32#include <linux/pagemap.h>
33#include <linux/memblock.h> 33#include <linux/memblock.h>
34#include <linux/gfp.h> 34#include <linux/gfp.h>
35#include <linux/slab.h>
36#include <linux/hugetlb.h>
35 37
36#include <asm/pgalloc.h> 38#include <asm/pgalloc.h>
37#include <asm/prom.h> 39#include <asm/prom.h>
@@ -44,6 +46,7 @@
44#include <asm/tlb.h> 46#include <asm/tlb.h>
45#include <asm/sections.h> 47#include <asm/sections.h>
46#include <asm/system.h> 48#include <asm/system.h>
49#include <asm/hugetlb.h>
47 50
48#include "mmu_decl.h" 51#include "mmu_decl.h"
49 52
@@ -123,6 +126,12 @@ void __init MMU_init(void)
123 /* parse args from command line */ 126 /* parse args from command line */
124 MMU_setup(); 127 MMU_setup();
125 128
129 /*
130 * Reserve gigantic pages for hugetlb. This MUST occur before
131 * lowmem_end_addr is initialized below.
132 */
133 reserve_hugetlb_gpages();
134
126 if (memblock.memory.cnt > 1) { 135 if (memblock.memory.cnt > 1) {
127#ifndef CONFIG_WII 136#ifndef CONFIG_WII
128 memblock.memory.cnt = 1; 137 memblock.memory.cnt = 1;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index c781bbcf733..ad9cf49dfb8 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -548,4 +548,9 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
548 return; 548 return;
549 hash_preload(vma->vm_mm, address, access, trap); 549 hash_preload(vma->vm_mm, address, access, trap);
550#endif /* CONFIG_PPC_STD_MMU */ 550#endif /* CONFIG_PPC_STD_MMU */
551#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
552 && defined(CONFIG_HUGETLB_PAGE)
553 if (is_vm_hugetlb_page(vma))
554 book3e_hugetlb_preload(vma->vm_mm, address, *ptep);
555#endif
551} 556}
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 336807de550..5b63bd3da4a 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -292,6 +292,11 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
292 mm->context.id = MMU_NO_CONTEXT; 292 mm->context.id = MMU_NO_CONTEXT;
293 mm->context.active = 0; 293 mm->context.active = 0;
294 294
295#ifdef CONFIG_PPC_MM_SLICES
296 if (slice_mm_new_context(mm))
297 slice_set_user_psize(mm, mmu_virtual_psize);
298#endif
299
295 return 0; 300 return 0;
296} 301}
297 302
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index af40c8768a7..214130a4edc 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -27,6 +27,7 @@
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/percpu.h> 28#include <linux/percpu.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <linux/hugetlb.h>
30#include <asm/pgalloc.h> 31#include <asm/pgalloc.h>
31#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
32#include <asm/tlb.h> 33#include <asm/tlb.h>
@@ -212,7 +213,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
212 entry = set_access_flags_filter(entry, vma, dirty); 213 entry = set_access_flags_filter(entry, vma, dirty);
213 changed = !pte_same(*(ptep), entry); 214 changed = !pte_same(*(ptep), entry);
214 if (changed) { 215 if (changed) {
215 if (!(vma->vm_flags & VM_HUGETLB)) 216 if (!is_vm_hugetlb_page(vma))
216 assert_pte_locked(vma->vm_mm, address); 217 assert_pte_locked(vma->vm_mm, address);
217 __ptep_set_access_flags(ptep, entry); 218 __ptep_set_access_flags(ptep, entry);
218 flush_tlb_page_nohash(vma, address); 219 flush_tlb_page_nohash(vma, address);
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index 4ebb34bc01d..dc4a5f385e4 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -553,24 +553,24 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
553 rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 553 rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3
554 clrrdi r10,r11,3 554 clrrdi r10,r11,3
555 ldx r15,r10,r15 555 ldx r15,r10,r15
556 cmpldi cr0,r15,0 556 cmpdi cr0,r15,0
557 beq virt_page_table_tlb_miss_fault 557 bge virt_page_table_tlb_miss_fault
558 558
559#ifndef CONFIG_PPC_64K_PAGES 559#ifndef CONFIG_PPC_64K_PAGES
560 /* Get to PUD entry */ 560 /* Get to PUD entry */
561 rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 561 rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
562 clrrdi r10,r11,3 562 clrrdi r10,r11,3
563 ldx r15,r10,r15 563 ldx r15,r10,r15
564 cmpldi cr0,r15,0 564 cmpdi cr0,r15,0
565 beq virt_page_table_tlb_miss_fault 565 bge virt_page_table_tlb_miss_fault
566#endif /* CONFIG_PPC_64K_PAGES */ 566#endif /* CONFIG_PPC_64K_PAGES */
567 567
568 /* Get to PMD entry */ 568 /* Get to PMD entry */
569 rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 569 rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
570 clrrdi r10,r11,3 570 clrrdi r10,r11,3
571 ldx r15,r10,r15 571 ldx r15,r10,r15
572 cmpldi cr0,r15,0 572 cmpdi cr0,r15,0
573 beq virt_page_table_tlb_miss_fault 573 bge virt_page_table_tlb_miss_fault
574 574
575 /* Ok, we're all right, we can now create a kernel translation for 575 /* Ok, we're all right, we can now create a kernel translation for
576 * a 4K or 64K page from r16 -> r15. 576 * a 4K or 64K page from r16 -> r15.
@@ -802,24 +802,24 @@ htw_tlb_miss:
802 rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 802 rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
803 clrrdi r10,r11,3 803 clrrdi r10,r11,3
804 ldx r15,r10,r15 804 ldx r15,r10,r15
805 cmpldi cr0,r15,0 805 cmpdi cr0,r15,0
806 beq htw_tlb_miss_fault 806 bge htw_tlb_miss_fault
807 807
808#ifndef CONFIG_PPC_64K_PAGES 808#ifndef CONFIG_PPC_64K_PAGES
809 /* Get to PUD entry */ 809 /* Get to PUD entry */
810 rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 810 rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
811 clrrdi r10,r11,3 811 clrrdi r10,r11,3
812 ldx r15,r10,r15 812 ldx r15,r10,r15
813 cmpldi cr0,r15,0 813 cmpdi cr0,r15,0
814 beq htw_tlb_miss_fault 814 bge htw_tlb_miss_fault
815#endif /* CONFIG_PPC_64K_PAGES */ 815#endif /* CONFIG_PPC_64K_PAGES */
816 816
817 /* Get to PMD entry */ 817 /* Get to PMD entry */
818 rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 818 rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
819 clrrdi r10,r11,3 819 clrrdi r10,r11,3
820 ldx r15,r10,r15 820 ldx r15,r10,r15
821 cmpldi cr0,r15,0 821 cmpdi cr0,r15,0
822 beq htw_tlb_miss_fault 822 bge htw_tlb_miss_fault
823 823
824 /* Ok, we're all right, we can now create an indirect entry for 824 /* Ok, we're all right, we can now create an indirect entry for
825 * a 1M or 256M page. 825 * a 1M or 256M page.
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index d32ec643c23..afc95c7304a 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -36,14 +36,49 @@
36#include <linux/spinlock.h> 36#include <linux/spinlock.h>
37#include <linux/memblock.h> 37#include <linux/memblock.h>
38#include <linux/of_fdt.h> 38#include <linux/of_fdt.h>
39#include <linux/hugetlb.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include <asm/tlb.h> 42#include <asm/tlb.h>
42#include <asm/code-patching.h> 43#include <asm/code-patching.h>
44#include <asm/hugetlb.h>
43 45
44#include "mmu_decl.h" 46#include "mmu_decl.h"
45 47
46#ifdef CONFIG_PPC_BOOK3E 48/*
49 * This struct lists the sw-supported page sizes. The hardawre MMU may support
50 * other sizes not listed here. The .ind field is only used on MMUs that have
51 * indirect page table entries.
52 */
53#ifdef CONFIG_PPC_BOOK3E_MMU
54#ifdef CONFIG_FSL_BOOKE
55struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
56 [MMU_PAGE_4K] = {
57 .shift = 12,
58 .enc = BOOK3E_PAGESZ_4K,
59 },
60 [MMU_PAGE_4M] = {
61 .shift = 22,
62 .enc = BOOK3E_PAGESZ_4M,
63 },
64 [MMU_PAGE_16M] = {
65 .shift = 24,
66 .enc = BOOK3E_PAGESZ_16M,
67 },
68 [MMU_PAGE_64M] = {
69 .shift = 26,
70 .enc = BOOK3E_PAGESZ_64M,
71 },
72 [MMU_PAGE_256M] = {
73 .shift = 28,
74 .enc = BOOK3E_PAGESZ_256M,
75 },
76 [MMU_PAGE_1G] = {
77 .shift = 30,
78 .enc = BOOK3E_PAGESZ_1GB,
79 },
80};
81#else
47struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { 82struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
48 [MMU_PAGE_4K] = { 83 [MMU_PAGE_4K] = {
49 .shift = 12, 84 .shift = 12,
@@ -77,6 +112,8 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
77 .enc = BOOK3E_PAGESZ_1GB, 112 .enc = BOOK3E_PAGESZ_1GB,
78 }, 113 },
79}; 114};
115#endif /* CONFIG_FSL_BOOKE */
116
80static inline int mmu_get_tsize(int psize) 117static inline int mmu_get_tsize(int psize)
81{ 118{
82 return mmu_psize_defs[psize].enc; 119 return mmu_psize_defs[psize].enc;
@@ -87,7 +124,7 @@ static inline int mmu_get_tsize(int psize)
87 /* This isn't used on !Book3E for now */ 124 /* This isn't used on !Book3E for now */
88 return 0; 125 return 0;
89} 126}
90#endif 127#endif /* CONFIG_PPC_BOOK3E_MMU */
91 128
92/* The variables below are currently only used on 64-bit Book3E 129/* The variables below are currently only used on 64-bit Book3E
93 * though this will probably be made common with other nohash 130 * though this will probably be made common with other nohash
@@ -266,6 +303,11 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
266 303
267void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) 304void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
268{ 305{
306#ifdef CONFIG_HUGETLB_PAGE
307 if (is_vm_hugetlb_page(vma))
308 flush_hugetlb_page(vma, vmaddr);
309#endif
310
269 __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, 311 __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
270 mmu_get_tsize(mmu_virtual_psize), 0); 312 mmu_get_tsize(mmu_virtual_psize), 0);
271} 313}