aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/Makefile5
-rw-r--r--arch/powerpc/mm/gup.c149
-rw-r--r--arch/powerpc/mm/hash_utils_64.c40
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c139
-rw-r--r--arch/powerpc/mm/hugetlbpage.c783
-rw-r--r--arch/powerpc/mm/init_64.c76
-rw-r--r--arch/powerpc/mm/mem.c17
-rw-r--r--arch/powerpc/mm/mmu_context_hash64.c24
-rw-r--r--arch/powerpc/mm/pgtable.c25
-rw-r--r--arch/powerpc/mm/tlb_hash64.c8
10 files changed, 575 insertions, 691 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 6fb8fc8d2fea..ce68708bbad5 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -28,7 +28,10 @@ obj-$(CONFIG_44x) += 44x_mmu.o
28obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o 28obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o
29obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o 29obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
30obj-$(CONFIG_PPC_MM_SLICES) += slice.o 30obj-$(CONFIG_PPC_MM_SLICES) += slice.o
31obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 31ifeq ($(CONFIG_HUGETLB_PAGE),y)
32obj-y += hugetlbpage.o
33obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
34endif
32obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o 35obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
33obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o 36obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
34obj-$(CONFIG_HIGHMEM) += highmem.o 37obj-$(CONFIG_HIGHMEM) += highmem.o
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index bc122a120bf0..d7efdbf640c7 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
55 return 1; 55 return 1;
56} 56}
57 57
58#ifdef CONFIG_HUGETLB_PAGE
59static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
60 unsigned long *addr, unsigned long end,
61 int write, struct page **pages, int *nr)
62{
63 unsigned long mask;
64 unsigned long pte_end;
65 struct page *head, *page;
66 pte_t pte;
67 int refs;
68
69 pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
70 if (pte_end < end)
71 end = pte_end;
72
73 pte = *ptep;
74 mask = _PAGE_PRESENT|_PAGE_USER;
75 if (write)
76 mask |= _PAGE_RW;
77 if ((pte_val(pte) & mask) != mask)
78 return 0;
79 /* hugepages are never "special" */
80 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
81
82 refs = 0;
83 head = pte_page(pte);
84 page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
85 do {
86 VM_BUG_ON(compound_head(page) != head);
87 pages[*nr] = page;
88 (*nr)++;
89 page++;
90 refs++;
91 } while (*addr += PAGE_SIZE, *addr != end);
92
93 if (!page_cache_add_speculative(head, refs)) {
94 *nr -= refs;
95 return 0;
96 }
97 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
98 /* Could be optimized better */
99 while (*nr) {
100 put_page(page);
101 (*nr)--;
102 }
103 }
104
105 return 1;
106}
107#endif /* CONFIG_HUGETLB_PAGE */
108
109static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 58static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
110 int write, struct page **pages, int *nr) 59 int write, struct page **pages, int *nr)
111{ 60{
@@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
119 next = pmd_addr_end(addr, end); 68 next = pmd_addr_end(addr, end);
120 if (pmd_none(pmd)) 69 if (pmd_none(pmd))
121 return 0; 70 return 0;
122 if (!gup_pte_range(pmd, addr, next, write, pages, nr)) 71 if (is_hugepd(pmdp)) {
72 if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
73 addr, next, write, pages, nr))
74 return 0;
75 } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
123 return 0; 76 return 0;
124 } while (pmdp++, addr = next, addr != end); 77 } while (pmdp++, addr = next, addr != end);
125 78
@@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
139 next = pud_addr_end(addr, end); 92 next = pud_addr_end(addr, end);
140 if (pud_none(pud)) 93 if (pud_none(pud))
141 return 0; 94 return 0;
142 if (!gup_pmd_range(pud, addr, next, write, pages, nr)) 95 if (is_hugepd(pudp)) {
96 if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
97 addr, next, write, pages, nr))
98 return 0;
99 } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
143 return 0; 100 return 0;
144 } while (pudp++, addr = next, addr != end); 101 } while (pudp++, addr = next, addr != end);
145 102
@@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
154 unsigned long next; 111 unsigned long next;
155 pgd_t *pgdp; 112 pgd_t *pgdp;
156 int nr = 0; 113 int nr = 0;
157#ifdef CONFIG_PPC64
158 unsigned int shift;
159 int psize;
160#endif
161 114
162 pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); 115 pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
163 116
@@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
172 125
173 pr_devel(" aligned: %lx .. %lx\n", start, end); 126 pr_devel(" aligned: %lx .. %lx\n", start, end);
174 127
175#ifdef CONFIG_HUGETLB_PAGE
176 /* We bail out on slice boundary crossing when hugetlb is
177 * enabled in order to not have to deal with two different
178 * page table formats
179 */
180 if (addr < SLICE_LOW_TOP) {
181 if (end > SLICE_LOW_TOP)
182 goto slow_irqon;
183
184 if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
185 GET_LOW_SLICE_INDEX(end - 1)))
186 goto slow_irqon;
187 } else {
188 if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
189 GET_HIGH_SLICE_INDEX(end - 1)))
190 goto slow_irqon;
191 }
192#endif /* CONFIG_HUGETLB_PAGE */
193
194 /* 128 /*
195 * XXX: batch / limit 'nr', to avoid large irq off latency 129 * XXX: batch / limit 'nr', to avoid large irq off latency
196 * needs some instrumenting to determine the common sizes used by 130 * needs some instrumenting to determine the common sizes used by
@@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
210 */ 144 */
211 local_irq_disable(); 145 local_irq_disable();
212 146
213#ifdef CONFIG_PPC64 147 pgdp = pgd_offset(mm, addr);
214 /* Those bits are related to hugetlbfs implementation and only exist 148 do {
215 * on 64-bit for now 149 pgd_t pgd = *pgdp;
216 */ 150
217 psize = get_slice_psize(mm, addr); 151 pr_devel(" %016lx: normal pgd %p\n", addr,
218 shift = mmu_psize_defs[psize].shift; 152 (void *)pgd_val(pgd));
219#endif /* CONFIG_PPC64 */ 153 next = pgd_addr_end(addr, end);
220 154 if (pgd_none(pgd))
221#ifdef CONFIG_HUGETLB_PAGE 155 goto slow;
222 if (unlikely(mmu_huge_psizes[psize])) { 156 if (is_hugepd(pgdp)) {
223 pte_t *ptep; 157 if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
224 unsigned long a = addr; 158 addr, next, write, pages, &nr))
225 unsigned long sz = ((1UL) << shift);
226 struct hstate *hstate = size_to_hstate(sz);
227
228 BUG_ON(!hstate);
229 /*
230 * XXX: could be optimized to avoid hstate
231 * lookup entirely (just use shift)
232 */
233
234 do {
235 VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
236 ptep = huge_pte_offset(mm, a);
237 pr_devel(" %016lx: huge ptep %p\n", a, ptep);
238 if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
239 &nr))
240 goto slow;
241 } while (a != end);
242 } else
243#endif /* CONFIG_HUGETLB_PAGE */
244 {
245 pgdp = pgd_offset(mm, addr);
246 do {
247 pgd_t pgd = *pgdp;
248
249#ifdef CONFIG_PPC64
250 VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
251#endif
252 pr_devel(" %016lx: normal pgd %p\n", addr,
253 (void *)pgd_val(pgd));
254 next = pgd_addr_end(addr, end);
255 if (pgd_none(pgd))
256 goto slow;
257 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
258 goto slow; 159 goto slow;
259 } while (pgdp++, addr = next, addr != end); 160 } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
260 } 161 goto slow;
162 } while (pgdp++, addr = next, addr != end);
163
261 local_irq_enable(); 164 local_irq_enable();
262 165
263 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); 166 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 1ade7eb6ae00..6810128aba30 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -92,6 +92,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
92struct hash_pte *htab_address; 92struct hash_pte *htab_address;
93unsigned long htab_size_bytes; 93unsigned long htab_size_bytes;
94unsigned long htab_hash_mask; 94unsigned long htab_hash_mask;
95EXPORT_SYMBOL_GPL(htab_hash_mask);
95int mmu_linear_psize = MMU_PAGE_4K; 96int mmu_linear_psize = MMU_PAGE_4K;
96int mmu_virtual_psize = MMU_PAGE_4K; 97int mmu_virtual_psize = MMU_PAGE_4K;
97int mmu_vmalloc_psize = MMU_PAGE_4K; 98int mmu_vmalloc_psize = MMU_PAGE_4K;
@@ -102,6 +103,7 @@ int mmu_io_psize = MMU_PAGE_4K;
102int mmu_kernel_ssize = MMU_SEGSIZE_256M; 103int mmu_kernel_ssize = MMU_SEGSIZE_256M;
103int mmu_highuser_ssize = MMU_SEGSIZE_256M; 104int mmu_highuser_ssize = MMU_SEGSIZE_256M;
104u16 mmu_slb_size = 64; 105u16 mmu_slb_size = 64;
106EXPORT_SYMBOL_GPL(mmu_slb_size);
105#ifdef CONFIG_HUGETLB_PAGE 107#ifdef CONFIG_HUGETLB_PAGE
106unsigned int HPAGE_SHIFT; 108unsigned int HPAGE_SHIFT;
107#endif 109#endif
@@ -481,16 +483,6 @@ static void __init htab_init_page_sizes(void)
481#ifdef CONFIG_HUGETLB_PAGE 483#ifdef CONFIG_HUGETLB_PAGE
482 /* Reserve 16G huge page memory sections for huge pages */ 484 /* Reserve 16G huge page memory sections for huge pages */
483 of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); 485 of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
484
485/* Set default large page size. Currently, we pick 16M or 1M depending
486 * on what is available
487 */
488 if (mmu_psize_defs[MMU_PAGE_16M].shift)
489 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
490 /* With 4k/4level pagetables, we can't (for now) cope with a
491 * huge page size < PMD_SIZE */
492 else if (mmu_psize_defs[MMU_PAGE_1M].shift)
493 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
494#endif /* CONFIG_HUGETLB_PAGE */ 486#endif /* CONFIG_HUGETLB_PAGE */
495} 487}
496 488
@@ -785,7 +777,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
785 /* page is dirty */ 777 /* page is dirty */
786 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { 778 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
787 if (trap == 0x400) { 779 if (trap == 0x400) {
788 __flush_dcache_icache(page_address(page)); 780 flush_dcache_icache_page(page);
789 set_bit(PG_arch_1, &page->flags); 781 set_bit(PG_arch_1, &page->flags);
790 } else 782 } else
791 pp |= HPTE_R_N; 783 pp |= HPTE_R_N;
@@ -891,6 +883,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
891 unsigned long vsid; 883 unsigned long vsid;
892 struct mm_struct *mm; 884 struct mm_struct *mm;
893 pte_t *ptep; 885 pte_t *ptep;
886 unsigned hugeshift;
894 const struct cpumask *tmp; 887 const struct cpumask *tmp;
895 int rc, user_region = 0, local = 0; 888 int rc, user_region = 0, local = 0;
896 int psize, ssize; 889 int psize, ssize;
@@ -943,30 +936,31 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
943 if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) 936 if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
944 local = 1; 937 local = 1;
945 938
946#ifdef CONFIG_HUGETLB_PAGE
947 /* Handle hugepage regions */
948 if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
949 DBG_LOW(" -> huge page !\n");
950 return hash_huge_page(mm, access, ea, vsid, local, trap);
951 }
952#endif /* CONFIG_HUGETLB_PAGE */
953
954#ifndef CONFIG_PPC_64K_PAGES 939#ifndef CONFIG_PPC_64K_PAGES
955 /* If we use 4K pages and our psize is not 4K, then we are hitting 940 /* If we use 4K pages and our psize is not 4K, then we might
956 * a special driver mapping, we need to align the address before 941 * be hitting a special driver mapping, and need to align the
957 * we fetch the PTE 942 * address before we fetch the PTE.
943 *
944 * It could also be a hugepage mapping, in which case this is
945 * not necessary, but it's not harmful, either.
958 */ 946 */
959 if (psize != MMU_PAGE_4K) 947 if (psize != MMU_PAGE_4K)
960 ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); 948 ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
961#endif /* CONFIG_PPC_64K_PAGES */ 949#endif /* CONFIG_PPC_64K_PAGES */
962 950
963 /* Get PTE and page size from page tables */ 951 /* Get PTE and page size from page tables */
964 ptep = find_linux_pte(pgdir, ea); 952 ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
965 if (ptep == NULL || !pte_present(*ptep)) { 953 if (ptep == NULL || !pte_present(*ptep)) {
966 DBG_LOW(" no PTE !\n"); 954 DBG_LOW(" no PTE !\n");
967 return 1; 955 return 1;
968 } 956 }
969 957
958#ifdef CONFIG_HUGETLB_PAGE
959 if (hugeshift)
960 return __hash_page_huge(ea, access, vsid, ptep, trap, local,
961 ssize, hugeshift, psize);
962#endif /* CONFIG_HUGETLB_PAGE */
963
970#ifndef CONFIG_PPC_64K_PAGES 964#ifndef CONFIG_PPC_64K_PAGES
971 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); 965 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
972#else 966#else
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
new file mode 100644
index 000000000000..199539882f92
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -0,0 +1,139 @@
1/*
2 * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
3 *
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
5 *
6 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8 */
9
10#include <linux/mm.h>
11#include <linux/hugetlb.h>
12#include <asm/pgtable.h>
13#include <asm/pgalloc.h>
14#include <asm/cacheflush.h>
15#include <asm/machdep.h>
16
17int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
18 pte_t *ptep, unsigned long trap, int local, int ssize,
19 unsigned int shift, unsigned int mmu_psize)
20{
21 unsigned long old_pte, new_pte;
22 unsigned long va, rflags, pa, sz;
23 long slot;
24 int err = 1;
25
26 BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
27
28 /* Search the Linux page table for a match with va */
29 va = hpt_va(ea, vsid, ssize);
30
31 /*
32 * Check the user's access rights to the page. If access should be
33 * prevented then send the problem up to do_page_fault.
34 */
35 if (unlikely(access & ~pte_val(*ptep)))
36 goto out;
37 /*
38 * At this point, we have a pte (old_pte) which can be used to build
39 * or update an HPTE. There are 2 cases:
40 *
41 * 1. There is a valid (present) pte with no associated HPTE (this is
42 * the most common case)
43 * 2. There is a valid (present) pte with an associated HPTE. The
44 * current values of the pp bits in the HPTE prevent access
45 * because we are doing software DIRTY bit management and the
46 * page is currently not DIRTY.
47 */
48
49
50 do {
51 old_pte = pte_val(*ptep);
52 if (old_pte & _PAGE_BUSY)
53 goto out;
54 new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
55 } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
56 old_pte, new_pte));
57
58 rflags = 0x2 | (!(new_pte & _PAGE_RW));
59 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
60 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
61 sz = ((1UL) << shift);
62 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
63 /* No CPU has hugepages but lacks no execute, so we
64 * don't need to worry about that case */
65 rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
66
67 /* Check if pte already has an hpte (case 2) */
68 if (unlikely(old_pte & _PAGE_HASHPTE)) {
69 /* There MIGHT be an HPTE for this pte */
70 unsigned long hash, slot;
71
72 hash = hpt_hash(va, shift, ssize);
73 if (old_pte & _PAGE_F_SECOND)
74 hash = ~hash;
75 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
76 slot += (old_pte & _PAGE_F_GIX) >> 12;
77
78 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
79 ssize, local) == -1)
80 old_pte &= ~_PAGE_HPTEFLAGS;
81 }
82
83 if (likely(!(old_pte & _PAGE_HASHPTE))) {
84 unsigned long hash = hpt_hash(va, shift, ssize);
85 unsigned long hpte_group;
86
87 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
88
89repeat:
90 hpte_group = ((hash & htab_hash_mask) *
91 HPTES_PER_GROUP) & ~0x7UL;
92
93 /* clear HPTE slot informations in new PTE */
94#ifdef CONFIG_PPC_64K_PAGES
95 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
96#else
97 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
98#endif
99 /* Add in WIMG bits */
100 rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
101 _PAGE_COHERENT | _PAGE_GUARDED));
102
103 /* Insert into the hash table, primary slot */
104 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
105 mmu_psize, ssize);
106
107 /* Primary is full, try the secondary */
108 if (unlikely(slot == -1)) {
109 hpte_group = ((~hash & htab_hash_mask) *
110 HPTES_PER_GROUP) & ~0x7UL;
111 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
112 HPTE_V_SECONDARY,
113 mmu_psize, ssize);
114 if (slot == -1) {
115 if (mftb() & 0x1)
116 hpte_group = ((hash & htab_hash_mask) *
117 HPTES_PER_GROUP)&~0x7UL;
118
119 ppc_md.hpte_remove(hpte_group);
120 goto repeat;
121 }
122 }
123
124 if (unlikely(slot == -2))
125 panic("hash_huge_page: pte_insert failed\n");
126
127 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
128 }
129
130 /*
131 * No need to use ldarx/stdcx here
132 */
133 *ptep = __pte(new_pte & ~_PAGE_BUSY);
134
135 err = 0;
136
137 out:
138 return err;
139}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 90df6ffe3a43..53b200abb025 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -7,29 +7,17 @@
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8 */ 8 */
9 9
10#include <linux/init.h>
11#include <linux/fs.h>
12#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/io.h>
13#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
14#include <linux/pagemap.h> 13#include <asm/pgtable.h>
15#include <linux/slab.h>
16#include <linux/err.h>
17#include <linux/sysctl.h>
18#include <asm/mman.h>
19#include <asm/pgalloc.h> 14#include <asm/pgalloc.h>
20#include <asm/tlb.h> 15#include <asm/tlb.h>
21#include <asm/tlbflush.h>
22#include <asm/mmu_context.h>
23#include <asm/machdep.h>
24#include <asm/cputable.h>
25#include <asm/spu.h>
26 16
27#define PAGE_SHIFT_64K 16 17#define PAGE_SHIFT_64K 16
28#define PAGE_SHIFT_16M 24 18#define PAGE_SHIFT_16M 24
29#define PAGE_SHIFT_16G 34 19#define PAGE_SHIFT_16G 34
30 20
31#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
32#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
33#define MAX_NUMBER_GPAGES 1024 21#define MAX_NUMBER_GPAGES 1024
34 22
35/* Tracks the 16G pages after the device tree is scanned and before the 23/* Tracks the 16G pages after the device tree is scanned and before the
@@ -37,53 +25,17 @@
37static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; 25static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
38static unsigned nr_gpages; 26static unsigned nr_gpages;
39 27
40/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
41 * stored for the huge page sizes that are valid.
42 */
43unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
44
45#define hugepte_shift mmu_huge_psizes
46#define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize])
47#define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize])
48
49#define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \
50 + hugepte_shift[psize])
51#define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize))
52#define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1))
53
54/* Subtract one from array size because we don't need a cache for 4K since
55 * is not a huge page size */
56#define HUGE_PGTABLE_INDEX(psize) (HUGEPTE_CACHE_NUM + psize - 1)
57#define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize])
58
59static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
60 [MMU_PAGE_64K] = "hugepte_cache_64K",
61 [MMU_PAGE_1M] = "hugepte_cache_1M",
62 [MMU_PAGE_16M] = "hugepte_cache_16M",
63 [MMU_PAGE_16G] = "hugepte_cache_16G",
64};
65
66/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 28/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
67 * will choke on pointers to hugepte tables, which is handy for 29 * will choke on pointers to hugepte tables, which is handy for
68 * catching screwups early. */ 30 * catching screwups early. */
69#define HUGEPD_OK 0x1
70
71typedef struct { unsigned long pd; } hugepd_t;
72
73#define hugepd_none(hpd) ((hpd).pd == 0)
74 31
75static inline int shift_to_mmu_psize(unsigned int shift) 32static inline int shift_to_mmu_psize(unsigned int shift)
76{ 33{
77 switch (shift) { 34 int psize;
78#ifndef CONFIG_PPC_64K_PAGES 35
79 case PAGE_SHIFT_64K: 36 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
80 return MMU_PAGE_64K; 37 if (mmu_psize_defs[psize].shift == shift)
81#endif 38 return psize;
82 case PAGE_SHIFT_16M:
83 return MMU_PAGE_16M;
84 case PAGE_SHIFT_16G:
85 return MMU_PAGE_16G;
86 }
87 return -1; 39 return -1;
88} 40}
89 41
@@ -94,71 +46,126 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
94 BUG(); 46 BUG();
95} 47}
96 48
49#define hugepd_none(hpd) ((hpd).pd == 0)
50
97static inline pte_t *hugepd_page(hugepd_t hpd) 51static inline pte_t *hugepd_page(hugepd_t hpd)
98{ 52{
99 BUG_ON(!(hpd.pd & HUGEPD_OK)); 53 BUG_ON(!hugepd_ok(hpd));
100 return (pte_t *)(hpd.pd & ~HUGEPD_OK); 54 return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
101} 55}
102 56
103static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, 57static inline unsigned int hugepd_shift(hugepd_t hpd)
104 struct hstate *hstate)
105{ 58{
106 unsigned int shift = huge_page_shift(hstate); 59 return hpd.pd & HUGEPD_SHIFT_MASK;
107 int psize = shift_to_mmu_psize(shift); 60}
108 unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1)); 61
62static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
63{
64 unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
109 pte_t *dir = hugepd_page(*hpdp); 65 pte_t *dir = hugepd_page(*hpdp);
110 66
111 return dir + idx; 67 return dir + idx;
112} 68}
113 69
70pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
71{
72 pgd_t *pg;
73 pud_t *pu;
74 pmd_t *pm;
75 hugepd_t *hpdp = NULL;
76 unsigned pdshift = PGDIR_SHIFT;
77
78 if (shift)
79 *shift = 0;
80
81 pg = pgdir + pgd_index(ea);
82 if (is_hugepd(pg)) {
83 hpdp = (hugepd_t *)pg;
84 } else if (!pgd_none(*pg)) {
85 pdshift = PUD_SHIFT;
86 pu = pud_offset(pg, ea);
87 if (is_hugepd(pu))
88 hpdp = (hugepd_t *)pu;
89 else if (!pud_none(*pu)) {
90 pdshift = PMD_SHIFT;
91 pm = pmd_offset(pu, ea);
92 if (is_hugepd(pm))
93 hpdp = (hugepd_t *)pm;
94 else if (!pmd_none(*pm)) {
95 return pte_offset_map(pm, ea);
96 }
97 }
98 }
99
100 if (!hpdp)
101 return NULL;
102
103 if (shift)
104 *shift = hugepd_shift(*hpdp);
105 return hugepte_offset(hpdp, ea, pdshift);
106}
107
108pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
109{
110 return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
111}
112
114static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 113static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
115 unsigned long address, unsigned int psize) 114 unsigned long address, unsigned pdshift, unsigned pshift)
116{ 115{
117 pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], 116 pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
118 GFP_KERNEL|__GFP_REPEAT); 117 GFP_KERNEL|__GFP_REPEAT);
118
119 BUG_ON(pshift > HUGEPD_SHIFT_MASK);
120 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
119 121
120 if (! new) 122 if (! new)
121 return -ENOMEM; 123 return -ENOMEM;
122 124
123 spin_lock(&mm->page_table_lock); 125 spin_lock(&mm->page_table_lock);
124 if (!hugepd_none(*hpdp)) 126 if (!hugepd_none(*hpdp))
125 kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new); 127 kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
126 else 128 else
127 hpdp->pd = (unsigned long)new | HUGEPD_OK; 129 hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
128 spin_unlock(&mm->page_table_lock); 130 spin_unlock(&mm->page_table_lock);
129 return 0; 131 return 0;
130} 132}
131 133
132 134pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
133static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
134{ 135{
135 if (huge_page_shift(hstate) < PUD_SHIFT) 136 pgd_t *pg;
136 return pud_offset(pgd, addr); 137 pud_t *pu;
137 else 138 pmd_t *pm;
138 return (pud_t *) pgd; 139 hugepd_t *hpdp = NULL;
139} 140 unsigned pshift = __ffs(sz);
140static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, 141 unsigned pdshift = PGDIR_SHIFT;
141 struct hstate *hstate) 142
142{ 143 addr &= ~(sz-1);
143 if (huge_page_shift(hstate) < PUD_SHIFT) 144
144 return pud_alloc(mm, pgd, addr); 145 pg = pgd_offset(mm, addr);
145 else 146 if (pshift >= PUD_SHIFT) {
146 return (pud_t *) pgd; 147 hpdp = (hugepd_t *)pg;
147} 148 } else {
148static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) 149 pdshift = PUD_SHIFT;
149{ 150 pu = pud_alloc(mm, pg, addr);
150 if (huge_page_shift(hstate) < PMD_SHIFT) 151 if (pshift >= PMD_SHIFT) {
151 return pmd_offset(pud, addr); 152 hpdp = (hugepd_t *)pu;
152 else 153 } else {
153 return (pmd_t *) pud; 154 pdshift = PMD_SHIFT;
154} 155 pm = pmd_alloc(mm, pu, addr);
155static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, 156 hpdp = (hugepd_t *)pm;
156 struct hstate *hstate) 157 }
157{ 158 }
158 if (huge_page_shift(hstate) < PMD_SHIFT) 159
159 return pmd_alloc(mm, pud, addr); 160 if (!hpdp)
160 else 161 return NULL;
161 return (pmd_t *) pud; 162
163 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
164
165 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
166 return NULL;
167
168 return hugepte_offset(hpdp, addr, pdshift);
162} 169}
163 170
164/* Build list of addresses of gigantic pages. This function is used in early 171/* Build list of addresses of gigantic pages. This function is used in early
@@ -192,94 +199,38 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
192 return 1; 199 return 1;
193} 200}
194 201
195
196/* Modelled after find_linux_pte() */
197pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
198{
199 pgd_t *pg;
200 pud_t *pu;
201 pmd_t *pm;
202
203 unsigned int psize;
204 unsigned int shift;
205 unsigned long sz;
206 struct hstate *hstate;
207 psize = get_slice_psize(mm, addr);
208 shift = mmu_psize_to_shift(psize);
209 sz = ((1UL) << shift);
210 hstate = size_to_hstate(sz);
211
212 addr &= hstate->mask;
213
214 pg = pgd_offset(mm, addr);
215 if (!pgd_none(*pg)) {
216 pu = hpud_offset(pg, addr, hstate);
217 if (!pud_none(*pu)) {
218 pm = hpmd_offset(pu, addr, hstate);
219 if (!pmd_none(*pm))
220 return hugepte_offset((hugepd_t *)pm, addr,
221 hstate);
222 }
223 }
224
225 return NULL;
226}
227
228pte_t *huge_pte_alloc(struct mm_struct *mm,
229 unsigned long addr, unsigned long sz)
230{
231 pgd_t *pg;
232 pud_t *pu;
233 pmd_t *pm;
234 hugepd_t *hpdp = NULL;
235 struct hstate *hstate;
236 unsigned int psize;
237 hstate = size_to_hstate(sz);
238
239 psize = get_slice_psize(mm, addr);
240 BUG_ON(!mmu_huge_psizes[psize]);
241
242 addr &= hstate->mask;
243
244 pg = pgd_offset(mm, addr);
245 pu = hpud_alloc(mm, pg, addr, hstate);
246
247 if (pu) {
248 pm = hpmd_alloc(mm, pu, addr, hstate);
249 if (pm)
250 hpdp = (hugepd_t *)pm;
251 }
252
253 if (! hpdp)
254 return NULL;
255
256 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
257 return NULL;
258
259 return hugepte_offset(hpdp, addr, hstate);
260}
261
262int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 202int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
263{ 203{
264 return 0; 204 return 0;
265} 205}
266 206
267static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, 207static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
268 unsigned int psize) 208 unsigned long start, unsigned long end,
209 unsigned long floor, unsigned long ceiling)
269{ 210{
270 pte_t *hugepte = hugepd_page(*hpdp); 211 pte_t *hugepte = hugepd_page(*hpdp);
212 unsigned shift = hugepd_shift(*hpdp);
213 unsigned long pdmask = ~((1UL << pdshift) - 1);
214
215 start &= pdmask;
216 if (start < floor)
217 return;
218 if (ceiling) {
219 ceiling &= pdmask;
220 if (! ceiling)
221 return;
222 }
223 if (end - 1 > ceiling - 1)
224 return;
271 225
272 hpdp->pd = 0; 226 hpdp->pd = 0;
273 tlb->need_flush = 1; 227 tlb->need_flush = 1;
274 pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, 228 pgtable_free_tlb(tlb, hugepte, pdshift - shift);
275 HUGEPTE_CACHE_NUM+psize-1,
276 PGF_CACHENUM_MASK));
277} 229}
278 230
279static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 231static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
280 unsigned long addr, unsigned long end, 232 unsigned long addr, unsigned long end,
281 unsigned long floor, unsigned long ceiling, 233 unsigned long floor, unsigned long ceiling)
282 unsigned int psize)
283{ 234{
284 pmd_t *pmd; 235 pmd_t *pmd;
285 unsigned long next; 236 unsigned long next;
@@ -291,7 +242,8 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
291 next = pmd_addr_end(addr, end); 242 next = pmd_addr_end(addr, end);
292 if (pmd_none(*pmd)) 243 if (pmd_none(*pmd))
293 continue; 244 continue;
294 free_hugepte_range(tlb, (hugepd_t *)pmd, psize); 245 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
246 addr, next, floor, ceiling);
295 } while (pmd++, addr = next, addr != end); 247 } while (pmd++, addr = next, addr != end);
296 248
297 start &= PUD_MASK; 249 start &= PUD_MASK;
@@ -317,23 +269,19 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
317 pud_t *pud; 269 pud_t *pud;
318 unsigned long next; 270 unsigned long next;
319 unsigned long start; 271 unsigned long start;
320 unsigned int shift;
321 unsigned int psize = get_slice_psize(tlb->mm, addr);
322 shift = mmu_psize_to_shift(psize);
323 272
324 start = addr; 273 start = addr;
325 pud = pud_offset(pgd, addr); 274 pud = pud_offset(pgd, addr);
326 do { 275 do {
327 next = pud_addr_end(addr, end); 276 next = pud_addr_end(addr, end);
328 if (shift < PMD_SHIFT) { 277 if (!is_hugepd(pud)) {
329 if (pud_none_or_clear_bad(pud)) 278 if (pud_none_or_clear_bad(pud))
330 continue; 279 continue;
331 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 280 hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
332 ceiling, psize); 281 ceiling);
333 } else { 282 } else {
334 if (pud_none(*pud)) 283 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
335 continue; 284 addr, next, floor, ceiling);
336 free_hugepte_range(tlb, (hugepd_t *)pud, psize);
337 } 285 }
338 } while (pud++, addr = next, addr != end); 286 } while (pud++, addr = next, addr != end);
339 287
@@ -364,121 +312,56 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
364{ 312{
365 pgd_t *pgd; 313 pgd_t *pgd;
366 unsigned long next; 314 unsigned long next;
367 unsigned long start;
368 315
369 /* 316 /*
370 * Comments below take from the normal free_pgd_range(). They 317 * Because there are a number of different possible pagetable
371 * apply here too. The tests against HUGEPD_MASK below are 318 * layouts for hugepage ranges, we limit knowledge of how
372 * essential, because we *don't* test for this at the bottom 319 * things should be laid out to the allocation path
373 * level. Without them we'll attempt to free a hugepte table 320 * (huge_pte_alloc(), above). Everything else works out the
374 * when we unmap just part of it, even if there are other 321 * structure as it goes from information in the hugepd
375 * active mappings using it. 322 * pointers. That means that we can't here use the
376 * 323 * optimization used in the normal page free_pgd_range(), of
377 * The next few lines have given us lots of grief... 324 * checking whether we're actually covering a large enough
325 * range to have to do anything at the top level of the walk
326 * instead of at the bottom.
378 * 327 *
379 * Why are we testing HUGEPD* at this top level? Because 328 * To make sense of this, you should probably go read the big
380 * often there will be no work to do at all, and we'd prefer 329 * block comment at the top of the normal free_pgd_range(),
381 * not to go all the way down to the bottom just to discover 330 * too.
382 * that.
383 *
384 * Why all these "- 1"s? Because 0 represents both the bottom
385 * of the address space and the top of it (using -1 for the
386 * top wouldn't help much: the masks would do the wrong thing).
387 * The rule is that addr 0 and floor 0 refer to the bottom of
388 * the address space, but end 0 and ceiling 0 refer to the top
389 * Comparisons need to use "end - 1" and "ceiling - 1" (though
390 * that end 0 case should be mythical).
391 *
392 * Wherever addr is brought up or ceiling brought down, we
393 * must be careful to reject "the opposite 0" before it
394 * confuses the subsequent tests. But what about where end is
395 * brought down by HUGEPD_SIZE below? no, end can't go down to
396 * 0 there.
397 *
398 * Whereas we round start (addr) and ceiling down, by different
399 * masks at different levels, in order to test whether a table
400 * now has no other vmas using it, so can be freed, we don't
401 * bother to round floor or end up - the tests don't need that.
402 */ 331 */
403 unsigned int psize = get_slice_psize(tlb->mm, addr);
404
405 addr &= HUGEPD_MASK(psize);
406 if (addr < floor) {
407 addr += HUGEPD_SIZE(psize);
408 if (!addr)
409 return;
410 }
411 if (ceiling) {
412 ceiling &= HUGEPD_MASK(psize);
413 if (!ceiling)
414 return;
415 }
416 if (end - 1 > ceiling - 1)
417 end -= HUGEPD_SIZE(psize);
418 if (addr > end - 1)
419 return;
420 332
421 start = addr;
422 pgd = pgd_offset(tlb->mm, addr); 333 pgd = pgd_offset(tlb->mm, addr);
423 do { 334 do {
424 psize = get_slice_psize(tlb->mm, addr);
425 BUG_ON(!mmu_huge_psizes[psize]);
426 next = pgd_addr_end(addr, end); 335 next = pgd_addr_end(addr, end);
427 if (mmu_psize_to_shift(psize) < PUD_SHIFT) { 336 if (!is_hugepd(pgd)) {
428 if (pgd_none_or_clear_bad(pgd)) 337 if (pgd_none_or_clear_bad(pgd))
429 continue; 338 continue;
430 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 339 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
431 } else { 340 } else {
432 if (pgd_none(*pgd)) 341 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
433 continue; 342 addr, next, floor, ceiling);
434 free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
435 } 343 }
436 } while (pgd++, addr = next, addr != end); 344 } while (pgd++, addr = next, addr != end);
437} 345}
438 346
439void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
440 pte_t *ptep, pte_t pte)
441{
442 if (pte_present(*ptep)) {
443 /* We open-code pte_clear because we need to pass the right
444 * argument to hpte_need_flush (huge / !huge). Might not be
445 * necessary anymore if we make hpte_need_flush() get the
446 * page size from the slices
447 */
448 unsigned int psize = get_slice_psize(mm, addr);
449 unsigned int shift = mmu_psize_to_shift(psize);
450 unsigned long sz = ((1UL) << shift);
451 struct hstate *hstate = size_to_hstate(sz);
452 pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
453 }
454 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
455}
456
457pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
458 pte_t *ptep)
459{
460 unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
461 return __pte(old);
462}
463
464struct page * 347struct page *
465follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 348follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
466{ 349{
467 pte_t *ptep; 350 pte_t *ptep;
468 struct page *page; 351 struct page *page;
469 unsigned int mmu_psize = get_slice_psize(mm, address); 352 unsigned shift;
353 unsigned long mask;
354
355 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
470 356
471 /* Verify it is a huge page else bail. */ 357 /* Verify it is a huge page else bail. */
472 if (!mmu_huge_psizes[mmu_psize]) 358 if (!ptep || !shift)
473 return ERR_PTR(-EINVAL); 359 return ERR_PTR(-EINVAL);
474 360
475 ptep = huge_pte_offset(mm, address); 361 mask = (1UL << shift) - 1;
476 page = pte_page(*ptep); 362 page = pte_page(*ptep);
477 if (page) { 363 if (page)
478 unsigned int shift = mmu_psize_to_shift(mmu_psize); 364 page += (address & mask) / PAGE_SIZE;
479 unsigned long sz = ((1UL) << shift);
480 page += (address % sz) / PAGE_SIZE;
481 }
482 365
483 return page; 366 return page;
484} 367}
@@ -501,6 +384,73 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
501 return NULL; 384 return NULL;
502} 385}
503 386
387static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
388 unsigned long end, int write, struct page **pages, int *nr)
389{
390 unsigned long mask;
391 unsigned long pte_end;
392 struct page *head, *page;
393 pte_t pte;
394 int refs;
395
396 pte_end = (addr + sz) & ~(sz-1);
397 if (pte_end < end)
398 end = pte_end;
399
400 pte = *ptep;
401 mask = _PAGE_PRESENT | _PAGE_USER;
402 if (write)
403 mask |= _PAGE_RW;
404
405 if ((pte_val(pte) & mask) != mask)
406 return 0;
407
408 /* hugepages are never "special" */
409 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
410
411 refs = 0;
412 head = pte_page(pte);
413
414 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
415 do {
416 VM_BUG_ON(compound_head(page) != head);
417 pages[*nr] = page;
418 (*nr)++;
419 page++;
420 refs++;
421 } while (addr += PAGE_SIZE, addr != end);
422
423 if (!page_cache_add_speculative(head, refs)) {
424 *nr -= refs;
425 return 0;
426 }
427
428 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
429 /* Could be optimized better */
430 while (*nr) {
431 put_page(page);
432 (*nr)--;
433 }
434 }
435
436 return 1;
437}
438
439int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
440 unsigned long addr, unsigned long end,
441 int write, struct page **pages, int *nr)
442{
443 pte_t *ptep;
444 unsigned long sz = 1UL << hugepd_shift(*hugepd);
445
446 ptep = hugepte_offset(hugepd, addr, pdshift);
447 do {
448 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
449 return 0;
450 } while (ptep++, addr += sz, addr != end);
451
452 return 1;
453}
504 454
505unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 455unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
506 unsigned long len, unsigned long pgoff, 456 unsigned long len, unsigned long pgoff,
@@ -509,8 +459,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
509 struct hstate *hstate = hstate_file(file); 459 struct hstate *hstate = hstate_file(file);
510 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 460 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
511 461
512 if (!mmu_huge_psizes[mmu_psize])
513 return -EINVAL;
514 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); 462 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
515} 463}
516 464
@@ -521,229 +469,46 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
521 return 1UL << mmu_psize_to_shift(psize); 469 return 1UL << mmu_psize_to_shift(psize);
522} 470}
523 471
524/* 472static int __init add_huge_page_size(unsigned long long size)
525 * Called by asm hashtable.S for doing lazy icache flush
526 */
527static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
528 pte_t pte, int trap, unsigned long sz)
529{ 473{
530 struct page *page; 474 int shift = __ffs(size);
531 int i; 475 int mmu_psize;
532
533 if (!pfn_valid(pte_pfn(pte)))
534 return rflags;
535
536 page = pte_page(pte);
537
538 /* page is dirty */
539 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
540 if (trap == 0x400) {
541 for (i = 0; i < (sz / PAGE_SIZE); i++)
542 __flush_dcache_icache(page_address(page+i));
543 set_bit(PG_arch_1, &page->flags);
544 } else {
545 rflags |= HPTE_R_N;
546 }
547 }
548 return rflags;
549}
550 476
551int hash_huge_page(struct mm_struct *mm, unsigned long access, 477 /* Check that it is a page size supported by the hardware and
552 unsigned long ea, unsigned long vsid, int local, 478 * that it fits within pagetable and slice limits. */
553 unsigned long trap) 479 if (!is_power_of_2(size)
554{ 480 || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
555 pte_t *ptep; 481 return -EINVAL;
556 unsigned long old_pte, new_pte;
557 unsigned long va, rflags, pa, sz;
558 long slot;
559 int err = 1;
560 int ssize = user_segment_size(ea);
561 unsigned int mmu_psize;
562 int shift;
563 mmu_psize = get_slice_psize(mm, ea);
564
565 if (!mmu_huge_psizes[mmu_psize])
566 goto out;
567 ptep = huge_pte_offset(mm, ea);
568
569 /* Search the Linux page table for a match with va */
570 va = hpt_va(ea, vsid, ssize);
571 482
572 /* 483 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
573 * If no pte found or not present, send the problem up to 484 return -EINVAL;
574 * do_page_fault
575 */
576 if (unlikely(!ptep || pte_none(*ptep)))
577 goto out;
578 485
579 /* 486#ifdef CONFIG_SPU_FS_64K_LS
580 * Check the user's access rights to the page. If access should be 487 /* Disable support for 64K huge pages when 64K SPU local store
581 * prevented then send the problem up to do_page_fault. 488 * support is enabled as the current implementation conflicts.
582 */
583 if (unlikely(access & ~pte_val(*ptep)))
584 goto out;
585 /*
586 * At this point, we have a pte (old_pte) which can be used to build
587 * or update an HPTE. There are 2 cases:
588 *
589 * 1. There is a valid (present) pte with no associated HPTE (this is
590 * the most common case)
591 * 2. There is a valid (present) pte with an associated HPTE. The
592 * current values of the pp bits in the HPTE prevent access
593 * because we are doing software DIRTY bit management and the
594 * page is currently not DIRTY.
595 */ 489 */
490 if (shift == PAGE_SHIFT_64K)
491 return -EINVAL;
492#endif /* CONFIG_SPU_FS_64K_LS */
596 493
494 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
597 495
598 do { 496 /* Return if huge page size has already been setup */
599 old_pte = pte_val(*ptep); 497 if (size_to_hstate(size))
600 if (old_pte & _PAGE_BUSY) 498 return 0;
601 goto out;
602 new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
603 } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
604 old_pte, new_pte));
605
606 rflags = 0x2 | (!(new_pte & _PAGE_RW));
607 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
608 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
609 shift = mmu_psize_to_shift(mmu_psize);
610 sz = ((1UL) << shift);
611 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
612 /* No CPU has hugepages but lacks no execute, so we
613 * don't need to worry about that case */
614 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
615 trap, sz);
616
617 /* Check if pte already has an hpte (case 2) */
618 if (unlikely(old_pte & _PAGE_HASHPTE)) {
619 /* There MIGHT be an HPTE for this pte */
620 unsigned long hash, slot;
621
622 hash = hpt_hash(va, shift, ssize);
623 if (old_pte & _PAGE_F_SECOND)
624 hash = ~hash;
625 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
626 slot += (old_pte & _PAGE_F_GIX) >> 12;
627
628 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
629 ssize, local) == -1)
630 old_pte &= ~_PAGE_HPTEFLAGS;
631 }
632
633 if (likely(!(old_pte & _PAGE_HASHPTE))) {
634 unsigned long hash = hpt_hash(va, shift, ssize);
635 unsigned long hpte_group;
636
637 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
638
639repeat:
640 hpte_group = ((hash & htab_hash_mask) *
641 HPTES_PER_GROUP) & ~0x7UL;
642
643 /* clear HPTE slot informations in new PTE */
644#ifdef CONFIG_PPC_64K_PAGES
645 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
646#else
647 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
648#endif
649 /* Add in WIMG bits */
650 rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
651 _PAGE_COHERENT | _PAGE_GUARDED));
652
653 /* Insert into the hash table, primary slot */
654 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
655 mmu_psize, ssize);
656
657 /* Primary is full, try the secondary */
658 if (unlikely(slot == -1)) {
659 hpte_group = ((~hash & htab_hash_mask) *
660 HPTES_PER_GROUP) & ~0x7UL;
661 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
662 HPTE_V_SECONDARY,
663 mmu_psize, ssize);
664 if (slot == -1) {
665 if (mftb() & 0x1)
666 hpte_group = ((hash & htab_hash_mask) *
667 HPTES_PER_GROUP)&~0x7UL;
668
669 ppc_md.hpte_remove(hpte_group);
670 goto repeat;
671 }
672 }
673
674 if (unlikely(slot == -2))
675 panic("hash_huge_page: pte_insert failed\n");
676
677 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
678 }
679
680 /*
681 * No need to use ldarx/stdcx here
682 */
683 *ptep = __pte(new_pte & ~_PAGE_BUSY);
684 499
685 err = 0; 500 hugetlb_add_hstate(shift - PAGE_SHIFT);
686 501
687 out: 502 return 0;
688 return err;
689}
690
691static void __init set_huge_psize(int psize)
692{
693 /* Check that it is a page size supported by the hardware and
694 * that it fits within pagetable limits. */
695 if (mmu_psize_defs[psize].shift &&
696 mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
697 (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
698 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
699 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
700 /* Return if huge page size has already been setup or is the
701 * same as the base page size. */
702 if (mmu_huge_psizes[psize] ||
703 mmu_psize_defs[psize].shift == PAGE_SHIFT)
704 return;
705 if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
706 return;
707 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
708
709 switch (mmu_psize_defs[psize].shift) {
710 case PAGE_SHIFT_64K:
711 /* We only allow 64k hpages with 4k base page,
712 * which was checked above, and always put them
713 * at the PMD */
714 hugepte_shift[psize] = PMD_SHIFT;
715 break;
716 case PAGE_SHIFT_16M:
717 /* 16M pages can be at two different levels
718 * of pagestables based on base page size */
719 if (PAGE_SHIFT == PAGE_SHIFT_64K)
720 hugepte_shift[psize] = PMD_SHIFT;
721 else /* 4k base page */
722 hugepte_shift[psize] = PUD_SHIFT;
723 break;
724 case PAGE_SHIFT_16G:
725 /* 16G pages are always at PGD level */
726 hugepte_shift[psize] = PGDIR_SHIFT;
727 break;
728 }
729 hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
730 } else
731 hugepte_shift[psize] = 0;
732} 503}
733 504
734static int __init hugepage_setup_sz(char *str) 505static int __init hugepage_setup_sz(char *str)
735{ 506{
736 unsigned long long size; 507 unsigned long long size;
737 int mmu_psize;
738 int shift;
739 508
740 size = memparse(str, &str); 509 size = memparse(str, &str);
741 510
742 shift = __ffs(size); 511 if (add_huge_page_size(size) != 0)
743 mmu_psize = shift_to_mmu_psize(shift);
744 if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
745 set_huge_psize(mmu_psize);
746 else
747 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); 512 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
748 513
749 return 1; 514 return 1;
@@ -752,41 +517,55 @@ __setup("hugepagesz=", hugepage_setup_sz);
752 517
753static int __init hugetlbpage_init(void) 518static int __init hugetlbpage_init(void)
754{ 519{
755 unsigned int psize; 520 int psize;
756 521
757 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 522 if (!cpu_has_feature(CPU_FTR_16M_PAGE))
758 return -ENODEV; 523 return -ENODEV;
759 524
760 /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE 525 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
761 * and adjust PTE_NONCACHE_NUM if the number of supported huge page 526 unsigned shift;
762 * sizes changes. 527 unsigned pdshift;
763 */
764 set_huge_psize(MMU_PAGE_16M);
765 set_huge_psize(MMU_PAGE_16G);
766 528
767 /* Temporarily disable support for 64K huge pages when 64K SPU local 529 if (!mmu_psize_defs[psize].shift)
768 * store support is enabled as the current implementation conflicts. 530 continue;
769 */
770#ifndef CONFIG_SPU_FS_64K_LS
771 set_huge_psize(MMU_PAGE_64K);
772#endif
773 531
774 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 532 shift = mmu_psize_to_shift(psize);
775 if (mmu_huge_psizes[psize]) { 533
776 pgtable_cache[HUGE_PGTABLE_INDEX(psize)] = 534 if (add_huge_page_size(1ULL << shift) < 0)
777 kmem_cache_create( 535 continue;
778 HUGEPTE_CACHE_NAME(psize), 536
779 HUGEPTE_TABLE_SIZE(psize), 537 if (shift < PMD_SHIFT)
780 HUGEPTE_TABLE_SIZE(psize), 538 pdshift = PMD_SHIFT;
781 0, 539 else if (shift < PUD_SHIFT)
782 NULL); 540 pdshift = PUD_SHIFT;
783 if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)]) 541 else
784 panic("hugetlbpage_init(): could not create %s"\ 542 pdshift = PGDIR_SHIFT;
785 "\n", HUGEPTE_CACHE_NAME(psize)); 543
786 } 544 pgtable_cache_add(pdshift - shift, NULL);
545 if (!PGT_CACHE(pdshift - shift))
546 panic("hugetlbpage_init(): could not create "
547 "pgtable cache for %d bit pagesize\n", shift);
787 } 548 }
788 549
550 /* Set default large page size. Currently, we pick 16M or 1M
551 * depending on what is available
552 */
553 if (mmu_psize_defs[MMU_PAGE_16M].shift)
554 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
555 else if (mmu_psize_defs[MMU_PAGE_1M].shift)
556 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
557
789 return 0; 558 return 0;
790} 559}
791 560
792module_init(hugetlbpage_init); 561module_init(hugetlbpage_init);
562
563void flush_dcache_icache_hugepage(struct page *page)
564{
565 int i;
566
567 BUG_ON(!PageCompound(page));
568
569 for (i = 0; i < (1UL << compound_order(page)); i++)
570 __flush_dcache_icache(page_address(page+i));
571}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 335c578b9cc3..776f28d02b6b 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -41,6 +41,7 @@
41#include <linux/module.h> 41#include <linux/module.h>
42#include <linux/poison.h> 42#include <linux/poison.h>
43#include <linux/lmb.h> 43#include <linux/lmb.h>
44#include <linux/hugetlb.h>
44 45
45#include <asm/pgalloc.h> 46#include <asm/pgalloc.h>
46#include <asm/page.h> 47#include <asm/page.h>
@@ -119,30 +120,63 @@ static void pmd_ctor(void *addr)
119 memset(addr, 0, PMD_TABLE_SIZE); 120 memset(addr, 0, PMD_TABLE_SIZE);
120} 121}
121 122
122static const unsigned int pgtable_cache_size[2] = { 123struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
123 PGD_TABLE_SIZE, PMD_TABLE_SIZE 124
124}; 125/*
125static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { 126 * Create a kmem_cache() for pagetables. This is not used for PTE
126#ifdef CONFIG_PPC_64K_PAGES 127 * pages - they're linked to struct page, come from the normal free
127 "pgd_cache", "pmd_cache", 128 * pages pool and have a different entry size (see real_pte_t) to
128#else 129 * everything else. Caches created by this function are used for all
129 "pgd_cache", "pud_pmd_cache", 130 * the higher level pagetables, and for hugepage pagetables.
130#endif /* CONFIG_PPC_64K_PAGES */ 131 */
131}; 132void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
132 133{
133#ifdef CONFIG_HUGETLB_PAGE 134 char *name;
134/* Hugepages need an extra cache per hugepagesize, initialized in 135 unsigned long table_size = sizeof(void *) << shift;
135 * hugetlbpage.c. We can't put into the tables above, because HPAGE_SHIFT 136 unsigned long align = table_size;
136 * is not compile time constant. */ 137
137struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT]; 138 /* When batching pgtable pointers for RCU freeing, we store
138#else 139 * the index size in the low bits. Table alignment must be
139struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; 140 * big enough to fit it.
140#endif 141 *
142 * Likewise, hugeapge pagetable pointers contain a (different)
143 * shift value in the low bits. All tables must be aligned so
144 * as to leave enough 0 bits in the address to contain it. */
145 unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
146 HUGEPD_SHIFT_MASK + 1);
147 struct kmem_cache *new;
148
149 /* It would be nice if this was a BUILD_BUG_ON(), but at the
150 * moment, gcc doesn't seem to recognize is_power_of_2 as a
151 * constant expression, so so much for that. */
152 BUG_ON(!is_power_of_2(minalign));
153 BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
154
155 if (PGT_CACHE(shift))
156 return; /* Already have a cache of this size */
157
158 align = max_t(unsigned long, align, minalign);
159 name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
160 new = kmem_cache_create(name, table_size, align, 0, ctor);
161 PGT_CACHE(shift) = new;
162
163 pr_debug("Allocated pgtable cache for order %d\n", shift);
164}
165
141 166
142void pgtable_cache_init(void) 167void pgtable_cache_init(void)
143{ 168{
144 pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor); 169 pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
145 pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor); 170 pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
171 if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
172 panic("Couldn't allocate pgtable caches");
173
174 /* In all current configs, when the PUD index exists it's the
175 * same size as either the pgd or pmd index. Verify that the
176 * initialization above has also created a PUD cache. This
177 * will need re-examiniation if we add new possibilities for
178 * the pagetable layout. */
179 BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
146} 180}
147 181
148#ifdef CONFIG_SPARSEMEM_VMEMMAP 182#ifdef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 59736317bf0e..b9b152558f9c 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -32,6 +32,7 @@
32#include <linux/pagemap.h> 32#include <linux/pagemap.h>
33#include <linux/suspend.h> 33#include <linux/suspend.h>
34#include <linux/lmb.h> 34#include <linux/lmb.h>
35#include <linux/hugetlb.h>
35 36
36#include <asm/pgalloc.h> 37#include <asm/pgalloc.h>
37#include <asm/prom.h> 38#include <asm/prom.h>
@@ -417,18 +418,26 @@ EXPORT_SYMBOL(flush_dcache_page);
417 418
418void flush_dcache_icache_page(struct page *page) 419void flush_dcache_icache_page(struct page *page)
419{ 420{
421#ifdef CONFIG_HUGETLB_PAGE
422 if (PageCompound(page)) {
423 flush_dcache_icache_hugepage(page);
424 return;
425 }
426#endif
420#ifdef CONFIG_BOOKE 427#ifdef CONFIG_BOOKE
421 void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE); 428 {
422 __flush_dcache_icache(start); 429 void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
423 kunmap_atomic(start, KM_PPC_SYNC_ICACHE); 430 __flush_dcache_icache(start);
431 kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
432 }
424#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64) 433#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
425 /* On 8xx there is no need to kmap since highmem is not supported */ 434 /* On 8xx there is no need to kmap since highmem is not supported */
426 __flush_dcache_icache(page_address(page)); 435 __flush_dcache_icache(page_address(page));
427#else 436#else
428 __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); 437 __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
429#endif 438#endif
430
431} 439}
440
432void clear_user_page(void *page, unsigned long vaddr, struct page *pg) 441void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
433{ 442{
434 clear_page(page); 443 clear_page(page);
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index dbeb86ac90cd..b9e4cc2c2057 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -18,6 +18,7 @@
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/spinlock.h> 19#include <linux/spinlock.h>
20#include <linux/idr.h> 20#include <linux/idr.h>
21#include <linux/module.h>
21 22
22#include <asm/mmu_context.h> 23#include <asm/mmu_context.h>
23 24
@@ -32,7 +33,7 @@ static DEFINE_IDR(mmu_context_idr);
32#define NO_CONTEXT 0 33#define NO_CONTEXT 0
33#define MAX_CONTEXT ((1UL << 19) - 1) 34#define MAX_CONTEXT ((1UL << 19) - 1)
34 35
35int init_new_context(struct task_struct *tsk, struct mm_struct *mm) 36int __init_new_context(void)
36{ 37{
37 int index; 38 int index;
38 int err; 39 int err;
@@ -57,6 +58,18 @@ again:
57 return -ENOMEM; 58 return -ENOMEM;
58 } 59 }
59 60
61 return index;
62}
63EXPORT_SYMBOL_GPL(__init_new_context);
64
65int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
66{
67 int index;
68
69 index = __init_new_context();
70 if (index < 0)
71 return index;
72
60 /* The old code would re-promote on fork, we don't do that 73 /* The old code would re-promote on fork, we don't do that
61 * when using slices as it could cause problem promoting slices 74 * when using slices as it could cause problem promoting slices
62 * that have been forced down to 4K 75 * that have been forced down to 4K
@@ -68,11 +81,16 @@ again:
68 return 0; 81 return 0;
69} 82}
70 83
71void destroy_context(struct mm_struct *mm) 84void __destroy_context(int context_id)
72{ 85{
73 spin_lock(&mmu_context_lock); 86 spin_lock(&mmu_context_lock);
74 idr_remove(&mmu_context_idr, mm->context.id); 87 idr_remove(&mmu_context_idr, context_id);
75 spin_unlock(&mmu_context_lock); 88 spin_unlock(&mmu_context_lock);
89}
90EXPORT_SYMBOL_GPL(__destroy_context);
76 91
92void destroy_context(struct mm_struct *mm)
93{
94 __destroy_context(mm->context.id);
77 mm->context.id = NO_CONTEXT; 95 mm->context.id = NO_CONTEXT;
78} 96}
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 53040931de32..99df697c601a 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -49,12 +49,12 @@ struct pte_freelist_batch
49{ 49{
50 struct rcu_head rcu; 50 struct rcu_head rcu;
51 unsigned int index; 51 unsigned int index;
52 pgtable_free_t tables[0]; 52 unsigned long tables[0];
53}; 53};
54 54
55#define PTE_FREELIST_SIZE \ 55#define PTE_FREELIST_SIZE \
56 ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ 56 ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
57 / sizeof(pgtable_free_t)) 57 / sizeof(unsigned long))
58 58
59static void pte_free_smp_sync(void *arg) 59static void pte_free_smp_sync(void *arg)
60{ 60{
@@ -64,13 +64,13 @@ static void pte_free_smp_sync(void *arg)
64/* This is only called when we are critically out of memory 64/* This is only called when we are critically out of memory
65 * (and fail to get a page in pte_free_tlb). 65 * (and fail to get a page in pte_free_tlb).
66 */ 66 */
67static void pgtable_free_now(pgtable_free_t pgf) 67static void pgtable_free_now(void *table, unsigned shift)
68{ 68{
69 pte_freelist_forced_free++; 69 pte_freelist_forced_free++;
70 70
71 smp_call_function(pte_free_smp_sync, NULL, 1); 71 smp_call_function(pte_free_smp_sync, NULL, 1);
72 72
73 pgtable_free(pgf); 73 pgtable_free(table, shift);
74} 74}
75 75
76static void pte_free_rcu_callback(struct rcu_head *head) 76static void pte_free_rcu_callback(struct rcu_head *head)
@@ -79,8 +79,12 @@ static void pte_free_rcu_callback(struct rcu_head *head)
79 container_of(head, struct pte_freelist_batch, rcu); 79 container_of(head, struct pte_freelist_batch, rcu);
80 unsigned int i; 80 unsigned int i;
81 81
82 for (i = 0; i < batch->index; i++) 82 for (i = 0; i < batch->index; i++) {
83 pgtable_free(batch->tables[i]); 83 void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE);
84 unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE;
85
86 pgtable_free(table, shift);
87 }
84 88
85 free_page((unsigned long)batch); 89 free_page((unsigned long)batch);
86} 90}
@@ -91,25 +95,28 @@ static void pte_free_submit(struct pte_freelist_batch *batch)
91 call_rcu(&batch->rcu, pte_free_rcu_callback); 95 call_rcu(&batch->rcu, pte_free_rcu_callback);
92} 96}
93 97
94void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) 98void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
95{ 99{
96 /* This is safe since tlb_gather_mmu has disabled preemption */ 100 /* This is safe since tlb_gather_mmu has disabled preemption */
97 struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); 101 struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
102 unsigned long pgf;
98 103
99 if (atomic_read(&tlb->mm->mm_users) < 2 || 104 if (atomic_read(&tlb->mm->mm_users) < 2 ||
100 cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){ 105 cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){
101 pgtable_free(pgf); 106 pgtable_free(table, shift);
102 return; 107 return;
103 } 108 }
104 109
105 if (*batchp == NULL) { 110 if (*batchp == NULL) {
106 *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); 111 *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
107 if (*batchp == NULL) { 112 if (*batchp == NULL) {
108 pgtable_free_now(pgf); 113 pgtable_free_now(table, shift);
109 return; 114 return;
110 } 115 }
111 (*batchp)->index = 0; 116 (*batchp)->index = 0;
112 } 117 }
118 BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
119 pgf = (unsigned long)table | shift;
113 (*batchp)->tables[(*batchp)->index++] = pgf; 120 (*batchp)->tables[(*batchp)->index++] = pgf;
114 if ((*batchp)->index == PTE_FREELIST_SIZE) { 121 if ((*batchp)->index == PTE_FREELIST_SIZE) {
115 pte_free_submit(*batchp); 122 pte_free_submit(*batchp);
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 2b2f35f6985e..282d9306361f 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -53,11 +53,6 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
53 53
54 i = batch->index; 54 i = batch->index;
55 55
56 /* We mask the address for the base page size. Huge pages will
57 * have applied their own masking already
58 */
59 addr &= PAGE_MASK;
60
61 /* Get page size (maybe move back to caller). 56 /* Get page size (maybe move back to caller).
62 * 57 *
63 * NOTE: when using special 64K mappings in 4K environment like 58 * NOTE: when using special 64K mappings in 4K environment like
@@ -75,6 +70,9 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
75 } else 70 } else
76 psize = pte_pagesize_index(mm, addr, pte); 71 psize = pte_pagesize_index(mm, addr, pte);
77 72
73 /* Mask the address for the correct page size */
74 addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
75
78 /* Build full vaddr */ 76 /* Build full vaddr */
79 if (!is_kernel_addr(addr)) { 77 if (!is_kernel_addr(addr)) {
80 ssize = user_segment_size(addr); 78 ssize = user_segment_size(addr);