aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/40x_mmu.c6
-rw-r--r--arch/powerpc/mm/44x_mmu.c2
-rw-r--r--arch/powerpc/mm/Makefile5
-rw-r--r--arch/powerpc/mm/dma-noncoherent.c1
-rw-r--r--arch/powerpc/mm/fault.c8
-rw-r--r--arch/powerpc/mm/fsl_booke_mmu.c137
-rw-r--r--arch/powerpc/mm/gup.c149
-rw-r--r--arch/powerpc/mm/hash_native_64.c19
-rw-r--r--arch/powerpc/mm/hash_utils_64.c58
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c139
-rw-r--r--arch/powerpc/mm/hugetlbpage.c793
-rw-r--r--arch/powerpc/mm/init_32.c12
-rw-r--r--arch/powerpc/mm/init_64.c77
-rw-r--r--arch/powerpc/mm/mem.c28
-rw-r--r--arch/powerpc/mm/mmap_64.c4
-rw-r--r--arch/powerpc/mm/mmu_context_hash64.c35
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c17
-rw-r--r--arch/powerpc/mm/mmu_decl.h28
-rw-r--r--arch/powerpc/mm/numa.c23
-rw-r--r--arch/powerpc/mm/pgtable.c26
-rw-r--r--arch/powerpc/mm/pgtable_32.c39
-rw-r--r--arch/powerpc/mm/pgtable_64.c1
-rw-r--r--arch/powerpc/mm/ppc_mmu_32.c4
-rw-r--r--arch/powerpc/mm/subpage-prot.c16
-rw-r--r--arch/powerpc/mm/tlb_hash64.c16
-rw-r--r--arch/powerpc/mm/tlb_low_64e.S2
-rw-r--r--arch/powerpc/mm/tlb_nohash.c6
27 files changed, 818 insertions, 833 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
index f5e7b9ce63dd..65abfcfaaa9e 100644
--- a/arch/powerpc/mm/40x_mmu.c
+++ b/arch/powerpc/mm/40x_mmu.c
@@ -84,14 +84,14 @@ void __init MMU_init_hw(void)
84 * vectors and the kernel live in real-mode. 84 * vectors and the kernel live in real-mode.
85 */ 85 */
86 86
87 mtspr(SPRN_DCCR, 0xF0000000); /* 512 MB of data space at 0x0. */ 87 mtspr(SPRN_DCCR, 0xFFFF0000); /* 2GByte of data space at 0x0. */
88 mtspr(SPRN_ICCR, 0xF0000000); /* 512 MB of instr. space at 0x0. */ 88 mtspr(SPRN_ICCR, 0xFFFF0000); /* 2GByte of instr. space at 0x0. */
89} 89}
90 90
91#define LARGE_PAGE_SIZE_16M (1<<24) 91#define LARGE_PAGE_SIZE_16M (1<<24)
92#define LARGE_PAGE_SIZE_4M (1<<22) 92#define LARGE_PAGE_SIZE_4M (1<<22)
93 93
94unsigned long __init mmu_mapin_ram(void) 94unsigned long __init mmu_mapin_ram(unsigned long top)
95{ 95{
96 unsigned long v, s, mapped; 96 unsigned long v, s, mapped;
97 phys_addr_t p; 97 phys_addr_t p;
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index 98052ac96580..3986264b0993 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -88,7 +88,7 @@ void __init MMU_init_hw(void)
88 flush_instruction_cache(); 88 flush_instruction_cache();
89} 89}
90 90
91unsigned long __init mmu_mapin_ram(void) 91unsigned long __init mmu_mapin_ram(unsigned long top)
92{ 92{
93 unsigned long addr; 93 unsigned long addr;
94 94
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 6fb8fc8d2fea..ce68708bbad5 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -28,7 +28,10 @@ obj-$(CONFIG_44x) += 44x_mmu.o
28obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o 28obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o
29obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o 29obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
30obj-$(CONFIG_PPC_MM_SLICES) += slice.o 30obj-$(CONFIG_PPC_MM_SLICES) += slice.o
31obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 31ifeq ($(CONFIG_HUGETLB_PAGE),y)
32obj-y += hugetlbpage.o
33obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
34endif
32obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o 35obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
33obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o 36obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
34obj-$(CONFIG_HIGHMEM) += highmem.o 37obj-$(CONFIG_HIGHMEM) += highmem.o
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 36692f5c9a76..757c0bed9a91 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/sched.h> 25#include <linux/sched.h>
26#include <linux/slab.h>
26#include <linux/kernel.h> 27#include <linux/kernel.h>
27#include <linux/errno.h> 28#include <linux/errno.h>
28#include <linux/string.h> 29#include <linux/string.h>
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index e7dae82c1285..26fb6b990b0a 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -40,7 +40,7 @@
40#include <asm/uaccess.h> 40#include <asm/uaccess.h>
41#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
42#include <asm/siginfo.h> 42#include <asm/siginfo.h>
43 43#include <mm/mmu_decl.h>
44 44
45#ifdef CONFIG_KPROBES 45#ifdef CONFIG_KPROBES
46static inline int notify_page_fault(struct pt_regs *regs) 46static inline int notify_page_fault(struct pt_regs *regs)
@@ -246,6 +246,12 @@ good_area:
246 goto bad_area; 246 goto bad_area;
247#endif /* CONFIG_6xx */ 247#endif /* CONFIG_6xx */
248#if defined(CONFIG_8xx) 248#if defined(CONFIG_8xx)
249 /* 8xx sometimes need to load a invalid/non-present TLBs.
250 * These must be invalidated separately as linux mm don't.
251 */
252 if (error_code & 0x40000000) /* no translation? */
253 _tlbil_va(address, 0, 0, 0);
254
249 /* The MPC8xx seems to always set 0x80000000, which is 255 /* The MPC8xx seems to always set 0x80000000, which is
250 * "undefined". Of those that can be set, this is the only 256 * "undefined". Of those that can be set, this is the only
251 * one which seems bad. 257 * one which seems bad.
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index dc93e95b256e..1ed6b52f3031 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -54,26 +54,35 @@
54 54
55#include "mmu_decl.h" 55#include "mmu_decl.h"
56 56
57extern void loadcam_entry(unsigned int index);
58unsigned int tlbcam_index; 57unsigned int tlbcam_index;
59static unsigned long cam[CONFIG_LOWMEM_CAM_NUM];
60 58
61#define NUM_TLBCAMS (16) 59#define NUM_TLBCAMS (64)
62 60
63#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) 61#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
64#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" 62#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
65#endif 63#endif
66 64
67struct tlbcam TLBCAM[NUM_TLBCAMS]; 65struct tlbcam {
66 u32 MAS0;
67 u32 MAS1;
68 unsigned long MAS2;
69 u32 MAS3;
70 u32 MAS7;
71} TLBCAM[NUM_TLBCAMS];
68 72
69struct tlbcamrange { 73struct tlbcamrange {
70 unsigned long start; 74 unsigned long start;
71 unsigned long limit; 75 unsigned long limit;
72 phys_addr_t phys; 76 phys_addr_t phys;
73} tlbcam_addrs[NUM_TLBCAMS]; 77} tlbcam_addrs[NUM_TLBCAMS];
74 78
75extern unsigned int tlbcam_index; 79extern unsigned int tlbcam_index;
76 80
81unsigned long tlbcam_sz(int idx)
82{
83 return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1;
84}
85
77/* 86/*
78 * Return PA for this VA if it is mapped by a CAM, or 0 87 * Return PA for this VA if it is mapped by a CAM, or 0
79 */ 88 */
@@ -94,23 +103,36 @@ unsigned long p_mapped_by_tlbcam(phys_addr_t pa)
94 int b; 103 int b;
95 for (b = 0; b < tlbcam_index; ++b) 104 for (b = 0; b < tlbcam_index; ++b)
96 if (pa >= tlbcam_addrs[b].phys 105 if (pa >= tlbcam_addrs[b].phys
97 && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) 106 && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start)
98 +tlbcam_addrs[b].phys) 107 +tlbcam_addrs[b].phys)
99 return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); 108 return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys);
100 return 0; 109 return 0;
101} 110}
102 111
112void loadcam_entry(int idx)
113{
114 mtspr(SPRN_MAS0, TLBCAM[idx].MAS0);
115 mtspr(SPRN_MAS1, TLBCAM[idx].MAS1);
116 mtspr(SPRN_MAS2, TLBCAM[idx].MAS2);
117 mtspr(SPRN_MAS3, TLBCAM[idx].MAS3);
118
119 if (mmu_has_feature(MMU_FTR_BIG_PHYS))
120 mtspr(SPRN_MAS7, TLBCAM[idx].MAS7);
121
122 asm volatile("isync;tlbwe;isync" : : : "memory");
123}
124
103/* 125/*
104 * Set up one of the I/D BAT (block address translation) register pairs. 126 * Set up one of the I/D BAT (block address translation) register pairs.
105 * The parameters are not checked; in particular size must be a power 127 * The parameters are not checked; in particular size must be a power
106 * of 4 between 4k and 256M. 128 * of 4 between 4k and 256M.
107 */ 129 */
108void settlbcam(int index, unsigned long virt, phys_addr_t phys, 130static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
109 unsigned int size, int flags, unsigned int pid) 131 unsigned long size, unsigned long flags, unsigned int pid)
110{ 132{
111 unsigned int tsize, lz; 133 unsigned int tsize, lz;
112 134
113 asm ("cntlzw %0,%1" : "=r" (lz) : "r" (size)); 135 asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (size));
114 tsize = 21 - lz; 136 tsize = 21 - lz;
115 137
116#ifdef CONFIG_SMP 138#ifdef CONFIG_SMP
@@ -128,18 +150,15 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys,
128 TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; 150 TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
129 TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; 151 TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
130 152
131 TLBCAM[index].MAS3 = (phys & PAGE_MASK) | MAS3_SX | MAS3_SR; 153 TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR;
132 TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); 154 TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
155 if (mmu_has_feature(MMU_FTR_BIG_PHYS))
156 TLBCAM[index].MAS7 = (u64)phys >> 32;
133 157
134#ifndef CONFIG_KGDB /* want user access for breakpoints */
135 if (flags & _PAGE_USER) { 158 if (flags & _PAGE_USER) {
136 TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; 159 TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
137 TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); 160 TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
138 } 161 }
139#else
140 TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
141 TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
142#endif
143 162
144 tlbcam_addrs[index].start = virt; 163 tlbcam_addrs[index].start = virt;
145 tlbcam_addrs[index].limit = virt + size - 1; 164 tlbcam_addrs[index].limit = virt + size - 1;
@@ -148,27 +167,44 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys,
148 loadcam_entry(index); 167 loadcam_entry(index);
149} 168}
150 169
151void invalidate_tlbcam_entry(int index) 170unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
152{
153 TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index);
154 TLBCAM[index].MAS1 = ~MAS1_VALID;
155
156 loadcam_entry(index);
157}
158
159unsigned long __init mmu_mapin_ram(void)
160{ 171{
172 int i;
161 unsigned long virt = PAGE_OFFSET; 173 unsigned long virt = PAGE_OFFSET;
162 phys_addr_t phys = memstart_addr; 174 phys_addr_t phys = memstart_addr;
175 unsigned long amount_mapped = 0;
176 unsigned long max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xf;
177
178 /* Convert (4^max) kB to (2^max) bytes */
179 max_cam = max_cam * 2 + 10;
180
181 /* Calculate CAM values */
182 for (i = 0; ram && i < max_cam_idx; i++) {
183 unsigned int camsize = __ilog2(ram) & ~1U;
184 unsigned int align = __ffs(virt | phys) & ~1U;
185 unsigned long cam_sz;
186
187 if (camsize > align)
188 camsize = align;
189 if (camsize > max_cam)
190 camsize = max_cam;
191
192 cam_sz = 1UL << camsize;
193 settlbcam(i, virt, phys, cam_sz, PAGE_KERNEL_X, 0);
163 194
164 while (tlbcam_index < ARRAY_SIZE(cam) && cam[tlbcam_index]) { 195 ram -= cam_sz;
165 settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], PAGE_KERNEL_X, 0); 196 amount_mapped += cam_sz;
166 virt += cam[tlbcam_index]; 197 virt += cam_sz;
167 phys += cam[tlbcam_index]; 198 phys += cam_sz;
168 tlbcam_index++;
169 } 199 }
200 tlbcam_index = i;
201
202 return amount_mapped;
203}
170 204
171 return virt - PAGE_OFFSET; 205unsigned long __init mmu_mapin_ram(unsigned long top)
206{
207 return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1;
172} 208}
173 209
174/* 210/*
@@ -179,46 +215,21 @@ void __init MMU_init_hw(void)
179 flush_instruction_cache(); 215 flush_instruction_cache();
180} 216}
181 217
182void __init 218void __init adjust_total_lowmem(void)
183adjust_total_lowmem(void)
184{ 219{
185 phys_addr_t ram; 220 unsigned long ram;
186 unsigned int max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xff;
187 char buf[ARRAY_SIZE(cam) * 5 + 1], *p = buf;
188 int i; 221 int i;
189 unsigned long virt = PAGE_OFFSET & 0xffffffffUL;
190 unsigned long phys = memstart_addr & 0xffffffffUL;
191
192 /* Convert (4^max) kB to (2^max) bytes */
193 max_cam = max_cam * 2 + 10;
194 222
195 /* adjust lowmem size to __max_low_memory */ 223 /* adjust lowmem size to __max_low_memory */
196 ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); 224 ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
197 225
198 /* Calculate CAM values */ 226 __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM);
199 __max_low_memory = 0;
200 for (i = 0; ram && i < ARRAY_SIZE(cam); i++) {
201 unsigned int camsize = __ilog2(ram) & ~1U;
202 unsigned int align = __ffs(virt | phys) & ~1U;
203 227
204 if (camsize > align) 228 pr_info("Memory CAM mapping: ");
205 camsize = align; 229 for (i = 0; i < tlbcam_index - 1; i++)
206 if (camsize > max_cam) 230 pr_cont("%lu/", tlbcam_sz(i) >> 20);
207 camsize = max_cam; 231 pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20,
208
209 cam[i] = 1UL << camsize;
210 ram -= cam[i];
211 __max_low_memory += cam[i];
212 virt += cam[i];
213 phys += cam[i];
214
215 p += sprintf(p, "%lu/", cam[i] >> 20);
216 }
217 for (; i < ARRAY_SIZE(cam); i++)
218 p += sprintf(p, "0/");
219 p[-1] = '\0';
220
221 pr_info("Memory CAM mapping: %s Mb, residual: %dMb\n", buf,
222 (unsigned int)((total_lowmem - __max_low_memory) >> 20)); 232 (unsigned int)((total_lowmem - __max_low_memory) >> 20));
233
223 __initial_memory_limit_addr = memstart_addr + __max_low_memory; 234 __initial_memory_limit_addr = memstart_addr + __max_low_memory;
224} 235}
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index bc122a120bf0..d7efdbf640c7 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
55 return 1; 55 return 1;
56} 56}
57 57
58#ifdef CONFIG_HUGETLB_PAGE
59static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
60 unsigned long *addr, unsigned long end,
61 int write, struct page **pages, int *nr)
62{
63 unsigned long mask;
64 unsigned long pte_end;
65 struct page *head, *page;
66 pte_t pte;
67 int refs;
68
69 pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
70 if (pte_end < end)
71 end = pte_end;
72
73 pte = *ptep;
74 mask = _PAGE_PRESENT|_PAGE_USER;
75 if (write)
76 mask |= _PAGE_RW;
77 if ((pte_val(pte) & mask) != mask)
78 return 0;
79 /* hugepages are never "special" */
80 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
81
82 refs = 0;
83 head = pte_page(pte);
84 page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
85 do {
86 VM_BUG_ON(compound_head(page) != head);
87 pages[*nr] = page;
88 (*nr)++;
89 page++;
90 refs++;
91 } while (*addr += PAGE_SIZE, *addr != end);
92
93 if (!page_cache_add_speculative(head, refs)) {
94 *nr -= refs;
95 return 0;
96 }
97 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
98 /* Could be optimized better */
99 while (*nr) {
100 put_page(page);
101 (*nr)--;
102 }
103 }
104
105 return 1;
106}
107#endif /* CONFIG_HUGETLB_PAGE */
108
109static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 58static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
110 int write, struct page **pages, int *nr) 59 int write, struct page **pages, int *nr)
111{ 60{
@@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
119 next = pmd_addr_end(addr, end); 68 next = pmd_addr_end(addr, end);
120 if (pmd_none(pmd)) 69 if (pmd_none(pmd))
121 return 0; 70 return 0;
122 if (!gup_pte_range(pmd, addr, next, write, pages, nr)) 71 if (is_hugepd(pmdp)) {
72 if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
73 addr, next, write, pages, nr))
74 return 0;
75 } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
123 return 0; 76 return 0;
124 } while (pmdp++, addr = next, addr != end); 77 } while (pmdp++, addr = next, addr != end);
125 78
@@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
139 next = pud_addr_end(addr, end); 92 next = pud_addr_end(addr, end);
140 if (pud_none(pud)) 93 if (pud_none(pud))
141 return 0; 94 return 0;
142 if (!gup_pmd_range(pud, addr, next, write, pages, nr)) 95 if (is_hugepd(pudp)) {
96 if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
97 addr, next, write, pages, nr))
98 return 0;
99 } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
143 return 0; 100 return 0;
144 } while (pudp++, addr = next, addr != end); 101 } while (pudp++, addr = next, addr != end);
145 102
@@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
154 unsigned long next; 111 unsigned long next;
155 pgd_t *pgdp; 112 pgd_t *pgdp;
156 int nr = 0; 113 int nr = 0;
157#ifdef CONFIG_PPC64
158 unsigned int shift;
159 int psize;
160#endif
161 114
162 pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); 115 pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
163 116
@@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
172 125
173 pr_devel(" aligned: %lx .. %lx\n", start, end); 126 pr_devel(" aligned: %lx .. %lx\n", start, end);
174 127
175#ifdef CONFIG_HUGETLB_PAGE
176 /* We bail out on slice boundary crossing when hugetlb is
177 * enabled in order to not have to deal with two different
178 * page table formats
179 */
180 if (addr < SLICE_LOW_TOP) {
181 if (end > SLICE_LOW_TOP)
182 goto slow_irqon;
183
184 if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
185 GET_LOW_SLICE_INDEX(end - 1)))
186 goto slow_irqon;
187 } else {
188 if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
189 GET_HIGH_SLICE_INDEX(end - 1)))
190 goto slow_irqon;
191 }
192#endif /* CONFIG_HUGETLB_PAGE */
193
194 /* 128 /*
195 * XXX: batch / limit 'nr', to avoid large irq off latency 129 * XXX: batch / limit 'nr', to avoid large irq off latency
196 * needs some instrumenting to determine the common sizes used by 130 * needs some instrumenting to determine the common sizes used by
@@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
210 */ 144 */
211 local_irq_disable(); 145 local_irq_disable();
212 146
213#ifdef CONFIG_PPC64 147 pgdp = pgd_offset(mm, addr);
214 /* Those bits are related to hugetlbfs implementation and only exist 148 do {
215 * on 64-bit for now 149 pgd_t pgd = *pgdp;
216 */ 150
217 psize = get_slice_psize(mm, addr); 151 pr_devel(" %016lx: normal pgd %p\n", addr,
218 shift = mmu_psize_defs[psize].shift; 152 (void *)pgd_val(pgd));
219#endif /* CONFIG_PPC64 */ 153 next = pgd_addr_end(addr, end);
220 154 if (pgd_none(pgd))
221#ifdef CONFIG_HUGETLB_PAGE 155 goto slow;
222 if (unlikely(mmu_huge_psizes[psize])) { 156 if (is_hugepd(pgdp)) {
223 pte_t *ptep; 157 if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
224 unsigned long a = addr; 158 addr, next, write, pages, &nr))
225 unsigned long sz = ((1UL) << shift);
226 struct hstate *hstate = size_to_hstate(sz);
227
228 BUG_ON(!hstate);
229 /*
230 * XXX: could be optimized to avoid hstate
231 * lookup entirely (just use shift)
232 */
233
234 do {
235 VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
236 ptep = huge_pte_offset(mm, a);
237 pr_devel(" %016lx: huge ptep %p\n", a, ptep);
238 if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
239 &nr))
240 goto slow;
241 } while (a != end);
242 } else
243#endif /* CONFIG_HUGETLB_PAGE */
244 {
245 pgdp = pgd_offset(mm, addr);
246 do {
247 pgd_t pgd = *pgdp;
248
249#ifdef CONFIG_PPC64
250 VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
251#endif
252 pr_devel(" %016lx: normal pgd %p\n", addr,
253 (void *)pgd_val(pgd));
254 next = pgd_addr_end(addr, end);
255 if (pgd_none(pgd))
256 goto slow;
257 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
258 goto slow; 159 goto slow;
259 } while (pgdp++, addr = next, addr != end); 160 } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
260 } 161 goto slow;
162 } while (pgdp++, addr = next, addr != end);
163
261 local_irq_enable(); 164 local_irq_enable();
262 165
263 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); 166 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 056d23a1b105..784a400e0781 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -37,7 +37,7 @@
37 37
38#define HPTE_LOCK_BIT 3 38#define HPTE_LOCK_BIT 3
39 39
40static DEFINE_SPINLOCK(native_tlbie_lock); 40static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
41 41
42static inline void __tlbie(unsigned long va, int psize, int ssize) 42static inline void __tlbie(unsigned long va, int psize, int ssize)
43{ 43{
@@ -104,7 +104,7 @@ static inline void tlbie(unsigned long va, int psize, int ssize, int local)
104 if (use_local) 104 if (use_local)
105 use_local = mmu_psize_defs[psize].tlbiel; 105 use_local = mmu_psize_defs[psize].tlbiel;
106 if (lock_tlbie && !use_local) 106 if (lock_tlbie && !use_local)
107 spin_lock(&native_tlbie_lock); 107 raw_spin_lock(&native_tlbie_lock);
108 asm volatile("ptesync": : :"memory"); 108 asm volatile("ptesync": : :"memory");
109 if (use_local) { 109 if (use_local) {
110 __tlbiel(va, psize, ssize); 110 __tlbiel(va, psize, ssize);
@@ -114,7 +114,7 @@ static inline void tlbie(unsigned long va, int psize, int ssize, int local)
114 asm volatile("eieio; tlbsync; ptesync": : :"memory"); 114 asm volatile("eieio; tlbsync; ptesync": : :"memory");
115 } 115 }
116 if (lock_tlbie && !use_local) 116 if (lock_tlbie && !use_local)
117 spin_unlock(&native_tlbie_lock); 117 raw_spin_unlock(&native_tlbie_lock);
118} 118}
119 119
120static inline void native_lock_hpte(struct hash_pte *hptep) 120static inline void native_lock_hpte(struct hash_pte *hptep)
@@ -122,7 +122,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
122 unsigned long *word = &hptep->v; 122 unsigned long *word = &hptep->v;
123 123
124 while (1) { 124 while (1) {
125 if (!test_and_set_bit(HPTE_LOCK_BIT, word)) 125 if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
126 break; 126 break;
127 while(test_bit(HPTE_LOCK_BIT, word)) 127 while(test_bit(HPTE_LOCK_BIT, word))
128 cpu_relax(); 128 cpu_relax();
@@ -133,8 +133,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep)
133{ 133{
134 unsigned long *word = &hptep->v; 134 unsigned long *word = &hptep->v;
135 135
136 asm volatile("lwsync":::"memory"); 136 clear_bit_unlock(HPTE_LOCK_BIT, word);
137 clear_bit(HPTE_LOCK_BIT, word);
138} 137}
139 138
140static long native_hpte_insert(unsigned long hpte_group, unsigned long va, 139static long native_hpte_insert(unsigned long hpte_group, unsigned long va,
@@ -434,7 +433,7 @@ static void native_hpte_clear(void)
434 /* we take the tlbie lock and hold it. Some hardware will 433 /* we take the tlbie lock and hold it. Some hardware will
435 * deadlock if we try to tlbie from two processors at once. 434 * deadlock if we try to tlbie from two processors at once.
436 */ 435 */
437 spin_lock(&native_tlbie_lock); 436 raw_spin_lock(&native_tlbie_lock);
438 437
439 slots = pteg_count * HPTES_PER_GROUP; 438 slots = pteg_count * HPTES_PER_GROUP;
440 439
@@ -458,7 +457,7 @@ static void native_hpte_clear(void)
458 } 457 }
459 458
460 asm volatile("eieio; tlbsync; ptesync":::"memory"); 459 asm volatile("eieio; tlbsync; ptesync":::"memory");
461 spin_unlock(&native_tlbie_lock); 460 raw_spin_unlock(&native_tlbie_lock);
462 local_irq_restore(flags); 461 local_irq_restore(flags);
463} 462}
464 463
@@ -521,7 +520,7 @@ static void native_flush_hash_range(unsigned long number, int local)
521 int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); 520 int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
522 521
523 if (lock_tlbie) 522 if (lock_tlbie)
524 spin_lock(&native_tlbie_lock); 523 raw_spin_lock(&native_tlbie_lock);
525 524
526 asm volatile("ptesync":::"memory"); 525 asm volatile("ptesync":::"memory");
527 for (i = 0; i < number; i++) { 526 for (i = 0; i < number; i++) {
@@ -536,7 +535,7 @@ static void native_flush_hash_range(unsigned long number, int local)
536 asm volatile("eieio; tlbsync; ptesync":::"memory"); 535 asm volatile("eieio; tlbsync; ptesync":::"memory");
537 536
538 if (lock_tlbie) 537 if (lock_tlbie)
539 spin_unlock(&native_tlbie_lock); 538 raw_spin_unlock(&native_tlbie_lock);
540 } 539 }
541 540
542 local_irq_restore(flags); 541 local_irq_restore(flags);
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 1ade7eb6ae00..3ecdcec0a39e 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -92,6 +92,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
92struct hash_pte *htab_address; 92struct hash_pte *htab_address;
93unsigned long htab_size_bytes; 93unsigned long htab_size_bytes;
94unsigned long htab_hash_mask; 94unsigned long htab_hash_mask;
95EXPORT_SYMBOL_GPL(htab_hash_mask);
95int mmu_linear_psize = MMU_PAGE_4K; 96int mmu_linear_psize = MMU_PAGE_4K;
96int mmu_virtual_psize = MMU_PAGE_4K; 97int mmu_virtual_psize = MMU_PAGE_4K;
97int mmu_vmalloc_psize = MMU_PAGE_4K; 98int mmu_vmalloc_psize = MMU_PAGE_4K;
@@ -102,6 +103,7 @@ int mmu_io_psize = MMU_PAGE_4K;
102int mmu_kernel_ssize = MMU_SEGSIZE_256M; 103int mmu_kernel_ssize = MMU_SEGSIZE_256M;
103int mmu_highuser_ssize = MMU_SEGSIZE_256M; 104int mmu_highuser_ssize = MMU_SEGSIZE_256M;
104u16 mmu_slb_size = 64; 105u16 mmu_slb_size = 64;
106EXPORT_SYMBOL_GPL(mmu_slb_size);
105#ifdef CONFIG_HUGETLB_PAGE 107#ifdef CONFIG_HUGETLB_PAGE
106unsigned int HPAGE_SHIFT; 108unsigned int HPAGE_SHIFT;
107#endif 109#endif
@@ -338,7 +340,7 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
338 else 340 else
339 def->tlbiel = 0; 341 def->tlbiel = 0;
340 342
341 DBG(" %d: shift=%02x, sllp=%04x, avpnm=%08x, " 343 DBG(" %d: shift=%02x, sllp=%04lx, avpnm=%08lx, "
342 "tlbiel=%d, penc=%d\n", 344 "tlbiel=%d, penc=%d\n",
343 idx, shift, def->sllp, def->avpnm, def->tlbiel, 345 idx, shift, def->sllp, def->avpnm, def->tlbiel,
344 def->penc); 346 def->penc);
@@ -481,16 +483,6 @@ static void __init htab_init_page_sizes(void)
481#ifdef CONFIG_HUGETLB_PAGE 483#ifdef CONFIG_HUGETLB_PAGE
482 /* Reserve 16G huge page memory sections for huge pages */ 484 /* Reserve 16G huge page memory sections for huge pages */
483 of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); 485 of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
484
485/* Set default large page size. Currently, we pick 16M or 1M depending
486 * on what is available
487 */
488 if (mmu_psize_defs[MMU_PAGE_16M].shift)
489 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
490 /* With 4k/4level pagetables, we can't (for now) cope with a
491 * huge page size < PMD_SIZE */
492 else if (mmu_psize_defs[MMU_PAGE_1M].shift)
493 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
494#endif /* CONFIG_HUGETLB_PAGE */ 486#endif /* CONFIG_HUGETLB_PAGE */
495} 487}
496 488
@@ -671,7 +663,7 @@ static void __init htab_initialize(void)
671 base = (unsigned long)__va(lmb.memory.region[i].base); 663 base = (unsigned long)__va(lmb.memory.region[i].base);
672 size = lmb.memory.region[i].size; 664 size = lmb.memory.region[i].size;
673 665
674 DBG("creating mapping for region: %lx..%lx (prot: %x)\n", 666 DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
675 base, size, prot); 667 base, size, prot);
676 668
677#ifdef CONFIG_U3_DART 669#ifdef CONFIG_U3_DART
@@ -785,7 +777,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
785 /* page is dirty */ 777 /* page is dirty */
786 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { 778 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
787 if (trap == 0x400) { 779 if (trap == 0x400) {
788 __flush_dcache_icache(page_address(page)); 780 flush_dcache_icache_page(page);
789 set_bit(PG_arch_1, &page->flags); 781 set_bit(PG_arch_1, &page->flags);
790 } else 782 } else
791 pp |= HPTE_R_N; 783 pp |= HPTE_R_N;
@@ -843,9 +835,9 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
843 * Result is 0: full permissions, _PAGE_RW: read-only, 835 * Result is 0: full permissions, _PAGE_RW: read-only,
844 * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. 836 * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access.
845 */ 837 */
846static int subpage_protection(pgd_t *pgdir, unsigned long ea) 838static int subpage_protection(struct mm_struct *mm, unsigned long ea)
847{ 839{
848 struct subpage_prot_table *spt = pgd_subpage_prot(pgdir); 840 struct subpage_prot_table *spt = &mm->context.spt;
849 u32 spp = 0; 841 u32 spp = 0;
850 u32 **sbpm, *sbpp; 842 u32 **sbpm, *sbpp;
851 843
@@ -873,7 +865,7 @@ static int subpage_protection(pgd_t *pgdir, unsigned long ea)
873} 865}
874 866
875#else /* CONFIG_PPC_SUBPAGE_PROT */ 867#else /* CONFIG_PPC_SUBPAGE_PROT */
876static inline int subpage_protection(pgd_t *pgdir, unsigned long ea) 868static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
877{ 869{
878 return 0; 870 return 0;
879} 871}
@@ -887,10 +879,11 @@ static inline int subpage_protection(pgd_t *pgdir, unsigned long ea)
887 */ 879 */
888int hash_page(unsigned long ea, unsigned long access, unsigned long trap) 880int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
889{ 881{
890 void *pgdir; 882 pgd_t *pgdir;
891 unsigned long vsid; 883 unsigned long vsid;
892 struct mm_struct *mm; 884 struct mm_struct *mm;
893 pte_t *ptep; 885 pte_t *ptep;
886 unsigned hugeshift;
894 const struct cpumask *tmp; 887 const struct cpumask *tmp;
895 int rc, user_region = 0, local = 0; 888 int rc, user_region = 0, local = 0;
896 int psize, ssize; 889 int psize, ssize;
@@ -943,30 +936,31 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
943 if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) 936 if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
944 local = 1; 937 local = 1;
945 938
946#ifdef CONFIG_HUGETLB_PAGE
947 /* Handle hugepage regions */
948 if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
949 DBG_LOW(" -> huge page !\n");
950 return hash_huge_page(mm, access, ea, vsid, local, trap);
951 }
952#endif /* CONFIG_HUGETLB_PAGE */
953
954#ifndef CONFIG_PPC_64K_PAGES 939#ifndef CONFIG_PPC_64K_PAGES
955 /* If we use 4K pages and our psize is not 4K, then we are hitting 940 /* If we use 4K pages and our psize is not 4K, then we might
956 * a special driver mapping, we need to align the address before 941 * be hitting a special driver mapping, and need to align the
957 * we fetch the PTE 942 * address before we fetch the PTE.
943 *
944 * It could also be a hugepage mapping, in which case this is
945 * not necessary, but it's not harmful, either.
958 */ 946 */
959 if (psize != MMU_PAGE_4K) 947 if (psize != MMU_PAGE_4K)
960 ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); 948 ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
961#endif /* CONFIG_PPC_64K_PAGES */ 949#endif /* CONFIG_PPC_64K_PAGES */
962 950
963 /* Get PTE and page size from page tables */ 951 /* Get PTE and page size from page tables */
964 ptep = find_linux_pte(pgdir, ea); 952 ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
965 if (ptep == NULL || !pte_present(*ptep)) { 953 if (ptep == NULL || !pte_present(*ptep)) {
966 DBG_LOW(" no PTE !\n"); 954 DBG_LOW(" no PTE !\n");
967 return 1; 955 return 1;
968 } 956 }
969 957
958#ifdef CONFIG_HUGETLB_PAGE
959 if (hugeshift)
960 return __hash_page_huge(ea, access, vsid, ptep, trap, local,
961 ssize, hugeshift, psize);
962#endif /* CONFIG_HUGETLB_PAGE */
963
970#ifndef CONFIG_PPC_64K_PAGES 964#ifndef CONFIG_PPC_64K_PAGES
971 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); 965 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
972#else 966#else
@@ -1031,7 +1025,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
1031 else 1025 else
1032#endif /* CONFIG_PPC_HAS_HASH_64K */ 1026#endif /* CONFIG_PPC_HAS_HASH_64K */
1033 { 1027 {
1034 int spp = subpage_protection(pgdir, ea); 1028 int spp = subpage_protection(mm, ea);
1035 if (access & spp) 1029 if (access & spp)
1036 rc = -2; 1030 rc = -2;
1037 else 1031 else
@@ -1121,7 +1115,7 @@ void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int ssize,
1121{ 1115{
1122 unsigned long hash, index, shift, hidx, slot; 1116 unsigned long hash, index, shift, hidx, slot;
1123 1117
1124 DBG_LOW("flush_hash_page(va=%016x)\n", va); 1118 DBG_LOW("flush_hash_page(va=%016lx)\n", va);
1125 pte_iterate_hashed_subpages(pte, psize, va, index, shift) { 1119 pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
1126 hash = hpt_hash(va, shift, ssize); 1120 hash = hpt_hash(va, shift, ssize);
1127 hidx = __rpte_to_hidx(pte, index); 1121 hidx = __rpte_to_hidx(pte, index);
@@ -1129,7 +1123,7 @@ void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int ssize,
1129 hash = ~hash; 1123 hash = ~hash;
1130 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1124 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1131 slot += hidx & _PTEIDX_GROUP_IX; 1125 slot += hidx & _PTEIDX_GROUP_IX;
1132 DBG_LOW(" sub %d: hash=%x, hidx=%x\n", index, slot, hidx); 1126 DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
1133 ppc_md.hpte_invalidate(slot, va, psize, ssize, local); 1127 ppc_md.hpte_invalidate(slot, va, psize, ssize, local);
1134 } pte_iterate_hashed_end(); 1128 } pte_iterate_hashed_end();
1135} 1129}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
new file mode 100644
index 000000000000..199539882f92
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -0,0 +1,139 @@
1/*
2 * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
3 *
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
5 *
6 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8 */
9
10#include <linux/mm.h>
11#include <linux/hugetlb.h>
12#include <asm/pgtable.h>
13#include <asm/pgalloc.h>
14#include <asm/cacheflush.h>
15#include <asm/machdep.h>
16
17int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
18 pte_t *ptep, unsigned long trap, int local, int ssize,
19 unsigned int shift, unsigned int mmu_psize)
20{
21 unsigned long old_pte, new_pte;
22 unsigned long va, rflags, pa, sz;
23 long slot;
24 int err = 1;
25
26 BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
27
28 /* Search the Linux page table for a match with va */
29 va = hpt_va(ea, vsid, ssize);
30
31 /*
32 * Check the user's access rights to the page. If access should be
33 * prevented then send the problem up to do_page_fault.
34 */
35 if (unlikely(access & ~pte_val(*ptep)))
36 goto out;
37 /*
38 * At this point, we have a pte (old_pte) which can be used to build
39 * or update an HPTE. There are 2 cases:
40 *
41 * 1. There is a valid (present) pte with no associated HPTE (this is
42 * the most common case)
43 * 2. There is a valid (present) pte with an associated HPTE. The
44 * current values of the pp bits in the HPTE prevent access
45 * because we are doing software DIRTY bit management and the
46 * page is currently not DIRTY.
47 */
48
49
50 do {
51 old_pte = pte_val(*ptep);
52 if (old_pte & _PAGE_BUSY)
53 goto out;
54 new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
55 } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
56 old_pte, new_pte));
57
58 rflags = 0x2 | (!(new_pte & _PAGE_RW));
59 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
60 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
61 sz = ((1UL) << shift);
62 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
63 /* No CPU has hugepages but lacks no execute, so we
64 * don't need to worry about that case */
65 rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
66
67 /* Check if pte already has an hpte (case 2) */
68 if (unlikely(old_pte & _PAGE_HASHPTE)) {
69 /* There MIGHT be an HPTE for this pte */
70 unsigned long hash, slot;
71
72 hash = hpt_hash(va, shift, ssize);
73 if (old_pte & _PAGE_F_SECOND)
74 hash = ~hash;
75 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
76 slot += (old_pte & _PAGE_F_GIX) >> 12;
77
78 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
79 ssize, local) == -1)
80 old_pte &= ~_PAGE_HPTEFLAGS;
81 }
82
83 if (likely(!(old_pte & _PAGE_HASHPTE))) {
84 unsigned long hash = hpt_hash(va, shift, ssize);
85 unsigned long hpte_group;
86
87 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
88
89repeat:
90 hpte_group = ((hash & htab_hash_mask) *
91 HPTES_PER_GROUP) & ~0x7UL;
92
93 /* clear HPTE slot informations in new PTE */
94#ifdef CONFIG_PPC_64K_PAGES
95 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
96#else
97 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
98#endif
99 /* Add in WIMG bits */
100 rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
101 _PAGE_COHERENT | _PAGE_GUARDED));
102
103 /* Insert into the hash table, primary slot */
104 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
105 mmu_psize, ssize);
106
107 /* Primary is full, try the secondary */
108 if (unlikely(slot == -1)) {
109 hpte_group = ((~hash & htab_hash_mask) *
110 HPTES_PER_GROUP) & ~0x7UL;
111 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
112 HPTE_V_SECONDARY,
113 mmu_psize, ssize);
114 if (slot == -1) {
115 if (mftb() & 0x1)
116 hpte_group = ((hash & htab_hash_mask) *
117 HPTES_PER_GROUP)&~0x7UL;
118
119 ppc_md.hpte_remove(hpte_group);
120 goto repeat;
121 }
122 }
123
124 if (unlikely(slot == -2))
125 panic("hash_huge_page: pte_insert failed\n");
126
127 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
128 }
129
130 /*
131 * No need to use ldarx/stdcx here
132 */
133 *ptep = __pte(new_pte & ~_PAGE_BUSY);
134
135 err = 0;
136
137 out:
138 return err;
139}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 90df6ffe3a43..9bb249c3046e 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -7,29 +7,18 @@
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8 */ 8 */
9 9
10#include <linux/init.h>
11#include <linux/fs.h>
12#include <linux/mm.h> 10#include <linux/mm.h>
13#include <linux/hugetlb.h> 11#include <linux/io.h>
14#include <linux/pagemap.h>
15#include <linux/slab.h> 12#include <linux/slab.h>
16#include <linux/err.h> 13#include <linux/hugetlb.h>
17#include <linux/sysctl.h> 14#include <asm/pgtable.h>
18#include <asm/mman.h>
19#include <asm/pgalloc.h> 15#include <asm/pgalloc.h>
20#include <asm/tlb.h> 16#include <asm/tlb.h>
21#include <asm/tlbflush.h>
22#include <asm/mmu_context.h>
23#include <asm/machdep.h>
24#include <asm/cputable.h>
25#include <asm/spu.h>
26 17
27#define PAGE_SHIFT_64K 16 18#define PAGE_SHIFT_64K 16
28#define PAGE_SHIFT_16M 24 19#define PAGE_SHIFT_16M 24
29#define PAGE_SHIFT_16G 34 20#define PAGE_SHIFT_16G 34
30 21
31#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
32#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
33#define MAX_NUMBER_GPAGES 1024 22#define MAX_NUMBER_GPAGES 1024
34 23
35/* Tracks the 16G pages after the device tree is scanned and before the 24/* Tracks the 16G pages after the device tree is scanned and before the
@@ -37,53 +26,17 @@
37static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; 26static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
38static unsigned nr_gpages; 27static unsigned nr_gpages;
39 28
40/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
41 * stored for the huge page sizes that are valid.
42 */
43unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
44
45#define hugepte_shift mmu_huge_psizes
46#define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize])
47#define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize])
48
49#define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \
50 + hugepte_shift[psize])
51#define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize))
52#define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1))
53
54/* Subtract one from array size because we don't need a cache for 4K since
55 * is not a huge page size */
56#define HUGE_PGTABLE_INDEX(psize) (HUGEPTE_CACHE_NUM + psize - 1)
57#define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize])
58
59static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
60 [MMU_PAGE_64K] = "hugepte_cache_64K",
61 [MMU_PAGE_1M] = "hugepte_cache_1M",
62 [MMU_PAGE_16M] = "hugepte_cache_16M",
63 [MMU_PAGE_16G] = "hugepte_cache_16G",
64};
65
66/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 29/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
67 * will choke on pointers to hugepte tables, which is handy for 30 * will choke on pointers to hugepte tables, which is handy for
68 * catching screwups early. */ 31 * catching screwups early. */
69#define HUGEPD_OK 0x1
70
71typedef struct { unsigned long pd; } hugepd_t;
72
73#define hugepd_none(hpd) ((hpd).pd == 0)
74 32
75static inline int shift_to_mmu_psize(unsigned int shift) 33static inline int shift_to_mmu_psize(unsigned int shift)
76{ 34{
77 switch (shift) { 35 int psize;
78#ifndef CONFIG_PPC_64K_PAGES 36
79 case PAGE_SHIFT_64K: 37 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
80 return MMU_PAGE_64K; 38 if (mmu_psize_defs[psize].shift == shift)
81#endif 39 return psize;
82 case PAGE_SHIFT_16M:
83 return MMU_PAGE_16M;
84 case PAGE_SHIFT_16G:
85 return MMU_PAGE_16G;
86 }
87 return -1; 40 return -1;
88} 41}
89 42
@@ -94,71 +47,126 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
94 BUG(); 47 BUG();
95} 48}
96 49
50#define hugepd_none(hpd) ((hpd).pd == 0)
51
97static inline pte_t *hugepd_page(hugepd_t hpd) 52static inline pte_t *hugepd_page(hugepd_t hpd)
98{ 53{
99 BUG_ON(!(hpd.pd & HUGEPD_OK)); 54 BUG_ON(!hugepd_ok(hpd));
100 return (pte_t *)(hpd.pd & ~HUGEPD_OK); 55 return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
56}
57
58static inline unsigned int hugepd_shift(hugepd_t hpd)
59{
60 return hpd.pd & HUGEPD_SHIFT_MASK;
101} 61}
102 62
103static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, 63static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
104 struct hstate *hstate)
105{ 64{
106 unsigned int shift = huge_page_shift(hstate); 65 unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
107 int psize = shift_to_mmu_psize(shift);
108 unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
109 pte_t *dir = hugepd_page(*hpdp); 66 pte_t *dir = hugepd_page(*hpdp);
110 67
111 return dir + idx; 68 return dir + idx;
112} 69}
113 70
71pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
72{
73 pgd_t *pg;
74 pud_t *pu;
75 pmd_t *pm;
76 hugepd_t *hpdp = NULL;
77 unsigned pdshift = PGDIR_SHIFT;
78
79 if (shift)
80 *shift = 0;
81
82 pg = pgdir + pgd_index(ea);
83 if (is_hugepd(pg)) {
84 hpdp = (hugepd_t *)pg;
85 } else if (!pgd_none(*pg)) {
86 pdshift = PUD_SHIFT;
87 pu = pud_offset(pg, ea);
88 if (is_hugepd(pu))
89 hpdp = (hugepd_t *)pu;
90 else if (!pud_none(*pu)) {
91 pdshift = PMD_SHIFT;
92 pm = pmd_offset(pu, ea);
93 if (is_hugepd(pm))
94 hpdp = (hugepd_t *)pm;
95 else if (!pmd_none(*pm)) {
96 return pte_offset_map(pm, ea);
97 }
98 }
99 }
100
101 if (!hpdp)
102 return NULL;
103
104 if (shift)
105 *shift = hugepd_shift(*hpdp);
106 return hugepte_offset(hpdp, ea, pdshift);
107}
108
109pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
110{
111 return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
112}
113
114static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 114static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
115 unsigned long address, unsigned int psize) 115 unsigned long address, unsigned pdshift, unsigned pshift)
116{ 116{
117 pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], 117 pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
118 GFP_KERNEL|__GFP_REPEAT); 118 GFP_KERNEL|__GFP_REPEAT);
119
120 BUG_ON(pshift > HUGEPD_SHIFT_MASK);
121 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
119 122
120 if (! new) 123 if (! new)
121 return -ENOMEM; 124 return -ENOMEM;
122 125
123 spin_lock(&mm->page_table_lock); 126 spin_lock(&mm->page_table_lock);
124 if (!hugepd_none(*hpdp)) 127 if (!hugepd_none(*hpdp))
125 kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new); 128 kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
126 else 129 else
127 hpdp->pd = (unsigned long)new | HUGEPD_OK; 130 hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
128 spin_unlock(&mm->page_table_lock); 131 spin_unlock(&mm->page_table_lock);
129 return 0; 132 return 0;
130} 133}
131 134
132 135pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
133static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
134{
135 if (huge_page_shift(hstate) < PUD_SHIFT)
136 return pud_offset(pgd, addr);
137 else
138 return (pud_t *) pgd;
139}
140static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
141 struct hstate *hstate)
142{
143 if (huge_page_shift(hstate) < PUD_SHIFT)
144 return pud_alloc(mm, pgd, addr);
145 else
146 return (pud_t *) pgd;
147}
148static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
149{ 136{
150 if (huge_page_shift(hstate) < PMD_SHIFT) 137 pgd_t *pg;
151 return pmd_offset(pud, addr); 138 pud_t *pu;
152 else 139 pmd_t *pm;
153 return (pmd_t *) pud; 140 hugepd_t *hpdp = NULL;
154} 141 unsigned pshift = __ffs(sz);
155static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, 142 unsigned pdshift = PGDIR_SHIFT;
156 struct hstate *hstate) 143
157{ 144 addr &= ~(sz-1);
158 if (huge_page_shift(hstate) < PMD_SHIFT) 145
159 return pmd_alloc(mm, pud, addr); 146 pg = pgd_offset(mm, addr);
160 else 147 if (pshift >= PUD_SHIFT) {
161 return (pmd_t *) pud; 148 hpdp = (hugepd_t *)pg;
149 } else {
150 pdshift = PUD_SHIFT;
151 pu = pud_alloc(mm, pg, addr);
152 if (pshift >= PMD_SHIFT) {
153 hpdp = (hugepd_t *)pu;
154 } else {
155 pdshift = PMD_SHIFT;
156 pm = pmd_alloc(mm, pu, addr);
157 hpdp = (hugepd_t *)pm;
158 }
159 }
160
161 if (!hpdp)
162 return NULL;
163
164 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
165
166 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
167 return NULL;
168
169 return hugepte_offset(hpdp, addr, pdshift);
162} 170}
163 171
164/* Build list of addresses of gigantic pages. This function is used in early 172/* Build list of addresses of gigantic pages. This function is used in early
@@ -192,94 +200,38 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
192 return 1; 200 return 1;
193} 201}
194 202
195
196/* Modelled after find_linux_pte() */
197pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
198{
199 pgd_t *pg;
200 pud_t *pu;
201 pmd_t *pm;
202
203 unsigned int psize;
204 unsigned int shift;
205 unsigned long sz;
206 struct hstate *hstate;
207 psize = get_slice_psize(mm, addr);
208 shift = mmu_psize_to_shift(psize);
209 sz = ((1UL) << shift);
210 hstate = size_to_hstate(sz);
211
212 addr &= hstate->mask;
213
214 pg = pgd_offset(mm, addr);
215 if (!pgd_none(*pg)) {
216 pu = hpud_offset(pg, addr, hstate);
217 if (!pud_none(*pu)) {
218 pm = hpmd_offset(pu, addr, hstate);
219 if (!pmd_none(*pm))
220 return hugepte_offset((hugepd_t *)pm, addr,
221 hstate);
222 }
223 }
224
225 return NULL;
226}
227
228pte_t *huge_pte_alloc(struct mm_struct *mm,
229 unsigned long addr, unsigned long sz)
230{
231 pgd_t *pg;
232 pud_t *pu;
233 pmd_t *pm;
234 hugepd_t *hpdp = NULL;
235 struct hstate *hstate;
236 unsigned int psize;
237 hstate = size_to_hstate(sz);
238
239 psize = get_slice_psize(mm, addr);
240 BUG_ON(!mmu_huge_psizes[psize]);
241
242 addr &= hstate->mask;
243
244 pg = pgd_offset(mm, addr);
245 pu = hpud_alloc(mm, pg, addr, hstate);
246
247 if (pu) {
248 pm = hpmd_alloc(mm, pu, addr, hstate);
249 if (pm)
250 hpdp = (hugepd_t *)pm;
251 }
252
253 if (! hpdp)
254 return NULL;
255
256 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
257 return NULL;
258
259 return hugepte_offset(hpdp, addr, hstate);
260}
261
262int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 203int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
263{ 204{
264 return 0; 205 return 0;
265} 206}
266 207
267static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, 208static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
268 unsigned int psize) 209 unsigned long start, unsigned long end,
210 unsigned long floor, unsigned long ceiling)
269{ 211{
270 pte_t *hugepte = hugepd_page(*hpdp); 212 pte_t *hugepte = hugepd_page(*hpdp);
213 unsigned shift = hugepd_shift(*hpdp);
214 unsigned long pdmask = ~((1UL << pdshift) - 1);
215
216 start &= pdmask;
217 if (start < floor)
218 return;
219 if (ceiling) {
220 ceiling &= pdmask;
221 if (! ceiling)
222 return;
223 }
224 if (end - 1 > ceiling - 1)
225 return;
271 226
272 hpdp->pd = 0; 227 hpdp->pd = 0;
273 tlb->need_flush = 1; 228 tlb->need_flush = 1;
274 pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, 229 pgtable_free_tlb(tlb, hugepte, pdshift - shift);
275 HUGEPTE_CACHE_NUM+psize-1,
276 PGF_CACHENUM_MASK));
277} 230}
278 231
279static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 232static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
280 unsigned long addr, unsigned long end, 233 unsigned long addr, unsigned long end,
281 unsigned long floor, unsigned long ceiling, 234 unsigned long floor, unsigned long ceiling)
282 unsigned int psize)
283{ 235{
284 pmd_t *pmd; 236 pmd_t *pmd;
285 unsigned long next; 237 unsigned long next;
@@ -291,7 +243,8 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
291 next = pmd_addr_end(addr, end); 243 next = pmd_addr_end(addr, end);
292 if (pmd_none(*pmd)) 244 if (pmd_none(*pmd))
293 continue; 245 continue;
294 free_hugepte_range(tlb, (hugepd_t *)pmd, psize); 246 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
247 addr, next, floor, ceiling);
295 } while (pmd++, addr = next, addr != end); 248 } while (pmd++, addr = next, addr != end);
296 249
297 start &= PUD_MASK; 250 start &= PUD_MASK;
@@ -317,23 +270,19 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
317 pud_t *pud; 270 pud_t *pud;
318 unsigned long next; 271 unsigned long next;
319 unsigned long start; 272 unsigned long start;
320 unsigned int shift;
321 unsigned int psize = get_slice_psize(tlb->mm, addr);
322 shift = mmu_psize_to_shift(psize);
323 273
324 start = addr; 274 start = addr;
325 pud = pud_offset(pgd, addr); 275 pud = pud_offset(pgd, addr);
326 do { 276 do {
327 next = pud_addr_end(addr, end); 277 next = pud_addr_end(addr, end);
328 if (shift < PMD_SHIFT) { 278 if (!is_hugepd(pud)) {
329 if (pud_none_or_clear_bad(pud)) 279 if (pud_none_or_clear_bad(pud))
330 continue; 280 continue;
331 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 281 hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
332 ceiling, psize); 282 ceiling);
333 } else { 283 } else {
334 if (pud_none(*pud)) 284 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
335 continue; 285 addr, next, floor, ceiling);
336 free_hugepte_range(tlb, (hugepd_t *)pud, psize);
337 } 286 }
338 } while (pud++, addr = next, addr != end); 287 } while (pud++, addr = next, addr != end);
339 288
@@ -364,121 +313,56 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
364{ 313{
365 pgd_t *pgd; 314 pgd_t *pgd;
366 unsigned long next; 315 unsigned long next;
367 unsigned long start;
368 316
369 /* 317 /*
370 * Comments below take from the normal free_pgd_range(). They 318 * Because there are a number of different possible pagetable
371 * apply here too. The tests against HUGEPD_MASK below are 319 * layouts for hugepage ranges, we limit knowledge of how
372 * essential, because we *don't* test for this at the bottom 320 * things should be laid out to the allocation path
373 * level. Without them we'll attempt to free a hugepte table 321 * (huge_pte_alloc(), above). Everything else works out the
374 * when we unmap just part of it, even if there are other 322 * structure as it goes from information in the hugepd
375 * active mappings using it. 323 * pointers. That means that we can't here use the
376 * 324 * optimization used in the normal page free_pgd_range(), of
377 * The next few lines have given us lots of grief... 325 * checking whether we're actually covering a large enough
378 * 326 * range to have to do anything at the top level of the walk
379 * Why are we testing HUGEPD* at this top level? Because 327 * instead of at the bottom.
380 * often there will be no work to do at all, and we'd prefer
381 * not to go all the way down to the bottom just to discover
382 * that.
383 * 328 *
384 * Why all these "- 1"s? Because 0 represents both the bottom 329 * To make sense of this, you should probably go read the big
385 * of the address space and the top of it (using -1 for the 330 * block comment at the top of the normal free_pgd_range(),
386 * top wouldn't help much: the masks would do the wrong thing). 331 * too.
387 * The rule is that addr 0 and floor 0 refer to the bottom of
388 * the address space, but end 0 and ceiling 0 refer to the top
389 * Comparisons need to use "end - 1" and "ceiling - 1" (though
390 * that end 0 case should be mythical).
391 *
392 * Wherever addr is brought up or ceiling brought down, we
393 * must be careful to reject "the opposite 0" before it
394 * confuses the subsequent tests. But what about where end is
395 * brought down by HUGEPD_SIZE below? no, end can't go down to
396 * 0 there.
397 *
398 * Whereas we round start (addr) and ceiling down, by different
399 * masks at different levels, in order to test whether a table
400 * now has no other vmas using it, so can be freed, we don't
401 * bother to round floor or end up - the tests don't need that.
402 */ 332 */
403 unsigned int psize = get_slice_psize(tlb->mm, addr);
404
405 addr &= HUGEPD_MASK(psize);
406 if (addr < floor) {
407 addr += HUGEPD_SIZE(psize);
408 if (!addr)
409 return;
410 }
411 if (ceiling) {
412 ceiling &= HUGEPD_MASK(psize);
413 if (!ceiling)
414 return;
415 }
416 if (end - 1 > ceiling - 1)
417 end -= HUGEPD_SIZE(psize);
418 if (addr > end - 1)
419 return;
420 333
421 start = addr;
422 pgd = pgd_offset(tlb->mm, addr); 334 pgd = pgd_offset(tlb->mm, addr);
423 do { 335 do {
424 psize = get_slice_psize(tlb->mm, addr);
425 BUG_ON(!mmu_huge_psizes[psize]);
426 next = pgd_addr_end(addr, end); 336 next = pgd_addr_end(addr, end);
427 if (mmu_psize_to_shift(psize) < PUD_SHIFT) { 337 if (!is_hugepd(pgd)) {
428 if (pgd_none_or_clear_bad(pgd)) 338 if (pgd_none_or_clear_bad(pgd))
429 continue; 339 continue;
430 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); 340 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
431 } else { 341 } else {
432 if (pgd_none(*pgd)) 342 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
433 continue; 343 addr, next, floor, ceiling);
434 free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
435 } 344 }
436 } while (pgd++, addr = next, addr != end); 345 } while (pgd++, addr = next, addr != end);
437} 346}
438 347
439void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
440 pte_t *ptep, pte_t pte)
441{
442 if (pte_present(*ptep)) {
443 /* We open-code pte_clear because we need to pass the right
444 * argument to hpte_need_flush (huge / !huge). Might not be
445 * necessary anymore if we make hpte_need_flush() get the
446 * page size from the slices
447 */
448 unsigned int psize = get_slice_psize(mm, addr);
449 unsigned int shift = mmu_psize_to_shift(psize);
450 unsigned long sz = ((1UL) << shift);
451 struct hstate *hstate = size_to_hstate(sz);
452 pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
453 }
454 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
455}
456
457pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
458 pte_t *ptep)
459{
460 unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
461 return __pte(old);
462}
463
464struct page * 348struct page *
465follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 349follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
466{ 350{
467 pte_t *ptep; 351 pte_t *ptep;
468 struct page *page; 352 struct page *page;
469 unsigned int mmu_psize = get_slice_psize(mm, address); 353 unsigned shift;
354 unsigned long mask;
355
356 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
470 357
471 /* Verify it is a huge page else bail. */ 358 /* Verify it is a huge page else bail. */
472 if (!mmu_huge_psizes[mmu_psize]) 359 if (!ptep || !shift)
473 return ERR_PTR(-EINVAL); 360 return ERR_PTR(-EINVAL);
474 361
475 ptep = huge_pte_offset(mm, address); 362 mask = (1UL << shift) - 1;
476 page = pte_page(*ptep); 363 page = pte_page(*ptep);
477 if (page) { 364 if (page)
478 unsigned int shift = mmu_psize_to_shift(mmu_psize); 365 page += (address & mask) / PAGE_SIZE;
479 unsigned long sz = ((1UL) << shift);
480 page += (address % sz) / PAGE_SIZE;
481 }
482 366
483 return page; 367 return page;
484} 368}
@@ -501,6 +385,82 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
501 return NULL; 385 return NULL;
502} 386}
503 387
388static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
389 unsigned long end, int write, struct page **pages, int *nr)
390{
391 unsigned long mask;
392 unsigned long pte_end;
393 struct page *head, *page;
394 pte_t pte;
395 int refs;
396
397 pte_end = (addr + sz) & ~(sz-1);
398 if (pte_end < end)
399 end = pte_end;
400
401 pte = *ptep;
402 mask = _PAGE_PRESENT | _PAGE_USER;
403 if (write)
404 mask |= _PAGE_RW;
405
406 if ((pte_val(pte) & mask) != mask)
407 return 0;
408
409 /* hugepages are never "special" */
410 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
411
412 refs = 0;
413 head = pte_page(pte);
414
415 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
416 do {
417 VM_BUG_ON(compound_head(page) != head);
418 pages[*nr] = page;
419 (*nr)++;
420 page++;
421 refs++;
422 } while (addr += PAGE_SIZE, addr != end);
423
424 if (!page_cache_add_speculative(head, refs)) {
425 *nr -= refs;
426 return 0;
427 }
428
429 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
430 /* Could be optimized better */
431 while (*nr) {
432 put_page(page);
433 (*nr)--;
434 }
435 }
436
437 return 1;
438}
439
440static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
441 unsigned long sz)
442{
443 unsigned long __boundary = (addr + sz) & ~(sz-1);
444 return (__boundary - 1 < end - 1) ? __boundary : end;
445}
446
447int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
448 unsigned long addr, unsigned long end,
449 int write, struct page **pages, int *nr)
450{
451 pte_t *ptep;
452 unsigned long sz = 1UL << hugepd_shift(*hugepd);
453 unsigned long next;
454
455 ptep = hugepte_offset(hugepd, addr, pdshift);
456 do {
457 next = hugepte_addr_end(addr, end, sz);
458 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
459 return 0;
460 } while (ptep++, addr = next, addr != end);
461
462 return 1;
463}
504 464
505unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 465unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
506 unsigned long len, unsigned long pgoff, 466 unsigned long len, unsigned long pgoff,
@@ -509,8 +469,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
509 struct hstate *hstate = hstate_file(file); 469 struct hstate *hstate = hstate_file(file);
510 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 470 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
511 471
512 if (!mmu_huge_psizes[mmu_psize])
513 return -EINVAL;
514 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); 472 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
515} 473}
516 474
@@ -521,229 +479,46 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
521 return 1UL << mmu_psize_to_shift(psize); 479 return 1UL << mmu_psize_to_shift(psize);
522} 480}
523 481
524/* 482static int __init add_huge_page_size(unsigned long long size)
525 * Called by asm hashtable.S for doing lazy icache flush
526 */
527static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
528 pte_t pte, int trap, unsigned long sz)
529{ 483{
530 struct page *page; 484 int shift = __ffs(size);
531 int i; 485 int mmu_psize;
532
533 if (!pfn_valid(pte_pfn(pte)))
534 return rflags;
535
536 page = pte_page(pte);
537
538 /* page is dirty */
539 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
540 if (trap == 0x400) {
541 for (i = 0; i < (sz / PAGE_SIZE); i++)
542 __flush_dcache_icache(page_address(page+i));
543 set_bit(PG_arch_1, &page->flags);
544 } else {
545 rflags |= HPTE_R_N;
546 }
547 }
548 return rflags;
549}
550 486
551int hash_huge_page(struct mm_struct *mm, unsigned long access, 487 /* Check that it is a page size supported by the hardware and
552 unsigned long ea, unsigned long vsid, int local, 488 * that it fits within pagetable and slice limits. */
553 unsigned long trap) 489 if (!is_power_of_2(size)
554{ 490 || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
555 pte_t *ptep; 491 return -EINVAL;
556 unsigned long old_pte, new_pte;
557 unsigned long va, rflags, pa, sz;
558 long slot;
559 int err = 1;
560 int ssize = user_segment_size(ea);
561 unsigned int mmu_psize;
562 int shift;
563 mmu_psize = get_slice_psize(mm, ea);
564
565 if (!mmu_huge_psizes[mmu_psize])
566 goto out;
567 ptep = huge_pte_offset(mm, ea);
568
569 /* Search the Linux page table for a match with va */
570 va = hpt_va(ea, vsid, ssize);
571 492
572 /* 493 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
573 * If no pte found or not present, send the problem up to 494 return -EINVAL;
574 * do_page_fault
575 */
576 if (unlikely(!ptep || pte_none(*ptep)))
577 goto out;
578 495
579 /* 496#ifdef CONFIG_SPU_FS_64K_LS
580 * Check the user's access rights to the page. If access should be 497 /* Disable support for 64K huge pages when 64K SPU local store
581 * prevented then send the problem up to do_page_fault. 498 * support is enabled as the current implementation conflicts.
582 */
583 if (unlikely(access & ~pte_val(*ptep)))
584 goto out;
585 /*
586 * At this point, we have a pte (old_pte) which can be used to build
587 * or update an HPTE. There are 2 cases:
588 *
589 * 1. There is a valid (present) pte with no associated HPTE (this is
590 * the most common case)
591 * 2. There is a valid (present) pte with an associated HPTE. The
592 * current values of the pp bits in the HPTE prevent access
593 * because we are doing software DIRTY bit management and the
594 * page is currently not DIRTY.
595 */ 499 */
500 if (shift == PAGE_SHIFT_64K)
501 return -EINVAL;
502#endif /* CONFIG_SPU_FS_64K_LS */
596 503
504 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
597 505
598 do { 506 /* Return if huge page size has already been setup */
599 old_pte = pte_val(*ptep); 507 if (size_to_hstate(size))
600 if (old_pte & _PAGE_BUSY) 508 return 0;
601 goto out;
602 new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
603 } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
604 old_pte, new_pte));
605
606 rflags = 0x2 | (!(new_pte & _PAGE_RW));
607 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
608 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
609 shift = mmu_psize_to_shift(mmu_psize);
610 sz = ((1UL) << shift);
611 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
612 /* No CPU has hugepages but lacks no execute, so we
613 * don't need to worry about that case */
614 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
615 trap, sz);
616
617 /* Check if pte already has an hpte (case 2) */
618 if (unlikely(old_pte & _PAGE_HASHPTE)) {
619 /* There MIGHT be an HPTE for this pte */
620 unsigned long hash, slot;
621
622 hash = hpt_hash(va, shift, ssize);
623 if (old_pte & _PAGE_F_SECOND)
624 hash = ~hash;
625 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
626 slot += (old_pte & _PAGE_F_GIX) >> 12;
627
628 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
629 ssize, local) == -1)
630 old_pte &= ~_PAGE_HPTEFLAGS;
631 }
632
633 if (likely(!(old_pte & _PAGE_HASHPTE))) {
634 unsigned long hash = hpt_hash(va, shift, ssize);
635 unsigned long hpte_group;
636
637 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
638
639repeat:
640 hpte_group = ((hash & htab_hash_mask) *
641 HPTES_PER_GROUP) & ~0x7UL;
642
643 /* clear HPTE slot informations in new PTE */
644#ifdef CONFIG_PPC_64K_PAGES
645 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
646#else
647 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
648#endif
649 /* Add in WIMG bits */
650 rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
651 _PAGE_COHERENT | _PAGE_GUARDED));
652
653 /* Insert into the hash table, primary slot */
654 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
655 mmu_psize, ssize);
656
657 /* Primary is full, try the secondary */
658 if (unlikely(slot == -1)) {
659 hpte_group = ((~hash & htab_hash_mask) *
660 HPTES_PER_GROUP) & ~0x7UL;
661 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
662 HPTE_V_SECONDARY,
663 mmu_psize, ssize);
664 if (slot == -1) {
665 if (mftb() & 0x1)
666 hpte_group = ((hash & htab_hash_mask) *
667 HPTES_PER_GROUP)&~0x7UL;
668
669 ppc_md.hpte_remove(hpte_group);
670 goto repeat;
671 }
672 }
673
674 if (unlikely(slot == -2))
675 panic("hash_huge_page: pte_insert failed\n");
676
677 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
678 }
679
680 /*
681 * No need to use ldarx/stdcx here
682 */
683 *ptep = __pte(new_pte & ~_PAGE_BUSY);
684
685 err = 0;
686 509
687 out: 510 hugetlb_add_hstate(shift - PAGE_SHIFT);
688 return err;
689}
690 511
691static void __init set_huge_psize(int psize) 512 return 0;
692{
693 /* Check that it is a page size supported by the hardware and
694 * that it fits within pagetable limits. */
695 if (mmu_psize_defs[psize].shift &&
696 mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
697 (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
698 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
699 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
700 /* Return if huge page size has already been setup or is the
701 * same as the base page size. */
702 if (mmu_huge_psizes[psize] ||
703 mmu_psize_defs[psize].shift == PAGE_SHIFT)
704 return;
705 if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
706 return;
707 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
708
709 switch (mmu_psize_defs[psize].shift) {
710 case PAGE_SHIFT_64K:
711 /* We only allow 64k hpages with 4k base page,
712 * which was checked above, and always put them
713 * at the PMD */
714 hugepte_shift[psize] = PMD_SHIFT;
715 break;
716 case PAGE_SHIFT_16M:
717 /* 16M pages can be at two different levels
718 * of pagestables based on base page size */
719 if (PAGE_SHIFT == PAGE_SHIFT_64K)
720 hugepte_shift[psize] = PMD_SHIFT;
721 else /* 4k base page */
722 hugepte_shift[psize] = PUD_SHIFT;
723 break;
724 case PAGE_SHIFT_16G:
725 /* 16G pages are always at PGD level */
726 hugepte_shift[psize] = PGDIR_SHIFT;
727 break;
728 }
729 hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
730 } else
731 hugepte_shift[psize] = 0;
732} 513}
733 514
734static int __init hugepage_setup_sz(char *str) 515static int __init hugepage_setup_sz(char *str)
735{ 516{
736 unsigned long long size; 517 unsigned long long size;
737 int mmu_psize;
738 int shift;
739 518
740 size = memparse(str, &str); 519 size = memparse(str, &str);
741 520
742 shift = __ffs(size); 521 if (add_huge_page_size(size) != 0)
743 mmu_psize = shift_to_mmu_psize(shift);
744 if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
745 set_huge_psize(mmu_psize);
746 else
747 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); 522 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
748 523
749 return 1; 524 return 1;
@@ -752,41 +527,55 @@ __setup("hugepagesz=", hugepage_setup_sz);
752 527
753static int __init hugetlbpage_init(void) 528static int __init hugetlbpage_init(void)
754{ 529{
755 unsigned int psize; 530 int psize;
756 531
757 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 532 if (!cpu_has_feature(CPU_FTR_16M_PAGE))
758 return -ENODEV; 533 return -ENODEV;
759 534
760 /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE 535 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
761 * and adjust PTE_NONCACHE_NUM if the number of supported huge page 536 unsigned shift;
762 * sizes changes. 537 unsigned pdshift;
763 */
764 set_huge_psize(MMU_PAGE_16M);
765 set_huge_psize(MMU_PAGE_16G);
766 538
767 /* Temporarily disable support for 64K huge pages when 64K SPU local 539 if (!mmu_psize_defs[psize].shift)
768 * store support is enabled as the current implementation conflicts. 540 continue;
769 */
770#ifndef CONFIG_SPU_FS_64K_LS
771 set_huge_psize(MMU_PAGE_64K);
772#endif
773 541
774 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 542 shift = mmu_psize_to_shift(psize);
775 if (mmu_huge_psizes[psize]) { 543
776 pgtable_cache[HUGE_PGTABLE_INDEX(psize)] = 544 if (add_huge_page_size(1ULL << shift) < 0)
777 kmem_cache_create( 545 continue;
778 HUGEPTE_CACHE_NAME(psize), 546
779 HUGEPTE_TABLE_SIZE(psize), 547 if (shift < PMD_SHIFT)
780 HUGEPTE_TABLE_SIZE(psize), 548 pdshift = PMD_SHIFT;
781 0, 549 else if (shift < PUD_SHIFT)
782 NULL); 550 pdshift = PUD_SHIFT;
783 if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)]) 551 else
784 panic("hugetlbpage_init(): could not create %s"\ 552 pdshift = PGDIR_SHIFT;
785 "\n", HUGEPTE_CACHE_NAME(psize)); 553
786 } 554 pgtable_cache_add(pdshift - shift, NULL);
555 if (!PGT_CACHE(pdshift - shift))
556 panic("hugetlbpage_init(): could not create "
557 "pgtable cache for %d bit pagesize\n", shift);
787 } 558 }
788 559
560 /* Set default large page size. Currently, we pick 16M or 1M
561 * depending on what is available
562 */
563 if (mmu_psize_defs[MMU_PAGE_16M].shift)
564 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
565 else if (mmu_psize_defs[MMU_PAGE_1M].shift)
566 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
567
789 return 0; 568 return 0;
790} 569}
791 570
792module_init(hugetlbpage_init); 571module_init(hugetlbpage_init);
572
573void flush_dcache_icache_hugepage(struct page *page)
574{
575 int i;
576
577 BUG_ON(!PageCompound(page));
578
579 for (i = 0; i < (1UL << compound_order(page)); i++)
580 __flush_dcache_icache(page_address(page+i));
581}
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 9ddcfb4dc139..767333005eb4 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -31,6 +31,7 @@
31#include <linux/initrd.h> 31#include <linux/initrd.h>
32#include <linux/pagemap.h> 32#include <linux/pagemap.h>
33#include <linux/lmb.h> 33#include <linux/lmb.h>
34#include <linux/gfp.h>
34 35
35#include <asm/pgalloc.h> 36#include <asm/pgalloc.h>
36#include <asm/prom.h> 37#include <asm/prom.h>
@@ -47,7 +48,7 @@
47#include "mmu_decl.h" 48#include "mmu_decl.h"
48 49
49#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL) 50#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
50/* The ammount of lowmem must be within 0xF0000000 - KERNELBASE. */ 51/* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */
51#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET)) 52#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET))
52#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL" 53#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL"
53#endif 54#endif
@@ -82,6 +83,11 @@ extern struct task_struct *current_set[NR_CPUS];
82int __map_without_bats; 83int __map_without_bats;
83int __map_without_ltlbs; 84int __map_without_ltlbs;
84 85
86/*
87 * This tells the system to allow ioremapping memory marked as reserved.
88 */
89int __allow_ioremap_reserved;
90
85/* max amount of low RAM to map in */ 91/* max amount of low RAM to map in */
86unsigned long __max_low_memory = MAX_LOW_MEM; 92unsigned long __max_low_memory = MAX_LOW_MEM;
87 93
@@ -131,9 +137,13 @@ void __init MMU_init(void)
131 MMU_setup(); 137 MMU_setup();
132 138
133 if (lmb.memory.cnt > 1) { 139 if (lmb.memory.cnt > 1) {
140#ifndef CONFIG_WII
134 lmb.memory.cnt = 1; 141 lmb.memory.cnt = 1;
135 lmb_analyze(); 142 lmb_analyze();
136 printk(KERN_WARNING "Only using first contiguous memory region"); 143 printk(KERN_WARNING "Only using first contiguous memory region");
144#else
145 wii_memory_fixups();
146#endif
137 } 147 }
138 148
139 total_lowmem = total_memory = lmb_end_of_DRAM() - memstart_addr; 149 total_lowmem = total_memory = lmb_end_of_DRAM() - memstart_addr;
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 335c578b9cc3..d7fa50b09b4a 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -41,6 +41,8 @@
41#include <linux/module.h> 41#include <linux/module.h>
42#include <linux/poison.h> 42#include <linux/poison.h>
43#include <linux/lmb.h> 43#include <linux/lmb.h>
44#include <linux/hugetlb.h>
45#include <linux/slab.h>
44 46
45#include <asm/pgalloc.h> 47#include <asm/pgalloc.h>
46#include <asm/page.h> 48#include <asm/page.h>
@@ -119,30 +121,63 @@ static void pmd_ctor(void *addr)
119 memset(addr, 0, PMD_TABLE_SIZE); 121 memset(addr, 0, PMD_TABLE_SIZE);
120} 122}
121 123
122static const unsigned int pgtable_cache_size[2] = { 124struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
123 PGD_TABLE_SIZE, PMD_TABLE_SIZE 125
124}; 126/*
125static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { 127 * Create a kmem_cache() for pagetables. This is not used for PTE
126#ifdef CONFIG_PPC_64K_PAGES 128 * pages - they're linked to struct page, come from the normal free
127 "pgd_cache", "pmd_cache", 129 * pages pool and have a different entry size (see real_pte_t) to
128#else 130 * everything else. Caches created by this function are used for all
129 "pgd_cache", "pud_pmd_cache", 131 * the higher level pagetables, and for hugepage pagetables.
130#endif /* CONFIG_PPC_64K_PAGES */ 132 */
131}; 133void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
132 134{
133#ifdef CONFIG_HUGETLB_PAGE 135 char *name;
134/* Hugepages need an extra cache per hugepagesize, initialized in 136 unsigned long table_size = sizeof(void *) << shift;
135 * hugetlbpage.c. We can't put into the tables above, because HPAGE_SHIFT 137 unsigned long align = table_size;
136 * is not compile time constant. */ 138
137struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT]; 139 /* When batching pgtable pointers for RCU freeing, we store
138#else 140 * the index size in the low bits. Table alignment must be
139struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; 141 * big enough to fit it.
140#endif 142 *
143 * Likewise, hugeapge pagetable pointers contain a (different)
144 * shift value in the low bits. All tables must be aligned so
145 * as to leave enough 0 bits in the address to contain it. */
146 unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
147 HUGEPD_SHIFT_MASK + 1);
148 struct kmem_cache *new;
149
150 /* It would be nice if this was a BUILD_BUG_ON(), but at the
151 * moment, gcc doesn't seem to recognize is_power_of_2 as a
152 * constant expression, so so much for that. */
153 BUG_ON(!is_power_of_2(minalign));
154 BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
155
156 if (PGT_CACHE(shift))
157 return; /* Already have a cache of this size */
158
159 align = max_t(unsigned long, align, minalign);
160 name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
161 new = kmem_cache_create(name, table_size, align, 0, ctor);
162 PGT_CACHE(shift) = new;
163
164 pr_debug("Allocated pgtable cache for order %d\n", shift);
165}
166
141 167
142void pgtable_cache_init(void) 168void pgtable_cache_init(void)
143{ 169{
144 pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor); 170 pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
145 pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor); 171 pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
172 if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
173 panic("Couldn't allocate pgtable caches");
174
175 /* In all current configs, when the PUD index exists it's the
176 * same size as either the pgd or pmd index. Verify that the
177 * initialization above has also created a PUD cache. This
178 * will need re-examiniation if we add new possibilities for
179 * the pagetable layout. */
180 BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
146} 181}
147 182
148#ifdef CONFIG_SPARSEMEM_VMEMMAP 183#ifdef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 59736317bf0e..0f594d774bf7 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -22,6 +22,7 @@
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/errno.h> 23#include <linux/errno.h>
24#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/gfp.h>
25#include <linux/types.h> 26#include <linux/types.h>
26#include <linux/mm.h> 27#include <linux/mm.h>
27#include <linux/stddef.h> 28#include <linux/stddef.h>
@@ -32,6 +33,7 @@
32#include <linux/pagemap.h> 33#include <linux/pagemap.h>
33#include <linux/suspend.h> 34#include <linux/suspend.h>
34#include <linux/lmb.h> 35#include <linux/lmb.h>
36#include <linux/hugetlb.h>
35 37
36#include <asm/pgalloc.h> 38#include <asm/pgalloc.h>
37#include <asm/prom.h> 39#include <asm/prom.h>
@@ -47,6 +49,7 @@
47#include <asm/sparsemem.h> 49#include <asm/sparsemem.h>
48#include <asm/vdso.h> 50#include <asm/vdso.h>
49#include <asm/fixmap.h> 51#include <asm/fixmap.h>
52#include <asm/swiotlb.h>
50 53
51#include "mmu_decl.h" 54#include "mmu_decl.h"
52 55
@@ -319,6 +322,11 @@ void __init mem_init(void)
319 struct page *page; 322 struct page *page;
320 unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize; 323 unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
321 324
325#ifdef CONFIG_SWIOTLB
326 if (ppc_swiotlb_enable)
327 swiotlb_init(1);
328#endif
329
322 num_physpages = lmb.memory.size >> PAGE_SHIFT; 330 num_physpages = lmb.memory.size >> PAGE_SHIFT;
323 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); 331 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
324 332
@@ -417,18 +425,26 @@ EXPORT_SYMBOL(flush_dcache_page);
417 425
418void flush_dcache_icache_page(struct page *page) 426void flush_dcache_icache_page(struct page *page)
419{ 427{
428#ifdef CONFIG_HUGETLB_PAGE
429 if (PageCompound(page)) {
430 flush_dcache_icache_hugepage(page);
431 return;
432 }
433#endif
420#ifdef CONFIG_BOOKE 434#ifdef CONFIG_BOOKE
421 void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE); 435 {
422 __flush_dcache_icache(start); 436 void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
423 kunmap_atomic(start, KM_PPC_SYNC_ICACHE); 437 __flush_dcache_icache(start);
438 kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
439 }
424#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64) 440#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
425 /* On 8xx there is no need to kmap since highmem is not supported */ 441 /* On 8xx there is no need to kmap since highmem is not supported */
426 __flush_dcache_icache(page_address(page)); 442 __flush_dcache_icache(page_address(page));
427#else 443#else
428 __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); 444 __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
429#endif 445#endif
430
431} 446}
447
432void clear_user_page(void *page, unsigned long vaddr, struct page *pg) 448void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
433{ 449{
434 clear_page(page); 450 clear_page(page);
@@ -485,13 +501,13 @@ EXPORT_SYMBOL(flush_icache_user_range);
485 * This must always be called with the pte lock held. 501 * This must always be called with the pte lock held.
486 */ 502 */
487void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, 503void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
488 pte_t pte) 504 pte_t *ptep)
489{ 505{
490#ifdef CONFIG_PPC_STD_MMU 506#ifdef CONFIG_PPC_STD_MMU
491 unsigned long access = 0, trap; 507 unsigned long access = 0, trap;
492 508
493 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ 509 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
494 if (!pte_young(pte) || address >= TASK_SIZE) 510 if (!pte_young(*ptep) || address >= TASK_SIZE)
495 return; 511 return;
496 512
497 /* We try to figure out if we are coming from an instruction 513 /* We try to figure out if we are coming from an instruction
diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap_64.c
index 0d957a4c70fe..5a783d8e8e8e 100644
--- a/arch/powerpc/mm/mmap_64.c
+++ b/arch/powerpc/mm/mmap_64.c
@@ -47,7 +47,7 @@ static inline int mmap_is_legacy(void)
47 if (current->personality & ADDR_COMPAT_LAYOUT) 47 if (current->personality & ADDR_COMPAT_LAYOUT)
48 return 1; 48 return 1;
49 49
50 if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) 50 if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
51 return 1; 51 return 1;
52 52
53 return sysctl_legacy_va_layout; 53 return sysctl_legacy_va_layout;
@@ -77,7 +77,7 @@ static unsigned long mmap_rnd(void)
77 77
78static inline unsigned long mmap_base(void) 78static inline unsigned long mmap_base(void)
79{ 79{
80 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; 80 unsigned long gap = rlimit(RLIMIT_STACK);
81 81
82 if (gap < MIN_GAP) 82 if (gap < MIN_GAP)
83 gap = MIN_GAP; 83 gap = MIN_GAP;
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index dbeb86ac90cd..2535828aa84b 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -18,11 +18,13 @@
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/spinlock.h> 19#include <linux/spinlock.h>
20#include <linux/idr.h> 20#include <linux/idr.h>
21#include <linux/module.h>
22#include <linux/gfp.h>
21 23
22#include <asm/mmu_context.h> 24#include <asm/mmu_context.h>
23 25
24static DEFINE_SPINLOCK(mmu_context_lock); 26static DEFINE_SPINLOCK(mmu_context_lock);
25static DEFINE_IDR(mmu_context_idr); 27static DEFINE_IDA(mmu_context_ida);
26 28
27/* 29/*
28 * The proto-VSID space has 2^35 - 1 segments available for user mappings. 30 * The proto-VSID space has 2^35 - 1 segments available for user mappings.
@@ -32,17 +34,17 @@ static DEFINE_IDR(mmu_context_idr);
32#define NO_CONTEXT 0 34#define NO_CONTEXT 0
33#define MAX_CONTEXT ((1UL << 19) - 1) 35#define MAX_CONTEXT ((1UL << 19) - 1)
34 36
35int init_new_context(struct task_struct *tsk, struct mm_struct *mm) 37int __init_new_context(void)
36{ 38{
37 int index; 39 int index;
38 int err; 40 int err;
39 41
40again: 42again:
41 if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL)) 43 if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
42 return -ENOMEM; 44 return -ENOMEM;
43 45
44 spin_lock(&mmu_context_lock); 46 spin_lock(&mmu_context_lock);
45 err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index); 47 err = ida_get_new_above(&mmu_context_ida, 1, &index);
46 spin_unlock(&mmu_context_lock); 48 spin_unlock(&mmu_context_lock);
47 49
48 if (err == -EAGAIN) 50 if (err == -EAGAIN)
@@ -52,27 +54,46 @@ again:
52 54
53 if (index > MAX_CONTEXT) { 55 if (index > MAX_CONTEXT) {
54 spin_lock(&mmu_context_lock); 56 spin_lock(&mmu_context_lock);
55 idr_remove(&mmu_context_idr, index); 57 ida_remove(&mmu_context_ida, index);
56 spin_unlock(&mmu_context_lock); 58 spin_unlock(&mmu_context_lock);
57 return -ENOMEM; 59 return -ENOMEM;
58 } 60 }
59 61
62 return index;
63}
64EXPORT_SYMBOL_GPL(__init_new_context);
65
66int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
67{
68 int index;
69
70 index = __init_new_context();
71 if (index < 0)
72 return index;
73
60 /* The old code would re-promote on fork, we don't do that 74 /* The old code would re-promote on fork, we don't do that
61 * when using slices as it could cause problem promoting slices 75 * when using slices as it could cause problem promoting slices
62 * that have been forced down to 4K 76 * that have been forced down to 4K
63 */ 77 */
64 if (slice_mm_new_context(mm)) 78 if (slice_mm_new_context(mm))
65 slice_set_user_psize(mm, mmu_virtual_psize); 79 slice_set_user_psize(mm, mmu_virtual_psize);
80 subpage_prot_init_new_context(mm);
66 mm->context.id = index; 81 mm->context.id = index;
67 82
68 return 0; 83 return 0;
69} 84}
70 85
71void destroy_context(struct mm_struct *mm) 86void __destroy_context(int context_id)
72{ 87{
73 spin_lock(&mmu_context_lock); 88 spin_lock(&mmu_context_lock);
74 idr_remove(&mmu_context_idr, mm->context.id); 89 ida_remove(&mmu_context_ida, context_id);
75 spin_unlock(&mmu_context_lock); 90 spin_unlock(&mmu_context_lock);
91}
92EXPORT_SYMBOL_GPL(__destroy_context);
76 93
94void destroy_context(struct mm_struct *mm)
95{
96 __destroy_context(mm->context.id);
97 subpage_prot_free(mm);
77 mm->context.id = NO_CONTEXT; 98 mm->context.id = NO_CONTEXT;
78} 99}
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index be4f34c30a0b..1f2d9ff09895 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -47,6 +47,7 @@
47#include <linux/bootmem.h> 47#include <linux/bootmem.h>
48#include <linux/notifier.h> 48#include <linux/notifier.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/slab.h>
50 51
51#include <asm/mmu_context.h> 52#include <asm/mmu_context.h>
52#include <asm/tlbflush.h> 53#include <asm/tlbflush.h>
@@ -56,7 +57,7 @@ static unsigned int next_context, nr_free_contexts;
56static unsigned long *context_map; 57static unsigned long *context_map;
57static unsigned long *stale_map[NR_CPUS]; 58static unsigned long *stale_map[NR_CPUS];
58static struct mm_struct **context_mm; 59static struct mm_struct **context_mm;
59static DEFINE_SPINLOCK(context_lock); 60static DEFINE_RAW_SPINLOCK(context_lock);
60 61
61#define CTX_MAP_SIZE \ 62#define CTX_MAP_SIZE \
62 (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1)) 63 (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
@@ -121,9 +122,9 @@ static unsigned int steal_context_smp(unsigned int id)
121 /* This will happen if you have more CPUs than available contexts, 122 /* This will happen if you have more CPUs than available contexts,
122 * all we can do here is wait a bit and try again 123 * all we can do here is wait a bit and try again
123 */ 124 */
124 spin_unlock(&context_lock); 125 raw_spin_unlock(&context_lock);
125 cpu_relax(); 126 cpu_relax();
126 spin_lock(&context_lock); 127 raw_spin_lock(&context_lock);
127 128
128 /* This will cause the caller to try again */ 129 /* This will cause the caller to try again */
129 return MMU_NO_CONTEXT; 130 return MMU_NO_CONTEXT;
@@ -194,7 +195,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
194 unsigned long *map; 195 unsigned long *map;
195 196
196 /* No lockless fast path .. yet */ 197 /* No lockless fast path .. yet */
197 spin_lock(&context_lock); 198 raw_spin_lock(&context_lock);
198 199
199 pr_hard("[%d] activating context for mm @%p, active=%d, id=%d", 200 pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
200 cpu, next, next->context.active, next->context.id); 201 cpu, next, next->context.active, next->context.id);
@@ -278,7 +279,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
278 /* Flick the MMU and release lock */ 279 /* Flick the MMU and release lock */
279 pr_hardcont(" -> %d\n", id); 280 pr_hardcont(" -> %d\n", id);
280 set_context(id, next->pgd); 281 set_context(id, next->pgd);
281 spin_unlock(&context_lock); 282 raw_spin_unlock(&context_lock);
282} 283}
283 284
284/* 285/*
@@ -307,7 +308,7 @@ void destroy_context(struct mm_struct *mm)
307 308
308 WARN_ON(mm->context.active != 0); 309 WARN_ON(mm->context.active != 0);
309 310
310 spin_lock_irqsave(&context_lock, flags); 311 raw_spin_lock_irqsave(&context_lock, flags);
311 id = mm->context.id; 312 id = mm->context.id;
312 if (id != MMU_NO_CONTEXT) { 313 if (id != MMU_NO_CONTEXT) {
313 __clear_bit(id, context_map); 314 __clear_bit(id, context_map);
@@ -318,7 +319,7 @@ void destroy_context(struct mm_struct *mm)
318 context_mm[id] = NULL; 319 context_mm[id] = NULL;
319 nr_free_contexts++; 320 nr_free_contexts++;
320 } 321 }
321 spin_unlock_irqrestore(&context_lock, flags); 322 raw_spin_unlock_irqrestore(&context_lock, flags);
322} 323}
323 324
324#ifdef CONFIG_SMP 325#ifdef CONFIG_SMP
@@ -353,7 +354,7 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
353 read_lock(&tasklist_lock); 354 read_lock(&tasklist_lock);
354 for_each_process(p) { 355 for_each_process(p) {
355 if (p->mm) 356 if (p->mm)
356 cpu_mask_clear_cpu(cpu, mm_cpumask(p->mm)); 357 cpumask_clear_cpu(cpu, mm_cpumask(p->mm));
357 } 358 }
358 read_unlock(&tasklist_lock); 359 read_unlock(&tasklist_lock);
359 break; 360 break;
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index d2e5321d5ea6..d49a77503e19 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -98,23 +98,13 @@ extern void _tlbia(void);
98 98
99#ifdef CONFIG_PPC32 99#ifdef CONFIG_PPC32
100 100
101struct tlbcam {
102 u32 MAS0;
103 u32 MAS1;
104 u32 MAS2;
105 u32 MAS3;
106 u32 MAS7;
107};
108
109extern void mapin_ram(void); 101extern void mapin_ram(void);
110extern int map_page(unsigned long va, phys_addr_t pa, int flags); 102extern int map_page(unsigned long va, phys_addr_t pa, int flags);
111extern void setbat(int index, unsigned long virt, phys_addr_t phys, 103extern void setbat(int index, unsigned long virt, phys_addr_t phys,
112 unsigned int size, int flags); 104 unsigned int size, int flags);
113extern void settlbcam(int index, unsigned long virt, phys_addr_t phys,
114 unsigned int size, int flags, unsigned int pid);
115extern void invalidate_tlbcam_entry(int index);
116 105
117extern int __map_without_bats; 106extern int __map_without_bats;
107extern int __allow_ioremap_reserved;
118extern unsigned long ioremap_base; 108extern unsigned long ioremap_base;
119extern unsigned int rtas_data, rtas_size; 109extern unsigned int rtas_data, rtas_size;
120 110
@@ -136,24 +126,32 @@ extern phys_addr_t total_lowmem;
136extern phys_addr_t memstart_addr; 126extern phys_addr_t memstart_addr;
137extern phys_addr_t lowmem_end_addr; 127extern phys_addr_t lowmem_end_addr;
138 128
129#ifdef CONFIG_WII
130extern unsigned long wii_hole_start;
131extern unsigned long wii_hole_size;
132
133extern unsigned long wii_mmu_mapin_mem2(unsigned long top);
134extern void wii_memory_fixups(void);
135#endif
136
139/* ...and now those things that may be slightly different between processor 137/* ...and now those things that may be slightly different between processor
140 * architectures. -- Dan 138 * architectures. -- Dan
141 */ 139 */
142#if defined(CONFIG_8xx) 140#if defined(CONFIG_8xx)
143#define MMU_init_hw() do { } while(0) 141#define MMU_init_hw() do { } while(0)
144#define mmu_mapin_ram() (0UL) 142#define mmu_mapin_ram(top) (0UL)
145 143
146#elif defined(CONFIG_4xx) 144#elif defined(CONFIG_4xx)
147extern void MMU_init_hw(void); 145extern void MMU_init_hw(void);
148extern unsigned long mmu_mapin_ram(void); 146extern unsigned long mmu_mapin_ram(unsigned long top);
149 147
150#elif defined(CONFIG_FSL_BOOKE) 148#elif defined(CONFIG_FSL_BOOKE)
151extern void MMU_init_hw(void); 149extern void MMU_init_hw(void);
152extern unsigned long mmu_mapin_ram(void); 150extern unsigned long mmu_mapin_ram(unsigned long top);
153extern void adjust_total_lowmem(void); 151extern void adjust_total_lowmem(void);
154 152
155#elif defined(CONFIG_PPC32) 153#elif defined(CONFIG_PPC32)
156/* anything 32-bit except 4xx or 8xx */ 154/* anything 32-bit except 4xx or 8xx */
157extern void MMU_init_hw(void); 155extern void MMU_init_hw(void);
158extern unsigned long mmu_mapin_ram(void); 156extern unsigned long mmu_mapin_ram(unsigned long top);
159#endif 157#endif
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b037d95eeadc..eaa7633515b7 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -242,10 +242,11 @@ EXPORT_SYMBOL_GPL(of_node_to_nid);
242 */ 242 */
243static int __init find_min_common_depth(void) 243static int __init find_min_common_depth(void)
244{ 244{
245 int depth; 245 int depth, index;
246 const unsigned int *ref_points; 246 const unsigned int *ref_points;
247 struct device_node *rtas_root; 247 struct device_node *rtas_root;
248 unsigned int len; 248 unsigned int len;
249 struct device_node *options;
249 250
250 rtas_root = of_find_node_by_path("/rtas"); 251 rtas_root = of_find_node_by_path("/rtas");
251 252
@@ -258,11 +259,23 @@ static int __init find_min_common_depth(void)
258 * configuration (should be all 0's) and the second is for a normal 259 * configuration (should be all 0's) and the second is for a normal
259 * NUMA configuration. 260 * NUMA configuration.
260 */ 261 */
262 index = 1;
261 ref_points = of_get_property(rtas_root, 263 ref_points = of_get_property(rtas_root,
262 "ibm,associativity-reference-points", &len); 264 "ibm,associativity-reference-points", &len);
263 265
266 /*
267 * For type 1 affinity information we want the first field
268 */
269 options = of_find_node_by_path("/options");
270 if (options) {
271 const char *str;
272 str = of_get_property(options, "ibm,associativity-form", NULL);
273 if (str && !strcmp(str, "1"))
274 index = 0;
275 }
276
264 if ((len >= 2 * sizeof(unsigned int)) && ref_points) { 277 if ((len >= 2 * sizeof(unsigned int)) && ref_points) {
265 depth = ref_points[1]; 278 depth = ref_points[index];
266 } else { 279 } else {
267 dbg("NUMA: ibm,associativity-reference-points not found.\n"); 280 dbg("NUMA: ibm,associativity-reference-points not found.\n");
268 depth = -1; 281 depth = -1;
@@ -451,7 +464,7 @@ static int __cpuinit numa_setup_cpu(unsigned long lcpu)
451 nid = of_node_to_nid_single(cpu); 464 nid = of_node_to_nid_single(cpu);
452 465
453 if (nid < 0 || !node_online(nid)) 466 if (nid < 0 || !node_online(nid))
454 nid = any_online_node(NODE_MASK_ALL); 467 nid = first_online_node;
455out: 468out:
456 map_cpu_to_node(lcpu, nid); 469 map_cpu_to_node(lcpu, nid);
457 470
@@ -1114,7 +1127,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
1114 int nid, found = 0; 1127 int nid, found = 0;
1115 1128
1116 if (!numa_enabled || (min_common_depth < 0)) 1129 if (!numa_enabled || (min_common_depth < 0))
1117 return any_online_node(NODE_MASK_ALL); 1130 return first_online_node;
1118 1131
1119 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1132 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1120 if (memory) { 1133 if (memory) {
@@ -1125,7 +1138,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
1125 } 1138 }
1126 1139
1127 if (nid < 0 || !node_online(nid)) 1140 if (nid < 0 || !node_online(nid))
1128 nid = any_online_node(NODE_MASK_ALL); 1141 nid = first_online_node;
1129 1142
1130 if (NODE_DATA(nid)->node_spanned_pages) 1143 if (NODE_DATA(nid)->node_spanned_pages)
1131 return nid; 1144 return nid;
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 53040931de32..ebc2f38eb381 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/gfp.h>
25#include <linux/mm.h> 26#include <linux/mm.h>
26#include <linux/init.h> 27#include <linux/init.h>
27#include <linux/percpu.h> 28#include <linux/percpu.h>
@@ -49,12 +50,12 @@ struct pte_freelist_batch
49{ 50{
50 struct rcu_head rcu; 51 struct rcu_head rcu;
51 unsigned int index; 52 unsigned int index;
52 pgtable_free_t tables[0]; 53 unsigned long tables[0];
53}; 54};
54 55
55#define PTE_FREELIST_SIZE \ 56#define PTE_FREELIST_SIZE \
56 ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ 57 ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
57 / sizeof(pgtable_free_t)) 58 / sizeof(unsigned long))
58 59
59static void pte_free_smp_sync(void *arg) 60static void pte_free_smp_sync(void *arg)
60{ 61{
@@ -64,13 +65,13 @@ static void pte_free_smp_sync(void *arg)
64/* This is only called when we are critically out of memory 65/* This is only called when we are critically out of memory
65 * (and fail to get a page in pte_free_tlb). 66 * (and fail to get a page in pte_free_tlb).
66 */ 67 */
67static void pgtable_free_now(pgtable_free_t pgf) 68static void pgtable_free_now(void *table, unsigned shift)
68{ 69{
69 pte_freelist_forced_free++; 70 pte_freelist_forced_free++;
70 71
71 smp_call_function(pte_free_smp_sync, NULL, 1); 72 smp_call_function(pte_free_smp_sync, NULL, 1);
72 73
73 pgtable_free(pgf); 74 pgtable_free(table, shift);
74} 75}
75 76
76static void pte_free_rcu_callback(struct rcu_head *head) 77static void pte_free_rcu_callback(struct rcu_head *head)
@@ -79,8 +80,12 @@ static void pte_free_rcu_callback(struct rcu_head *head)
79 container_of(head, struct pte_freelist_batch, rcu); 80 container_of(head, struct pte_freelist_batch, rcu);
80 unsigned int i; 81 unsigned int i;
81 82
82 for (i = 0; i < batch->index; i++) 83 for (i = 0; i < batch->index; i++) {
83 pgtable_free(batch->tables[i]); 84 void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE);
85 unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE;
86
87 pgtable_free(table, shift);
88 }
84 89
85 free_page((unsigned long)batch); 90 free_page((unsigned long)batch);
86} 91}
@@ -91,25 +96,28 @@ static void pte_free_submit(struct pte_freelist_batch *batch)
91 call_rcu(&batch->rcu, pte_free_rcu_callback); 96 call_rcu(&batch->rcu, pte_free_rcu_callback);
92} 97}
93 98
94void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) 99void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
95{ 100{
96 /* This is safe since tlb_gather_mmu has disabled preemption */ 101 /* This is safe since tlb_gather_mmu has disabled preemption */
97 struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); 102 struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
103 unsigned long pgf;
98 104
99 if (atomic_read(&tlb->mm->mm_users) < 2 || 105 if (atomic_read(&tlb->mm->mm_users) < 2 ||
100 cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){ 106 cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){
101 pgtable_free(pgf); 107 pgtable_free(table, shift);
102 return; 108 return;
103 } 109 }
104 110
105 if (*batchp == NULL) { 111 if (*batchp == NULL) {
106 *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); 112 *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
107 if (*batchp == NULL) { 113 if (*batchp == NULL) {
108 pgtable_free_now(pgf); 114 pgtable_free_now(table, shift);
109 return; 115 return;
110 } 116 }
111 (*batchp)->index = 0; 117 (*batchp)->index = 0;
112 } 118 }
119 BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
120 pgf = (unsigned long)table | shift;
113 (*batchp)->tables[(*batchp)->index++] = pgf; 121 (*batchp)->tables[(*batchp)->index++] = pgf;
114 if ((*batchp)->index == PTE_FREELIST_SIZE) { 122 if ((*batchp)->index == PTE_FREELIST_SIZE) {
115 pte_free_submit(*batchp); 123 pte_free_submit(*batchp);
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index cb96cb2e17cc..b9243e7557ae 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -26,6 +26,8 @@
26#include <linux/vmalloc.h> 26#include <linux/vmalloc.h>
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/lmb.h>
30#include <linux/slab.h>
29 31
30#include <asm/pgtable.h> 32#include <asm/pgtable.h>
31#include <asm/pgalloc.h> 33#include <asm/pgalloc.h>
@@ -191,7 +193,8 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags,
191 * Don't allow anybody to remap normal RAM that we're using. 193 * Don't allow anybody to remap normal RAM that we're using.
192 * mem_init() sets high_memory so only do the check after that. 194 * mem_init() sets high_memory so only do the check after that.
193 */ 195 */
194 if (mem_init_done && (p < virt_to_phys(high_memory))) { 196 if (mem_init_done && (p < virt_to_phys(high_memory)) &&
197 !(__allow_ioremap_reserved && lmb_is_region_reserved(p, size))) {
195 printk("__ioremap(): phys addr 0x%llx is RAM lr %p\n", 198 printk("__ioremap(): phys addr 0x%llx is RAM lr %p\n",
196 (unsigned long long)p, __builtin_return_address(0)); 199 (unsigned long long)p, __builtin_return_address(0));
197 return NULL; 200 return NULL;
@@ -283,18 +286,18 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
283} 286}
284 287
285/* 288/*
286 * Map in a big chunk of physical memory starting at PAGE_OFFSET. 289 * Map in a chunk of physical memory starting at start.
287 */ 290 */
288void __init mapin_ram(void) 291void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
289{ 292{
290 unsigned long v, s, f; 293 unsigned long v, s, f;
291 phys_addr_t p; 294 phys_addr_t p;
292 int ktext; 295 int ktext;
293 296
294 s = mmu_mapin_ram(); 297 s = offset;
295 v = PAGE_OFFSET + s; 298 v = PAGE_OFFSET + s;
296 p = memstart_addr + s; 299 p = memstart_addr + s;
297 for (; s < total_lowmem; s += PAGE_SIZE) { 300 for (; s < top; s += PAGE_SIZE) {
298 ktext = ((char *) v >= _stext && (char *) v < etext); 301 ktext = ((char *) v >= _stext && (char *) v < etext);
299 f = ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL; 302 f = ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL;
300 map_page(v, p, f); 303 map_page(v, p, f);
@@ -307,6 +310,30 @@ void __init mapin_ram(void)
307 } 310 }
308} 311}
309 312
313void __init mapin_ram(void)
314{
315 unsigned long s, top;
316
317#ifndef CONFIG_WII
318 top = total_lowmem;
319 s = mmu_mapin_ram(top);
320 __mapin_ram_chunk(s, top);
321#else
322 if (!wii_hole_size) {
323 s = mmu_mapin_ram(total_lowmem);
324 __mapin_ram_chunk(s, total_lowmem);
325 } else {
326 top = wii_hole_start;
327 s = mmu_mapin_ram(top);
328 __mapin_ram_chunk(s, top);
329
330 top = lmb_end_of_DRAM();
331 s = wii_mmu_mapin_mem2(top);
332 __mapin_ram_chunk(s, top);
333 }
334#endif
335}
336
310/* Scan the real Linux page tables and return a PTE pointer for 337/* Scan the real Linux page tables and return a PTE pointer for
311 * a virtual address in a context. 338 * a virtual address in a context.
312 * Returns true (1) if PTE was found, zero otherwise. The pointer to 339 * Returns true (1) if PTE was found, zero otherwise. The pointer to
@@ -356,7 +383,7 @@ static int __change_page_attr(struct page *page, pgprot_t prot)
356 return 0; 383 return 0;
357 if (!get_pteptr(&init_mm, address, &kpte, &kpmd)) 384 if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
358 return -EINVAL; 385 return -EINVAL;
359 set_pte_at(&init_mm, address, kpte, mk_pte(page, prot)); 386 __set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
360 wmb(); 387 wmb();
361#ifdef CONFIG_PPC_STD_MMU 388#ifdef CONFIG_PPC_STD_MMU
362 flush_hash_pages(0, address, pmd_val(*kpmd), 1); 389 flush_hash_pages(0, address, pmd_val(*kpmd), 1);
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 853d5565eed5..d95679a5fb29 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -35,6 +35,7 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/bootmem.h> 36#include <linux/bootmem.h>
37#include <linux/lmb.h> 37#include <linux/lmb.h>
38#include <linux/slab.h>
38 39
39#include <asm/pgalloc.h> 40#include <asm/pgalloc.h>
40#include <asm/page.h> 41#include <asm/page.h>
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 2d2a87e10154..f11c2cdcb0fe 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -72,7 +72,7 @@ unsigned long p_mapped_by_bats(phys_addr_t pa)
72 return 0; 72 return 0;
73} 73}
74 74
75unsigned long __init mmu_mapin_ram(void) 75unsigned long __init mmu_mapin_ram(unsigned long top)
76{ 76{
77 unsigned long tot, bl, done; 77 unsigned long tot, bl, done;
78 unsigned long max_size = (256<<20); 78 unsigned long max_size = (256<<20);
@@ -86,7 +86,7 @@ unsigned long __init mmu_mapin_ram(void)
86 86
87 /* Make sure we don't map a block larger than the 87 /* Make sure we don't map a block larger than the
88 smallest alignment of the physical address. */ 88 smallest alignment of the physical address. */
89 tot = total_lowmem; 89 tot = top;
90 for (bl = 128<<10; bl < max_size; bl <<= 1) { 90 for (bl = 128<<10; bl < max_size; bl <<= 1) {
91 if (bl * 2 > tot) 91 if (bl * 2 > tot)
92 break; 92 break;
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 4cafc0c33d0a..e4f8f1fc81a5 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -10,7 +10,6 @@
10#include <linux/errno.h> 10#include <linux/errno.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/slab.h>
14#include <linux/types.h> 13#include <linux/types.h>
15#include <linux/mm.h> 14#include <linux/mm.h>
16#include <linux/hugetlb.h> 15#include <linux/hugetlb.h>
@@ -24,9 +23,9 @@
24 * Also makes sure that the subpage_prot_table structure is 23 * Also makes sure that the subpage_prot_table structure is
25 * reinitialized for the next user. 24 * reinitialized for the next user.
26 */ 25 */
27void subpage_prot_free(pgd_t *pgd) 26void subpage_prot_free(struct mm_struct *mm)
28{ 27{
29 struct subpage_prot_table *spt = pgd_subpage_prot(pgd); 28 struct subpage_prot_table *spt = &mm->context.spt;
30 unsigned long i, j, addr; 29 unsigned long i, j, addr;
31 u32 **p; 30 u32 **p;
32 31
@@ -51,6 +50,13 @@ void subpage_prot_free(pgd_t *pgd)
51 spt->maxaddr = 0; 50 spt->maxaddr = 0;
52} 51}
53 52
53void subpage_prot_init_new_context(struct mm_struct *mm)
54{
55 struct subpage_prot_table *spt = &mm->context.spt;
56
57 memset(spt, 0, sizeof(*spt));
58}
59
54static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, 60static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
55 int npages) 61 int npages)
56{ 62{
@@ -87,7 +93,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
87static void subpage_prot_clear(unsigned long addr, unsigned long len) 93static void subpage_prot_clear(unsigned long addr, unsigned long len)
88{ 94{
89 struct mm_struct *mm = current->mm; 95 struct mm_struct *mm = current->mm;
90 struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd); 96 struct subpage_prot_table *spt = &mm->context.spt;
91 u32 **spm, *spp; 97 u32 **spm, *spp;
92 int i, nw; 98 int i, nw;
93 unsigned long next, limit; 99 unsigned long next, limit;
@@ -136,7 +142,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
136long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) 142long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
137{ 143{
138 struct mm_struct *mm = current->mm; 144 struct mm_struct *mm = current->mm;
139 struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd); 145 struct subpage_prot_table *spt = &mm->context.spt;
140 u32 **spm, *spp; 146 u32 **spm, *spp;
141 int i, nw; 147 int i, nw;
142 unsigned long next, limit; 148 unsigned long next, limit;
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 2b2f35f6985e..1ec06576f619 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -53,11 +53,6 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
53 53
54 i = batch->index; 54 i = batch->index;
55 55
56 /* We mask the address for the base page size. Huge pages will
57 * have applied their own masking already
58 */
59 addr &= PAGE_MASK;
60
61 /* Get page size (maybe move back to caller). 56 /* Get page size (maybe move back to caller).
62 * 57 *
63 * NOTE: when using special 64K mappings in 4K environment like 58 * NOTE: when using special 64K mappings in 4K environment like
@@ -68,12 +63,21 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
68 if (huge) { 63 if (huge) {
69#ifdef CONFIG_HUGETLB_PAGE 64#ifdef CONFIG_HUGETLB_PAGE
70 psize = get_slice_psize(mm, addr); 65 psize = get_slice_psize(mm, addr);
66 /* Mask the address for the correct page size */
67 addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
71#else 68#else
72 BUG(); 69 BUG();
73 psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */ 70 psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
74#endif 71#endif
75 } else 72 } else {
76 psize = pte_pagesize_index(mm, addr, pte); 73 psize = pte_pagesize_index(mm, addr, pte);
74 /* Mask the address for the standard page size. If we
75 * have a 64k page kernel, but the hardware does not
76 * support 64k pages, this might be different from the
77 * hardware page size encoded in the slice table. */
78 addr &= PAGE_MASK;
79 }
80
77 81
78 /* Build full vaddr */ 82 /* Build full vaddr */
79 if (!is_kernel_addr(addr)) { 83 if (!is_kernel_addr(addr)) {
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index f288279e679d..8b04c54e596f 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -1,5 +1,5 @@
1/* 1/*
2 * Low leve TLB miss handlers for Book3E 2 * Low level TLB miss handlers for Book3E
3 * 3 *
4 * Copyright (C) 2008-2009 4 * Copyright (C) 2008-2009
5 * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. 5 * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 2fbc680c2c71..e81d5d67f834 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -150,7 +150,7 @@ EXPORT_SYMBOL(local_flush_tlb_page);
150 */ 150 */
151#ifdef CONFIG_SMP 151#ifdef CONFIG_SMP
152 152
153static DEFINE_SPINLOCK(tlbivax_lock); 153static DEFINE_RAW_SPINLOCK(tlbivax_lock);
154 154
155static int mm_is_core_local(struct mm_struct *mm) 155static int mm_is_core_local(struct mm_struct *mm)
156{ 156{
@@ -232,10 +232,10 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
232 if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { 232 if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
233 int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); 233 int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
234 if (lock) 234 if (lock)
235 spin_lock(&tlbivax_lock); 235 raw_spin_lock(&tlbivax_lock);
236 _tlbivax_bcast(vmaddr, pid, tsize, ind); 236 _tlbivax_bcast(vmaddr, pid, tsize, ind);
237 if (lock) 237 if (lock)
238 spin_unlock(&tlbivax_lock); 238 raw_spin_unlock(&tlbivax_lock);
239 goto bail; 239 goto bail;
240 } else { 240 } else {
241 struct tlb_flush_param p = { 241 struct tlb_flush_param p = {