aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/40x_mmu.c4
-rw-r--r--arch/powerpc/mm/Makefile1
-rw-r--r--arch/powerpc/mm/fsl_booke_mmu.c2
-rw-r--r--arch/powerpc/mm/hash_low_32.S4
-rw-r--r--arch/powerpc/mm/hugetlbpage.c8
-rw-r--r--arch/powerpc/mm/init_32.c2
-rw-r--r--arch/powerpc/mm/init_64.c55
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c96
-rw-r--r--arch/powerpc/mm/mmu_decl.h37
-rw-r--r--arch/powerpc/mm/pgtable.c179
-rw-r--r--arch/powerpc/mm/pgtable_32.c2
-rw-r--r--arch/powerpc/mm/pgtable_64.c59
-rw-r--r--arch/powerpc/mm/slb.c83
-rw-r--r--arch/powerpc/mm/stab.c13
-rw-r--r--arch/powerpc/mm/tlb_hash32.c3
-rw-r--r--arch/powerpc/mm/tlb_hash64.c20
-rw-r--r--arch/powerpc/mm/tlb_low_64e.S770
-rw-r--r--arch/powerpc/mm/tlb_nohash.c268
-rw-r--r--arch/powerpc/mm/tlb_nohash_low.S87
19 files changed, 1523 insertions, 170 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
index 29954dc28942..f5e7b9ce63dd 100644
--- a/arch/powerpc/mm/40x_mmu.c
+++ b/arch/powerpc/mm/40x_mmu.c
@@ -105,7 +105,7 @@ unsigned long __init mmu_mapin_ram(void)
105 105
106 while (s >= LARGE_PAGE_SIZE_16M) { 106 while (s >= LARGE_PAGE_SIZE_16M) {
107 pmd_t *pmdp; 107 pmd_t *pmdp;
108 unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE; 108 unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
109 109
110 pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); 110 pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
111 pmd_val(*pmdp++) = val; 111 pmd_val(*pmdp++) = val;
@@ -120,7 +120,7 @@ unsigned long __init mmu_mapin_ram(void)
120 120
121 while (s >= LARGE_PAGE_SIZE_4M) { 121 while (s >= LARGE_PAGE_SIZE_4M) {
122 pmd_t *pmdp; 122 pmd_t *pmdp;
123 unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE; 123 unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
124 124
125 pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); 125 pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
126 pmd_val(*pmdp) = val; 126 pmd_val(*pmdp) = val;
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3e68363405b7..6fb8fc8d2fea 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -13,6 +13,7 @@ obj-y := fault.o mem.o pgtable.o gup.o \
13 pgtable_$(CONFIG_WORD_SIZE).o 13 pgtable_$(CONFIG_WORD_SIZE).o
14obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ 14obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
15 tlb_nohash_low.o 15 tlb_nohash_low.o
16obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o
16obj-$(CONFIG_PPC64) += mmap_64.o 17obj-$(CONFIG_PPC64) += mmap_64.o
17hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o 18hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
18obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \ 19obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index bb3d65998e6b..dc93e95b256e 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -161,7 +161,7 @@ unsigned long __init mmu_mapin_ram(void)
161 unsigned long virt = PAGE_OFFSET; 161 unsigned long virt = PAGE_OFFSET;
162 phys_addr_t phys = memstart_addr; 162 phys_addr_t phys = memstart_addr;
163 163
164 while (cam[tlbcam_index] && tlbcam_index < ARRAY_SIZE(cam)) { 164 while (tlbcam_index < ARRAY_SIZE(cam) && cam[tlbcam_index]) {
165 settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], PAGE_KERNEL_X, 0); 165 settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], PAGE_KERNEL_X, 0);
166 virt += cam[tlbcam_index]; 166 virt += cam[tlbcam_index];
167 phys += cam[tlbcam_index]; 167 phys += cam[tlbcam_index];
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 14af8cedab70..b13d58932bf6 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -40,7 +40,7 @@ mmu_hash_lock:
40 * The address is in r4, and r3 contains an access flag: 40 * The address is in r4, and r3 contains an access flag:
41 * _PAGE_RW (0x400) if a write. 41 * _PAGE_RW (0x400) if a write.
42 * r9 contains the SRR1 value, from which we use the MSR_PR bit. 42 * r9 contains the SRR1 value, from which we use the MSR_PR bit.
43 * SPRG3 contains the physical address of the current task's thread. 43 * SPRG_THREAD contains the physical address of the current task's thread.
44 * 44 *
45 * Returns to the caller if the access is illegal or there is no 45 * Returns to the caller if the access is illegal or there is no
46 * mapping for the address. Otherwise it places an appropriate PTE 46 * mapping for the address. Otherwise it places an appropriate PTE
@@ -68,7 +68,7 @@ _GLOBAL(hash_page)
68 /* Get PTE (linux-style) and check access */ 68 /* Get PTE (linux-style) and check access */
69 lis r0,KERNELBASE@h /* check if kernel address */ 69 lis r0,KERNELBASE@h /* check if kernel address */
70 cmplw 0,r4,r0 70 cmplw 0,r4,r0
71 mfspr r8,SPRN_SPRG3 /* current task's THREAD (phys) */ 71 mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */
72 ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ 72 ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
73 lwz r5,PGDIR(r8) /* virt page-table root */ 73 lwz r5,PGDIR(r8) /* virt page-table root */
74 blt+ 112f /* assume user more likely */ 74 blt+ 112f /* assume user more likely */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index c46ef2ffa3d9..90df6ffe3a43 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -57,8 +57,10 @@ unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
57#define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) 57#define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize])
58 58
59static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { 59static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
60 "unused_4K", "hugepte_cache_64K", "unused_64K_AP", 60 [MMU_PAGE_64K] = "hugepte_cache_64K",
61 "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G" 61 [MMU_PAGE_1M] = "hugepte_cache_1M",
62 [MMU_PAGE_16M] = "hugepte_cache_16M",
63 [MMU_PAGE_16G] = "hugepte_cache_16G",
62}; 64};
63 65
64/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 66/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
@@ -700,6 +702,8 @@ static void __init set_huge_psize(int psize)
700 if (mmu_huge_psizes[psize] || 702 if (mmu_huge_psizes[psize] ||
701 mmu_psize_defs[psize].shift == PAGE_SHIFT) 703 mmu_psize_defs[psize].shift == PAGE_SHIFT)
702 return; 704 return;
705 if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
706 return;
703 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); 707 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
704 708
705 switch (mmu_psize_defs[psize].shift) { 709 switch (mmu_psize_defs[psize].shift) {
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 3de6a0d93824..3ef5084b90ca 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -54,8 +54,6 @@
54#endif 54#endif
55#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE 55#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE
56 56
57DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
58
59phys_addr_t total_memory; 57phys_addr_t total_memory;
60phys_addr_t total_lowmem; 58phys_addr_t total_lowmem;
61 59
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 68a821add28d..31582329cd67 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -205,6 +205,47 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size)
205 return 0; 205 return 0;
206} 206}
207 207
208/* On hash-based CPUs, the vmemmap is bolted in the hash table.
209 *
210 * On Book3E CPUs, the vmemmap is currently mapped in the top half of
211 * the vmalloc space using normal page tables, though the size of
212 * pages encoded in the PTEs can be different
213 */
214
215#ifdef CONFIG_PPC_BOOK3E
216static void __meminit vmemmap_create_mapping(unsigned long start,
217 unsigned long page_size,
218 unsigned long phys)
219{
220 /* Create a PTE encoding without page size */
221 unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
222 _PAGE_KERNEL_RW;
223
224 /* PTEs only contain page size encodings up to 32M */
225 BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
226
227 /* Encode the size in the PTE */
228 flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
229
230 /* For each PTE for that area, map things. Note that we don't
231 * increment phys because all PTEs are of the large size and
232 * thus must have the low bits clear
233 */
234 for (i = 0; i < page_size; i += PAGE_SIZE)
235 BUG_ON(map_kernel_page(start + i, phys, flags));
236}
237#else /* CONFIG_PPC_BOOK3E */
238static void __meminit vmemmap_create_mapping(unsigned long start,
239 unsigned long page_size,
240 unsigned long phys)
241{
242 int mapped = htab_bolt_mapping(start, start + page_size, phys,
243 PAGE_KERNEL, mmu_vmemmap_psize,
244 mmu_kernel_ssize);
245 BUG_ON(mapped < 0);
246}
247#endif /* CONFIG_PPC_BOOK3E */
248
208int __meminit vmemmap_populate(struct page *start_page, 249int __meminit vmemmap_populate(struct page *start_page,
209 unsigned long nr_pages, int node) 250 unsigned long nr_pages, int node)
210{ 251{
@@ -215,8 +256,11 @@ int __meminit vmemmap_populate(struct page *start_page,
215 /* Align to the page size of the linear mapping. */ 256 /* Align to the page size of the linear mapping. */
216 start = _ALIGN_DOWN(start, page_size); 257 start = _ALIGN_DOWN(start, page_size);
217 258
259 pr_debug("vmemmap_populate page %p, %ld pages, node %d\n",
260 start_page, nr_pages, node);
261 pr_debug(" -> map %lx..%lx\n", start, end);
262
218 for (; start < end; start += page_size) { 263 for (; start < end; start += page_size) {
219 int mapped;
220 void *p; 264 void *p;
221 265
222 if (vmemmap_populated(start, page_size)) 266 if (vmemmap_populated(start, page_size))
@@ -226,13 +270,10 @@ int __meminit vmemmap_populate(struct page *start_page,
226 if (!p) 270 if (!p)
227 return -ENOMEM; 271 return -ENOMEM;
228 272
229 pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n", 273 pr_debug(" * %016lx..%016lx allocated at %p\n",
230 start, p, __pa(p)); 274 start, start + page_size, p);
231 275
232 mapped = htab_bolt_mapping(start, start + page_size, __pa(p), 276 vmemmap_create_mapping(start, page_size, __pa(p));
233 pgprot_val(PAGE_KERNEL),
234 mmu_vmemmap_psize, mmu_kernel_ssize);
235 BUG_ON(mapped < 0);
236 } 277 }
237 278
238 return 0; 279 return 0;
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index b1a727def15b..c2f93dc470e6 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -25,10 +25,20 @@
25 * also clear mm->cpu_vm_mask bits when processes are migrated 25 * also clear mm->cpu_vm_mask bits when processes are migrated
26 */ 26 */
27 27
28#undef DEBUG 28#define DEBUG_MAP_CONSISTENCY
29#define DEBUG_STEAL_ONLY 29#define DEBUG_CLAMP_LAST_CONTEXT 31
30#undef DEBUG_MAP_CONSISTENCY 30//#define DEBUG_HARDER
31/*#define DEBUG_CLAMP_LAST_CONTEXT 15 */ 31
32/* We don't use DEBUG because it tends to be compiled in always nowadays
33 * and this would generate way too much output
34 */
35#ifdef DEBUG_HARDER
36#define pr_hard(args...) printk(KERN_DEBUG args)
37#define pr_hardcont(args...) printk(KERN_CONT args)
38#else
39#define pr_hard(args...) do { } while(0)
40#define pr_hardcont(args...) do { } while(0)
41#endif
32 42
33#include <linux/kernel.h> 43#include <linux/kernel.h>
34#include <linux/mm.h> 44#include <linux/mm.h>
@@ -71,7 +81,7 @@ static DEFINE_SPINLOCK(context_lock);
71static unsigned int steal_context_smp(unsigned int id) 81static unsigned int steal_context_smp(unsigned int id)
72{ 82{
73 struct mm_struct *mm; 83 struct mm_struct *mm;
74 unsigned int cpu, max; 84 unsigned int cpu, max, i;
75 85
76 max = last_context - first_context; 86 max = last_context - first_context;
77 87
@@ -89,15 +99,22 @@ static unsigned int steal_context_smp(unsigned int id)
89 id = first_context; 99 id = first_context;
90 continue; 100 continue;
91 } 101 }
92 pr_devel("[%d] steal context %d from mm @%p\n", 102 pr_hardcont(" | steal %d from 0x%p", id, mm);
93 smp_processor_id(), id, mm);
94 103
95 /* Mark this mm has having no context anymore */ 104 /* Mark this mm has having no context anymore */
96 mm->context.id = MMU_NO_CONTEXT; 105 mm->context.id = MMU_NO_CONTEXT;
97 106
98 /* Mark it stale on all CPUs that used this mm */ 107 /* Mark it stale on all CPUs that used this mm. For threaded
99 for_each_cpu(cpu, mm_cpumask(mm)) 108 * implementations, we set it on all threads on each core
100 __set_bit(id, stale_map[cpu]); 109 * represented in the mask. A future implementation will use
110 * a core map instead but this will do for now.
111 */
112 for_each_cpu(cpu, mm_cpumask(mm)) {
113 for (i = cpu_first_thread_in_core(cpu);
114 i <= cpu_last_thread_in_core(cpu); i++)
115 __set_bit(id, stale_map[i]);
116 cpu = i - 1;
117 }
101 return id; 118 return id;
102 } 119 }
103 120
@@ -126,7 +143,7 @@ static unsigned int steal_context_up(unsigned int id)
126 /* Pick up the victim mm */ 143 /* Pick up the victim mm */
127 mm = context_mm[id]; 144 mm = context_mm[id];
128 145
129 pr_devel("[%d] steal context %d from mm @%p\n", cpu, id, mm); 146 pr_hardcont(" | steal %d from 0x%p", id, mm);
130 147
131 /* Flush the TLB for that context */ 148 /* Flush the TLB for that context */
132 local_flush_tlb_mm(mm); 149 local_flush_tlb_mm(mm);
@@ -173,25 +190,20 @@ static void context_check_map(void) { }
173 190
174void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) 191void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
175{ 192{
176 unsigned int id, cpu = smp_processor_id(); 193 unsigned int i, id, cpu = smp_processor_id();
177 unsigned long *map; 194 unsigned long *map;
178 195
179 /* No lockless fast path .. yet */ 196 /* No lockless fast path .. yet */
180 spin_lock(&context_lock); 197 spin_lock(&context_lock);
181 198
182#ifndef DEBUG_STEAL_ONLY 199 pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
183 pr_devel("[%d] activating context for mm @%p, active=%d, id=%d\n", 200 cpu, next, next->context.active, next->context.id);
184 cpu, next, next->context.active, next->context.id);
185#endif
186 201
187#ifdef CONFIG_SMP 202#ifdef CONFIG_SMP
188 /* Mark us active and the previous one not anymore */ 203 /* Mark us active and the previous one not anymore */
189 next->context.active++; 204 next->context.active++;
190 if (prev) { 205 if (prev) {
191#ifndef DEBUG_STEAL_ONLY 206 pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active);
192 pr_devel(" old context %p active was: %d\n",
193 prev, prev->context.active);
194#endif
195 WARN_ON(prev->context.active < 1); 207 WARN_ON(prev->context.active < 1);
196 prev->context.active--; 208 prev->context.active--;
197 } 209 }
@@ -201,8 +213,14 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
201 213
202 /* If we already have a valid assigned context, skip all that */ 214 /* If we already have a valid assigned context, skip all that */
203 id = next->context.id; 215 id = next->context.id;
204 if (likely(id != MMU_NO_CONTEXT)) 216 if (likely(id != MMU_NO_CONTEXT)) {
217#ifdef DEBUG_MAP_CONSISTENCY
218 if (context_mm[id] != next)
219 pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n",
220 next, id, id, context_mm[id]);
221#endif
205 goto ctxt_ok; 222 goto ctxt_ok;
223 }
206 224
207 /* We really don't have a context, let's try to acquire one */ 225 /* We really don't have a context, let's try to acquire one */
208 id = next_context; 226 id = next_context;
@@ -235,11 +253,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
235 next_context = id + 1; 253 next_context = id + 1;
236 context_mm[id] = next; 254 context_mm[id] = next;
237 next->context.id = id; 255 next->context.id = id;
238 256 pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts);
239#ifndef DEBUG_STEAL_ONLY
240 pr_devel("[%d] picked up new id %d, nrf is now %d\n",
241 cpu, id, nr_free_contexts);
242#endif
243 257
244 context_check_map(); 258 context_check_map();
245 ctxt_ok: 259 ctxt_ok:
@@ -248,15 +262,21 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
248 * local TLB for it and unmark it before we use it 262 * local TLB for it and unmark it before we use it
249 */ 263 */
250 if (test_bit(id, stale_map[cpu])) { 264 if (test_bit(id, stale_map[cpu])) {
251 pr_devel("[%d] flushing stale context %d for mm @%p !\n", 265 pr_hardcont(" | stale flush %d [%d..%d]",
252 cpu, id, next); 266 id, cpu_first_thread_in_core(cpu),
267 cpu_last_thread_in_core(cpu));
268
253 local_flush_tlb_mm(next); 269 local_flush_tlb_mm(next);
254 270
255 /* XXX This clear should ultimately be part of local_flush_tlb_mm */ 271 /* XXX This clear should ultimately be part of local_flush_tlb_mm */
256 __clear_bit(id, stale_map[cpu]); 272 for (i = cpu_first_thread_in_core(cpu);
273 i <= cpu_last_thread_in_core(cpu); i++) {
274 __clear_bit(id, stale_map[i]);
275 }
257 } 276 }
258 277
259 /* Flick the MMU and release lock */ 278 /* Flick the MMU and release lock */
279 pr_hardcont(" -> %d\n", id);
260 set_context(id, next->pgd); 280 set_context(id, next->pgd);
261 spin_unlock(&context_lock); 281 spin_unlock(&context_lock);
262} 282}
@@ -266,6 +286,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
266 */ 286 */
267int init_new_context(struct task_struct *t, struct mm_struct *mm) 287int init_new_context(struct task_struct *t, struct mm_struct *mm)
268{ 288{
289 pr_hard("initing context for mm @%p\n", mm);
290
269 mm->context.id = MMU_NO_CONTEXT; 291 mm->context.id = MMU_NO_CONTEXT;
270 mm->context.active = 0; 292 mm->context.active = 0;
271 293
@@ -305,7 +327,9 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
305 unsigned long action, void *hcpu) 327 unsigned long action, void *hcpu)
306{ 328{
307 unsigned int cpu = (unsigned int)(long)hcpu; 329 unsigned int cpu = (unsigned int)(long)hcpu;
308 330#ifdef CONFIG_HOTPLUG_CPU
331 struct task_struct *p;
332#endif
309 /* We don't touch CPU 0 map, it's allocated at aboot and kept 333 /* We don't touch CPU 0 map, it's allocated at aboot and kept
310 * around forever 334 * around forever
311 */ 335 */
@@ -324,8 +348,16 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
324 pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); 348 pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
325 kfree(stale_map[cpu]); 349 kfree(stale_map[cpu]);
326 stale_map[cpu] = NULL; 350 stale_map[cpu] = NULL;
327 break; 351
328#endif 352 /* We also clear the cpu_vm_mask bits of CPUs going away */
353 read_lock(&tasklist_lock);
354 for_each_process(p) {
355 if (p->mm)
356 cpu_mask_clear_cpu(cpu, mm_cpumask(p->mm));
357 }
358 read_unlock(&tasklist_lock);
359 break;
360#endif /* CONFIG_HOTPLUG_CPU */
329 } 361 }
330 return NOTIFY_OK; 362 return NOTIFY_OK;
331} 363}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index d1f9c62dc177..d2e5321d5ea6 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -36,21 +36,37 @@ static inline void _tlbil_pid(unsigned int pid)
36{ 36{
37 asm volatile ("sync; tlbia; isync" : : : "memory"); 37 asm volatile ("sync; tlbia; isync" : : : "memory");
38} 38}
39#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
40
39#else /* CONFIG_40x || CONFIG_8xx */ 41#else /* CONFIG_40x || CONFIG_8xx */
40extern void _tlbil_all(void); 42extern void _tlbil_all(void);
41extern void _tlbil_pid(unsigned int pid); 43extern void _tlbil_pid(unsigned int pid);
44#ifdef CONFIG_PPC_BOOK3E
45extern void _tlbil_pid_noind(unsigned int pid);
46#else
47#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
48#endif
42#endif /* !(CONFIG_40x || CONFIG_8xx) */ 49#endif /* !(CONFIG_40x || CONFIG_8xx) */
43 50
44/* 51/*
45 * On 8xx, we directly inline tlbie, on others, it's extern 52 * On 8xx, we directly inline tlbie, on others, it's extern
46 */ 53 */
47#ifdef CONFIG_8xx 54#ifdef CONFIG_8xx
48static inline void _tlbil_va(unsigned long address, unsigned int pid) 55static inline void _tlbil_va(unsigned long address, unsigned int pid,
56 unsigned int tsize, unsigned int ind)
49{ 57{
50 asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); 58 asm volatile ("tlbie %0; sync" : : "r" (address) : "memory");
51} 59}
52#else /* CONFIG_8xx */ 60#elif defined(CONFIG_PPC_BOOK3E)
53extern void _tlbil_va(unsigned long address, unsigned int pid); 61extern void _tlbil_va(unsigned long address, unsigned int pid,
62 unsigned int tsize, unsigned int ind);
63#else
64extern void __tlbil_va(unsigned long address, unsigned int pid);
65static inline void _tlbil_va(unsigned long address, unsigned int pid,
66 unsigned int tsize, unsigned int ind)
67{
68 __tlbil_va(address, pid);
69}
54#endif /* CONIFG_8xx */ 70#endif /* CONIFG_8xx */
55 71
56/* 72/*
@@ -58,10 +74,16 @@ extern void _tlbil_va(unsigned long address, unsigned int pid);
58 * implementation. When that becomes the case, this will be 74 * implementation. When that becomes the case, this will be
59 * an extern. 75 * an extern.
60 */ 76 */
61static inline void _tlbivax_bcast(unsigned long address, unsigned int pid) 77#ifdef CONFIG_PPC_BOOK3E
78extern void _tlbivax_bcast(unsigned long address, unsigned int pid,
79 unsigned int tsize, unsigned int ind);
80#else
81static inline void _tlbivax_bcast(unsigned long address, unsigned int pid,
82 unsigned int tsize, unsigned int ind)
62{ 83{
63 BUG(); 84 BUG();
64} 85}
86#endif
65 87
66#else /* CONFIG_PPC_MMU_NOHASH */ 88#else /* CONFIG_PPC_MMU_NOHASH */
67 89
@@ -99,7 +121,12 @@ extern unsigned int rtas_data, rtas_size;
99struct hash_pte; 121struct hash_pte;
100extern struct hash_pte *Hash, *Hash_end; 122extern struct hash_pte *Hash, *Hash_end;
101extern unsigned long Hash_size, Hash_mask; 123extern unsigned long Hash_size, Hash_mask;
102#endif 124
125#endif /* CONFIG_PPC32 */
126
127#ifdef CONFIG_PPC64
128extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags);
129#endif /* CONFIG_PPC64 */
103 130
104extern unsigned long ioremap_bot; 131extern unsigned long ioremap_bot;
105extern unsigned long __max_low_memory; 132extern unsigned long __max_low_memory;
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 627767d6169b..83f1551ec2c9 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -30,6 +30,16 @@
30#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
31#include <asm/tlb.h> 31#include <asm/tlb.h>
32 32
33DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
34
35#ifdef CONFIG_SMP
36
37/*
38 * Handle batching of page table freeing on SMP. Page tables are
39 * queued up and send to be freed later by RCU in order to avoid
40 * freeing a page table page that is being walked without locks
41 */
42
33static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); 43static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
34static unsigned long pte_freelist_forced_free; 44static unsigned long pte_freelist_forced_free;
35 45
@@ -116,27 +126,7 @@ void pte_free_finish(void)
116 *batchp = NULL; 126 *batchp = NULL;
117} 127}
118 128
119/* 129#endif /* CONFIG_SMP */
120 * Handle i/d cache flushing, called from set_pte_at() or ptep_set_access_flags()
121 */
122static pte_t do_dcache_icache_coherency(pte_t pte)
123{
124 unsigned long pfn = pte_pfn(pte);
125 struct page *page;
126
127 if (unlikely(!pfn_valid(pfn)))
128 return pte;
129 page = pfn_to_page(pfn);
130
131 if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags)) {
132 pr_devel("do_dcache_icache_coherency... flushing\n");
133 flush_dcache_icache_page(page);
134 set_bit(PG_arch_1, &page->flags);
135 }
136 else
137 pr_devel("do_dcache_icache_coherency... already clean\n");
138 return __pte(pte_val(pte) | _PAGE_HWEXEC);
139}
140 130
141static inline int is_exec_fault(void) 131static inline int is_exec_fault(void)
142{ 132{
@@ -145,49 +135,139 @@ static inline int is_exec_fault(void)
145 135
146/* We only try to do i/d cache coherency on stuff that looks like 136/* We only try to do i/d cache coherency on stuff that looks like
147 * reasonably "normal" PTEs. We currently require a PTE to be present 137 * reasonably "normal" PTEs. We currently require a PTE to be present
148 * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE 138 * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that
139 * on userspace PTEs
149 */ 140 */
150static inline int pte_looks_normal(pte_t pte) 141static inline int pte_looks_normal(pte_t pte)
151{ 142{
152 return (pte_val(pte) & 143 return (pte_val(pte) &
153 (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE)) == 144 (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) ==
154 (_PAGE_PRESENT); 145 (_PAGE_PRESENT | _PAGE_USER);
155} 146}
156 147
157#if defined(CONFIG_PPC_STD_MMU) 148struct page * maybe_pte_to_page(pte_t pte)
149{
150 unsigned long pfn = pte_pfn(pte);
151 struct page *page;
152
153 if (unlikely(!pfn_valid(pfn)))
154 return NULL;
155 page = pfn_to_page(pfn);
156 if (PageReserved(page))
157 return NULL;
158 return page;
159}
160
161#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0
162
158/* Server-style MMU handles coherency when hashing if HW exec permission 163/* Server-style MMU handles coherency when hashing if HW exec permission
159 * is supposed per page (currently 64-bit only). Else, we always flush 164 * is supposed per page (currently 64-bit only). If not, then, we always
160 * valid PTEs in set_pte. 165 * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec
166 * support falls into the same category.
161 */ 167 */
162static inline int pte_need_exec_flush(pte_t pte, int set_pte) 168
169static pte_t set_pte_filter(pte_t pte)
163{ 170{
164 return set_pte && pte_looks_normal(pte) && 171 pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
165 !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || 172 if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
166 cpu_has_feature(CPU_FTR_NOEXECUTE)); 173 cpu_has_feature(CPU_FTR_NOEXECUTE))) {
174 struct page *pg = maybe_pte_to_page(pte);
175 if (!pg)
176 return pte;
177 if (!test_bit(PG_arch_1, &pg->flags)) {
178 flush_dcache_icache_page(pg);
179 set_bit(PG_arch_1, &pg->flags);
180 }
181 }
182 return pte;
167} 183}
168#elif _PAGE_HWEXEC == 0 184
169/* Embedded type MMU without HW exec support (8xx only so far), we flush 185static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
170 * the cache for any present PTE 186 int dirty)
171 */
172static inline int pte_need_exec_flush(pte_t pte, int set_pte)
173{ 187{
174 return set_pte && pte_looks_normal(pte); 188 return pte;
175} 189}
176#else 190
177/* Other embedded CPUs with HW exec support per-page, we flush on exec 191#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */
178 * fault if HWEXEC is not set 192
193/* Embedded type MMU with HW exec support. This is a bit more complicated
194 * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
195 * instead we "filter out" the exec permission for non clean pages.
179 */ 196 */
180static inline int pte_need_exec_flush(pte_t pte, int set_pte) 197static pte_t set_pte_filter(pte_t pte)
181{ 198{
182 return pte_looks_normal(pte) && is_exec_fault() && 199 struct page *pg;
183 !(pte_val(pte) & _PAGE_HWEXEC); 200
201 /* No exec permission in the first place, move on */
202 if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte))
203 return pte;
204
205 /* If you set _PAGE_EXEC on weird pages you're on your own */
206 pg = maybe_pte_to_page(pte);
207 if (unlikely(!pg))
208 return pte;
209
210 /* If the page clean, we move on */
211 if (test_bit(PG_arch_1, &pg->flags))
212 return pte;
213
214 /* If it's an exec fault, we flush the cache and make it clean */
215 if (is_exec_fault()) {
216 flush_dcache_icache_page(pg);
217 set_bit(PG_arch_1, &pg->flags);
218 return pte;
219 }
220
221 /* Else, we filter out _PAGE_EXEC */
222 return __pte(pte_val(pte) & ~_PAGE_EXEC);
184} 223}
185#endif 224
225static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
226 int dirty)
227{
228 struct page *pg;
229
230 /* So here, we only care about exec faults, as we use them
231 * to recover lost _PAGE_EXEC and perform I$/D$ coherency
232 * if necessary. Also if _PAGE_EXEC is already set, same deal,
233 * we just bail out
234 */
235 if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault())
236 return pte;
237
238#ifdef CONFIG_DEBUG_VM
239 /* So this is an exec fault, _PAGE_EXEC is not set. If it was
240 * an error we would have bailed out earlier in do_page_fault()
241 * but let's make sure of it
242 */
243 if (WARN_ON(!(vma->vm_flags & VM_EXEC)))
244 return pte;
245#endif /* CONFIG_DEBUG_VM */
246
247 /* If you set _PAGE_EXEC on weird pages you're on your own */
248 pg = maybe_pte_to_page(pte);
249 if (unlikely(!pg))
250 goto bail;
251
252 /* If the page is already clean, we move on */
253 if (test_bit(PG_arch_1, &pg->flags))
254 goto bail;
255
256 /* Clean the page and set PG_arch_1 */
257 flush_dcache_icache_page(pg);
258 set_bit(PG_arch_1, &pg->flags);
259
260 bail:
261 return __pte(pte_val(pte) | _PAGE_EXEC);
262}
263
264#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */
186 265
187/* 266/*
188 * set_pte stores a linux PTE into the linux page table. 267 * set_pte stores a linux PTE into the linux page table.
189 */ 268 */
190void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) 269void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
270 pte_t pte)
191{ 271{
192#ifdef CONFIG_DEBUG_VM 272#ifdef CONFIG_DEBUG_VM
193 WARN_ON(pte_present(*ptep)); 273 WARN_ON(pte_present(*ptep));
@@ -196,9 +276,7 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte
196 * this context might not have been activated yet when this 276 * this context might not have been activated yet when this
197 * is called. 277 * is called.
198 */ 278 */
199 pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); 279 pte = set_pte_filter(pte);
200 if (pte_need_exec_flush(pte, 1))
201 pte = do_dcache_icache_coherency(pte);
202 280
203 /* Perform the setting of the PTE */ 281 /* Perform the setting of the PTE */
204 __set_pte_at(mm, addr, ptep, pte, 0); 282 __set_pte_at(mm, addr, ptep, pte, 0);
@@ -215,8 +293,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
215 pte_t *ptep, pte_t entry, int dirty) 293 pte_t *ptep, pte_t entry, int dirty)
216{ 294{
217 int changed; 295 int changed;
218 if (!dirty && pte_need_exec_flush(entry, 0)) 296 entry = set_access_flags_filter(entry, vma, dirty);
219 entry = do_dcache_icache_coherency(entry);
220 changed = !pte_same(*(ptep), entry); 297 changed = !pte_same(*(ptep), entry);
221 if (changed) { 298 if (changed) {
222 if (!(vma->vm_flags & VM_HUGETLB)) 299 if (!(vma->vm_flags & VM_HUGETLB))
@@ -242,7 +319,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
242 BUG_ON(pud_none(*pud)); 319 BUG_ON(pud_none(*pud));
243 pmd = pmd_offset(pud, addr); 320 pmd = pmd_offset(pud, addr);
244 BUG_ON(!pmd_present(*pmd)); 321 BUG_ON(!pmd_present(*pmd));
245 BUG_ON(!spin_is_locked(pte_lockptr(mm, pmd))); 322 assert_spin_locked(pte_lockptr(mm, pmd));
246} 323}
247#endif /* CONFIG_DEBUG_VM */ 324#endif /* CONFIG_DEBUG_VM */
248 325
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 5422169626ba..cb96cb2e17cc 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -142,7 +142,7 @@ ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags)
142 flags |= _PAGE_DIRTY | _PAGE_HWWRITE; 142 flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
143 143
144 /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ 144 /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
145 flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC); 145 flags &= ~(_PAGE_USER | _PAGE_EXEC);
146 146
147 return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); 147 return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
148} 148}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index bfa7db6b2fd5..853d5565eed5 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -33,6 +33,8 @@
33#include <linux/stddef.h> 33#include <linux/stddef.h>
34#include <linux/vmalloc.h> 34#include <linux/vmalloc.h>
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/bootmem.h>
37#include <linux/lmb.h>
36 38
37#include <asm/pgalloc.h> 39#include <asm/pgalloc.h>
38#include <asm/page.h> 40#include <asm/page.h>
@@ -55,19 +57,36 @@
55 57
56unsigned long ioremap_bot = IOREMAP_BASE; 58unsigned long ioremap_bot = IOREMAP_BASE;
57 59
60
61#ifdef CONFIG_PPC_MMU_NOHASH
62static void *early_alloc_pgtable(unsigned long size)
63{
64 void *pt;
65
66 if (init_bootmem_done)
67 pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS));
68 else
69 pt = __va(lmb_alloc_base(size, size,
70 __pa(MAX_DMA_ADDRESS)));
71 memset(pt, 0, size);
72
73 return pt;
74}
75#endif /* CONFIG_PPC_MMU_NOHASH */
76
58/* 77/*
59 * map_io_page currently only called by __ioremap 78 * map_kernel_page currently only called by __ioremap
60 * map_io_page adds an entry to the ioremap page table 79 * map_kernel_page adds an entry to the ioremap page table
61 * and adds an entry to the HPT, possibly bolting it 80 * and adds an entry to the HPT, possibly bolting it
62 */ 81 */
63static int map_io_page(unsigned long ea, unsigned long pa, int flags) 82int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
64{ 83{
65 pgd_t *pgdp; 84 pgd_t *pgdp;
66 pud_t *pudp; 85 pud_t *pudp;
67 pmd_t *pmdp; 86 pmd_t *pmdp;
68 pte_t *ptep; 87 pte_t *ptep;
69 88
70 if (mem_init_done) { 89 if (slab_is_available()) {
71 pgdp = pgd_offset_k(ea); 90 pgdp = pgd_offset_k(ea);
72 pudp = pud_alloc(&init_mm, pgdp, ea); 91 pudp = pud_alloc(&init_mm, pgdp, ea);
73 if (!pudp) 92 if (!pudp)
@@ -81,6 +100,35 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
81 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, 100 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
82 __pgprot(flags))); 101 __pgprot(flags)));
83 } else { 102 } else {
103#ifdef CONFIG_PPC_MMU_NOHASH
104 /* Warning ! This will blow up if bootmem is not initialized
105 * which our ppc64 code is keen to do that, we'll need to
106 * fix it and/or be more careful
107 */
108 pgdp = pgd_offset_k(ea);
109#ifdef PUD_TABLE_SIZE
110 if (pgd_none(*pgdp)) {
111 pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
112 BUG_ON(pudp == NULL);
113 pgd_populate(&init_mm, pgdp, pudp);
114 }
115#endif /* PUD_TABLE_SIZE */
116 pudp = pud_offset(pgdp, ea);
117 if (pud_none(*pudp)) {
118 pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
119 BUG_ON(pmdp == NULL);
120 pud_populate(&init_mm, pudp, pmdp);
121 }
122 pmdp = pmd_offset(pudp, ea);
123 if (!pmd_present(*pmdp)) {
124 ptep = early_alloc_pgtable(PAGE_SIZE);
125 BUG_ON(ptep == NULL);
126 pmd_populate_kernel(&init_mm, pmdp, ptep);
127 }
128 ptep = pte_offset_kernel(pmdp, ea);
129 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
130 __pgprot(flags)));
131#else /* CONFIG_PPC_MMU_NOHASH */
84 /* 132 /*
85 * If the mm subsystem is not fully up, we cannot create a 133 * If the mm subsystem is not fully up, we cannot create a
86 * linux page table entry for this mapping. Simply bolt an 134 * linux page table entry for this mapping. Simply bolt an
@@ -93,6 +141,7 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
93 "memory at %016lx !\n", pa); 141 "memory at %016lx !\n", pa);
94 return -ENOMEM; 142 return -ENOMEM;
95 } 143 }
144#endif /* !CONFIG_PPC_MMU_NOHASH */
96 } 145 }
97 return 0; 146 return 0;
98} 147}
@@ -124,7 +173,7 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
124 WARN_ON(size & ~PAGE_MASK); 173 WARN_ON(size & ~PAGE_MASK);
125 174
126 for (i = 0; i < size; i += PAGE_SIZE) 175 for (i = 0; i < size; i += PAGE_SIZE)
127 if (map_io_page((unsigned long)ea+i, pa+i, flags)) 176 if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
128 return NULL; 177 return NULL;
129 178
130 return (void __iomem *)ea; 179 return (void __iomem *)ea;
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 5b7038f248b6..1d98ecc8eecd 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -92,15 +92,13 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
92 : "memory" ); 92 : "memory" );
93} 93}
94 94
95void slb_flush_and_rebolt(void) 95static void __slb_flush_and_rebolt(void)
96{ 96{
97 /* If you change this make sure you change SLB_NUM_BOLTED 97 /* If you change this make sure you change SLB_NUM_BOLTED
98 * appropriately too. */ 98 * appropriately too. */
99 unsigned long linear_llp, vmalloc_llp, lflags, vflags; 99 unsigned long linear_llp, vmalloc_llp, lflags, vflags;
100 unsigned long ksp_esid_data, ksp_vsid_data; 100 unsigned long ksp_esid_data, ksp_vsid_data;
101 101
102 WARN_ON(!irqs_disabled());
103
104 linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; 102 linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
105 vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; 103 vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
106 lflags = SLB_VSID_KERNEL | linear_llp; 104 lflags = SLB_VSID_KERNEL | linear_llp;
@@ -117,12 +115,6 @@ void slb_flush_and_rebolt(void)
117 ksp_vsid_data = get_slb_shadow()->save_area[2].vsid; 115 ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
118 } 116 }
119 117
120 /*
121 * We can't take a PMU exception in the following code, so hard
122 * disable interrupts.
123 */
124 hard_irq_disable();
125
126 /* We need to do this all in asm, so we're sure we don't touch 118 /* We need to do this all in asm, so we're sure we don't touch
127 * the stack between the slbia and rebolting it. */ 119 * the stack between the slbia and rebolting it. */
128 asm volatile("isync\n" 120 asm volatile("isync\n"
@@ -139,6 +131,21 @@ void slb_flush_and_rebolt(void)
139 : "memory"); 131 : "memory");
140} 132}
141 133
134void slb_flush_and_rebolt(void)
135{
136
137 WARN_ON(!irqs_disabled());
138
139 /*
140 * We can't take a PMU exception in the following code, so hard
141 * disable interrupts.
142 */
143 hard_irq_disable();
144
145 __slb_flush_and_rebolt();
146 get_paca()->slb_cache_ptr = 0;
147}
148
142void slb_vmalloc_update(void) 149void slb_vmalloc_update(void)
143{ 150{
144 unsigned long vflags; 151 unsigned long vflags;
@@ -180,12 +187,20 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2)
180/* Flush all user entries from the segment table of the current processor. */ 187/* Flush all user entries from the segment table of the current processor. */
181void switch_slb(struct task_struct *tsk, struct mm_struct *mm) 188void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
182{ 189{
183 unsigned long offset = get_paca()->slb_cache_ptr; 190 unsigned long offset;
184 unsigned long slbie_data = 0; 191 unsigned long slbie_data = 0;
185 unsigned long pc = KSTK_EIP(tsk); 192 unsigned long pc = KSTK_EIP(tsk);
186 unsigned long stack = KSTK_ESP(tsk); 193 unsigned long stack = KSTK_ESP(tsk);
187 unsigned long unmapped_base; 194 unsigned long exec_base;
188 195
196 /*
197 * We need interrupts hard-disabled here, not just soft-disabled,
198 * so that a PMU interrupt can't occur, which might try to access
199 * user memory (to get a stack trace) and possible cause an SLB miss
200 * which would update the slb_cache/slb_cache_ptr fields in the PACA.
201 */
202 hard_irq_disable();
203 offset = get_paca()->slb_cache_ptr;
189 if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) && 204 if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) &&
190 offset <= SLB_CACHE_ENTRIES) { 205 offset <= SLB_CACHE_ENTRIES) {
191 int i; 206 int i;
@@ -200,7 +215,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
200 } 215 }
201 asm volatile("isync" : : : "memory"); 216 asm volatile("isync" : : : "memory");
202 } else { 217 } else {
203 slb_flush_and_rebolt(); 218 __slb_flush_and_rebolt();
204 } 219 }
205 220
206 /* Workaround POWER5 < DD2.1 issue */ 221 /* Workaround POWER5 < DD2.1 issue */
@@ -212,42 +227,44 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
212 227
213 /* 228 /*
214 * preload some userspace segments into the SLB. 229 * preload some userspace segments into the SLB.
230 * Almost all 32 and 64bit PowerPC executables are linked at
231 * 0x10000000 so it makes sense to preload this segment.
215 */ 232 */
216 if (test_tsk_thread_flag(tsk, TIF_32BIT)) 233 exec_base = 0x10000000;
217 unmapped_base = TASK_UNMAPPED_BASE_USER32;
218 else
219 unmapped_base = TASK_UNMAPPED_BASE_USER64;
220 234
221 if (is_kernel_addr(pc)) 235 if (is_kernel_addr(pc) || is_kernel_addr(stack) ||
222 return; 236 is_kernel_addr(exec_base))
223 slb_allocate(pc);
224
225 if (esids_match(pc,stack))
226 return; 237 return;
227 238
228 if (is_kernel_addr(stack)) 239 slb_allocate(pc);
229 return;
230 slb_allocate(stack);
231 240
232 if (esids_match(pc,unmapped_base) || esids_match(stack,unmapped_base)) 241 if (!esids_match(pc, stack))
233 return; 242 slb_allocate(stack);
234 243
235 if (is_kernel_addr(unmapped_base)) 244 if (!esids_match(pc, exec_base) &&
236 return; 245 !esids_match(stack, exec_base))
237 slb_allocate(unmapped_base); 246 slb_allocate(exec_base);
238} 247}
239 248
240static inline void patch_slb_encoding(unsigned int *insn_addr, 249static inline void patch_slb_encoding(unsigned int *insn_addr,
241 unsigned int immed) 250 unsigned int immed)
242{ 251{
243 /* Assume the instruction had a "0" immediate value, just 252 *insn_addr = (*insn_addr & 0xffff0000) | immed;
244 * "or" in the new value
245 */
246 *insn_addr |= immed;
247 flush_icache_range((unsigned long)insn_addr, 4+ 253 flush_icache_range((unsigned long)insn_addr, 4+
248 (unsigned long)insn_addr); 254 (unsigned long)insn_addr);
249} 255}
250 256
257void slb_set_size(u16 size)
258{
259 extern unsigned int *slb_compare_rr_to_size;
260
261 if (mmu_slb_size == size)
262 return;
263
264 mmu_slb_size = size;
265 patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size);
266}
267
251void slb_initialize(void) 268void slb_initialize(void)
252{ 269{
253 unsigned long linear_llp, vmalloc_llp, io_llp; 270 unsigned long linear_llp, vmalloc_llp, io_llp;
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 98cd1dc2ae75..687fddaa24c5 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -31,7 +31,7 @@ struct stab_entry {
31 31
32#define NR_STAB_CACHE_ENTRIES 8 32#define NR_STAB_CACHE_ENTRIES 8
33static DEFINE_PER_CPU(long, stab_cache_ptr); 33static DEFINE_PER_CPU(long, stab_cache_ptr);
34static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); 34static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache);
35 35
36/* 36/*
37 * Create a segment table entry for the given esid/vsid pair. 37 * Create a segment table entry for the given esid/vsid pair.
@@ -164,7 +164,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
164{ 164{
165 struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr; 165 struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
166 struct stab_entry *ste; 166 struct stab_entry *ste;
167 unsigned long offset = __get_cpu_var(stab_cache_ptr); 167 unsigned long offset;
168 unsigned long pc = KSTK_EIP(tsk); 168 unsigned long pc = KSTK_EIP(tsk);
169 unsigned long stack = KSTK_ESP(tsk); 169 unsigned long stack = KSTK_ESP(tsk);
170 unsigned long unmapped_base; 170 unsigned long unmapped_base;
@@ -172,6 +172,15 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
172 /* Force previous translations to complete. DRENG */ 172 /* Force previous translations to complete. DRENG */
173 asm volatile("isync" : : : "memory"); 173 asm volatile("isync" : : : "memory");
174 174
175 /*
176 * We need interrupts hard-disabled here, not just soft-disabled,
177 * so that a PMU interrupt can't occur, which might try to access
178 * user memory (to get a stack trace) and possible cause an STAB miss
179 * which would update the stab_cache/stab_cache_ptr per-cpu variables.
180 */
181 hard_irq_disable();
182
183 offset = __get_cpu_var(stab_cache_ptr);
175 if (offset <= NR_STAB_CACHE_ENTRIES) { 184 if (offset <= NR_STAB_CACHE_ENTRIES) {
176 int i; 185 int i;
177 186
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
index 65190587a365..8aaa8b7eb324 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -71,6 +71,9 @@ void tlb_flush(struct mmu_gather *tlb)
71 */ 71 */
72 _tlbia(); 72 _tlbia();
73 } 73 }
74
75 /* Push out batch of freed page tables */
76 pte_free_finish();
74} 77}
75 78
76/* 79/*
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 937eb90677d9..2b2f35f6985e 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -33,11 +33,6 @@
33 33
34DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); 34DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
35 35
36/* This is declared as we are using the more or less generic
37 * arch/powerpc/include/asm/tlb.h file -- tgall
38 */
39DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
40
41/* 36/*
42 * A linux PTE was changed and the corresponding hash table entry 37 * A linux PTE was changed and the corresponding hash table entry
43 * neesd to be flushed. This function will either perform the flush 38 * neesd to be flushed. This function will either perform the flush
@@ -154,6 +149,21 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
154 batch->index = 0; 149 batch->index = 0;
155} 150}
156 151
152void tlb_flush(struct mmu_gather *tlb)
153{
154 struct ppc64_tlb_batch *tlbbatch = &__get_cpu_var(ppc64_tlb_batch);
155
156 /* If there's a TLB batch pending, then we must flush it because the
157 * pages are going to be freed and we really don't want to have a CPU
158 * access a freed page because it has a stale TLB
159 */
160 if (tlbbatch->index)
161 __flush_tlb_pending(tlbbatch);
162
163 /* Push out batch of freed page tables */
164 pte_free_finish();
165}
166
157/** 167/**
158 * __flush_hash_table_range - Flush all HPTEs for a given address range 168 * __flush_hash_table_range - Flush all HPTEs for a given address range
159 * from the hash table (and the TLB). But keeps 169 * from the hash table (and the TLB). But keeps
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
new file mode 100644
index 000000000000..ef1cccf71173
--- /dev/null
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -0,0 +1,770 @@
1/*
2 * Low leve TLB miss handlers for Book3E
3 *
4 * Copyright (C) 2008-2009
5 * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <asm/processor.h>
14#include <asm/reg.h>
15#include <asm/page.h>
16#include <asm/mmu.h>
17#include <asm/ppc_asm.h>
18#include <asm/asm-offsets.h>
19#include <asm/cputable.h>
20#include <asm/pgtable.h>
21#include <asm/reg.h>
22#include <asm/exception-64e.h>
23#include <asm/ppc-opcode.h>
24
25#ifdef CONFIG_PPC_64K_PAGES
26#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1)
27#else
28#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE)
29#endif
30#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
31#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
32#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE)
33
34
35/**********************************************************************
36 * *
37 * TLB miss handling for Book3E with TLB reservation and HES support *
38 * *
39 **********************************************************************/
40
41
42/* Data TLB miss */
43 START_EXCEPTION(data_tlb_miss)
44 TLB_MISS_PROLOG
45
46 /* Now we handle the fault proper. We only save DEAR in normal
47 * fault case since that's the only interesting values here.
48 * We could probably also optimize by not saving SRR0/1 in the
49 * linear mapping case but I'll leave that for later
50 */
51 mfspr r14,SPRN_ESR
52 mfspr r16,SPRN_DEAR /* get faulting address */
53 srdi r15,r16,60 /* get region */
54 cmpldi cr0,r15,0xc /* linear mapping ? */
55 TLB_MISS_STATS_SAVE_INFO
56 beq tlb_load_linear /* yes -> go to linear map load */
57
58 /* The page tables are mapped virtually linear. At this point, though,
59 * we don't know whether we are trying to fault in a first level
60 * virtual address or a virtual page table address. We can get that
61 * from bit 0x1 of the region ID which we have set for a page table
62 */
63 andi. r10,r15,0x1
64 bne- virt_page_table_tlb_miss
65
66 std r14,EX_TLB_ESR(r12); /* save ESR */
67 std r16,EX_TLB_DEAR(r12); /* save DEAR */
68
69 /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */
70 li r11,_PAGE_PRESENT
71 oris r11,r11,_PAGE_ACCESSED@h
72
73 /* We do the user/kernel test for the PID here along with the RW test
74 */
75 cmpldi cr0,r15,0 /* Check for user region */
76
77 /* We pre-test some combination of permissions to avoid double
78 * faults:
79 *
80 * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
81 * ESR_ST is 0x00800000
82 * _PAGE_BAP_SW is 0x00000010
83 * So the shift is >> 19. This tests for supervisor writeability.
84 * If the page happens to be supervisor writeable and not user
85 * writeable, we will take a new fault later, but that should be
86 * a rare enough case.
87 *
88 * We also move ESR_ST in _PAGE_DIRTY position
89 * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
90 *
91 * MAS1 is preset for all we need except for TID that needs to
92 * be cleared for kernel translations
93 */
94 rlwimi r11,r14,32-19,27,27
95 rlwimi r11,r14,32-16,19,19
96 beq normal_tlb_miss
97 /* XXX replace the RMW cycles with immediate loads + writes */
981: mfspr r10,SPRN_MAS1
99 cmpldi cr0,r15,8 /* Check for vmalloc region */
100 rlwinm r10,r10,0,16,1 /* Clear TID */
101 mtspr SPRN_MAS1,r10
102 beq+ normal_tlb_miss
103
104 /* We got a crappy address, just fault with whatever DEAR and ESR
105 * are here
106 */
107 TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
108 TLB_MISS_EPILOG_ERROR
109 b exc_data_storage_book3e
110
111/* Instruction TLB miss */
112 START_EXCEPTION(instruction_tlb_miss)
113 TLB_MISS_PROLOG
114
115 /* If we take a recursive fault, the second level handler may need
116 * to know whether we are handling a data or instruction fault in
117 * order to get to the right store fault handler. We provide that
118 * info by writing a crazy value in ESR in our exception frame
119 */
120 li r14,-1 /* store to exception frame is done later */
121
122 /* Now we handle the fault proper. We only save DEAR in the non
123 * linear mapping case since we know the linear mapping case will
124 * not re-enter. We could indeed optimize and also not save SRR0/1
125 * in the linear mapping case but I'll leave that for later
126 *
127 * Faulting address is SRR0 which is already in r16
128 */
129 srdi r15,r16,60 /* get region */
130 cmpldi cr0,r15,0xc /* linear mapping ? */
131 TLB_MISS_STATS_SAVE_INFO
132 beq tlb_load_linear /* yes -> go to linear map load */
133
134 /* We do the user/kernel test for the PID here along with the RW test
135 */
136 li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */
137 oris r11,r11,_PAGE_ACCESSED@h
138
139 cmpldi cr0,r15,0 /* Check for user region */
140 std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */
141 beq normal_tlb_miss
142 /* XXX replace the RMW cycles with immediate loads + writes */
1431: mfspr r10,SPRN_MAS1
144 cmpldi cr0,r15,8 /* Check for vmalloc region */
145 rlwinm r10,r10,0,16,1 /* Clear TID */
146 mtspr SPRN_MAS1,r10
147 beq+ normal_tlb_miss
148
149 /* We got a crappy address, just fault */
150 TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
151 TLB_MISS_EPILOG_ERROR
152 b exc_instruction_storage_book3e
153
154/*
155 * This is the guts of the first-level TLB miss handler for direct
156 * misses. We are entered with:
157 *
158 * r16 = faulting address
159 * r15 = region ID
160 * r14 = crap (free to use)
161 * r13 = PACA
162 * r12 = TLB exception frame in PACA
163 * r11 = PTE permission mask
164 * r10 = crap (free to use)
165 */
166normal_tlb_miss:
167 /* So we first construct the page table address. We do that by
168 * shifting the bottom of the address (not the region ID) by
169 * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and
170 * or'ing the fourth high bit.
171 *
172 * NOTE: For 64K pages, we do things slightly differently in
173 * order to handle the weird page table format used by linux
174 */
175 ori r10,r15,0x1
176#ifdef CONFIG_PPC_64K_PAGES
177 /* For the top bits, 16 bytes per PTE */
178 rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4
179 /* Now create the bottom bits as 0 in position 0x8000 and
180 * the rest calculated for 8 bytes per PTE
181 */
182 rldicl r15,r16,64-(PAGE_SHIFT-3),64-15
183 /* Insert the bottom bits in */
184 rlwimi r14,r15,0,16,31
185#else
186 rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
187#endif
188 sldi r15,r10,60
189 clrrdi r14,r14,3
190 or r10,r15,r14
191
192BEGIN_MMU_FTR_SECTION
193 /* Set the TLB reservation and seach for existing entry. Then load
194 * the entry.
195 */
196 PPC_TLBSRX_DOT(0,r16)
197 ld r14,0(r10)
198 beq normal_tlb_miss_done
199MMU_FTR_SECTION_ELSE
200 ld r14,0(r10)
201ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
202
203finish_normal_tlb_miss:
204 /* Check if required permissions are met */
205 andc. r15,r11,r14
206 bne- normal_tlb_miss_access_fault
207
208 /* Now we build the MAS:
209 *
210 * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
211 * MAS 1 : Almost fully setup
212 * - PID already updated by caller if necessary
213 * - TSIZE need change if !base page size, not
214 * yet implemented for now
215 * MAS 2 : Defaults not useful, need to be redone
216 * MAS 3+7 : Needs to be done
217 *
218 * TODO: mix up code below for better scheduling
219 */
220 clrrdi r11,r16,12 /* Clear low crap in EA */
221 rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */
222 mtspr SPRN_MAS2,r11
223
224 /* Check page size, if not standard, update MAS1 */
225 rldicl r11,r14,64-8,64-8
226#ifdef CONFIG_PPC_64K_PAGES
227 cmpldi cr0,r11,BOOK3E_PAGESZ_64K
228#else
229 cmpldi cr0,r11,BOOK3E_PAGESZ_4K
230#endif
231 beq- 1f
232 mfspr r11,SPRN_MAS1
233 rlwimi r11,r14,31,21,24
234 rlwinm r11,r11,0,21,19
235 mtspr SPRN_MAS1,r11
2361:
237 /* Move RPN in position */
238 rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
239 clrldi r15,r11,12 /* Clear crap at the top */
240 rlwimi r15,r14,32-8,22,25 /* Move in U bits */
241 rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */
242
243 /* Mask out SW and UW if !DIRTY (XXX optimize this !) */
244 andi. r11,r14,_PAGE_DIRTY
245 bne 1f
246 li r11,MAS3_SW|MAS3_UW
247 andc r15,r15,r11
2481:
249BEGIN_MMU_FTR_SECTION
250 srdi r16,r15,32
251 mtspr SPRN_MAS3,r15
252 mtspr SPRN_MAS7,r16
253MMU_FTR_SECTION_ELSE
254 mtspr SPRN_MAS7_MAS3,r15
255ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
256
257 tlbwe
258
259normal_tlb_miss_done:
260 /* We don't bother with restoring DEAR or ESR since we know we are
261 * level 0 and just going back to userland. They are only needed
262 * if you are going to take an access fault
263 */
264 TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
265 TLB_MISS_EPILOG_SUCCESS
266 rfi
267
268normal_tlb_miss_access_fault:
269 /* We need to check if it was an instruction miss */
270 andi. r10,r11,_PAGE_EXEC
271 bne 1f
272 ld r14,EX_TLB_DEAR(r12)
273 ld r15,EX_TLB_ESR(r12)
274 mtspr SPRN_DEAR,r14
275 mtspr SPRN_ESR,r15
276 TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
277 TLB_MISS_EPILOG_ERROR
278 b exc_data_storage_book3e
2791: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
280 TLB_MISS_EPILOG_ERROR
281 b exc_instruction_storage_book3e
282
283
284/*
285 * This is the guts of the second-level TLB miss handler for direct
286 * misses. We are entered with:
287 *
288 * r16 = virtual page table faulting address
289 * r15 = region (top 4 bits of address)
290 * r14 = crap (free to use)
291 * r13 = PACA
292 * r12 = TLB exception frame in PACA
293 * r11 = crap (free to use)
294 * r10 = crap (free to use)
295 *
296 * Note that this should only ever be called as a second level handler
297 * with the current scheme when using SW load.
298 * That means we can always get the original fault DEAR at
299 * EX_TLB_DEAR-EX_TLB_SIZE(r12)
300 *
301 * It can be re-entered by the linear mapping miss handler. However, to
302 * avoid too much complication, it will restart the whole fault at level
303 * 0 so we don't care too much about clobbers
304 *
305 * XXX That code was written back when we couldn't clobber r14. We can now,
306 * so we could probably optimize things a bit
307 */
308virt_page_table_tlb_miss:
309 /* Are we hitting a kernel page table ? */
310 andi. r10,r15,0x8
311
312 /* The cool thing now is that r10 contains 0 for user and 8 for kernel,
313 * and we happen to have the swapper_pg_dir at offset 8 from the user
314 * pgdir in the PACA :-).
315 */
316 add r11,r10,r13
317
318 /* If kernel, we need to clear MAS1 TID */
319 beq 1f
320 /* XXX replace the RMW cycles with immediate loads + writes */
321 mfspr r10,SPRN_MAS1
322 rlwinm r10,r10,0,16,1 /* Clear TID */
323 mtspr SPRN_MAS1,r10
3241:
325BEGIN_MMU_FTR_SECTION
326 /* Search if we already have a TLB entry for that virtual address, and
327 * if we do, bail out.
328 */
329 PPC_TLBSRX_DOT(0,r16)
330 beq virt_page_table_tlb_miss_done
331END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
332
333 /* Now, we need to walk the page tables. First check if we are in
334 * range.
335 */
336 rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4
337 bne- virt_page_table_tlb_miss_fault
338
339 /* Get the PGD pointer */
340 ld r15,PACAPGD(r11)
341 cmpldi cr0,r15,0
342 beq- virt_page_table_tlb_miss_fault
343
344 /* Get to PGD entry */
345 rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3
346 clrrdi r10,r11,3
347 ldx r15,r10,r15
348 cmpldi cr0,r15,0
349 beq virt_page_table_tlb_miss_fault
350
351#ifndef CONFIG_PPC_64K_PAGES
352 /* Get to PUD entry */
353 rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
354 clrrdi r10,r11,3
355 ldx r15,r10,r15
356 cmpldi cr0,r15,0
357 beq virt_page_table_tlb_miss_fault
358#endif /* CONFIG_PPC_64K_PAGES */
359
360 /* Get to PMD entry */
361 rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
362 clrrdi r10,r11,3
363 ldx r15,r10,r15
364 cmpldi cr0,r15,0
365 beq virt_page_table_tlb_miss_fault
366
367 /* Ok, we're all right, we can now create a kernel translation for
368 * a 4K or 64K page from r16 -> r15.
369 */
370 /* Now we build the MAS:
371 *
372 * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
373 * MAS 1 : Almost fully setup
374 * - PID already updated by caller if necessary
375 * - TSIZE for now is base page size always
376 * MAS 2 : Use defaults
377 * MAS 3+7 : Needs to be done
378 *
379 * So we only do MAS 2 and 3 for now...
380 */
381 clrldi r11,r15,4 /* remove region ID from RPN */
382 ori r10,r11,1 /* Or-in SR */
383
384BEGIN_MMU_FTR_SECTION
385 srdi r16,r10,32
386 mtspr SPRN_MAS3,r10
387 mtspr SPRN_MAS7,r16
388MMU_FTR_SECTION_ELSE
389 mtspr SPRN_MAS7_MAS3,r10
390ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
391
392 tlbwe
393
394BEGIN_MMU_FTR_SECTION
395virt_page_table_tlb_miss_done:
396
397 /* We have overriden MAS2:EPN but currently our primary TLB miss
398 * handler will always restore it so that should not be an issue,
399 * if we ever optimize the primary handler to not write MAS2 on
400 * some cases, we'll have to restore MAS2:EPN here based on the
401 * original fault's DEAR. If we do that we have to modify the
402 * ITLB miss handler to also store SRR0 in the exception frame
403 * as DEAR.
404 *
405 * However, one nasty thing we did is we cleared the reservation
406 * (well, potentially we did). We do a trick here thus if we
407 * are not a level 0 exception (we interrupted the TLB miss) we
408 * offset the return address by -4 in order to replay the tlbsrx
409 * instruction there
410 */
411 subf r10,r13,r12
412 cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE
413 bne- 1f
414 ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
415 addi r10,r11,-4
416 std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
4171:
418END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
419 /* Return to caller, normal case */
420 TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK);
421 TLB_MISS_EPILOG_SUCCESS
422 rfi
423
424virt_page_table_tlb_miss_fault:
425 /* If we fault here, things are a little bit tricky. We need to call
426 * either data or instruction store fault, and we need to retreive
427 * the original fault address and ESR (for data).
428 *
429 * The thing is, we know that in normal circumstances, this is
430 * always called as a second level tlb miss for SW load or as a first
431 * level TLB miss for HW load, so we should be able to peek at the
432 * relevant informations in the first exception frame in the PACA.
433 *
434 * However, we do need to double check that, because we may just hit
435 * a stray kernel pointer or a userland attack trying to hit those
436 * areas. If that is the case, we do a data fault. (We can't get here
437 * from an instruction tlb miss anyway).
438 *
439 * Note also that when going to a fault, we must unwind the previous
440 * level as well. Since we are doing that, we don't need to clear or
441 * restore the TLB reservation neither.
442 */
443 subf r10,r13,r12
444 cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE
445 bne- virt_page_table_tlb_miss_whacko_fault
446
447 /* We dig the original DEAR and ESR from slot 0 */
448 ld r15,EX_TLB_DEAR+PACA_EXTLB(r13)
449 ld r16,EX_TLB_ESR+PACA_EXTLB(r13)
450
451 /* We check for the "special" ESR value for instruction faults */
452 cmpdi cr0,r16,-1
453 beq 1f
454 mtspr SPRN_DEAR,r15
455 mtspr SPRN_ESR,r16
456 TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT);
457 TLB_MISS_EPILOG_ERROR
458 b exc_data_storage_book3e
4591: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT);
460 TLB_MISS_EPILOG_ERROR
461 b exc_instruction_storage_book3e
462
463virt_page_table_tlb_miss_whacko_fault:
464 /* The linear fault will restart everything so ESR and DEAR will
465 * not have been clobbered, let's just fault with what we have
466 */
467 TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT);
468 TLB_MISS_EPILOG_ERROR
469 b exc_data_storage_book3e
470
471
472/**************************************************************
473 * *
474 * TLB miss handling for Book3E with hw page table support *
475 * *
476 **************************************************************/
477
478
479/* Data TLB miss */
480 START_EXCEPTION(data_tlb_miss_htw)
481 TLB_MISS_PROLOG
482
483 /* Now we handle the fault proper. We only save DEAR in normal
484 * fault case since that's the only interesting values here.
485 * We could probably also optimize by not saving SRR0/1 in the
486 * linear mapping case but I'll leave that for later
487 */
488 mfspr r14,SPRN_ESR
489 mfspr r16,SPRN_DEAR /* get faulting address */
490 srdi r11,r16,60 /* get region */
491 cmpldi cr0,r11,0xc /* linear mapping ? */
492 TLB_MISS_STATS_SAVE_INFO
493 beq tlb_load_linear /* yes -> go to linear map load */
494
495 /* We do the user/kernel test for the PID here along with the RW test
496 */
497 cmpldi cr0,r11,0 /* Check for user region */
498 ld r15,PACAPGD(r13) /* Load user pgdir */
499 beq htw_tlb_miss
500
501 /* XXX replace the RMW cycles with immediate loads + writes */
5021: mfspr r10,SPRN_MAS1
503 cmpldi cr0,r11,8 /* Check for vmalloc region */
504 rlwinm r10,r10,0,16,1 /* Clear TID */
505 mtspr SPRN_MAS1,r10
506 ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */
507 beq+ htw_tlb_miss
508
509 /* We got a crappy address, just fault with whatever DEAR and ESR
510 * are here
511 */
512 TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
513 TLB_MISS_EPILOG_ERROR
514 b exc_data_storage_book3e
515
516/* Instruction TLB miss */
517 START_EXCEPTION(instruction_tlb_miss_htw)
518 TLB_MISS_PROLOG
519
520 /* If we take a recursive fault, the second level handler may need
521 * to know whether we are handling a data or instruction fault in
522 * order to get to the right store fault handler. We provide that
523 * info by keeping a crazy value for ESR in r14
524 */
525 li r14,-1 /* store to exception frame is done later */
526
527 /* Now we handle the fault proper. We only save DEAR in the non
528 * linear mapping case since we know the linear mapping case will
529 * not re-enter. We could indeed optimize and also not save SRR0/1
530 * in the linear mapping case but I'll leave that for later
531 *
532 * Faulting address is SRR0 which is already in r16
533 */
534 srdi r11,r16,60 /* get region */
535 cmpldi cr0,r11,0xc /* linear mapping ? */
536 TLB_MISS_STATS_SAVE_INFO
537 beq tlb_load_linear /* yes -> go to linear map load */
538
539 /* We do the user/kernel test for the PID here along with the RW test
540 */
541 cmpldi cr0,r11,0 /* Check for user region */
542 ld r15,PACAPGD(r13) /* Load user pgdir */
543 beq htw_tlb_miss
544
545 /* XXX replace the RMW cycles with immediate loads + writes */
5461: mfspr r10,SPRN_MAS1
547 cmpldi cr0,r11,8 /* Check for vmalloc region */
548 rlwinm r10,r10,0,16,1 /* Clear TID */
549 mtspr SPRN_MAS1,r10
550 ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */
551 beq+ htw_tlb_miss
552
553 /* We got a crappy address, just fault */
554 TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
555 TLB_MISS_EPILOG_ERROR
556 b exc_instruction_storage_book3e
557
558
559/*
560 * This is the guts of the second-level TLB miss handler for direct
561 * misses. We are entered with:
562 *
563 * r16 = virtual page table faulting address
564 * r15 = PGD pointer
565 * r14 = ESR
566 * r13 = PACA
567 * r12 = TLB exception frame in PACA
568 * r11 = crap (free to use)
569 * r10 = crap (free to use)
570 *
571 * It can be re-entered by the linear mapping miss handler. However, to
572 * avoid too much complication, it will save/restore things for us
573 */
574htw_tlb_miss:
575 /* Search if we already have a TLB entry for that virtual address, and
576 * if we do, bail out.
577 *
578 * MAS1:IND should be already set based on MAS4
579 */
580 PPC_TLBSRX_DOT(0,r16)
581 beq htw_tlb_miss_done
582
583 /* Now, we need to walk the page tables. First check if we are in
584 * range.
585 */
586 rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
587 bne- htw_tlb_miss_fault
588
589 /* Get the PGD pointer */
590 cmpldi cr0,r15,0
591 beq- htw_tlb_miss_fault
592
593 /* Get to PGD entry */
594 rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
595 clrrdi r10,r11,3
596 ldx r15,r10,r15
597 cmpldi cr0,r15,0
598 beq htw_tlb_miss_fault
599
600#ifndef CONFIG_PPC_64K_PAGES
601 /* Get to PUD entry */
602 rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
603 clrrdi r10,r11,3
604 ldx r15,r10,r15
605 cmpldi cr0,r15,0
606 beq htw_tlb_miss_fault
607#endif /* CONFIG_PPC_64K_PAGES */
608
609 /* Get to PMD entry */
610 rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
611 clrrdi r10,r11,3
612 ldx r15,r10,r15
613 cmpldi cr0,r15,0
614 beq htw_tlb_miss_fault
615
616 /* Ok, we're all right, we can now create an indirect entry for
617 * a 1M or 256M page.
618 *
619 * The last trick is now that because we use "half" pages for
620 * the HTW (1M IND is 2K and 256M IND is 32K) we need to account
621 * for an added LSB bit to the RPN. For 64K pages, there is no
622 * problem as we already use 32K arrays (half PTE pages), but for
623 * 4K page we need to extract a bit from the virtual address and
624 * insert it into the "PA52" bit of the RPN.
625 */
626#ifndef CONFIG_PPC_64K_PAGES
627 rlwimi r15,r16,32-9,20,20
628#endif
629 /* Now we build the MAS:
630 *
631 * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
632 * MAS 1 : Almost fully setup
633 * - PID already updated by caller if necessary
634 * - TSIZE for now is base ind page size always
635 * MAS 2 : Use defaults
636 * MAS 3+7 : Needs to be done
637 */
638#ifdef CONFIG_PPC_64K_PAGES
639 ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT)
640#else
641 ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
642#endif
643
644BEGIN_MMU_FTR_SECTION
645 srdi r16,r10,32
646 mtspr SPRN_MAS3,r10
647 mtspr SPRN_MAS7,r16
648MMU_FTR_SECTION_ELSE
649 mtspr SPRN_MAS7_MAS3,r10
650ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
651
652 tlbwe
653
654htw_tlb_miss_done:
655 /* We don't bother with restoring DEAR or ESR since we know we are
656 * level 0 and just going back to userland. They are only needed
657 * if you are going to take an access fault
658 */
659 TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK)
660 TLB_MISS_EPILOG_SUCCESS
661 rfi
662
663htw_tlb_miss_fault:
664 /* We need to check if it was an instruction miss. We know this
665 * though because r14 would contain -1
666 */
667 cmpdi cr0,r14,-1
668 beq 1f
669 mtspr SPRN_DEAR,r16
670 mtspr SPRN_ESR,r14
671 TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT)
672 TLB_MISS_EPILOG_ERROR
673 b exc_data_storage_book3e
6741: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT)
675 TLB_MISS_EPILOG_ERROR
676 b exc_instruction_storage_book3e
677
678/*
679 * This is the guts of "any" level TLB miss handler for kernel linear
680 * mapping misses. We are entered with:
681 *
682 *
683 * r16 = faulting address
684 * r15 = crap (free to use)
685 * r14 = ESR (data) or -1 (instruction)
686 * r13 = PACA
687 * r12 = TLB exception frame in PACA
688 * r11 = crap (free to use)
689 * r10 = crap (free to use)
690 *
691 * In addition we know that we will not re-enter, so in theory, we could
692 * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later.
693 *
694 * We also need to be careful about MAS registers here & TLB reservation,
695 * as we know we'll have clobbered them if we interrupt the main TLB miss
696 * handlers in which case we probably want to do a full restart at level
697 * 0 rather than saving / restoring the MAS.
698 *
699 * Note: If we care about performance of that core, we can easily shuffle
700 * a few things around
701 */
702tlb_load_linear:
703 /* For now, we assume the linear mapping is contiguous and stops at
704 * linear_map_top. We also assume the size is a multiple of 1G, thus
705 * we only use 1G pages for now. That might have to be changed in a
706 * final implementation, especially when dealing with hypervisors
707 */
708 ld r11,PACATOC(r13)
709 ld r11,linear_map_top@got(r11)
710 ld r10,0(r11)
711 cmpld cr0,r10,r16
712 bge tlb_load_linear_fault
713
714 /* MAS1 need whole new setup. */
715 li r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT)
716 oris r15,r15,MAS1_VALID@h /* MAS1 needs V and TSIZE */
717 mtspr SPRN_MAS1,r15
718
719 /* Already somebody there ? */
720 PPC_TLBSRX_DOT(0,r16)
721 beq tlb_load_linear_done
722
723 /* Now we build the remaining MAS. MAS0 and 2 should be fine
724 * with their defaults, which leaves us with MAS 3 and 7. The
725 * mapping is linear, so we just take the address, clear the
726 * region bits, and or in the permission bits which are currently
727 * hard wired
728 */
729 clrrdi r10,r16,30 /* 1G page index */
730 clrldi r10,r10,4 /* clear region bits */
731 ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX
732
733BEGIN_MMU_FTR_SECTION
734 srdi r16,r10,32
735 mtspr SPRN_MAS3,r10
736 mtspr SPRN_MAS7,r16
737MMU_FTR_SECTION_ELSE
738 mtspr SPRN_MAS7_MAS3,r10
739ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
740
741 tlbwe
742
743tlb_load_linear_done:
744 /* We use the "error" epilog for success as we do want to
745 * restore to the initial faulting context, whatever it was.
746 * We do that because we can't resume a fault within a TLB
747 * miss handler, due to MAS and TLB reservation being clobbered.
748 */
749 TLB_MISS_STATS_X(MMSTAT_TLB_MISS_LINEAR)
750 TLB_MISS_EPILOG_ERROR
751 rfi
752
753tlb_load_linear_fault:
754 /* We keep the DEAR and ESR around, this shouldn't have happened */
755 cmpdi cr0,r14,-1
756 beq 1f
757 TLB_MISS_EPILOG_ERROR_SPECIAL
758 b exc_data_storage_book3e
7591: TLB_MISS_EPILOG_ERROR_SPECIAL
760 b exc_instruction_storage_book3e
761
762
763#ifdef CONFIG_BOOK3E_MMU_TLB_STATS
764.tlb_stat_inc:
7651: ldarx r8,0,r9
766 addi r8,r8,1
767 stdcx. r8,0,r9
768 bne- 1b
769 blr
770#endif
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index ad2eb4d34dd4..2fbc680c2c71 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -7,8 +7,8 @@
7 * 7 *
8 * -- BenH 8 * -- BenH
9 * 9 *
10 * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org> 10 * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
11 * IBM Corp. 11 * IBM Corp.
12 * 12 *
13 * Derived from arch/ppc/mm/init.c: 13 * Derived from arch/ppc/mm/init.c:
14 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 14 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -34,12 +34,71 @@
34#include <linux/pagemap.h> 34#include <linux/pagemap.h>
35#include <linux/preempt.h> 35#include <linux/preempt.h>
36#include <linux/spinlock.h> 36#include <linux/spinlock.h>
37#include <linux/lmb.h>
37 38
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
39#include <asm/tlb.h> 40#include <asm/tlb.h>
41#include <asm/code-patching.h>
40 42
41#include "mmu_decl.h" 43#include "mmu_decl.h"
42 44
45#ifdef CONFIG_PPC_BOOK3E
46struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
47 [MMU_PAGE_4K] = {
48 .shift = 12,
49 .enc = BOOK3E_PAGESZ_4K,
50 },
51 [MMU_PAGE_16K] = {
52 .shift = 14,
53 .enc = BOOK3E_PAGESZ_16K,
54 },
55 [MMU_PAGE_64K] = {
56 .shift = 16,
57 .enc = BOOK3E_PAGESZ_64K,
58 },
59 [MMU_PAGE_1M] = {
60 .shift = 20,
61 .enc = BOOK3E_PAGESZ_1M,
62 },
63 [MMU_PAGE_16M] = {
64 .shift = 24,
65 .enc = BOOK3E_PAGESZ_16M,
66 },
67 [MMU_PAGE_256M] = {
68 .shift = 28,
69 .enc = BOOK3E_PAGESZ_256M,
70 },
71 [MMU_PAGE_1G] = {
72 .shift = 30,
73 .enc = BOOK3E_PAGESZ_1GB,
74 },
75};
76static inline int mmu_get_tsize(int psize)
77{
78 return mmu_psize_defs[psize].enc;
79}
80#else
81static inline int mmu_get_tsize(int psize)
82{
83 /* This isn't used on !Book3E for now */
84 return 0;
85}
86#endif
87
88/* The variables below are currently only used on 64-bit Book3E
89 * though this will probably be made common with other nohash
90 * implementations at some point
91 */
92#ifdef CONFIG_PPC64
93
94int mmu_linear_psize; /* Page size used for the linear mapping */
95int mmu_pte_psize; /* Page size used for PTE pages */
96int mmu_vmemmap_psize; /* Page size used for the virtual mem map */
97int book3e_htw_enabled; /* Is HW tablewalk enabled ? */
98unsigned long linear_map_top; /* Top of linear mapping */
99
100#endif /* CONFIG_PPC64 */
101
43/* 102/*
44 * Base TLB flushing operations: 103 * Base TLB flushing operations:
45 * 104 *
@@ -67,18 +126,24 @@ void local_flush_tlb_mm(struct mm_struct *mm)
67} 126}
68EXPORT_SYMBOL(local_flush_tlb_mm); 127EXPORT_SYMBOL(local_flush_tlb_mm);
69 128
70void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) 129void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
130 int tsize, int ind)
71{ 131{
72 unsigned int pid; 132 unsigned int pid;
73 133
74 preempt_disable(); 134 preempt_disable();
75 pid = vma ? vma->vm_mm->context.id : 0; 135 pid = mm ? mm->context.id : 0;
76 if (pid != MMU_NO_CONTEXT) 136 if (pid != MMU_NO_CONTEXT)
77 _tlbil_va(vmaddr, pid); 137 _tlbil_va(vmaddr, pid, tsize, ind);
78 preempt_enable(); 138 preempt_enable();
79} 139}
80EXPORT_SYMBOL(local_flush_tlb_page);
81 140
141void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
142{
143 __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
144 mmu_get_tsize(mmu_virtual_psize), 0);
145}
146EXPORT_SYMBOL(local_flush_tlb_page);
82 147
83/* 148/*
84 * And here are the SMP non-local implementations 149 * And here are the SMP non-local implementations
@@ -87,9 +152,17 @@ EXPORT_SYMBOL(local_flush_tlb_page);
87 152
88static DEFINE_SPINLOCK(tlbivax_lock); 153static DEFINE_SPINLOCK(tlbivax_lock);
89 154
155static int mm_is_core_local(struct mm_struct *mm)
156{
157 return cpumask_subset(mm_cpumask(mm),
158 topology_thread_cpumask(smp_processor_id()));
159}
160
90struct tlb_flush_param { 161struct tlb_flush_param {
91 unsigned long addr; 162 unsigned long addr;
92 unsigned int pid; 163 unsigned int pid;
164 unsigned int tsize;
165 unsigned int ind;
93}; 166};
94 167
95static void do_flush_tlb_mm_ipi(void *param) 168static void do_flush_tlb_mm_ipi(void *param)
@@ -103,7 +176,7 @@ static void do_flush_tlb_page_ipi(void *param)
103{ 176{
104 struct tlb_flush_param *p = param; 177 struct tlb_flush_param *p = param;
105 178
106 _tlbil_va(p->addr, p->pid); 179 _tlbil_va(p->addr, p->pid, p->tsize, p->ind);
107} 180}
108 181
109 182
@@ -131,7 +204,7 @@ void flush_tlb_mm(struct mm_struct *mm)
131 pid = mm->context.id; 204 pid = mm->context.id;
132 if (unlikely(pid == MMU_NO_CONTEXT)) 205 if (unlikely(pid == MMU_NO_CONTEXT))
133 goto no_context; 206 goto no_context;
134 if (!cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { 207 if (!mm_is_core_local(mm)) {
135 struct tlb_flush_param p = { .pid = pid }; 208 struct tlb_flush_param p = { .pid = pid };
136 /* Ignores smp_processor_id() even if set. */ 209 /* Ignores smp_processor_id() even if set. */
137 smp_call_function_many(mm_cpumask(mm), 210 smp_call_function_many(mm_cpumask(mm),
@@ -143,37 +216,49 @@ void flush_tlb_mm(struct mm_struct *mm)
143} 216}
144EXPORT_SYMBOL(flush_tlb_mm); 217EXPORT_SYMBOL(flush_tlb_mm);
145 218
146void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) 219void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
220 int tsize, int ind)
147{ 221{
148 struct cpumask *cpu_mask; 222 struct cpumask *cpu_mask;
149 unsigned int pid; 223 unsigned int pid;
150 224
151 preempt_disable(); 225 preempt_disable();
152 pid = vma ? vma->vm_mm->context.id : 0; 226 pid = mm ? mm->context.id : 0;
153 if (unlikely(pid == MMU_NO_CONTEXT)) 227 if (unlikely(pid == MMU_NO_CONTEXT))
154 goto bail; 228 goto bail;
155 cpu_mask = mm_cpumask(vma->vm_mm); 229 cpu_mask = mm_cpumask(mm);
156 if (!cpumask_equal(cpu_mask, cpumask_of(smp_processor_id()))) { 230 if (!mm_is_core_local(mm)) {
157 /* If broadcast tlbivax is supported, use it */ 231 /* If broadcast tlbivax is supported, use it */
158 if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { 232 if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
159 int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); 233 int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
160 if (lock) 234 if (lock)
161 spin_lock(&tlbivax_lock); 235 spin_lock(&tlbivax_lock);
162 _tlbivax_bcast(vmaddr, pid); 236 _tlbivax_bcast(vmaddr, pid, tsize, ind);
163 if (lock) 237 if (lock)
164 spin_unlock(&tlbivax_lock); 238 spin_unlock(&tlbivax_lock);
165 goto bail; 239 goto bail;
166 } else { 240 } else {
167 struct tlb_flush_param p = { .pid = pid, .addr = vmaddr }; 241 struct tlb_flush_param p = {
242 .pid = pid,
243 .addr = vmaddr,
244 .tsize = tsize,
245 .ind = ind,
246 };
168 /* Ignores smp_processor_id() even if set in cpu_mask */ 247 /* Ignores smp_processor_id() even if set in cpu_mask */
169 smp_call_function_many(cpu_mask, 248 smp_call_function_many(cpu_mask,
170 do_flush_tlb_page_ipi, &p, 1); 249 do_flush_tlb_page_ipi, &p, 1);
171 } 250 }
172 } 251 }
173 _tlbil_va(vmaddr, pid); 252 _tlbil_va(vmaddr, pid, tsize, ind);
174 bail: 253 bail:
175 preempt_enable(); 254 preempt_enable();
176} 255}
256
257void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
258{
259 __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
260 mmu_get_tsize(mmu_virtual_psize), 0);
261}
177EXPORT_SYMBOL(flush_tlb_page); 262EXPORT_SYMBOL(flush_tlb_page);
178 263
179#endif /* CONFIG_SMP */ 264#endif /* CONFIG_SMP */
@@ -207,3 +292,156 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
207 flush_tlb_mm(vma->vm_mm); 292 flush_tlb_mm(vma->vm_mm);
208} 293}
209EXPORT_SYMBOL(flush_tlb_range); 294EXPORT_SYMBOL(flush_tlb_range);
295
296void tlb_flush(struct mmu_gather *tlb)
297{
298 flush_tlb_mm(tlb->mm);
299
300 /* Push out batch of freed page tables */
301 pte_free_finish();
302}
303
304/*
305 * Below are functions specific to the 64-bit variant of Book3E though that
306 * may change in the future
307 */
308
309#ifdef CONFIG_PPC64
310
311/*
312 * Handling of virtual linear page tables or indirect TLB entries
313 * flushing when PTE pages are freed
314 */
315void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
316{
317 int tsize = mmu_psize_defs[mmu_pte_psize].enc;
318
319 if (book3e_htw_enabled) {
320 unsigned long start = address & PMD_MASK;
321 unsigned long end = address + PMD_SIZE;
322 unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
323
324 /* This isn't the most optimal, ideally we would factor out the
325 * while preempt & CPU mask mucking around, or even the IPI but
326 * it will do for now
327 */
328 while (start < end) {
329 __flush_tlb_page(tlb->mm, start, tsize, 1);
330 start += size;
331 }
332 } else {
333 unsigned long rmask = 0xf000000000000000ul;
334 unsigned long rid = (address & rmask) | 0x1000000000000000ul;
335 unsigned long vpte = address & ~rmask;
336
337#ifdef CONFIG_PPC_64K_PAGES
338 vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful;
339#else
340 vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
341#endif
342 vpte |= rid;
343 __flush_tlb_page(tlb->mm, vpte, tsize, 0);
344 }
345}
346
347/*
348 * Early initialization of the MMU TLB code
349 */
350static void __early_init_mmu(int boot_cpu)
351{
352 extern unsigned int interrupt_base_book3e;
353 extern unsigned int exc_data_tlb_miss_htw_book3e;
354 extern unsigned int exc_instruction_tlb_miss_htw_book3e;
355
356 unsigned int *ibase = &interrupt_base_book3e;
357 unsigned int mas4;
358
359 /* XXX This will have to be decided at runtime, but right
360 * now our boot and TLB miss code hard wires it. Ideally
361 * we should find out a suitable page size and patch the
362 * TLB miss code (either that or use the PACA to store
363 * the value we want)
364 */
365 mmu_linear_psize = MMU_PAGE_1G;
366
367 /* XXX This should be decided at runtime based on supported
368 * page sizes in the TLB, but for now let's assume 16M is
369 * always there and a good fit (which it probably is)
370 */
371 mmu_vmemmap_psize = MMU_PAGE_16M;
372
373 /* Check if HW tablewalk is present, and if yes, enable it by:
374 *
375 * - patching the TLB miss handlers to branch to the
376 * one dedicates to it
377 *
378 * - setting the global book3e_htw_enabled
379 *
380 * - Set MAS4:INDD and default page size
381 */
382
383 /* XXX This code only checks for TLB 0 capabilities and doesn't
384 * check what page size combos are supported by the HW. It
385 * also doesn't handle the case where a separate array holds
386 * the IND entries from the array loaded by the PT.
387 */
388 if (boot_cpu) {
389 unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
390
391 /* Check if HW loader is supported */
392 if ((tlb0cfg & TLBnCFG_IND) &&
393 (tlb0cfg & TLBnCFG_PT)) {
394 patch_branch(ibase + (0x1c0 / 4),
395 (unsigned long)&exc_data_tlb_miss_htw_book3e, 0);
396 patch_branch(ibase + (0x1e0 / 4),
397 (unsigned long)&exc_instruction_tlb_miss_htw_book3e, 0);
398 book3e_htw_enabled = 1;
399 }
400 pr_info("MMU: Book3E Page Tables %s\n",
401 book3e_htw_enabled ? "Enabled" : "Disabled");
402 }
403
404 /* Set MAS4 based on page table setting */
405
406 mas4 = 0x4 << MAS4_WIMGED_SHIFT;
407 if (book3e_htw_enabled) {
408 mas4 |= mas4 | MAS4_INDD;
409#ifdef CONFIG_PPC_64K_PAGES
410 mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
411 mmu_pte_psize = MMU_PAGE_256M;
412#else
413 mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
414 mmu_pte_psize = MMU_PAGE_1M;
415#endif
416 } else {
417#ifdef CONFIG_PPC_64K_PAGES
418 mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
419#else
420 mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
421#endif
422 mmu_pte_psize = mmu_virtual_psize;
423 }
424 mtspr(SPRN_MAS4, mas4);
425
426 /* Set the global containing the top of the linear mapping
427 * for use by the TLB miss code
428 */
429 linear_map_top = lmb_end_of_DRAM();
430
431 /* A sync won't hurt us after mucking around with
432 * the MMU configuration
433 */
434 mb();
435}
436
437void __init early_init_mmu(void)
438{
439 __early_init_mmu(1);
440}
441
442void __cpuinit early_init_mmu_secondary(void)
443{
444 __early_init_mmu(0);
445}
446
447#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index 3037911279b1..bbdc5b577b85 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -39,7 +39,7 @@
39/* 39/*
40 * 40x implementation needs only tlbil_va 40 * 40x implementation needs only tlbil_va
41 */ 41 */
42_GLOBAL(_tlbil_va) 42_GLOBAL(__tlbil_va)
43 /* We run the search with interrupts disabled because we have to change 43 /* We run the search with interrupts disabled because we have to change
44 * the PID and I don't want to preempt when that happens. 44 * the PID and I don't want to preempt when that happens.
45 */ 45 */
@@ -71,7 +71,7 @@ _GLOBAL(_tlbil_va)
71 * 440 implementation uses tlbsx/we for tlbil_va and a full sweep 71 * 440 implementation uses tlbsx/we for tlbil_va and a full sweep
72 * of the TLB for everything else. 72 * of the TLB for everything else.
73 */ 73 */
74_GLOBAL(_tlbil_va) 74_GLOBAL(__tlbil_va)
75 mfspr r5,SPRN_MMUCR 75 mfspr r5,SPRN_MMUCR
76 rlwimi r5,r4,0,24,31 /* Set TID */ 76 rlwimi r5,r4,0,24,31 /* Set TID */
77 77
@@ -124,8 +124,6 @@ _GLOBAL(_tlbil_pid)
124 * to have the larger code path before the _SECTION_ELSE 124 * to have the larger code path before the _SECTION_ELSE
125 */ 125 */
126 126
127#define MMUCSR0_TLBFI (MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \
128 MMUCSR0_TLB2FI | MMUCSR0_TLB3FI)
129/* 127/*
130 * Flush MMU TLB on the local processor 128 * Flush MMU TLB on the local processor
131 */ 129 */
@@ -170,7 +168,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX)
170 * Flush MMU TLB for a particular address, but only on the local processor 168 * Flush MMU TLB for a particular address, but only on the local processor
171 * (no broadcast) 169 * (no broadcast)
172 */ 170 */
173_GLOBAL(_tlbil_va) 171_GLOBAL(__tlbil_va)
174 mfmsr r10 172 mfmsr r10
175 wrteei 0 173 wrteei 0
176 slwi r4,r4,16 174 slwi r4,r4,16
@@ -191,6 +189,85 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
191 isync 189 isync
1921: wrtee r10 1901: wrtee r10
193 blr 191 blr
192#elif defined(CONFIG_PPC_BOOK3E)
193/*
194 * New Book3E (>= 2.06) implementation
195 *
196 * Note: We may be able to get away without the interrupt masking stuff
197 * if we save/restore MAS6 on exceptions that might modify it
198 */
199_GLOBAL(_tlbil_pid)
200 slwi r4,r3,MAS6_SPID_SHIFT
201 mfmsr r10
202 wrteei 0
203 mtspr SPRN_MAS6,r4
204 PPC_TLBILX_PID(0,0)
205 wrtee r10
206 msync
207 isync
208 blr
209
210_GLOBAL(_tlbil_pid_noind)
211 slwi r4,r3,MAS6_SPID_SHIFT
212 mfmsr r10
213 ori r4,r4,MAS6_SIND
214 wrteei 0
215 mtspr SPRN_MAS6,r4
216 PPC_TLBILX_PID(0,0)
217 wrtee r10
218 msync
219 isync
220 blr
221
222_GLOBAL(_tlbil_all)
223 PPC_TLBILX_ALL(0,0)
224 msync
225 isync
226 blr
227
228_GLOBAL(_tlbil_va)
229 mfmsr r10
230 wrteei 0
231 cmpwi cr0,r6,0
232 slwi r4,r4,MAS6_SPID_SHIFT
233 rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
234 beq 1f
235 rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
2361: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */
237 PPC_TLBILX_VA(0,r3)
238 msync
239 isync
240 wrtee r10
241 blr
242
243_GLOBAL(_tlbivax_bcast)
244 mfmsr r10
245 wrteei 0
246 cmpwi cr0,r6,0
247 slwi r4,r4,MAS6_SPID_SHIFT
248 rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
249 beq 1f
250 rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
2511: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */
252 PPC_TLBIVAX(0,r3)
253 eieio
254 tlbsync
255 sync
256 wrtee r10
257 blr
258
259_GLOBAL(set_context)
260#ifdef CONFIG_BDI_SWITCH
261 /* Context switch the PTE pointer for the Abatron BDI2000.
262 * The PGDIR is the second parameter.
263 */
264 lis r5, abatron_pteptrs@h
265 ori r5, r5, abatron_pteptrs@l
266 stw r4, 0x4(r5)
267#endif
268 mtspr SPRN_PID,r3
269 isync /* Force context change */
270 blr
194#else 271#else
195#error Unsupported processor type ! 272#error Unsupported processor type !
196#endif 273#endif