aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBenjamin Herrenschmidt <benh@kernel.crashing.org>2007-04-10 03:09:37 -0400
committerPaul Mackerras <paulus@samba.org>2007-04-12 14:09:38 -0400
commita741e67969577163a4cfc78d7fd2753219087ef1 (patch)
treebac4162aaf15367e896429afa60465e201c9204c
parente4ee3891db35aa9a069bb403c2a66a8fbfa274d6 (diff)
[POWERPC] Make tlb flush batch use lazy MMU mode
The current tlb flush code on powerpc 64 bits has a subtle race since we lost the page table lock due to the possible faulting in of new PTEs after a previous one has been removed but before the corresponding hash entry has been evicted, which can leads to all sort of fatal problems. This patch reworks the batch code completely. It doesn't use the mmu_gather stuff anymore. Instead, we use the lazy mmu hooks that were added by the paravirt code. They have the nice property that the enter/leave lazy mmu mode pair is always fully contained by the PTE lock for a given range of PTEs. Thus we can guarantee that all batches are flushed on a given CPU before it drops that lock. We also generalize batching for any PTE update that require a flush. Batching is now enabled on a CPU by arch_enter_lazy_mmu_mode() and disabled by arch_leave_lazy_mmu_mode(). The code epects that this is always contained within a PTE lock section so no preemption can happen and no PTE insertion in that range from another CPU. When batching is enabled on a CPU, every PTE updates that need a hash flush will use the batch for that flush. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
-rw-r--r--arch/powerpc/kernel/process.c4
-rw-r--r--arch/powerpc/kernel/smp.c4
-rw-r--r--arch/powerpc/mm/hugetlbpage.c16
-rw-r--r--arch/powerpc/mm/tlb_64.c68
-rw-r--r--include/asm-powerpc/pgtable.h50
-rw-r--r--include/asm-powerpc/tlb.h1
-rw-r--r--include/asm-powerpc/tlbflush.h39
7 files changed, 90 insertions, 92 deletions
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 949092dccf44..e509aae2feb3 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -305,9 +305,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
305 set_dabr(new->thread.dabr); 305 set_dabr(new->thread.dabr);
306 __get_cpu_var(current_dabr) = new->thread.dabr; 306 __get_cpu_var(current_dabr) = new->thread.dabr;
307 } 307 }
308 308#endif /* CONFIG_PPC64 */
309 flush_tlb_pending();
310#endif
311 309
312 new_thread = &new->thread; 310 new_thread = &new->thread;
313 old_thread = &current->thread; 311 old_thread = &current->thread;
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 924d692bc8f9..d8e503b2e1af 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -428,10 +428,6 @@ void generic_mach_cpu_die(void)
428 smp_wmb(); 428 smp_wmb();
429 while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) 429 while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
430 cpu_relax(); 430 cpu_relax();
431
432#ifdef CONFIG_PPC64
433 flush_tlb_pending();
434#endif
435 cpu_set(cpu, cpu_online_map); 431 cpu_set(cpu, cpu_online_map);
436 local_irq_enable(); 432 local_irq_enable();
437} 433}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f6ffaaa7a5bf..8508f973d9cc 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -316,12 +316,11 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
316{ 316{
317 if (pte_present(*ptep)) { 317 if (pte_present(*ptep)) {
318 /* We open-code pte_clear because we need to pass the right 318 /* We open-code pte_clear because we need to pass the right
319 * argument to hpte_update (huge / !huge) 319 * argument to hpte_need_flush (huge / !huge). Might not be
320 * necessary anymore if we make hpte_need_flush() get the
321 * page size from the slices
320 */ 322 */
321 unsigned long old = pte_update(ptep, ~0UL); 323 pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
322 if (old & _PAGE_HASHPTE)
323 hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
324 flush_tlb_pending();
325 } 324 }
326 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); 325 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
327} 326}
@@ -329,12 +328,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
329pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, 328pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
330 pte_t *ptep) 329 pte_t *ptep)
331{ 330{
332 unsigned long old = pte_update(ptep, ~0UL); 331 unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
333
334 if (old & _PAGE_HASHPTE)
335 hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
336 *ptep = __pte(0);
337
338 return __pte(old); 332 return __pte(old);
339} 333}
340 334
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index b58baa65c4a7..fd8d08c325eb 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -120,17 +120,20 @@ void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
120} 120}
121 121
122/* 122/*
123 * Update the MMU hash table to correspond with a change to 123 * A linux PTE was changed and the corresponding hash table entry
124 * a Linux PTE. If wrprot is true, it is permissible to 124 * neesd to be flushed. This function will either perform the flush
125 * change the existing HPTE to read-only rather than removing it 125 * immediately or will batch it up if the current CPU has an active
126 * (if we remove it we should clear the _PTE_HPTEFLAGS bits). 126 * batch on it.
127 *
128 * Must be called from within some kind of spinlock/non-preempt region...
127 */ 129 */
128void hpte_update(struct mm_struct *mm, unsigned long addr, 130void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
129 pte_t *ptep, unsigned long pte, int huge) 131 pte_t *ptep, unsigned long pte, int huge)
130{ 132{
131 struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); 133 struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
132 unsigned long vsid; 134 unsigned long vsid, vaddr;
133 unsigned int psize; 135 unsigned int psize;
136 real_pte_t rpte;
134 int i; 137 int i;
135 138
136 i = batch->index; 139 i = batch->index;
@@ -151,6 +154,26 @@ void hpte_update(struct mm_struct *mm, unsigned long addr,
151 } else 154 } else
152 psize = pte_pagesize_index(pte); 155 psize = pte_pagesize_index(pte);
153 156
157 /* Build full vaddr */
158 if (!is_kernel_addr(addr)) {
159 vsid = get_vsid(mm->context.id, addr);
160 WARN_ON(vsid == 0);
161 } else
162 vsid = get_kernel_vsid(addr);
163 vaddr = (vsid << 28 ) | (addr & 0x0fffffff);
164 rpte = __real_pte(__pte(pte), ptep);
165
166 /*
167 * Check if we have an active batch on this CPU. If not, just
168 * flush now and return. For now, we don global invalidates
169 * in that case, might be worth testing the mm cpu mask though
170 * and decide to use local invalidates instead...
171 */
172 if (!batch->active) {
173 flush_hash_page(vaddr, rpte, psize, 0);
174 return;
175 }
176
154 /* 177 /*
155 * This can happen when we are in the middle of a TLB batch and 178 * This can happen when we are in the middle of a TLB batch and
156 * we encounter memory pressure (eg copy_page_range when it tries 179 * we encounter memory pressure (eg copy_page_range when it tries
@@ -162,47 +185,42 @@ void hpte_update(struct mm_struct *mm, unsigned long addr,
162 * batch 185 * batch
163 */ 186 */
164 if (i != 0 && (mm != batch->mm || batch->psize != psize)) { 187 if (i != 0 && (mm != batch->mm || batch->psize != psize)) {
165 flush_tlb_pending(); 188 __flush_tlb_pending(batch);
166 i = 0; 189 i = 0;
167 } 190 }
168 if (i == 0) { 191 if (i == 0) {
169 batch->mm = mm; 192 batch->mm = mm;
170 batch->psize = psize; 193 batch->psize = psize;
171 } 194 }
172 if (!is_kernel_addr(addr)) { 195 batch->pte[i] = rpte;
173 vsid = get_vsid(mm->context.id, addr); 196 batch->vaddr[i] = vaddr;
174 WARN_ON(vsid == 0);
175 } else
176 vsid = get_kernel_vsid(addr);
177 batch->vaddr[i] = (vsid << 28 ) | (addr & 0x0fffffff);
178 batch->pte[i] = __real_pte(__pte(pte), ptep);
179 batch->index = ++i; 197 batch->index = ++i;
180 if (i >= PPC64_TLB_BATCH_NR) 198 if (i >= PPC64_TLB_BATCH_NR)
181 flush_tlb_pending(); 199 __flush_tlb_pending(batch);
182} 200}
183 201
202/*
203 * This function is called when terminating an mmu batch or when a batch
204 * is full. It will perform the flush of all the entries currently stored
205 * in a batch.
206 *
207 * Must be called from within some kind of spinlock/non-preempt region...
208 */
184void __flush_tlb_pending(struct ppc64_tlb_batch *batch) 209void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
185{ 210{
186 int i;
187 int cpu;
188 cpumask_t tmp; 211 cpumask_t tmp;
189 int local = 0; 212 int i, local = 0;
190 213
191 BUG_ON(in_interrupt());
192
193 cpu = get_cpu();
194 i = batch->index; 214 i = batch->index;
195 tmp = cpumask_of_cpu(cpu); 215 tmp = cpumask_of_cpu(smp_processor_id());
196 if (cpus_equal(batch->mm->cpu_vm_mask, tmp)) 216 if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
197 local = 1; 217 local = 1;
198
199 if (i == 1) 218 if (i == 1)
200 flush_hash_page(batch->vaddr[0], batch->pte[0], 219 flush_hash_page(batch->vaddr[0], batch->pte[0],
201 batch->psize, local); 220 batch->psize, local);
202 else 221 else
203 flush_hash_range(i, local); 222 flush_hash_range(i, local);
204 batch->index = 0; 223 batch->index = 0;
205 put_cpu();
206} 224}
207 225
208void pte_free_finish(void) 226void pte_free_finish(void)
diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h
index 10f52743f4ff..c7142c7e0e05 100644
--- a/include/asm-powerpc/pgtable.h
+++ b/include/asm-powerpc/pgtable.h
@@ -272,7 +272,10 @@ static inline pte_t pte_mkhuge(pte_t pte) {
272 return pte; } 272 return pte; }
273 273
274/* Atomic PTE updates */ 274/* Atomic PTE updates */
275static inline unsigned long pte_update(pte_t *p, unsigned long clr) 275static inline unsigned long pte_update(struct mm_struct *mm,
276 unsigned long addr,
277 pte_t *ptep, unsigned long clr,
278 int huge)
276{ 279{
277 unsigned long old, tmp; 280 unsigned long old, tmp;
278 281
@@ -283,20 +286,15 @@ static inline unsigned long pte_update(pte_t *p, unsigned long clr)
283 andc %1,%0,%4 \n\ 286 andc %1,%0,%4 \n\
284 stdcx. %1,0,%3 \n\ 287 stdcx. %1,0,%3 \n\
285 bne- 1b" 288 bne- 1b"
286 : "=&r" (old), "=&r" (tmp), "=m" (*p) 289 : "=&r" (old), "=&r" (tmp), "=m" (*ptep)
287 : "r" (p), "r" (clr), "m" (*p), "i" (_PAGE_BUSY) 290 : "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY)
288 : "cc" ); 291 : "cc" );
292
293 if (old & _PAGE_HASHPTE)
294 hpte_need_flush(mm, addr, ptep, old, huge);
289 return old; 295 return old;
290} 296}
291 297
292/* PTE updating functions, this function puts the PTE in the
293 * batch, doesn't actually triggers the hash flush immediately,
294 * you need to call flush_tlb_pending() to do that.
295 * Pass -1 for "normal" size (4K or 64K)
296 */
297extern void hpte_update(struct mm_struct *mm, unsigned long addr,
298 pte_t *ptep, unsigned long pte, int huge);
299
300static inline int __ptep_test_and_clear_young(struct mm_struct *mm, 298static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
301 unsigned long addr, pte_t *ptep) 299 unsigned long addr, pte_t *ptep)
302{ 300{
@@ -304,11 +302,7 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
304 302
305 if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) 303 if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
306 return 0; 304 return 0;
307 old = pte_update(ptep, _PAGE_ACCESSED); 305 old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0);
308 if (old & _PAGE_HASHPTE) {
309 hpte_update(mm, addr, ptep, old, 0);
310 flush_tlb_pending();
311 }
312 return (old & _PAGE_ACCESSED) != 0; 306 return (old & _PAGE_ACCESSED) != 0;
313} 307}
314#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 308#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
@@ -331,9 +325,7 @@ static inline int __ptep_test_and_clear_dirty(struct mm_struct *mm,
331 325
332 if ((pte_val(*ptep) & _PAGE_DIRTY) == 0) 326 if ((pte_val(*ptep) & _PAGE_DIRTY) == 0)
333 return 0; 327 return 0;
334 old = pte_update(ptep, _PAGE_DIRTY); 328 old = pte_update(mm, addr, ptep, _PAGE_DIRTY, 0);
335 if (old & _PAGE_HASHPTE)
336 hpte_update(mm, addr, ptep, old, 0);
337 return (old & _PAGE_DIRTY) != 0; 329 return (old & _PAGE_DIRTY) != 0;
338} 330}
339#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY 331#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
@@ -352,9 +344,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
352 344
353 if ((pte_val(*ptep) & _PAGE_RW) == 0) 345 if ((pte_val(*ptep) & _PAGE_RW) == 0)
354 return; 346 return;
355 old = pte_update(ptep, _PAGE_RW); 347 old = pte_update(mm, addr, ptep, _PAGE_RW, 0);
356 if (old & _PAGE_HASHPTE)
357 hpte_update(mm, addr, ptep, old, 0);
358} 348}
359 349
360/* 350/*
@@ -378,7 +368,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
378({ \ 368({ \
379 int __dirty = __ptep_test_and_clear_dirty((__vma)->vm_mm, __address, \ 369 int __dirty = __ptep_test_and_clear_dirty((__vma)->vm_mm, __address, \
380 __ptep); \ 370 __ptep); \
381 flush_tlb_page(__vma, __address); \
382 __dirty; \ 371 __dirty; \
383}) 372})
384 373
@@ -386,20 +375,14 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
386static inline pte_t ptep_get_and_clear(struct mm_struct *mm, 375static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
387 unsigned long addr, pte_t *ptep) 376 unsigned long addr, pte_t *ptep)
388{ 377{
389 unsigned long old = pte_update(ptep, ~0UL); 378 unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0);
390
391 if (old & _PAGE_HASHPTE)
392 hpte_update(mm, addr, ptep, old, 0);
393 return __pte(old); 379 return __pte(old);
394} 380}
395 381
396static inline void pte_clear(struct mm_struct *mm, unsigned long addr, 382static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
397 pte_t * ptep) 383 pte_t * ptep)
398{ 384{
399 unsigned long old = pte_update(ptep, ~0UL); 385 pte_update(mm, addr, ptep, ~0UL, 0);
400
401 if (old & _PAGE_HASHPTE)
402 hpte_update(mm, addr, ptep, old, 0);
403} 386}
404 387
405/* 388/*
@@ -408,10 +391,8 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
408static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, 391static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
409 pte_t *ptep, pte_t pte) 392 pte_t *ptep, pte_t pte)
410{ 393{
411 if (pte_present(*ptep)) { 394 if (pte_present(*ptep))
412 pte_clear(mm, addr, ptep); 395 pte_clear(mm, addr, ptep);
413 flush_tlb_pending();
414 }
415 pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); 396 pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
416 *ptep = pte; 397 *ptep = pte;
417} 398}
@@ -522,6 +503,7 @@ void pgtable_cache_init(void);
522 return pt; 503 return pt;
523} 504}
524 505
506
525#include <asm-generic/pgtable.h> 507#include <asm-generic/pgtable.h>
526 508
527#endif /* __ASSEMBLY__ */ 509#endif /* __ASSEMBLY__ */
diff --git a/include/asm-powerpc/tlb.h b/include/asm-powerpc/tlb.h
index 4e2a834683fb..0a17682663d8 100644
--- a/include/asm-powerpc/tlb.h
+++ b/include/asm-powerpc/tlb.h
@@ -38,7 +38,6 @@ extern void pte_free_finish(void);
38 38
39static inline void tlb_flush(struct mmu_gather *tlb) 39static inline void tlb_flush(struct mmu_gather *tlb)
40{ 40{
41 flush_tlb_pending();
42 pte_free_finish(); 41 pte_free_finish();
43} 42}
44 43
diff --git a/include/asm-powerpc/tlbflush.h b/include/asm-powerpc/tlbflush.h
index 93c7d0c7230f..0bc5a5e506be 100644
--- a/include/asm-powerpc/tlbflush.h
+++ b/include/asm-powerpc/tlbflush.h
@@ -28,25 +28,41 @@ struct mm_struct;
28#define PPC64_TLB_BATCH_NR 192 28#define PPC64_TLB_BATCH_NR 192
29 29
30struct ppc64_tlb_batch { 30struct ppc64_tlb_batch {
31 unsigned long index; 31 int active;
32 struct mm_struct *mm; 32 unsigned long index;
33 real_pte_t pte[PPC64_TLB_BATCH_NR]; 33 struct mm_struct *mm;
34 unsigned long vaddr[PPC64_TLB_BATCH_NR]; 34 real_pte_t pte[PPC64_TLB_BATCH_NR];
35 unsigned int psize; 35 unsigned long vaddr[PPC64_TLB_BATCH_NR];
36 unsigned int psize;
36}; 37};
37DECLARE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); 38DECLARE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
38 39
39extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch); 40extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch);
40 41
41static inline void flush_tlb_pending(void) 42extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
43 pte_t *ptep, unsigned long pte, int huge);
44
45#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
46
47static inline void arch_enter_lazy_mmu_mode(void)
48{
49 struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
50
51 batch->active = 1;
52}
53
54static inline void arch_leave_lazy_mmu_mode(void)
42{ 55{
43 struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); 56 struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
44 57
45 if (batch->index) 58 if (batch->index)
46 __flush_tlb_pending(batch); 59 __flush_tlb_pending(batch);
47 put_cpu_var(ppc64_tlb_batch); 60 batch->active = 0;
48} 61}
49 62
63#define arch_flush_lazy_mmu_mode() do {} while (0)
64
65
50extern void flush_hash_page(unsigned long va, real_pte_t pte, int psize, 66extern void flush_hash_page(unsigned long va, real_pte_t pte, int psize,
51 int local); 67 int local);
52extern void flush_hash_range(unsigned long number, int local); 68extern void flush_hash_range(unsigned long number, int local);
@@ -88,15 +104,12 @@ extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t);
88 104
89static inline void flush_tlb_mm(struct mm_struct *mm) 105static inline void flush_tlb_mm(struct mm_struct *mm)
90{ 106{
91 flush_tlb_pending();
92} 107}
93 108
94static inline void flush_tlb_page(struct vm_area_struct *vma, 109static inline void flush_tlb_page(struct vm_area_struct *vma,
95 unsigned long vmaddr) 110 unsigned long vmaddr)
96{ 111{
97#ifdef CONFIG_PPC64 112#ifndef CONFIG_PPC64
98 flush_tlb_pending();
99#else
100 _tlbie(vmaddr); 113 _tlbie(vmaddr);
101#endif 114#endif
102} 115}
@@ -112,13 +125,11 @@ static inline void flush_tlb_page_nohash(struct vm_area_struct *vma,
112static inline void flush_tlb_range(struct vm_area_struct *vma, 125static inline void flush_tlb_range(struct vm_area_struct *vma,
113 unsigned long start, unsigned long end) 126 unsigned long start, unsigned long end)
114{ 127{
115 flush_tlb_pending();
116} 128}
117 129
118static inline void flush_tlb_kernel_range(unsigned long start, 130static inline void flush_tlb_kernel_range(unsigned long start,
119 unsigned long end) 131 unsigned long end)
120{ 132{
121 flush_tlb_pending();
122} 133}
123 134
124#else /* 6xx, 7xx, 7xxx cpus */ 135#else /* 6xx, 7xx, 7xxx cpus */