aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2006-03-16 05:02:32 -0500
committerDavid S. Miller <davem@sunset.davemloft.net>2006-03-20 04:16:33 -0500
commit7a1ac5264108fc3ed22d17a3cdd76212ed1666d1 (patch)
tree75378a1b470afa54900f1f15a5b41966d301520d /arch
parenta858f1ca726edc5eb7ed39722f7966d005f1c9ca (diff)
[SPARC64]: Fix and re-enable dynamic TSB sizing.
This is good for up to %50 performance improvement of some test cases. The problem has been the race conditions, and hopefully I've plugged them all up here. 1) There was a serious race in switch_mm() wrt. lazy TLB switching to and from kernel threads. We could erroneously skip a tsb_context_switch() and thus use a stale TSB across a TSB grow event. There is a big comment now in that function describing exactly how it can happen. 2) All code paths that do something with the TSB need to be guarded with the mm->context.lock spinlock. This makes page table flushing paths properly synchronize with both TSB growing and TLB context changes. 3) TSB growing events are moved to the end of successful fault processing. Previously it was in update_mmu_cache() but that is deadlock prone. At the end of do_sparc64_fault() we hold no spinlocks that could deadlock the TSB grow sequence. We also have dropped the address space semaphore. While we're here, add prefetching to the copy_tsb() routine and put it in assembler into the tsb.S file. This piece of code is quite time critical. There are some small negative side effects to this code which can be improved upon. In particular we grab the mm->context.lock even for the tsb insert done by update_mmu_cache() now and that's a bit excessive. We can get rid of that locking, and the same lock taking in flush_tsb_user(), by disabling PSTATE_IE around the whole operation including the capturing of the tsb pointer and tsb_nentries value. That would work because anyone growing the TSB won't free up the old TSB until all cpus respond to the TSB change cross call. I'm not quite so confident in that optimization to put it in right now, but eventually we might be able to and the description is here for reference. This code seems very solid now. It passes several parallel GCC bootstrap builds, and our favorite "nut cruncher" stress test which is a full "make -j8192" build of a "make allmodconfig" kernel. That puts about 256 processes on each cpu's run queue, makes lots of process cpu migrations occur, causes lots of page table and TLB flushing activity, incurs many context version number changes, and it swaps the machine real far out to disk even though there is 16GB of ram on this test system. :-) Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'arch')
-rw-r--r--arch/sparc64/kernel/tsb.S71
-rw-r--r--arch/sparc64/mm/fault.c8
-rw-r--r--arch/sparc64/mm/init.c7
-rw-r--r--arch/sparc64/mm/tsb.c185
4 files changed, 165 insertions, 106 deletions
diff --git a/arch/sparc64/kernel/tsb.S b/arch/sparc64/kernel/tsb.S
index d738910153f6..1b154c863628 100644
--- a/arch/sparc64/kernel/tsb.S
+++ b/arch/sparc64/kernel/tsb.S
@@ -34,8 +34,9 @@ tsb_miss_itlb:
34 ldxa [%g4] ASI_IMMU, %g4 34 ldxa [%g4] ASI_IMMU, %g4
35 35
36 /* At this point we have: 36 /* At this point we have:
37 * %g4 -- missing virtual address
38 * %g1 -- TSB entry address 37 * %g1 -- TSB entry address
38 * %g3 -- FAULT_CODE_{D,I}TLB
39 * %g4 -- missing virtual address
39 * %g6 -- TAG TARGET (vaddr >> 22) 40 * %g6 -- TAG TARGET (vaddr >> 22)
40 */ 41 */
41tsb_miss_page_table_walk: 42tsb_miss_page_table_walk:
@@ -45,6 +46,12 @@ tsb_miss_page_table_walk:
45tsb_miss_page_table_walk_sun4v_fastpath: 46tsb_miss_page_table_walk_sun4v_fastpath:
46 USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault) 47 USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault)
47 48
49 /* At this point we have:
50 * %g1 -- TSB entry address
51 * %g3 -- FAULT_CODE_{D,I}TLB
52 * %g5 -- physical address of PTE in Linux page tables
53 * %g6 -- TAG TARGET (vaddr >> 22)
54 */
48tsb_reload: 55tsb_reload:
49 TSB_LOCK_TAG(%g1, %g2, %g7) 56 TSB_LOCK_TAG(%g1, %g2, %g7)
50 57
@@ -199,6 +206,7 @@ __tsb_insert:
199 wrpr %o5, %pstate 206 wrpr %o5, %pstate
200 retl 207 retl
201 nop 208 nop
209 .size __tsb_insert, .-__tsb_insert
202 210
203 /* Flush the given TSB entry if it has the matching 211 /* Flush the given TSB entry if it has the matching
204 * tag. 212 * tag.
@@ -208,6 +216,7 @@ __tsb_insert:
208 */ 216 */
209 .align 32 217 .align 32
210 .globl tsb_flush 218 .globl tsb_flush
219 .type tsb_flush,#function
211tsb_flush: 220tsb_flush:
212 sethi %hi(TSB_TAG_LOCK_HIGH), %g2 221 sethi %hi(TSB_TAG_LOCK_HIGH), %g2
2131: TSB_LOAD_TAG(%o0, %g1) 2221: TSB_LOAD_TAG(%o0, %g1)
@@ -225,6 +234,7 @@ tsb_flush:
225 nop 234 nop
2262: retl 2352: retl
227 TSB_MEMBAR 236 TSB_MEMBAR
237 .size tsb_flush, .-tsb_flush
228 238
229 /* Reload MMU related context switch state at 239 /* Reload MMU related context switch state at
230 * schedule() time. 240 * schedule() time.
@@ -241,6 +251,7 @@ tsb_flush:
241 */ 251 */
242 .align 32 252 .align 32
243 .globl __tsb_context_switch 253 .globl __tsb_context_switch
254 .type __tsb_context_switch,#function
244__tsb_context_switch: 255__tsb_context_switch:
245 rdpr %pstate, %o5 256 rdpr %pstate, %o5
246 wrpr %o5, PSTATE_IE, %pstate 257 wrpr %o5, PSTATE_IE, %pstate
@@ -302,3 +313,61 @@ __tsb_context_switch:
302 313
303 retl 314 retl
304 nop 315 nop
316 .size __tsb_context_switch, .-__tsb_context_switch
317
318#define TSB_PASS_BITS ((1 << TSB_TAG_LOCK_BIT) | \
319 (1 << TSB_TAG_INVALID_BIT))
320
321 .align 32
322 .globl copy_tsb
323 .type copy_tsb,#function
324copy_tsb: /* %o0=old_tsb_base, %o1=old_tsb_size
325 * %o2=new_tsb_base, %o3=new_tsb_size
326 */
327 sethi %uhi(TSB_PASS_BITS), %g7
328 srlx %o3, 4, %o3
329 add %o0, %o1, %g1 /* end of old tsb */
330 sllx %g7, 32, %g7
331 sub %o3, 1, %o3 /* %o3 == new tsb hash mask */
332
333661: prefetcha [%o0] ASI_N, #one_read
334 .section .tsb_phys_patch, "ax"
335 .word 661b
336 prefetcha [%o0] ASI_PHYS_USE_EC, #one_read
337 .previous
338
33990: andcc %o0, (64 - 1), %g0
340 bne 1f
341 add %o0, 64, %o5
342
343661: prefetcha [%o5] ASI_N, #one_read
344 .section .tsb_phys_patch, "ax"
345 .word 661b
346 prefetcha [%o5] ASI_PHYS_USE_EC, #one_read
347 .previous
348
3491: TSB_LOAD_QUAD(%o0, %g2) /* %g2/%g3 == TSB entry */
350 andcc %g2, %g7, %g0 /* LOCK or INVALID set? */
351 bne,pn %xcc, 80f /* Skip it */
352 sllx %g2, 22, %o4 /* TAG --> VADDR */
353
354 /* This can definitely be computed faster... */
355 srlx %o0, 4, %o5 /* Build index */
356 and %o5, 511, %o5 /* Mask index */
357 sllx %o5, PAGE_SHIFT, %o5 /* Put into vaddr position */
358 or %o4, %o5, %o4 /* Full VADDR. */
359 srlx %o4, PAGE_SHIFT, %o4 /* Shift down to create index */
360 and %o4, %o3, %o4 /* Mask with new_tsb_nents-1 */
361 sllx %o4, 4, %o4 /* Shift back up into tsb ent offset */
362 TSB_STORE(%o2 + %o4, %g2) /* Store TAG */
363 add %o4, 0x8, %o4 /* Advance to TTE */
364 TSB_STORE(%o2 + %o4, %g3) /* Store TTE */
365
36680: add %o0, 16, %o0
367 cmp %o0, %g1
368 bne,pt %xcc, 90b
369 nop
370
371 retl
372 TSB_MEMBAR
373 .size copy_tsb, .-copy_tsb
diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c
index b97bd054aad3..63b6cc0cd5d5 100644
--- a/arch/sparc64/mm/fault.c
+++ b/arch/sparc64/mm/fault.c
@@ -29,6 +29,7 @@
29#include <asm/lsu.h> 29#include <asm/lsu.h>
30#include <asm/sections.h> 30#include <asm/sections.h>
31#include <asm/kdebug.h> 31#include <asm/kdebug.h>
32#include <asm/mmu_context.h>
32 33
33/* 34/*
34 * To debug kernel to catch accesses to certain virtual/physical addresses. 35 * To debug kernel to catch accesses to certain virtual/physical addresses.
@@ -258,7 +259,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
258 struct vm_area_struct *vma; 259 struct vm_area_struct *vma;
259 unsigned int insn = 0; 260 unsigned int insn = 0;
260 int si_code, fault_code; 261 int si_code, fault_code;
261 unsigned long address; 262 unsigned long address, mm_rss;
262 263
263 fault_code = get_thread_fault_code(); 264 fault_code = get_thread_fault_code();
264 265
@@ -407,6 +408,11 @@ good_area:
407 } 408 }
408 409
409 up_read(&mm->mmap_sem); 410 up_read(&mm->mmap_sem);
411
412 mm_rss = get_mm_rss(mm);
413 if (unlikely(mm_rss >= mm->context.tsb_rss_limit))
414 tsb_grow(mm, mm_rss);
415
410 return; 416 return;
411 417
412 /* 418 /*
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index b40f6477dea0..d703b67bc7b9 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -279,7 +279,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
279{ 279{
280 struct mm_struct *mm; 280 struct mm_struct *mm;
281 struct tsb *tsb; 281 struct tsb *tsb;
282 unsigned long tag; 282 unsigned long tag, flags;
283 283
284 if (tlb_type != hypervisor) { 284 if (tlb_type != hypervisor) {
285 unsigned long pfn = pte_pfn(pte); 285 unsigned long pfn = pte_pfn(pte);
@@ -308,10 +308,15 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
308 } 308 }
309 309
310 mm = vma->vm_mm; 310 mm = vma->vm_mm;
311
312 spin_lock_irqsave(&mm->context.lock, flags);
313
311 tsb = &mm->context.tsb[(address >> PAGE_SHIFT) & 314 tsb = &mm->context.tsb[(address >> PAGE_SHIFT) &
312 (mm->context.tsb_nentries - 1UL)]; 315 (mm->context.tsb_nentries - 1UL)];
313 tag = (address >> 22UL); 316 tag = (address >> 22UL);
314 tsb_insert(tsb, tag, pte_val(pte)); 317 tsb_insert(tsb, tag, pte_val(pte));
318
319 spin_unlock_irqrestore(&mm->context.lock, flags);
315} 320}
316 321
317void flush_dcache_page(struct page *page) 322void flush_dcache_page(struct page *page)
diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c
index f36799b7152c..7fbe1e0cd105 100644
--- a/arch/sparc64/mm/tsb.c
+++ b/arch/sparc64/mm/tsb.c
@@ -48,11 +48,15 @@ void flush_tsb_kernel_range(unsigned long start, unsigned long end)
48void flush_tsb_user(struct mmu_gather *mp) 48void flush_tsb_user(struct mmu_gather *mp)
49{ 49{
50 struct mm_struct *mm = mp->mm; 50 struct mm_struct *mm = mp->mm;
51 struct tsb *tsb = mm->context.tsb; 51 unsigned long nentries, base, flags;
52 unsigned long nentries = mm->context.tsb_nentries; 52 struct tsb *tsb;
53 unsigned long base;
54 int i; 53 int i;
55 54
55 spin_lock_irqsave(&mm->context.lock, flags);
56
57 tsb = mm->context.tsb;
58 nentries = mm->context.tsb_nentries;
59
56 if (tlb_type == cheetah_plus || tlb_type == hypervisor) 60 if (tlb_type == cheetah_plus || tlb_type == hypervisor)
57 base = __pa(tsb); 61 base = __pa(tsb);
58 else 62 else
@@ -70,6 +74,8 @@ void flush_tsb_user(struct mmu_gather *mp)
70 74
71 tsb_flush(ent, tag); 75 tsb_flush(ent, tag);
72 } 76 }
77
78 spin_unlock_irqrestore(&mm->context.lock, flags);
73} 79}
74 80
75static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes) 81static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)
@@ -201,86 +207,9 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)
201 } 207 }
202} 208}
203 209
204/* The page tables are locked against modifications while this
205 * runs.
206 *
207 * XXX do some prefetching...
208 */
209static void copy_tsb(struct tsb *old_tsb, unsigned long old_size,
210 struct tsb *new_tsb, unsigned long new_size)
211{
212 unsigned long old_nentries = old_size / sizeof(struct tsb);
213 unsigned long new_nentries = new_size / sizeof(struct tsb);
214 unsigned long i;
215
216 for (i = 0; i < old_nentries; i++) {
217 register unsigned long tag asm("o4");
218 register unsigned long pte asm("o5");
219 unsigned long v, hash;
220
221 if (tlb_type == hypervisor) {
222 __asm__ __volatile__(
223 "ldda [%2] %3, %0"
224 : "=r" (tag), "=r" (pte)
225 : "r" (__pa(&old_tsb[i])),
226 "i" (ASI_QUAD_LDD_PHYS_4V));
227 } else if (tlb_type == cheetah_plus) {
228 __asm__ __volatile__(
229 "ldda [%2] %3, %0"
230 : "=r" (tag), "=r" (pte)
231 : "r" (__pa(&old_tsb[i])),
232 "i" (ASI_QUAD_LDD_PHYS));
233 } else {
234 __asm__ __volatile__(
235 "ldda [%2] %3, %0"
236 : "=r" (tag), "=r" (pte)
237 : "r" (&old_tsb[i]),
238 "i" (ASI_NUCLEUS_QUAD_LDD));
239 }
240
241 if (tag & ((1UL << TSB_TAG_LOCK_BIT) |
242 (1UL << TSB_TAG_INVALID_BIT)))
243 continue;
244
245 /* We only put base page size PTEs into the TSB,
246 * but that might change in the future. This code
247 * would need to be changed if we start putting larger
248 * page size PTEs into there.
249 */
250 WARN_ON((pte & _PAGE_ALL_SZ_BITS) != _PAGE_SZBITS);
251
252 /* The tag holds bits 22 to 63 of the virtual address
253 * and the context. Clear out the context, and shift
254 * up to make a virtual address.
255 */
256 v = (tag & ((1UL << 42UL) - 1UL)) << 22UL;
257
258 /* The implied bits of the tag (bits 13 to 21) are
259 * determined by the TSB entry index, so fill that in.
260 */
261 v |= (i & (512UL - 1UL)) << 13UL;
262
263 hash = tsb_hash(v, new_nentries);
264 if (tlb_type == cheetah_plus ||
265 tlb_type == hypervisor) {
266 __asm__ __volatile__(
267 "stxa %0, [%1] %2\n\t"
268 "stxa %3, [%4] %2"
269 : /* no outputs */
270 : "r" (tag),
271 "r" (__pa(&new_tsb[hash].tag)),
272 "i" (ASI_PHYS_USE_EC),
273 "r" (pte),
274 "r" (__pa(&new_tsb[hash].pte)));
275 } else {
276 new_tsb[hash].tag = tag;
277 new_tsb[hash].pte = pte;
278 }
279 }
280}
281
282/* When the RSS of an address space exceeds mm->context.tsb_rss_limit, 210/* When the RSS of an address space exceeds mm->context.tsb_rss_limit,
283 * update_mmu_cache() invokes this routine to try and grow the TSB. 211 * do_sparc64_fault() invokes this routine to try and grow the TSB.
212 *
284 * When we reach the maximum TSB size supported, we stick ~0UL into 213 * When we reach the maximum TSB size supported, we stick ~0UL into
285 * mm->context.tsb_rss_limit so the grow checks in update_mmu_cache() 214 * mm->context.tsb_rss_limit so the grow checks in update_mmu_cache()
286 * will not trigger any longer. 215 * will not trigger any longer.
@@ -293,12 +222,12 @@ static void copy_tsb(struct tsb *old_tsb, unsigned long old_size,
293 * the number of entries that the current TSB can hold at once. Currently, 222 * the number of entries that the current TSB can hold at once. Currently,
294 * we trigger when the RSS hits 3/4 of the TSB capacity. 223 * we trigger when the RSS hits 3/4 of the TSB capacity.
295 */ 224 */
296void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags) 225void tsb_grow(struct mm_struct *mm, unsigned long rss)
297{ 226{
298 unsigned long max_tsb_size = 1 * 1024 * 1024; 227 unsigned long max_tsb_size = 1 * 1024 * 1024;
299 unsigned long size, old_size; 228 unsigned long size, old_size, flags;
300 struct page *page; 229 struct page *page;
301 struct tsb *old_tsb; 230 struct tsb *old_tsb, *new_tsb;
302 231
303 if (max_tsb_size > (PAGE_SIZE << MAX_ORDER)) 232 if (max_tsb_size > (PAGE_SIZE << MAX_ORDER))
304 max_tsb_size = (PAGE_SIZE << MAX_ORDER); 233 max_tsb_size = (PAGE_SIZE << MAX_ORDER);
@@ -311,12 +240,51 @@ void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)
311 break; 240 break;
312 } 241 }
313 242
314 page = alloc_pages(gfp_flags, get_order(size)); 243 page = alloc_pages(GFP_KERNEL, get_order(size));
315 if (unlikely(!page)) 244 if (unlikely(!page))
316 return; 245 return;
317 246
318 /* Mark all tags as invalid. */ 247 /* Mark all tags as invalid. */
319 memset(page_address(page), 0x40, size); 248 new_tsb = page_address(page);
249 memset(new_tsb, 0x40, size);
250
251 /* Ok, we are about to commit the changes. If we are
252 * growing an existing TSB the locking is very tricky,
253 * so WATCH OUT!
254 *
255 * We have to hold mm->context.lock while committing to the
256 * new TSB, this synchronizes us with processors in
257 * flush_tsb_user() and switch_mm() for this address space.
258 *
259 * But even with that lock held, processors run asynchronously
260 * accessing the old TSB via TLB miss handling. This is OK
261 * because those actions are just propagating state from the
262 * Linux page tables into the TSB, page table mappings are not
263 * being changed. If a real fault occurs, the processor will
264 * synchronize with us when it hits flush_tsb_user(), this is
265 * also true for the case where vmscan is modifying the page
266 * tables. The only thing we need to be careful with is to
267 * skip any locked TSB entries during copy_tsb().
268 *
269 * When we finish committing to the new TSB, we have to drop
270 * the lock and ask all other cpus running this address space
271 * to run tsb_context_switch() to see the new TSB table.
272 */
273 spin_lock_irqsave(&mm->context.lock, flags);
274
275 old_tsb = mm->context.tsb;
276 old_size = mm->context.tsb_nentries * sizeof(struct tsb);
277
278 /* Handle multiple threads trying to grow the TSB at the same time.
279 * One will get in here first, and bump the size and the RSS limit.
280 * The others will get in here next and hit this check.
281 */
282 if (unlikely(old_tsb && (rss < mm->context.tsb_rss_limit))) {
283 spin_unlock_irqrestore(&mm->context.lock, flags);
284
285 free_pages((unsigned long) new_tsb, get_order(size));
286 return;
287 }
320 288
321 if (size == max_tsb_size) 289 if (size == max_tsb_size)
322 mm->context.tsb_rss_limit = ~0UL; 290 mm->context.tsb_rss_limit = ~0UL;
@@ -324,30 +292,37 @@ void tsb_grow(struct mm_struct *mm, unsigned long rss, gfp_t gfp_flags)
324 mm->context.tsb_rss_limit = 292 mm->context.tsb_rss_limit =
325 ((size / sizeof(struct tsb)) * 3) / 4; 293 ((size / sizeof(struct tsb)) * 3) / 4;
326 294
327 old_tsb = mm->context.tsb; 295 if (old_tsb) {
328 old_size = mm->context.tsb_nentries * sizeof(struct tsb); 296 extern void copy_tsb(unsigned long old_tsb_base,
329 297 unsigned long old_tsb_size,
330 if (old_tsb) 298 unsigned long new_tsb_base,
331 copy_tsb(old_tsb, old_size, page_address(page), size); 299 unsigned long new_tsb_size);
300 unsigned long old_tsb_base = (unsigned long) old_tsb;
301 unsigned long new_tsb_base = (unsigned long) new_tsb;
302
303 if (tlb_type == cheetah_plus || tlb_type == hypervisor) {
304 old_tsb_base = __pa(old_tsb_base);
305 new_tsb_base = __pa(new_tsb_base);
306 }
307 copy_tsb(old_tsb_base, old_size, new_tsb_base, size);
308 }
332 309
333 mm->context.tsb = page_address(page); 310 mm->context.tsb = new_tsb;
334 setup_tsb_params(mm, size); 311 setup_tsb_params(mm, size);
335 312
313 spin_unlock_irqrestore(&mm->context.lock, flags);
314
336 /* If old_tsb is NULL, we're being invoked for the first time 315 /* If old_tsb is NULL, we're being invoked for the first time
337 * from init_new_context(). 316 * from init_new_context().
338 */ 317 */
339 if (old_tsb) { 318 if (old_tsb) {
340 /* Now force all other processors to reload the new 319 /* Reload it on the local cpu. */
341 * TSB state.
342 */
343 smp_tsb_sync(mm);
344
345 /* Finally reload it on the local cpu. No further
346 * references will remain to the old TSB and we can
347 * thus free it up.
348 */
349 tsb_context_switch(mm); 320 tsb_context_switch(mm);
350 321
322 /* Now force other processors to do the same. */
323 smp_tsb_sync(mm);
324
325 /* Now it is safe to free the old tsb. */
351 free_pages((unsigned long) old_tsb, get_order(old_size)); 326 free_pages((unsigned long) old_tsb, get_order(old_size));
352 } 327 }
353} 328}
@@ -363,7 +338,11 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
363 * will be confused and think there is an older TSB to free up. 338 * will be confused and think there is an older TSB to free up.
364 */ 339 */
365 mm->context.tsb = NULL; 340 mm->context.tsb = NULL;
366 tsb_grow(mm, 0, GFP_KERNEL); 341
342 /* If this is fork, inherit the parent's TSB size. We would
343 * grow it to that size on the first page fault anyways.
344 */
345 tsb_grow(mm, get_mm_rss(mm));
367 346
368 if (unlikely(!mm->context.tsb)) 347 if (unlikely(!mm->context.tsb))
369 return -ENOMEM; 348 return -ENOMEM;