diff options
author | Jack Steiner <steiner@sgi.com> | 2006-11-22 10:55:08 -0500 |
---|---|---|
committer | Tony Luck <tony.luck@intel.com> | 2006-12-12 14:47:09 -0500 |
commit | 1cf24bdbbbd2eb5439796dc399ab1649d150ed1d (patch) | |
tree | f1ef2033b15e43d01a5759f90130900455ea5957 /arch | |
parent | 8b9c106856d92c8266697328b148d115538b59ce (diff) |
[IA64] - Reduce overhead of FP exception logging messages
Improve the scalability of the fpswa code that rate-limits
logging of messages.
There are 2 distinctly different problems in this code.
1) If prctl is used to disable logging, last_time is never
updated. The result is that fpu_swa_count is zeroed out on
EVERY fp fault. This causes a very very hot cache line.
The fix reduces the wallclock time of a 1024p FP exception test
from 28734 sec to 19 sec!!!
2) On VERY large systems, excessive messages are logged because
multiple cpus can each reset or increment fpu_swa_count at
about the same time. The result is that hundreds of messages
are logged each second. The fixes reduces the logging rate
to ~1 per second.
Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/ia64/kernel/traps.c | 50 |
1 files changed, 40 insertions, 10 deletions
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c index fffa9e0826bc..ab684747036f 100644 --- a/arch/ia64/kernel/traps.c +++ b/arch/ia64/kernel/traps.c | |||
@@ -307,6 +307,15 @@ fp_emulate (int fp_fault, void *bundle, long *ipsr, long *fpsr, long *isr, long | |||
307 | return ret.status; | 307 | return ret.status; |
308 | } | 308 | } |
309 | 309 | ||
310 | struct fpu_swa_msg { | ||
311 | unsigned long count; | ||
312 | unsigned long time; | ||
313 | }; | ||
314 | static DEFINE_PER_CPU(struct fpu_swa_msg, cpulast); | ||
315 | DECLARE_PER_CPU(struct fpu_swa_msg, cpulast); | ||
316 | static struct fpu_swa_msg last __cacheline_aligned; | ||
317 | |||
318 | |||
310 | /* | 319 | /* |
311 | * Handle floating-point assist faults and traps. | 320 | * Handle floating-point assist faults and traps. |
312 | */ | 321 | */ |
@@ -316,8 +325,6 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr) | |||
316 | long exception, bundle[2]; | 325 | long exception, bundle[2]; |
317 | unsigned long fault_ip; | 326 | unsigned long fault_ip; |
318 | struct siginfo siginfo; | 327 | struct siginfo siginfo; |
319 | static int fpu_swa_count = 0; | ||
320 | static unsigned long last_time; | ||
321 | 328 | ||
322 | fault_ip = regs->cr_iip; | 329 | fault_ip = regs->cr_iip; |
323 | if (!fp_fault && (ia64_psr(regs)->ri == 0)) | 330 | if (!fp_fault && (ia64_psr(regs)->ri == 0)) |
@@ -325,14 +332,37 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr) | |||
325 | if (copy_from_user(bundle, (void __user *) fault_ip, sizeof(bundle))) | 332 | if (copy_from_user(bundle, (void __user *) fault_ip, sizeof(bundle))) |
326 | return -1; | 333 | return -1; |
327 | 334 | ||
328 | if (jiffies - last_time > 5*HZ) | 335 | if (!(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT)) { |
329 | fpu_swa_count = 0; | 336 | unsigned long count, current_jiffies = jiffies; |
330 | if ((fpu_swa_count < 4) && !(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT)) { | 337 | struct fpu_swa_msg *cp = &__get_cpu_var(cpulast); |
331 | last_time = jiffies; | 338 | |
332 | ++fpu_swa_count; | 339 | if (unlikely(current_jiffies > cp->time)) |
333 | printk(KERN_WARNING | 340 | cp->count = 0; |
334 | "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n", | 341 | if (unlikely(cp->count < 5)) { |
335 | current->comm, current->pid, regs->cr_iip + ia64_psr(regs)->ri, isr); | 342 | cp->count++; |
343 | cp->time = current_jiffies + 5 * HZ; | ||
344 | |||
345 | /* minimize races by grabbing a copy of count BEFORE checking last.time. */ | ||
346 | count = last.count; | ||
347 | barrier(); | ||
348 | |||
349 | /* | ||
350 | * Lower 4 bits are used as a count. Upper bits are a sequence | ||
351 | * number that is updated when count is reset. The cmpxchg will | ||
352 | * fail is seqno has changed. This minimizes mutiple cpus | ||
353 | * reseting the count. | ||
354 | */ | ||
355 | if (current_jiffies > last.time) | ||
356 | (void) cmpxchg_acq(&last.count, count, 16 + (count & ~15)); | ||
357 | |||
358 | /* used fetchadd to atomically update the count */ | ||
359 | if ((last.count & 15) < 5 && (ia64_fetchadd(1, &last.count, acq) & 15) < 5) { | ||
360 | last.time = current_jiffies + 5 * HZ; | ||
361 | printk(KERN_WARNING | ||
362 | "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n", | ||
363 | current->comm, current->pid, regs->cr_iip + ia64_psr(regs)->ri, isr); | ||
364 | } | ||
365 | } | ||
336 | } | 366 | } |
337 | 367 | ||
338 | exception = fp_emulate(fp_fault, bundle, ®s->cr_ipsr, ®s->ar_fpsr, &isr, ®s->pr, | 368 | exception = fp_emulate(fp_fault, bundle, ®s->cr_ipsr, ®s->ar_fpsr, &isr, ®s->pr, |