aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGiuseppe CAVALLARO <peppe.cavallaro@st.com>2009-07-07 10:25:10 -0400
committerPaul Mundt <lethal@linux-sh.org>2009-11-24 02:23:38 -0500
commita0458b07c17a10ea316e6ae65ab15b78bf5f44ee (patch)
tree16211bec010bd65fe08f818ecb94075bec4d988e
parenta8a8a669ea13d792296737505adc43ccacf3a648 (diff)
sh: add sleazy FPU optimization
sh port of the sLeAZY-fpu feature currently implemented for some architectures such us i386. Right now the SH kernel has a 100% lazy fpu behaviour. This is of course great for applications that have very sporadic or no FPU use. However for very frequent FPU users... you take an extra trap every context switch. The patch below adds a simple heuristic to this code: after 5 consecutive context switches of FPU use, the lazy behavior is disabled and the context gets restored every context switch. After 256 switches, this is reset and the 100% lazy behavior is returned. Tests with LMbench showed no regression. I saw a little improvement due to the prefetching (~2%). The tests below also show that, with this sLeazy patch, indeed, the number of FPU exceptions is reduced. To test this. I hacked the lat_ctx LMBench to use the FPU a little more. sLeasy implementation =========================================== switch_to calls | 79326 sleasy calls | 42577 do_fpu_state_restore calls| 59232 restore_fpu calls | 59032 Exceptions: 0x800 (FPU disabled ): 16604 100% Leazy (default implementation) =========================================== switch_to calls | 79690 do_fpu_state_restore calls | 53299 restore_fpu calls | 53101 Exceptions: 0x800 (FPU disabled ): 53273 Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com> Signed-off-by: Stuart Menefy <stuart.menefy@st.com> Signed-off-by: Paul Mundt <lethal@linux-sh.org>
-rw-r--r--arch/sh/include/asm/fpu.h3
-rw-r--r--arch/sh/kernel/cpu/sh4/fpu.c16
-rw-r--r--arch/sh/kernel/process_32.c16
3 files changed, 31 insertions, 4 deletions
diff --git a/arch/sh/include/asm/fpu.h b/arch/sh/include/asm/fpu.h
index 1d3aee04b5cc..bfd78e19de1b 100644
--- a/arch/sh/include/asm/fpu.h
+++ b/arch/sh/include/asm/fpu.h
@@ -19,6 +19,7 @@ static inline void grab_fpu(struct pt_regs *regs)
19struct task_struct; 19struct task_struct;
20 20
21extern void save_fpu(struct task_struct *__tsk, struct pt_regs *regs); 21extern void save_fpu(struct task_struct *__tsk, struct pt_regs *regs);
22void fpu_state_restore(struct pt_regs *regs);
22#else 23#else
23 24
24#define release_fpu(regs) do { } while (0) 25#define release_fpu(regs) do { } while (0)
@@ -44,6 +45,8 @@ static inline void unlazy_fpu(struct task_struct *tsk, struct pt_regs *regs)
44 preempt_disable(); 45 preempt_disable();
45 if (test_tsk_thread_flag(tsk, TIF_USEDFPU)) 46 if (test_tsk_thread_flag(tsk, TIF_USEDFPU))
46 save_fpu(tsk, regs); 47 save_fpu(tsk, regs);
48 else
49 tsk->fpu_counter = 0;
47 preempt_enable(); 50 preempt_enable();
48} 51}
49 52
diff --git a/arch/sh/kernel/cpu/sh4/fpu.c b/arch/sh/kernel/cpu/sh4/fpu.c
index e3ea5411da6d..d79226fa59d1 100644
--- a/arch/sh/kernel/cpu/sh4/fpu.c
+++ b/arch/sh/kernel/cpu/sh4/fpu.c
@@ -483,18 +483,18 @@ BUILD_TRAP_HANDLER(fpu_error)
483 force_sig(SIGFPE, tsk); 483 force_sig(SIGFPE, tsk);
484} 484}
485 485
486BUILD_TRAP_HANDLER(fpu_state_restore) 486void fpu_state_restore(struct pt_regs *regs)
487{ 487{
488 struct task_struct *tsk = current; 488 struct task_struct *tsk = current;
489 TRAP_HANDLER_DECL;
490 489
491 grab_fpu(regs); 490 grab_fpu(regs);
492 if (!user_mode(regs)) { 491 if (unlikely(!user_mode(regs))) {
493 printk(KERN_ERR "BUG: FPU is used in kernel mode.\n"); 492 printk(KERN_ERR "BUG: FPU is used in kernel mode.\n");
493 BUG();
494 return; 494 return;
495 } 495 }
496 496
497 if (used_math()) { 497 if (likely(used_math())) {
498 /* Using the FPU again. */ 498 /* Using the FPU again. */
499 restore_fpu(tsk); 499 restore_fpu(tsk);
500 } else { 500 } else {
@@ -503,4 +503,12 @@ BUILD_TRAP_HANDLER(fpu_state_restore)
503 set_used_math(); 503 set_used_math();
504 } 504 }
505 set_tsk_thread_flag(tsk, TIF_USEDFPU); 505 set_tsk_thread_flag(tsk, TIF_USEDFPU);
506 tsk->fpu_counter++;
507}
508
509BUILD_TRAP_HANDLER(fpu_state_restore)
510{
511 TRAP_HANDLER_DECL;
512
513 fpu_state_restore(regs);
506} 514}
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 0673c4746be3..aff5fe02e393 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -288,8 +288,14 @@ static void ubc_set_tracing(int asid, unsigned long pc)
288__notrace_funcgraph struct task_struct * 288__notrace_funcgraph struct task_struct *
289__switch_to(struct task_struct *prev, struct task_struct *next) 289__switch_to(struct task_struct *prev, struct task_struct *next)
290{ 290{
291 struct thread_struct *next_t = &next->thread;
292
291#if defined(CONFIG_SH_FPU) 293#if defined(CONFIG_SH_FPU)
292 unlazy_fpu(prev, task_pt_regs(prev)); 294 unlazy_fpu(prev, task_pt_regs(prev));
295
296 /* we're going to use this soon, after a few expensive things */
297 if (next->fpu_counter > 5)
298 prefetch(&next_t->fpu.hard);
293#endif 299#endif
294 300
295#ifdef CONFIG_MMU 301#ifdef CONFIG_MMU
@@ -321,6 +327,16 @@ __switch_to(struct task_struct *prev, struct task_struct *next)
321#endif 327#endif
322 } 328 }
323 329
330#if defined(CONFIG_SH_FPU)
331 /* If the task has used fpu the last 5 timeslices, just do a full
332 * restore of the math state immediately to avoid the trap; the
333 * chances of needing FPU soon are obviously high now
334 */
335 if (next->fpu_counter > 5) {
336 fpu_state_restore(task_pt_regs(next));
337 }
338#endif
339
324 return prev; 340 return prev;
325} 341}
326 342