aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChuck Ebbert <76306.1226@compuserve.com>2006-12-06 20:14:01 -0500
committerAndi Kleen <andi@basil.nowhere.org>2006-12-06 20:14:01 -0500
commitacc207616a91a413a50fdd8847a747c4a7324167 (patch)
tree71f603615d7c9da8af47fd89346dce9a2e341456
parentbe44d2aabce2d62f72d5751d1871b6212bf7a1c7 (diff)
[PATCH] i386: add sleazy FPU optimization
i386 port of the sLeAZY-fpu feature. Chuck reports that this gives him a +/- 0.4% improvement on his simple benchmark x86_64 description follows: Right now the kernel on x86-64 has a 100% lazy fpu behavior: after *every* context switch a trap is taken for the first FPU use to restore the FPU context lazily. This is of course great for applications that have very sporadic or no FPU use (since then you avoid doing the expensive save/restore all the time). However for very frequent FPU users... you take an extra trap every context switch. The patch below adds a simple heuristic to this code: After 5 consecutive context switches of FPU use, the lazy behavior is disabled and the context gets restored every context switch. If the app indeed uses the FPU, the trap is avoided. (the chance of the 6th time slice using FPU after the previous 5 having done so are quite high obviously). After 256 switches, this is reset and lazy behavior is returned (until there are 5 consecutive ones again). The reason for this is to give apps that do longer bursts of FPU use still the lazy behavior back after some time. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Andi Kleen <ak@suse.de>
-rw-r--r--arch/i386/kernel/process.c12
-rw-r--r--arch/i386/kernel/traps.c3
-rw-r--r--include/asm-i386/i387.h5
3 files changed, 18 insertions, 2 deletions
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index dd53c58f64f1..ae924c416b68 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -648,6 +648,11 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
648 648
649 __unlazy_fpu(prev_p); 649 __unlazy_fpu(prev_p);
650 650
651
652 /* we're going to use this soon, after a few expensive things */
653 if (next_p->fpu_counter > 5)
654 prefetch(&next->i387.fxsave);
655
651 /* 656 /*
652 * Reload esp0. 657 * Reload esp0.
653 */ 658 */
@@ -697,6 +702,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
697 702
698 disable_tsc(prev_p, next_p); 703 disable_tsc(prev_p, next_p);
699 704
705 /* If the task has used fpu the last 5 timeslices, just do a full
706 * restore of the math state immediately to avoid the trap; the
707 * chances of needing FPU soon are obviously high now
708 */
709 if (next_p->fpu_counter > 5)
710 math_state_restore();
711
700 return prev_p; 712 return prev_p;
701} 713}
702 714
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index f9bb1f89d687..4a6fa2837df2 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1118,7 +1118,7 @@ fastcall unsigned long patch_espfix_desc(unsigned long uesp,
1118 * Must be called with kernel preemption disabled (in this case, 1118 * Must be called with kernel preemption disabled (in this case,
1119 * local interrupts are disabled at the call-site in entry.S). 1119 * local interrupts are disabled at the call-site in entry.S).
1120 */ 1120 */
1121asmlinkage void math_state_restore(struct pt_regs regs) 1121asmlinkage void math_state_restore(void)
1122{ 1122{
1123 struct thread_info *thread = current_thread_info(); 1123 struct thread_info *thread = current_thread_info();
1124 struct task_struct *tsk = thread->task; 1124 struct task_struct *tsk = thread->task;
@@ -1128,6 +1128,7 @@ asmlinkage void math_state_restore(struct pt_regs regs)
1128 init_fpu(tsk); 1128 init_fpu(tsk);
1129 restore_fpu(tsk); 1129 restore_fpu(tsk);
1130 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 1130 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1131 tsk->fpu_counter++;
1131} 1132}
1132 1133
1133#ifndef CONFIG_MATH_EMULATION 1134#ifndef CONFIG_MATH_EMULATION
diff --git a/include/asm-i386/i387.h b/include/asm-i386/i387.h
index bc1d6edae1ed..434936c732d6 100644
--- a/include/asm-i386/i387.h
+++ b/include/asm-i386/i387.h
@@ -76,7 +76,9 @@ static inline void __save_init_fpu( struct task_struct *tsk )
76 76
77#define __unlazy_fpu( tsk ) do { \ 77#define __unlazy_fpu( tsk ) do { \
78 if (task_thread_info(tsk)->status & TS_USEDFPU) \ 78 if (task_thread_info(tsk)->status & TS_USEDFPU) \
79 save_init_fpu( tsk ); \ 79 save_init_fpu( tsk ); \
80 else \
81 tsk->fpu_counter = 0; \
80} while (0) 82} while (0)
81 83
82#define __clear_fpu( tsk ) \ 84#define __clear_fpu( tsk ) \
@@ -118,6 +120,7 @@ static inline void save_init_fpu( struct task_struct *tsk )
118extern unsigned short get_fpu_cwd( struct task_struct *tsk ); 120extern unsigned short get_fpu_cwd( struct task_struct *tsk );
119extern unsigned short get_fpu_swd( struct task_struct *tsk ); 121extern unsigned short get_fpu_swd( struct task_struct *tsk );
120extern unsigned short get_fpu_mxcsr( struct task_struct *tsk ); 122extern unsigned short get_fpu_mxcsr( struct task_struct *tsk );
123extern asmlinkage void math_state_restore(void);
121 124
122/* 125/*
123 * Signal frame handlers... 126 * Signal frame handlers...