aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArjan van de Ven <arjan@linux.intel.com>2006-09-26 04:52:36 -0400
committerAndi Kleen <andi@basil.nowhere.org>2006-09-26 04:52:36 -0400
commite07e23e1fd3000289fc7ccc6c71879070d3b19e0 (patch)
tree1290385cacd89e39b7bc1b12b7515b68423d78d3
parent73fea175303926055440c06bc8894f0c5c58afc8 (diff)
[PATCH] non lazy "sleazy" fpu implementation
Right now the kernel on x86-64 has a 100% lazy fpu behavior: after *every* context switch a trap is taken for the first FPU use to restore the FPU context lazily. This is of course great for applications that have very sporadic or no FPU use (since then you avoid doing the expensive save/restore all the time). However for very frequent FPU users... you take an extra trap every context switch. The patch below adds a simple heuristic to this code: After 5 consecutive context switches of FPU use, the lazy behavior is disabled and the context gets restored every context switch. If the app indeed uses the FPU, the trap is avoided. (the chance of the 6th time slice using FPU after the previous 5 having done so are quite high obviously). After 256 switches, this is reset and lazy behavior is returned (until there are 5 consecutive ones again). The reason for this is to give apps that do longer bursts of FPU use still the lazy behavior back after some time. [akpm@osdl.org: place new task_struct field next to jit_keyring to save space] Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org>
-rw-r--r--arch/x86_64/kernel/process.c10
-rw-r--r--arch/x86_64/kernel/traps.c1
-rw-r--r--include/asm-x86_64/i387.h5
-rw-r--r--include/linux/sched.h9
4 files changed, 24 insertions, 1 deletions
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 6fbd19564e4..9e9a70e50c7 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -552,6 +552,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552 int cpu = smp_processor_id(); 552 int cpu = smp_processor_id();
553 struct tss_struct *tss = &per_cpu(init_tss, cpu); 553 struct tss_struct *tss = &per_cpu(init_tss, cpu);
554 554
555 /* we're going to use this soon, after a few expensive things */
556 if (next_p->fpu_counter>5)
557 prefetch(&next->i387.fxsave);
558
555 /* 559 /*
556 * Reload esp0, LDT and the page table pointer: 560 * Reload esp0, LDT and the page table pointer:
557 */ 561 */
@@ -629,6 +633,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
629 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) 633 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
630 __switch_to_xtra(prev_p, next_p, tss); 634 __switch_to_xtra(prev_p, next_p, tss);
631 635
636 /* If the task has used fpu the last 5 timeslices, just do a full
637 * restore of the math state immediately to avoid the trap; the
638 * chances of needing FPU soon are obviously high now
639 */
640 if (next_p->fpu_counter>5)
641 math_state_restore();
632 return prev_p; 642 return prev_p;
633} 643}
634 644
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 28e53342f29..ffc40cff1e0 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -1136,6 +1136,7 @@ asmlinkage void math_state_restore(void)
1136 init_fpu(me); 1136 init_fpu(me);
1137 restore_fpu_checking(&me->thread.i387.fxsave); 1137 restore_fpu_checking(&me->thread.i387.fxsave);
1138 task_thread_info(me)->status |= TS_USEDFPU; 1138 task_thread_info(me)->status |= TS_USEDFPU;
1139 me->fpu_counter++;
1139} 1140}
1140 1141
1141void __init trap_init(void) 1142void __init trap_init(void)
diff --git a/include/asm-x86_64/i387.h b/include/asm-x86_64/i387.h
index cba8a3b0cde..60c0f4853fd 100644
--- a/include/asm-x86_64/i387.h
+++ b/include/asm-x86_64/i387.h
@@ -24,6 +24,7 @@ extern unsigned int mxcsr_feature_mask;
24extern void mxcsr_feature_mask_init(void); 24extern void mxcsr_feature_mask_init(void);
25extern void init_fpu(struct task_struct *child); 25extern void init_fpu(struct task_struct *child);
26extern int save_i387(struct _fpstate __user *buf); 26extern int save_i387(struct _fpstate __user *buf);
27extern asmlinkage void math_state_restore(void);
27 28
28/* 29/*
29 * FPU lazy state save handling... 30 * FPU lazy state save handling...
@@ -31,7 +32,9 @@ extern int save_i387(struct _fpstate __user *buf);
31 32
32#define unlazy_fpu(tsk) do { \ 33#define unlazy_fpu(tsk) do { \
33 if (task_thread_info(tsk)->status & TS_USEDFPU) \ 34 if (task_thread_info(tsk)->status & TS_USEDFPU) \
34 save_init_fpu(tsk); \ 35 save_init_fpu(tsk); \
36 else \
37 tsk->fpu_counter = 0; \
35} while (0) 38} while (0)
36 39
37/* Ignore delayed exceptions from user space */ 40/* Ignore delayed exceptions from user space */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 34ed0d99b1b..807556c5bcd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -865,6 +865,15 @@ struct task_struct {
865 struct key *thread_keyring; /* keyring private to this thread */ 865 struct key *thread_keyring; /* keyring private to this thread */
866 unsigned char jit_keyring; /* default keyring to attach requested keys to */ 866 unsigned char jit_keyring; /* default keyring to attach requested keys to */
867#endif 867#endif
868 /*
869 * fpu_counter contains the number of consecutive context switches
870 * that the FPU is used. If this is over a threshold, the lazy fpu
871 * saving becomes unlazy to save the trap. This is an unsigned char
872 * so that after 256 times the counter wraps and the behavior turns
873 * lazy again; this to deal with bursty apps that only use FPU for
874 * a short time
875 */
876 unsigned char fpu_counter;
868 int oomkilladj; /* OOM kill score adjustment (bit shift). */ 877 int oomkilladj; /* OOM kill score adjustment (bit shift). */
869 char comm[TASK_COMM_LEN]; /* executable name excluding path 878 char comm[TASK_COMM_LEN]; /* executable name excluding path
870 - access with [gs]et_task_comm (which lock 879 - access with [gs]et_task_comm (which lock