aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-02-19 16:27:00 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-02-20 13:58:54 -0500
commit7e16838d94b566a17b65231073d179bc04d590c8 (patch)
tree356ae3999d89d2419fd4b85b062a24820f4a4d82
parent80ab6f1e8c981b1b6604b2f22e36c917526235cd (diff)
i387: support lazy restore of FPU state
This makes us recognize when we try to restore FPU state that matches what we already have in the FPU on this CPU, and avoids the restore entirely if so. To do this, we add two new data fields: - a percpu 'fpu_owner_task' variable that gets written any time we update the "has_fpu" field, and thus acts as a kind of back-pointer to the task that owns the CPU. The exception is when we save the FPU state as part of a context switch - if the save can keep the FPU state around, we leave the 'fpu_owner_task' variable pointing at the task whose FP state still remains on the CPU. - a per-thread 'last_cpu' field, that indicates which CPU that thread used its FPU on last. We update this on every context switch (writing an invalid CPU number if the last context switch didn't leave the FPU in a lazily usable state), so we know that *that* thread has done nothing else with the FPU since. These two fields together can be used when next switching back to the task to see if the CPU still matches: if 'fpu_owner_task' matches the task we are switching to, we know that no other task (or kernel FPU usage) touched the FPU on this CPU in the meantime, and if the current CPU number matches the 'last_cpu' field, we know that this thread did no other FP work on any other CPU, so the FPU state on the CPU must match what was saved on last context switch. In that case, we can avoid the 'f[x]rstor' entirely, and just clear the CR0.TS bit. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/x86/include/asm/i387.h35
-rw-r--r--arch/x86/include/asm/processor.h3
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
5 files changed, 29 insertions, 15 deletions
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 74c607b37e87..247904945d3f 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -32,6 +32,8 @@ extern int init_fpu(struct task_struct *child);
32extern void math_state_restore(void); 32extern void math_state_restore(void);
33extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); 33extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
34 34
35DECLARE_PER_CPU(struct task_struct *, fpu_owner_task);
36
35extern user_regset_active_fn fpregs_active, xfpregs_active; 37extern user_regset_active_fn fpregs_active, xfpregs_active;
36extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, 38extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
37 xstateregs_get; 39 xstateregs_get;
@@ -276,7 +278,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
276 "emms\n\t" /* clear stack tags */ 278 "emms\n\t" /* clear stack tags */
277 "fildl %P[addr]", /* set F?P to defined value */ 279 "fildl %P[addr]", /* set F?P to defined value */
278 X86_FEATURE_FXSAVE_LEAK, 280 X86_FEATURE_FXSAVE_LEAK,
279 [addr] "m" (tsk->thread.has_fpu)); 281 [addr] "m" (tsk->thread.fpu.has_fpu));
280 282
281 return fpu_restore_checking(&tsk->thread.fpu); 283 return fpu_restore_checking(&tsk->thread.fpu);
282} 284}
@@ -288,19 +290,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
288 */ 290 */
289static inline int __thread_has_fpu(struct task_struct *tsk) 291static inline int __thread_has_fpu(struct task_struct *tsk)
290{ 292{
291 return tsk->thread.has_fpu; 293 return tsk->thread.fpu.has_fpu;
292} 294}
293 295
294/* Must be paired with an 'stts' after! */ 296/* Must be paired with an 'stts' after! */
295static inline void __thread_clear_has_fpu(struct task_struct *tsk) 297static inline void __thread_clear_has_fpu(struct task_struct *tsk)
296{ 298{
297 tsk->thread.has_fpu = 0; 299 tsk->thread.fpu.has_fpu = 0;
300 percpu_write(fpu_owner_task, NULL);
298} 301}
299 302
300/* Must be paired with a 'clts' before! */ 303/* Must be paired with a 'clts' before! */
301static inline void __thread_set_has_fpu(struct task_struct *tsk) 304static inline void __thread_set_has_fpu(struct task_struct *tsk)
302{ 305{
303 tsk->thread.has_fpu = 1; 306 tsk->thread.fpu.has_fpu = 1;
307 percpu_write(fpu_owner_task, tsk);
304} 308}
305 309
306/* 310/*
@@ -345,18 +349,22 @@ typedef struct { int preload; } fpu_switch_t;
345 * We don't do that yet, so "fpu_lazy_restore()" always returns 349 * We don't do that yet, so "fpu_lazy_restore()" always returns
346 * false, but some day.. 350 * false, but some day..
347 */ 351 */
348#define fpu_lazy_restore(tsk) (0) 352static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
349#define fpu_lazy_state_intact(tsk) do { } while (0) 353{
354 return new == percpu_read_stable(fpu_owner_task) &&
355 cpu == new->thread.fpu.last_cpu;
356}
350 357
351static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new) 358static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
352{ 359{
353 fpu_switch_t fpu; 360 fpu_switch_t fpu;
354 361
355 fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; 362 fpu.preload = tsk_used_math(new) && new->fpu_counter > 5;
356 if (__thread_has_fpu(old)) { 363 if (__thread_has_fpu(old)) {
357 if (__save_init_fpu(old)) 364 if (!__save_init_fpu(old))
358 fpu_lazy_state_intact(old); 365 cpu = ~0;
359 __thread_clear_has_fpu(old); 366 old->thread.fpu.last_cpu = cpu;
367 old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */
360 368
361 /* Don't change CR0.TS if we just switch! */ 369 /* Don't change CR0.TS if we just switch! */
362 if (fpu.preload) { 370 if (fpu.preload) {
@@ -367,9 +375,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
367 stts(); 375 stts();
368 } else { 376 } else {
369 old->fpu_counter = 0; 377 old->fpu_counter = 0;
378 old->thread.fpu.last_cpu = ~0;
370 if (fpu.preload) { 379 if (fpu.preload) {
371 new->fpu_counter++; 380 new->fpu_counter++;
372 if (fpu_lazy_restore(new)) 381 if (fpu_lazy_restore(new, cpu))
373 fpu.preload = 0; 382 fpu.preload = 0;
374 else 383 else
375 prefetch(new->thread.fpu.state); 384 prefetch(new->thread.fpu.state);
@@ -463,8 +472,10 @@ static inline void kernel_fpu_begin(void)
463 __save_init_fpu(me); 472 __save_init_fpu(me);
464 __thread_clear_has_fpu(me); 473 __thread_clear_has_fpu(me);
465 /* We do 'stts()' in kernel_fpu_end() */ 474 /* We do 'stts()' in kernel_fpu_end() */
466 } else 475 } else {
476 percpu_write(fpu_owner_task, NULL);
467 clts(); 477 clts();
478 }
468} 479}
469 480
470static inline void kernel_fpu_end(void) 481static inline void kernel_fpu_end(void)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f7c89e231c6c..58545c97d071 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -374,6 +374,8 @@ union thread_xstate {
374}; 374};
375 375
376struct fpu { 376struct fpu {
377 unsigned int last_cpu;
378 unsigned int has_fpu;
377 union thread_xstate *state; 379 union thread_xstate *state;
378}; 380};
379 381
@@ -454,7 +456,6 @@ struct thread_struct {
454 unsigned long trap_no; 456 unsigned long trap_no;
455 unsigned long error_code; 457 unsigned long error_code;
456 /* floating point and extended processor state */ 458 /* floating point and extended processor state */
457 unsigned long has_fpu;
458 struct fpu fpu; 459 struct fpu fpu;
459#ifdef CONFIG_X86_32 460#ifdef CONFIG_X86_32
460 /* Virtual 86 mode info */ 461 /* Virtual 86 mode info */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d43cad74f166..b667148dfad7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1044,6 +1044,8 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
1044 1044
1045DEFINE_PER_CPU(unsigned int, irq_count) = -1; 1045DEFINE_PER_CPU(unsigned int, irq_count) = -1;
1046 1046
1047DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1048
1047/* 1049/*
1048 * Special IST stacks which the CPU switches to when it calls 1050 * Special IST stacks which the CPU switches to when it calls
1049 * an IST-marked descriptor entry. Up to 7 stacks (hardware 1051 * an IST-marked descriptor entry. Up to 7 stacks (hardware
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index bc32761bc27a..c08d1ff12b7c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -304,7 +304,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
304 304
305 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 305 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
306 306
307 fpu = switch_fpu_prepare(prev_p, next_p); 307 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
308 308
309 /* 309 /*
310 * Reload esp0. 310 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 8ad880b3bc1c..cfa5c90c01db 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -389,7 +389,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
389 unsigned fsindex, gsindex; 389 unsigned fsindex, gsindex;
390 fpu_switch_t fpu; 390 fpu_switch_t fpu;
391 391
392 fpu = switch_fpu_prepare(prev_p, next_p); 392 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
393 393
394 /* 394 /*
395 * Reload esp0, LDT and the page table pointer: 395 * Reload esp0, LDT and the page table pointer: