diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-02-19 16:27:00 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-02-20 13:58:54 -0500 |
commit | 7e16838d94b566a17b65231073d179bc04d590c8 (patch) | |
tree | 356ae3999d89d2419fd4b85b062a24820f4a4d82 | |
parent | 80ab6f1e8c981b1b6604b2f22e36c917526235cd (diff) |
i387: support lazy restore of FPU state
This makes us recognize when we try to restore FPU state that matches
what we already have in the FPU on this CPU, and avoids the restore
entirely if so.
To do this, we add two new data fields:
- a percpu 'fpu_owner_task' variable that gets written any time we
update the "has_fpu" field, and thus acts as a kind of back-pointer
to the task that owns the CPU. The exception is when we save the FPU
state as part of a context switch - if the save can keep the FPU
state around, we leave the 'fpu_owner_task' variable pointing at the
task whose FP state still remains on the CPU.
- a per-thread 'last_cpu' field, that indicates which CPU that thread
used its FPU on last. We update this on every context switch
(writing an invalid CPU number if the last context switch didn't
leave the FPU in a lazily usable state), so we know that *that*
thread has done nothing else with the FPU since.
These two fields together can be used when next switching back to the
task to see if the CPU still matches: if 'fpu_owner_task' matches the
task we are switching to, we know that no other task (or kernel FPU
usage) touched the FPU on this CPU in the meantime, and if the current
CPU number matches the 'last_cpu' field, we know that this thread did no
other FP work on any other CPU, so the FPU state on the CPU must match
what was saved on last context switch.
In that case, we can avoid the 'f[x]rstor' entirely, and just clear the
CR0.TS bit.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | arch/x86/include/asm/i387.h | 35 | ||||
-rw-r--r-- | arch/x86/include/asm/processor.h | 3 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/process_32.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/process_64.c | 2 |
5 files changed, 29 insertions, 15 deletions
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 74c607b37e87..247904945d3f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h | |||
@@ -32,6 +32,8 @@ extern int init_fpu(struct task_struct *child); | |||
32 | extern void math_state_restore(void); | 32 | extern void math_state_restore(void); |
33 | extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); | 33 | extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); |
34 | 34 | ||
35 | DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); | ||
36 | |||
35 | extern user_regset_active_fn fpregs_active, xfpregs_active; | 37 | extern user_regset_active_fn fpregs_active, xfpregs_active; |
36 | extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, | 38 | extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, |
37 | xstateregs_get; | 39 | xstateregs_get; |
@@ -276,7 +278,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk) | |||
276 | "emms\n\t" /* clear stack tags */ | 278 | "emms\n\t" /* clear stack tags */ |
277 | "fildl %P[addr]", /* set F?P to defined value */ | 279 | "fildl %P[addr]", /* set F?P to defined value */ |
278 | X86_FEATURE_FXSAVE_LEAK, | 280 | X86_FEATURE_FXSAVE_LEAK, |
279 | [addr] "m" (tsk->thread.has_fpu)); | 281 | [addr] "m" (tsk->thread.fpu.has_fpu)); |
280 | 282 | ||
281 | return fpu_restore_checking(&tsk->thread.fpu); | 283 | return fpu_restore_checking(&tsk->thread.fpu); |
282 | } | 284 | } |
@@ -288,19 +290,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk) | |||
288 | */ | 290 | */ |
289 | static inline int __thread_has_fpu(struct task_struct *tsk) | 291 | static inline int __thread_has_fpu(struct task_struct *tsk) |
290 | { | 292 | { |
291 | return tsk->thread.has_fpu; | 293 | return tsk->thread.fpu.has_fpu; |
292 | } | 294 | } |
293 | 295 | ||
294 | /* Must be paired with an 'stts' after! */ | 296 | /* Must be paired with an 'stts' after! */ |
295 | static inline void __thread_clear_has_fpu(struct task_struct *tsk) | 297 | static inline void __thread_clear_has_fpu(struct task_struct *tsk) |
296 | { | 298 | { |
297 | tsk->thread.has_fpu = 0; | 299 | tsk->thread.fpu.has_fpu = 0; |
300 | percpu_write(fpu_owner_task, NULL); | ||
298 | } | 301 | } |
299 | 302 | ||
300 | /* Must be paired with a 'clts' before! */ | 303 | /* Must be paired with a 'clts' before! */ |
301 | static inline void __thread_set_has_fpu(struct task_struct *tsk) | 304 | static inline void __thread_set_has_fpu(struct task_struct *tsk) |
302 | { | 305 | { |
303 | tsk->thread.has_fpu = 1; | 306 | tsk->thread.fpu.has_fpu = 1; |
307 | percpu_write(fpu_owner_task, tsk); | ||
304 | } | 308 | } |
305 | 309 | ||
306 | /* | 310 | /* |
@@ -345,18 +349,22 @@ typedef struct { int preload; } fpu_switch_t; | |||
345 | * We don't do that yet, so "fpu_lazy_restore()" always returns | 349 | * We don't do that yet, so "fpu_lazy_restore()" always returns |
346 | * false, but some day.. | 350 | * false, but some day.. |
347 | */ | 351 | */ |
348 | #define fpu_lazy_restore(tsk) (0) | 352 | static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) |
349 | #define fpu_lazy_state_intact(tsk) do { } while (0) | 353 | { |
354 | return new == percpu_read_stable(fpu_owner_task) && | ||
355 | cpu == new->thread.fpu.last_cpu; | ||
356 | } | ||
350 | 357 | ||
351 | static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new) | 358 | static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) |
352 | { | 359 | { |
353 | fpu_switch_t fpu; | 360 | fpu_switch_t fpu; |
354 | 361 | ||
355 | fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; | 362 | fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; |
356 | if (__thread_has_fpu(old)) { | 363 | if (__thread_has_fpu(old)) { |
357 | if (__save_init_fpu(old)) | 364 | if (!__save_init_fpu(old)) |
358 | fpu_lazy_state_intact(old); | 365 | cpu = ~0; |
359 | __thread_clear_has_fpu(old); | 366 | old->thread.fpu.last_cpu = cpu; |
367 | old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ | ||
360 | 368 | ||
361 | /* Don't change CR0.TS if we just switch! */ | 369 | /* Don't change CR0.TS if we just switch! */ |
362 | if (fpu.preload) { | 370 | if (fpu.preload) { |
@@ -367,9 +375,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta | |||
367 | stts(); | 375 | stts(); |
368 | } else { | 376 | } else { |
369 | old->fpu_counter = 0; | 377 | old->fpu_counter = 0; |
378 | old->thread.fpu.last_cpu = ~0; | ||
370 | if (fpu.preload) { | 379 | if (fpu.preload) { |
371 | new->fpu_counter++; | 380 | new->fpu_counter++; |
372 | if (fpu_lazy_restore(new)) | 381 | if (fpu_lazy_restore(new, cpu)) |
373 | fpu.preload = 0; | 382 | fpu.preload = 0; |
374 | else | 383 | else |
375 | prefetch(new->thread.fpu.state); | 384 | prefetch(new->thread.fpu.state); |
@@ -463,8 +472,10 @@ static inline void kernel_fpu_begin(void) | |||
463 | __save_init_fpu(me); | 472 | __save_init_fpu(me); |
464 | __thread_clear_has_fpu(me); | 473 | __thread_clear_has_fpu(me); |
465 | /* We do 'stts()' in kernel_fpu_end() */ | 474 | /* We do 'stts()' in kernel_fpu_end() */ |
466 | } else | 475 | } else { |
476 | percpu_write(fpu_owner_task, NULL); | ||
467 | clts(); | 477 | clts(); |
478 | } | ||
468 | } | 479 | } |
469 | 480 | ||
470 | static inline void kernel_fpu_end(void) | 481 | static inline void kernel_fpu_end(void) |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f7c89e231c6c..58545c97d071 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -374,6 +374,8 @@ union thread_xstate { | |||
374 | }; | 374 | }; |
375 | 375 | ||
376 | struct fpu { | 376 | struct fpu { |
377 | unsigned int last_cpu; | ||
378 | unsigned int has_fpu; | ||
377 | union thread_xstate *state; | 379 | union thread_xstate *state; |
378 | }; | 380 | }; |
379 | 381 | ||
@@ -454,7 +456,6 @@ struct thread_struct { | |||
454 | unsigned long trap_no; | 456 | unsigned long trap_no; |
455 | unsigned long error_code; | 457 | unsigned long error_code; |
456 | /* floating point and extended processor state */ | 458 | /* floating point and extended processor state */ |
457 | unsigned long has_fpu; | ||
458 | struct fpu fpu; | 459 | struct fpu fpu; |
459 | #ifdef CONFIG_X86_32 | 460 | #ifdef CONFIG_X86_32 |
460 | /* Virtual 86 mode info */ | 461 | /* Virtual 86 mode info */ |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d43cad74f166..b667148dfad7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -1044,6 +1044,8 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = | |||
1044 | 1044 | ||
1045 | DEFINE_PER_CPU(unsigned int, irq_count) = -1; | 1045 | DEFINE_PER_CPU(unsigned int, irq_count) = -1; |
1046 | 1046 | ||
1047 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); | ||
1048 | |||
1047 | /* | 1049 | /* |
1048 | * Special IST stacks which the CPU switches to when it calls | 1050 | * Special IST stacks which the CPU switches to when it calls |
1049 | * an IST-marked descriptor entry. Up to 7 stacks (hardware | 1051 | * an IST-marked descriptor entry. Up to 7 stacks (hardware |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bc32761bc27a..c08d1ff12b7c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -304,7 +304,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
304 | 304 | ||
305 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ | 305 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ |
306 | 306 | ||
307 | fpu = switch_fpu_prepare(prev_p, next_p); | 307 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); |
308 | 308 | ||
309 | /* | 309 | /* |
310 | * Reload esp0. | 310 | * Reload esp0. |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8ad880b3bc1c..cfa5c90c01db 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -389,7 +389,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
389 | unsigned fsindex, gsindex; | 389 | unsigned fsindex, gsindex; |
390 | fpu_switch_t fpu; | 390 | fpu_switch_t fpu; |
391 | 391 | ||
392 | fpu = switch_fpu_prepare(prev_p, next_p); | 392 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); |
393 | 393 | ||
394 | /* | 394 | /* |
395 | * Reload esp0, LDT and the page table pointer: | 395 | * Reload esp0, LDT and the page table pointer: |