aboutsummaryrefslogtreecommitdiffstats
path: root/arch/sh/kernel/process_32.c
diff options
context:
space:
mode:
authorStuart Menefy <stuart.menefy@st.com>2009-09-25 13:25:10 -0400
committerPaul Mundt <lethal@linux-sh.org>2009-11-24 03:45:38 -0500
commitd3ea9fa0a563620fe9f416f94bb8927c64390917 (patch)
tree0aa1278ac7929f936fc4fd8daf235930f6164d18 /arch/sh/kernel/process_32.c
parent39ac11c1607f1d566e7cf885acd403fa4f07f8a2 (diff)
sh: Minor optimisations to FPU handling
A number of small optimisations to FPU handling, in particular: - move the task USEDFPU flag from the thread_info flags field (which is accessed asynchronously to the thread) to a new status field, which is only accessed by the thread itself. This allows locking to be removed in most cases, or can be reduced to a preempt_lock(). This mimics the i386 behaviour. - move the modification of regs->sr and thread_info->status flags out of save_fpu() to __unlazy_fpu(). This gives the compiler a better chance to optimise things, as well as making save_fpu() symmetrical with restore_fpu() and init_fpu(). - implement prepare_to_copy(), so that when creating a thread, we can unlazy the FPU prior to copying the thread data structures. Also make sure that the FPU is disabled while in the kernel, in particular while booting, and for newly created kernel threads, In a very artificial benchmark, the execution time for 2500000 context switches was reduced from 50 to 45 seconds. Signed-off-by: Stuart Menefy <stuart.menefy@st.com> Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Diffstat (limited to 'arch/sh/kernel/process_32.c')
-rw-r--r--arch/sh/kernel/process_32.c24
1 files changed, 16 insertions, 8 deletions
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 7733f5fa6bb5..d721f9297c09 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -134,7 +134,10 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
134 regs.regs[5] = (unsigned long)fn; 134 regs.regs[5] = (unsigned long)fn;
135 135
136 regs.pc = (unsigned long)kernel_thread_helper; 136 regs.pc = (unsigned long)kernel_thread_helper;
137 regs.sr = (1 << 30); 137 regs.sr = SR_MD;
138#if defined(CONFIG_SH_FPU)
139 regs.sr |= SR_FD;
140#endif
138 141
139 /* Ok, create the new process.. */ 142 /* Ok, create the new process.. */
140 pid = do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, 143 pid = do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0,
@@ -189,6 +192,15 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
189} 192}
190EXPORT_SYMBOL(dump_fpu); 193EXPORT_SYMBOL(dump_fpu);
191 194
195/*
196 * This gets called before we allocate a new thread and copy
197 * the current task into it.
198 */
199void prepare_to_copy(struct task_struct *tsk)
200{
201 unlazy_fpu(tsk, task_pt_regs(tsk));
202}
203
192asmlinkage void ret_from_fork(void); 204asmlinkage void ret_from_fork(void);
193 205
194int copy_thread(unsigned long clone_flags, unsigned long usp, 206int copy_thread(unsigned long clone_flags, unsigned long usp,
@@ -197,16 +209,10 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
197{ 209{
198 struct thread_info *ti = task_thread_info(p); 210 struct thread_info *ti = task_thread_info(p);
199 struct pt_regs *childregs; 211 struct pt_regs *childregs;
200#if defined(CONFIG_SH_FPU) || defined(CONFIG_SH_DSP) 212#if defined(CONFIG_SH_DSP)
201 struct task_struct *tsk = current; 213 struct task_struct *tsk = current;
202#endif 214#endif
203 215
204#if defined(CONFIG_SH_FPU)
205 unlazy_fpu(tsk, regs);
206 p->thread.fpu = tsk->thread.fpu;
207 copy_to_stopped_child_used_math(p);
208#endif
209
210#if defined(CONFIG_SH_DSP) 216#if defined(CONFIG_SH_DSP)
211 if (is_dsp_enabled(tsk)) { 217 if (is_dsp_enabled(tsk)) {
212 /* We can use the __save_dsp or just copy the struct: 218 /* We can use the __save_dsp or just copy the struct:
@@ -226,6 +232,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
226 } else { 232 } else {
227 childregs->regs[15] = (unsigned long)childregs; 233 childregs->regs[15] = (unsigned long)childregs;
228 ti->addr_limit = KERNEL_DS; 234 ti->addr_limit = KERNEL_DS;
235 ti->status &= ~TS_USEDFPU;
236 p->fpu_counter = 0;
229 } 237 }
230 238
231 if (clone_flags & CLONE_SETTLS) 239 if (clone_flags & CLONE_SETTLS)