diff options
author | Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> | 2008-01-29 00:39:33 -0500 |
---|---|---|
committer | Tony Luck <tony.luck@intel.com> | 2008-03-10 19:35:47 -0400 |
commit | 4fe01c68eba53c3f324807faff71535218c41e9c (patch) | |
tree | 43e061a07d84b2ec80cd40e91156d008e4d0ef55 /arch/ia64/kernel | |
parent | cdeeeae056a429e729ae9e914fa8142ee45bee93 (diff) |
[IA64] cleanup and improve fsys_gettimeofday
This patch does:
- Remove outdated comments (which someday I marked with "?").
- Reassemble instructions to fit them in fewer bundles.
- If McKinley Errata 9 workaround is not needed, the workaround
bundles will be patched out with NOPs. However it also not
needed to have a totally NOP bundle (nop * 3) before branch.
As a result, this makes the code path 3 (or 2) bundles shorter
(and remove 1 unnecessary stop bit). It seems to be 1% faster.
(10sec loop test, with nojitter @ Madison 1.5GHz x 4)
Before:
CPU 0: 0.14 (usecs) (0 errors / 69598875 iterations)
CPU 1: 0.14 (usecs) (0 errors / 69630721 iterations)
CPU 2: 0.14 (usecs) (0 errors / 69607850 iterations)
CPU 3: 0.14 (usecs) (0 errors / 69619832 iterations)
After:
CPU 0: 0.14 (usecs) (0 errors / 70257728 iterations)
CPU 1: 0.14 (usecs) (0 errors / 70309498 iterations)
CPU 2: 0.14 (usecs) (0 errors / 70280639 iterations)
CPU 3: 0.14 (usecs) (0 errors / 70260682 iterations)
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch/ia64/kernel')
-rw-r--r-- | arch/ia64/kernel/fsys.S | 28 | ||||
-rw-r--r-- | arch/ia64/kernel/patch.c | 8 |
2 files changed, 16 insertions, 20 deletions
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 44841971f077..6a72db7ddecc 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S | |||
@@ -210,27 +210,25 @@ ENTRY(fsys_gettimeofday) | |||
210 | // Note that instructions are optimized for McKinley. McKinley can | 210 | // Note that instructions are optimized for McKinley. McKinley can |
211 | // process two bundles simultaneously and therefore we continuously | 211 | // process two bundles simultaneously and therefore we continuously |
212 | // try to feed the CPU two bundles and then a stop. | 212 | // try to feed the CPU two bundles and then a stop. |
213 | // | 213 | |
214 | // Additional note that code has changed a lot. Optimization is TBD. | ||
215 | // Comments begin with "?" are maybe outdated. | ||
216 | tnat.nz p6,p0 = r31 // ? branch deferred to fit later bundle | ||
217 | mov pr = r30,0xc000 // Set predicates according to function | ||
218 | add r2 = TI_FLAGS+IA64_TASK_SIZE,r16 | 214 | add r2 = TI_FLAGS+IA64_TASK_SIZE,r16 |
215 | tnat.nz p6,p0 = r31 // guard against Nat argument | ||
216 | (p6) br.cond.spnt.few .fail_einval | ||
219 | movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address | 217 | movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address |
220 | ;; | 218 | ;; |
219 | ld4 r2 = [r2] // process work pending flags | ||
221 | movl r29 = itc_jitter_data // itc_jitter | 220 | movl r29 = itc_jitter_data // itc_jitter |
222 | add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20 // wall_time | 221 | add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20 // wall_time |
223 | ld4 r2 = [r2] // process work pending flags | ||
224 | ;; | ||
225 | (p15) add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20 // monotonic_time | ||
226 | add r21 = IA64_CLKSRC_MMIO_OFFSET,r20 | 222 | add r21 = IA64_CLKSRC_MMIO_OFFSET,r20 |
227 | add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29 | 223 | mov pr = r30,0xc000 // Set predicates according to function |
224 | ;; | ||
228 | and r2 = TIF_ALLWORK_MASK,r2 | 225 | and r2 = TIF_ALLWORK_MASK,r2 |
229 | (p6) br.cond.spnt.few .fail_einval // ? deferred branch | 226 | add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29 |
227 | (p15) add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20 // monotonic_time | ||
230 | ;; | 228 | ;; |
231 | add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last | 229 | add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last |
232 | cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled | 230 | cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled |
233 | (p6) br.cond.spnt.many fsys_fallback_syscall | 231 | (p6) br.cond.spnt.many fsys_fallback_syscall |
234 | ;; | 232 | ;; |
235 | // Begin critical section | 233 | // Begin critical section |
236 | .time_redo: | 234 | .time_redo: |
@@ -258,7 +256,6 @@ ENTRY(fsys_gettimeofday) | |||
258 | (p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! | 256 | (p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! |
259 | (p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues.. | 257 | (p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues.. |
260 | (p13) ld8 r25 = [r19] // get itc_lastcycle value | 258 | (p13) ld8 r25 = [r19] // get itc_lastcycle value |
261 | ;; // ? could be removed by moving the last add upward | ||
262 | ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec | 259 | ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec |
263 | ;; | 260 | ;; |
264 | ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET // tv_nsec | 261 | ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET // tv_nsec |
@@ -285,13 +282,12 @@ ENTRY(fsys_gettimeofday) | |||
285 | EX(.fail_efault, probe.w.fault r31, 3) | 282 | EX(.fail_efault, probe.w.fault r31, 3) |
286 | xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter) | 283 | xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter) |
287 | ;; | 284 | ;; |
288 | // ? simulate tbit.nz.or p7,p0 = r28,0 | ||
289 | getf.sig r2 = f8 | 285 | getf.sig r2 = f8 |
290 | mf | 286 | mf |
291 | ;; | 287 | ;; |
292 | ld4 r10 = [r20] // gtod_lock.sequence | 288 | ld4 r10 = [r20] // gtod_lock.sequence |
293 | shr.u r2 = r2,r23 // shift by factor | 289 | shr.u r2 = r2,r23 // shift by factor |
294 | ;; // ? overloaded 3 bundles! | 290 | ;; |
295 | add r8 = r8,r2 // Add xtime.nsecs | 291 | add r8 = r8,r2 // Add xtime.nsecs |
296 | cmp4.ne p7,p0 = r28,r10 | 292 | cmp4.ne p7,p0 = r28,r10 |
297 | (p7) br.cond.dpnt.few .time_redo // sequence number changed, redo | 293 | (p7) br.cond.dpnt.few .time_redo // sequence number changed, redo |
@@ -319,9 +315,9 @@ EX(.fail_efault, probe.w.fault r31, 3) | |||
319 | EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles | 315 | EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles |
320 | (p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it | 316 | (p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it |
321 | ;; | 317 | ;; |
322 | mov r8 = r0 | ||
323 | (p14) getf.sig r2 = f8 | 318 | (p14) getf.sig r2 = f8 |
324 | ;; | 319 | ;; |
320 | mov r8 = r0 | ||
325 | (p14) shr.u r21 = r2, 4 | 321 | (p14) shr.u r21 = r2, 4 |
326 | ;; | 322 | ;; |
327 | EX(.fail_efault, st8 [r31] = r9) | 323 | EX(.fail_efault, st8 [r31] = r9) |
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c index 2cb9425e0421..e0dca8743dbb 100644 --- a/arch/ia64/kernel/patch.c +++ b/arch/ia64/kernel/patch.c | |||
@@ -135,10 +135,10 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end) | |||
135 | 135 | ||
136 | while (offp < (s32 *) end) { | 136 | while (offp < (s32 *) end) { |
137 | wp = (u64 *) ia64_imva((char *) offp + *offp); | 137 | wp = (u64 *) ia64_imva((char *) offp + *offp); |
138 | wp[0] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */ | 138 | wp[0] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */ |
139 | wp[1] = 0x0004000000000200UL; | 139 | wp[1] = 0x0084006880000200UL; |
140 | wp[2] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */ | 140 | wp[2] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */ |
141 | wp[3] = 0x0084006880000200UL; | 141 | wp[3] = 0x0004000000000200UL; |
142 | ia64_fc(wp); ia64_fc(wp + 2); | 142 | ia64_fc(wp); ia64_fc(wp + 2); |
143 | ++offp; | 143 | ++offp; |
144 | } | 144 | } |