[IA64] cleanup and improve fsys_gettimeofday

This patch does: - Remove outdated comments (which someday I marked with "?"). - Reassemble instructions to fit them in fewer bundles. - If McKinley Errata 9 workaround is not needed, the workaround bundles will be patched out with NOPs. However it also not needed to have a totally NOP bundle (nop * 3) before branch. As a result, this makes the code path 3 (or 2) bundles shorter (and remove 1 unnecessary stop bit). It seems to be 1% faster. (10sec loop test, with nojitter @ Madison 1.5GHz x 4) Before: CPU 0: 0.14 (usecs) (0 errors / 69598875 iterations) CPU 1: 0.14 (usecs) (0 errors / 69630721 iterations) CPU 2: 0.14 (usecs) (0 errors / 69607850 iterations) CPU 3: 0.14 (usecs) (0 errors / 69619832 iterations) After: CPU 0: 0.14 (usecs) (0 errors / 70257728 iterations) CPU 1: 0.14 (usecs) (0 errors / 70309498 iterations) CPU 2: 0.14 (usecs) (0 errors / 70280639 iterations) CPU 3: 0.14 (usecs) (0 errors / 70260682 iterations) Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
author: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> 2008-01-29 00:39:33 -0500
committer: Tony Luck <tony.luck@intel.com> 2008-03-10 19:35:47 -0400
commit: 4fe01c68eba53c3f324807faff71535218c41e9c (patch)
tree: 43e061a07d84b2ec80cd40e91156d008e4d0ef55 /arch/ia64
parent: cdeeeae056a429e729ae9e914fa8142ee45bee93 (diff)
2 files changed, 16 insertions, 20 deletions
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index 44841971f077..6a72db7ddecc 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -210,27 +210,25 @@ ENTRY(fsys_gettimeofday)
        // Note that instructions are optimized for McKinley. McKinley can
        // process two bundles simultaneously and therefore we continuously
        // try to feed the CPU two bundles and then a stop.
-        //
-        // Additional note that code has changed a lot. Optimization is TBD.
-        // Comments begin with "?" are maybe outdated.
-        tnat.nz p6,p0 = r31     // ? branch deferred to fit later bundle
-        mov pr = r30,0xc000     // Set predicates according to function
        add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
+        tnat.nz p6,p0 = r31             // guard against Nat argument
+(p6)    br.cond.spnt.few .fail_einval
        movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address
        ;;
+        ld4 r2 = [r2]                   // process work pending flags
        movl r29 = itc_jitter_data      // itc_jitter
        add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20        // wall_time
-        ld4 r2 = [r2]           // process work pending flags
-        ;;
-(p15)   add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20        // monotonic_time
        add r21 = IA64_CLKSRC_MMIO_OFFSET,r20
-        add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29
+        mov pr = r30,0xc000     // Set predicates according to function
+        ;;
        and r2 = TIF_ALLWORK_MASK,r2
-(p6)    br.cond.spnt.few .fail_einval   // ? deferred branch
+        add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29
+(p15)   add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20        // monotonic_time
        ;;
-        add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last
+        add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20     // clksrc_cycle_last
        cmp.ne p6, p0 = 0, r2   // Fallback if work is scheduled
-(p6)    br.cond.spnt.many fsys_fallback_syscall
+(p6)    br.cond.spnt.many fsys_fallback_syscall
        ;;
        // Begin critical section
 .time_redo:
@@ -258,7 +256,6 @@ ENTRY(fsys_gettimeofday)
 (p8)    mov r2 = ar.itc         // CPU_TIMER. 36 clocks latency!!!
 (p9)    ld8 r2 = [r30]          // MMIO_TIMER. Could also have latency issues..
 (p13)   ld8 r25 = [r19]         // get itc_lastcycle value
-        ;;              // ? could be removed by moving the last add upward
        ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET     // tv_sec
        ;;
        ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET    // tv_nsec
@@ -285,13 +282,12 @@ ENTRY(fsys_gettimeofday)
 EX(.fail_efault, probe.w.fault r31, 3)
        xmpy.l f8 = f8,f7       // nsec_per_cyc*(counter-last_counter)
        ;;
-        // ? simulate tbit.nz.or p7,p0 = r28,0
        getf.sig r2 = f8
        mf
        ;;
        ld4 r10 = [r20]         // gtod_lock.sequence
        shr.u r2 = r2,r23       // shift by factor
-        ;;              // ? overloaded 3 bundles!
+        ;;
        add r8 = r8,r2          // Add xtime.nsecs
        cmp4.ne p7,p0 = r28,r10
 (p7)    br.cond.dpnt.few .time_redo     // sequence number changed, redo
@@ -319,9 +315,9 @@ EX(.fail_efault, probe.w.fault r31, 3)
 EX(.fail_efault, probe.w.fault r23, 3)  // This also costs 5 cycles
 (p14)   xmpy.hu f8 = f8, f7             // xmpy has 5 cycles latency so use it
        ;;
-        mov r8 = r0
 (p14)   getf.sig r2 = f8
        ;;
+        mov r8 = r0
 (p14)   shr.u r21 = r2, 4
        ;;
 EX(.fail_efault, st8 [r31] = r9)
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
index 2cb9425e0421..e0dca8743dbb 100644
--- a/arch/ia64/kernel/patch.c
+++ b/arch/ia64/kernel/patch.c
@@ -135,10 +135,10 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
        while (offp < (s32 *) end) {
                wp = (u64 *) ia64_imva((char *) offp + *offp);
-                wp[0] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */
+                wp[0] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */
-                wp[1] = 0x0004000000000200UL;
+                wp[1] = 0x0084006880000200UL;
-                wp[2] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */
+                wp[2] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */
-                wp[3] = 0x0084006880000200UL;
+                wp[3] = 0x0004000000000200UL;
                ia64_fc(wp); ia64_fc(wp + 2);
                ++offp;
        }
author	Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>	2008-01-29 00:39:33 -0500
committer	Tony Luck <tony.luck@intel.com>	2008-03-10 19:35:47 -0400
commit	4fe01c68eba53c3f324807faff71535218c41e9c (patch)
tree	43e061a07d84b2ec80cd40e91156d008e4d0ef55 /arch/ia64
parent	cdeeeae056a429e729ae9e914fa8142ee45bee93 (diff)

diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 44841971f077..6a72db7ddecc 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S
@@ -210,27 +210,25 @@ ENTRY(fsys_gettimeofday)
210	// Note that instructions are optimized for McKinley. McKinley can	210	// Note that instructions are optimized for McKinley. McKinley can
211	// process two bundles simultaneously and therefore we continuously	211	// process two bundles simultaneously and therefore we continuously
212	// try to feed the CPU two bundles and then a stop.	212	// try to feed the CPU two bundles and then a stop.
213	//	213
214	// Additional note that code has changed a lot. Optimization is TBD.
215	// Comments begin with "?" are maybe outdated.
216	tnat.nz p6,p0 = r31 // ? branch deferred to fit later bundle
217	mov pr = r30,0xc000 // Set predicates according to function
218	add r2 = TI_FLAGS+IA64_TASK_SIZE,r16	214	add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
		215	tnat.nz p6,p0 = r31 // guard against Nat argument
		216	(p6) br.cond.spnt.few .fail_einval
219	movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address	217	movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address
220	;;	218	;;
		219	ld4 r2 = [r2] // process work pending flags
221	movl r29 = itc_jitter_data // itc_jitter	220	movl r29 = itc_jitter_data // itc_jitter
222	add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20 // wall_time	221	add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20 // wall_time
223	ld4 r2 = [r2] // process work pending flags
224	;;
225	(p15) add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20 // monotonic_time
226	add r21 = IA64_CLKSRC_MMIO_OFFSET,r20	222	add r21 = IA64_CLKSRC_MMIO_OFFSET,r20
227	add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29	223	mov pr = r30,0xc000 // Set predicates according to function
		224	;;
228	and r2 = TIF_ALLWORK_MASK,r2	225	and r2 = TIF_ALLWORK_MASK,r2
229	(p6) br.cond.spnt.few .fail_einval // ? deferred branch	226	add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29
		227	(p15) add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20 // monotonic_time
230	;;	228	;;
231	add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last	229	add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last
232	cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled	230	cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled
233	(p6) br.cond.spnt.many fsys_fallback_syscall	231	(p6) br.cond.spnt.many fsys_fallback_syscall
234	;;	232	;;
235	// Begin critical section	233	// Begin critical section
236	.time_redo:	234	.time_redo:
@@ -258,7 +256,6 @@ ENTRY(fsys_gettimeofday)
258	(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!!	256	(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!!
259	(p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues..	257	(p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues..
260	(p13) ld8 r25 = [r19] // get itc_lastcycle value	258	(p13) ld8 r25 = [r19] // get itc_lastcycle value
261	;; // ? could be removed by moving the last add upward
262	ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec	259	ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec
263	;;	260	;;
264	ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET // tv_nsec	261	ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET // tv_nsec
@@ -285,13 +282,12 @@ ENTRY(fsys_gettimeofday)
285	EX(.fail_efault, probe.w.fault r31, 3)	282	EX(.fail_efault, probe.w.fault r31, 3)
286	xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter)	283	xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter)
287	;;	284	;;
288	// ? simulate tbit.nz.or p7,p0 = r28,0
289	getf.sig r2 = f8	285	getf.sig r2 = f8
290	mf	286	mf
291	;;	287	;;
292	ld4 r10 = [r20] // gtod_lock.sequence	288	ld4 r10 = [r20] // gtod_lock.sequence
293	shr.u r2 = r2,r23 // shift by factor	289	shr.u r2 = r2,r23 // shift by factor
294	;; // ? overloaded 3 bundles!	290	;;
295	add r8 = r8,r2 // Add xtime.nsecs	291	add r8 = r8,r2 // Add xtime.nsecs
296	cmp4.ne p7,p0 = r28,r10	292	cmp4.ne p7,p0 = r28,r10
297	(p7) br.cond.dpnt.few .time_redo // sequence number changed, redo	293	(p7) br.cond.dpnt.few .time_redo // sequence number changed, redo
@@ -319,9 +315,9 @@ EX(.fail_efault, probe.w.fault r31, 3)
319	EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles	315	EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles
320	(p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it	316	(p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it
321	;;	317	;;
322	mov r8 = r0
323	(p14) getf.sig r2 = f8	318	(p14) getf.sig r2 = f8
324	;;	319	;;
		320	mov r8 = r0
325	(p14) shr.u r21 = r2, 4	321	(p14) shr.u r21 = r2, 4
326	;;	322	;;
327	EX(.fail_efault, st8 [r31] = r9)	323	EX(.fail_efault, st8 [r31] = r9)


diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c index 2cb9425e0421..e0dca8743dbb 100644 --- a/arch/ia64/kernel/patch.c +++ b/arch/ia64/kernel/patch.c
@@ -135,10 +135,10 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
135		135
136	while (offp < (s32 *) end) {	136	while (offp < (s32 *) end) {
137	wp = (u64 ) ia64_imva((char ) offp + *offp);	137	wp = (u64 ) ia64_imva((char ) offp + *offp);
138	wp[0] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */	138	wp[0] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */
139	wp[1] = 0x0004000000000200UL;	139	wp[1] = 0x0084006880000200UL;
140	wp[2] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */	140	wp[2] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */
141	wp[3] = 0x0084006880000200UL;	141	wp[3] = 0x0004000000000200UL;
142	ia64_fc(wp); ia64_fc(wp + 2);	142	ia64_fc(wp); ia64_fc(wp + 2);
143	++offp;	143	++offp;
144	}	144	}