[IA64] Reschedule __kernel_syscall_via_epc().

Avoid some stalls, which is good for about 2 cycles when invoking a light-weight handler. When invoking a heavy-weight handler, this helps by about 7 cycles, with most of the improvement coming from the improved branch-prediction achieved by splitting the BBB bundle into two MIB bundles. Signed-off-by: David Mosberger-Tang <davidm@hpl.hp.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
author: David Mosberger-Tang <davidm@hpl.hp.com> 2005-04-28 00:19:37 -0400
committer: Tony Luck <tony.luck@intel.com> 2005-04-28 00:19:37 -0400
commit: 70929a57cfea8c18de13fcea9ae6771018a98949 (patch)
tree: 1371e183617f368b7a92b185a2dee829c70d0efd
parent: f8fa5448fc9b4a7806b1297a0b57808f12fe4d43 (diff)
1 files changed, 18 insertions, 13 deletions
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
index facf75acdc85..3cd3f2e971f6 100644
--- a/arch/ia64/kernel/gate.S
+++ b/arch/ia64/kernel/gate.S
@@ -79,31 +79,34 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
        ;;
        rsm psr.be // note: on McKinley "rsm psr.be/srlz.d" is slightly faster than "rum psr.be"
        LOAD_FSYSCALL_TABLE(r14)
+        ;;
        mov r16=IA64_KR(CURRENT)                // 12 cycle read latency
-        tnat.nz p10,p9=r15
+        shladd r18=r17,3,r14
        mov r19=NR_syscalls-1
        ;;
-        shladd r18=r17,3,r14
+        lfetch [r18]                            // M0|1
+        mov r29=psr                             // read psr (12 cyc load latency)
-        srlz.d
-        cmp.ne p8,p0=r0,r0                      // p8 <- FALSE
        /* Note: if r17 is a NaT, p6 will be set to zero.  */
        cmp.geu p6,p7=r19,r17                   // (syscall > 0 && syscall < 1024+NR_syscalls)?
        ;;
-(p6)    ld8 r18=[r18]
        mov r21=ar.fpsr
-        add r14=-8,r14                          // r14 <- addr of fsys_bubble_down entry
+        tnat.nz p10,p9=r15
+        mov r26=ar.pfs
        ;;
+        srlz.d
+(p6)    ld8 r18=[r18]
+        nop.i 0
+        ;;
+        nop.m 0
 (p6)    mov b7=r18
-(p6)    tbit.z p8,p0=r18,0
+(p6)    tbit.z.unc p8,p0=r18,0
+        nop.m 0
+        nop.i 0
 (p8)    br.dptk.many b7
-(p6)    rsm psr.i
        mov r27=ar.rsc
-        mov r26=ar.pfs
+(p6)    rsm psr.i
-        ;;
-        mov r29=psr                             // read psr (12 cyc load latency)
 /*
 * brl.cond doesn't work as intended because the linker would convert this branch
 * into a branch to a PLT.  Perhaps there will be a way to avoid this with some
@@ -111,6 +114,8 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
 * instead.
 */
 #ifdef CONFIG_ITANIUM
+        add r14=-8,r14                          // r14 <- addr of fsys_bubble_down entry
+        ;;
 (p6)    ld8 r14=[r14]                           // r14 <- fsys_bubble_down
        ;;
 (p6)    mov b7=r14
author	David Mosberger-Tang <davidm@hpl.hp.com>	2005-04-28 00:19:37 -0400
committer	Tony Luck <tony.luck@intel.com>	2005-04-28 00:19:37 -0400
commit	70929a57cfea8c18de13fcea9ae6771018a98949 (patch)
tree	1371e183617f368b7a92b185a2dee829c70d0efd
parent	f8fa5448fc9b4a7806b1297a0b57808f12fe4d43 (diff)

diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S index facf75acdc85..3cd3f2e971f6 100644 --- a/arch/ia64/kernel/gate.S +++ b/arch/ia64/kernel/gate.S
@@ -79,31 +79,34 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
79	;;	79	;;
80	rsm psr.be // note: on McKinley "rsm psr.be/srlz.d" is slightly faster than "rum psr.be"	80	rsm psr.be // note: on McKinley "rsm psr.be/srlz.d" is slightly faster than "rum psr.be"
81	LOAD_FSYSCALL_TABLE(r14)	81	LOAD_FSYSCALL_TABLE(r14)
82		82	;;
83	mov r16=IA64_KR(CURRENT) // 12 cycle read latency	83	mov r16=IA64_KR(CURRENT) // 12 cycle read latency
84	tnat.nz p10,p9=r15	84	shladd r18=r17,3,r14
85	mov r19=NR_syscalls-1	85	mov r19=NR_syscalls-1
86	;;	86	;;
87	shladd r18=r17,3,r14	87	lfetch [r18] // M0\|1
88		88	mov r29=psr // read psr (12 cyc load latency)
89	srlz.d
90	cmp.ne p8,p0=r0,r0 // p8 <- FALSE
91	/* Note: if r17 is a NaT, p6 will be set to zero. */	89	/* Note: if r17 is a NaT, p6 will be set to zero. */
92	cmp.geu p6,p7=r19,r17 // (syscall > 0 && syscall < 1024+NR_syscalls)?	90	cmp.geu p6,p7=r19,r17 // (syscall > 0 && syscall < 1024+NR_syscalls)?
93	;;	91	;;
94	(p6) ld8 r18=[r18]
95	mov r21=ar.fpsr	92	mov r21=ar.fpsr
96	add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry	93	tnat.nz p10,p9=r15
		94	mov r26=ar.pfs
97	;;	95	;;
		96	srlz.d
		97	(p6) ld8 r18=[r18]
		98	nop.i 0
		99	;;
		100	nop.m 0
98	(p6) mov b7=r18	101	(p6) mov b7=r18
99	(p6) tbit.z p8,p0=r18,0	102	(p6) tbit.z.unc p8,p0=r18,0
		103
		104	nop.m 0
		105	nop.i 0
100	(p8) br.dptk.many b7	106	(p8) br.dptk.many b7
101		107
102	(p6) rsm psr.i
103	mov r27=ar.rsc	108	mov r27=ar.rsc
104	mov r26=ar.pfs	109	(p6) rsm psr.i
105	;;
106	mov r29=psr // read psr (12 cyc load latency)
107	/*	110	/*
108	* brl.cond doesn't work as intended because the linker would convert this branch	111	* brl.cond doesn't work as intended because the linker would convert this branch
109	* into a branch to a PLT. Perhaps there will be a way to avoid this with some	112	* into a branch to a PLT. Perhaps there will be a way to avoid this with some
@@ -111,6 +114,8 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
111	* instead.	114	* instead.
112	*/	115	*/
113	#ifdef CONFIG_ITANIUM	116	#ifdef CONFIG_ITANIUM
		117	add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry
		118	;;
114	(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down	119	(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down
115	;;	120	;;
116	(p6) mov b7=r14	121	(p6) mov b7=r14