aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Mosberger-Tang <davidm@hpl.hp.com>2005-04-28 00:19:37 -0400
committerTony Luck <tony.luck@intel.com>2005-04-28 00:19:37 -0400
commit70929a57cfea8c18de13fcea9ae6771018a98949 (patch)
tree1371e183617f368b7a92b185a2dee829c70d0efd
parentf8fa5448fc9b4a7806b1297a0b57808f12fe4d43 (diff)
[IA64] Reschedule __kernel_syscall_via_epc().
Avoid some stalls, which is good for about 2 cycles when invoking a light-weight handler. When invoking a heavy-weight handler, this helps by about 7 cycles, with most of the improvement coming from the improved branch-prediction achieved by splitting the BBB bundle into two MIB bundles. Signed-off-by: David Mosberger-Tang <davidm@hpl.hp.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
-rw-r--r--arch/ia64/kernel/gate.S31
1 files changed, 18 insertions, 13 deletions
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
index facf75acdc85..3cd3f2e971f6 100644
--- a/arch/ia64/kernel/gate.S
+++ b/arch/ia64/kernel/gate.S
@@ -79,31 +79,34 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
79 ;; 79 ;;
80 rsm psr.be // note: on McKinley "rsm psr.be/srlz.d" is slightly faster than "rum psr.be" 80 rsm psr.be // note: on McKinley "rsm psr.be/srlz.d" is slightly faster than "rum psr.be"
81 LOAD_FSYSCALL_TABLE(r14) 81 LOAD_FSYSCALL_TABLE(r14)
82 82 ;;
83 mov r16=IA64_KR(CURRENT) // 12 cycle read latency 83 mov r16=IA64_KR(CURRENT) // 12 cycle read latency
84 tnat.nz p10,p9=r15 84 shladd r18=r17,3,r14
85 mov r19=NR_syscalls-1 85 mov r19=NR_syscalls-1
86 ;; 86 ;;
87 shladd r18=r17,3,r14 87 lfetch [r18] // M0|1
88 88 mov r29=psr // read psr (12 cyc load latency)
89 srlz.d
90 cmp.ne p8,p0=r0,r0 // p8 <- FALSE
91 /* Note: if r17 is a NaT, p6 will be set to zero. */ 89 /* Note: if r17 is a NaT, p6 will be set to zero. */
92 cmp.geu p6,p7=r19,r17 // (syscall > 0 && syscall < 1024+NR_syscalls)? 90 cmp.geu p6,p7=r19,r17 // (syscall > 0 && syscall < 1024+NR_syscalls)?
93 ;; 91 ;;
94(p6) ld8 r18=[r18]
95 mov r21=ar.fpsr 92 mov r21=ar.fpsr
96 add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry 93 tnat.nz p10,p9=r15
94 mov r26=ar.pfs
97 ;; 95 ;;
96 srlz.d
97(p6) ld8 r18=[r18]
98 nop.i 0
99 ;;
100 nop.m 0
98(p6) mov b7=r18 101(p6) mov b7=r18
99(p6) tbit.z p8,p0=r18,0 102(p6) tbit.z.unc p8,p0=r18,0
103
104 nop.m 0
105 nop.i 0
100(p8) br.dptk.many b7 106(p8) br.dptk.many b7
101 107
102(p6) rsm psr.i
103 mov r27=ar.rsc 108 mov r27=ar.rsc
104 mov r26=ar.pfs 109(p6) rsm psr.i
105 ;;
106 mov r29=psr // read psr (12 cyc load latency)
107/* 110/*
108 * brl.cond doesn't work as intended because the linker would convert this branch 111 * brl.cond doesn't work as intended because the linker would convert this branch
109 * into a branch to a PLT. Perhaps there will be a way to avoid this with some 112 * into a branch to a PLT. Perhaps there will be a way to avoid this with some
@@ -111,6 +114,8 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
111 * instead. 114 * instead.
112 */ 115 */
113#ifdef CONFIG_ITANIUM 116#ifdef CONFIG_ITANIUM
117 add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry
118 ;;
114(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down 119(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down
115 ;; 120 ;;
116(p6) mov b7=r14 121(p6) mov b7=r14