diff options
| author | David Mosberger-Tang <davidm@hpl.hp.com> | 2005-04-28 00:19:37 -0400 |
|---|---|---|
| committer | Tony Luck <tony.luck@intel.com> | 2005-04-28 00:19:37 -0400 |
| commit | 70929a57cfea8c18de13fcea9ae6771018a98949 (patch) | |
| tree | 1371e183617f368b7a92b185a2dee829c70d0efd | |
| parent | f8fa5448fc9b4a7806b1297a0b57808f12fe4d43 (diff) | |
[IA64] Reschedule __kernel_syscall_via_epc().
Avoid some stalls, which is good for about 2 cycles when invoking a
light-weight handler. When invoking a heavy-weight handler, this
helps by about 7 cycles, with most of the improvement coming from the
improved branch-prediction achieved by splitting the BBB bundle into
two MIB bundles.
Signed-off-by: David Mosberger-Tang <davidm@hpl.hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
| -rw-r--r-- | arch/ia64/kernel/gate.S | 31 |
1 files changed, 18 insertions, 13 deletions
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S index facf75acdc85..3cd3f2e971f6 100644 --- a/arch/ia64/kernel/gate.S +++ b/arch/ia64/kernel/gate.S | |||
| @@ -79,31 +79,34 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) | |||
| 79 | ;; | 79 | ;; |
| 80 | rsm psr.be // note: on McKinley "rsm psr.be/srlz.d" is slightly faster than "rum psr.be" | 80 | rsm psr.be // note: on McKinley "rsm psr.be/srlz.d" is slightly faster than "rum psr.be" |
| 81 | LOAD_FSYSCALL_TABLE(r14) | 81 | LOAD_FSYSCALL_TABLE(r14) |
| 82 | 82 | ;; | |
| 83 | mov r16=IA64_KR(CURRENT) // 12 cycle read latency | 83 | mov r16=IA64_KR(CURRENT) // 12 cycle read latency |
| 84 | tnat.nz p10,p9=r15 | 84 | shladd r18=r17,3,r14 |
| 85 | mov r19=NR_syscalls-1 | 85 | mov r19=NR_syscalls-1 |
| 86 | ;; | 86 | ;; |
| 87 | shladd r18=r17,3,r14 | 87 | lfetch [r18] // M0|1 |
| 88 | 88 | mov r29=psr // read psr (12 cyc load latency) | |
| 89 | srlz.d | ||
| 90 | cmp.ne p8,p0=r0,r0 // p8 <- FALSE | ||
| 91 | /* Note: if r17 is a NaT, p6 will be set to zero. */ | 89 | /* Note: if r17 is a NaT, p6 will be set to zero. */ |
| 92 | cmp.geu p6,p7=r19,r17 // (syscall > 0 && syscall < 1024+NR_syscalls)? | 90 | cmp.geu p6,p7=r19,r17 // (syscall > 0 && syscall < 1024+NR_syscalls)? |
| 93 | ;; | 91 | ;; |
| 94 | (p6) ld8 r18=[r18] | ||
| 95 | mov r21=ar.fpsr | 92 | mov r21=ar.fpsr |
| 96 | add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry | 93 | tnat.nz p10,p9=r15 |
| 94 | mov r26=ar.pfs | ||
| 97 | ;; | 95 | ;; |
| 96 | srlz.d | ||
| 97 | (p6) ld8 r18=[r18] | ||
| 98 | nop.i 0 | ||
| 99 | ;; | ||
| 100 | nop.m 0 | ||
| 98 | (p6) mov b7=r18 | 101 | (p6) mov b7=r18 |
| 99 | (p6) tbit.z p8,p0=r18,0 | 102 | (p6) tbit.z.unc p8,p0=r18,0 |
| 103 | |||
| 104 | nop.m 0 | ||
| 105 | nop.i 0 | ||
| 100 | (p8) br.dptk.many b7 | 106 | (p8) br.dptk.many b7 |
| 101 | 107 | ||
| 102 | (p6) rsm psr.i | ||
| 103 | mov r27=ar.rsc | 108 | mov r27=ar.rsc |
| 104 | mov r26=ar.pfs | 109 | (p6) rsm psr.i |
| 105 | ;; | ||
| 106 | mov r29=psr // read psr (12 cyc load latency) | ||
| 107 | /* | 110 | /* |
| 108 | * brl.cond doesn't work as intended because the linker would convert this branch | 111 | * brl.cond doesn't work as intended because the linker would convert this branch |
| 109 | * into a branch to a PLT. Perhaps there will be a way to avoid this with some | 112 | * into a branch to a PLT. Perhaps there will be a way to avoid this with some |
| @@ -111,6 +114,8 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) | |||
| 111 | * instead. | 114 | * instead. |
| 112 | */ | 115 | */ |
| 113 | #ifdef CONFIG_ITANIUM | 116 | #ifdef CONFIG_ITANIUM |
| 117 | add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry | ||
| 118 | ;; | ||
| 114 | (p6) ld8 r14=[r14] // r14 <- fsys_bubble_down | 119 | (p6) ld8 r14=[r14] // r14 <- fsys_bubble_down |
| 115 | ;; | 120 | ;; |
| 116 | (p6) mov b7=r14 | 121 | (p6) mov b7=r14 |
