aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDenys Vlasenko <dvlasenk@redhat.com>2015-03-06 15:55:32 -0500
committerIngo Molnar <mingo@kernel.org>2015-03-07 05:12:43 -0500
commit3e1aa7cb59aff4b245b45e326fcdba1bf7f105c6 (patch)
tree8c61dc1e18de4087fadd2d566f0f72b3782be79d
parenta7fcf28d431ef70afaa91496e64e16dc51dccec4 (diff)
x86/asm: Optimize unnecessarily wide TEST instructions
By the nature of the TEST operation, it is often possible to test a narrower part of the operand: "testl $3, mem" -> "testb $3, mem", "testq $3, %rcx" -> "testb $3, %cl" This results in shorter instructions, because the TEST instruction has no sign-entending byte-immediate forms unlike other ALU ops. Note that this change does not create any LCP (Length-Changing Prefix) stalls, which happen when adding a 0x66 prefix, which happens when 16-bit immediates are used, which changes such TEST instructions: [test_opcode] [modrm] [imm32] to: [0x66] [test_opcode] [modrm] [imm16] where [imm16] has a *different length* now: 2 bytes instead of 4. This confuses the decoder and slows down execution. REX prefixes were carefully designed to almost never hit this case: adding REX prefix does not change instruction length except MOVABS and MOV [addr],RAX instruction. This patch does not add instructions which would use a 0x66 prefix, code changes in assembly are: -48 f7 07 01 00 00 00 testq $0x1,(%rdi) +f6 07 01 testb $0x1,(%rdi) -48 f7 c1 01 00 00 00 test $0x1,%rcx +f6 c1 01 test $0x1,%cl -48 f7 c1 02 00 00 00 test $0x2,%rcx +f6 c1 02 test $0x2,%cl -41 f7 c2 01 00 00 00 test $0x1,%r10d +41 f6 c2 01 test $0x1,%r10b -48 f7 c1 04 00 00 00 test $0x4,%rcx +f6 c1 04 test $0x4,%cl -48 f7 c1 08 00 00 00 test $0x8,%rcx +f6 c1 08 test $0x8,%cl Linus further notes: "There are no stalls from using 8-bit instruction forms. Now, changing from 64-bit or 32-bit 'test' instructions to 8-bit ones *could* cause problems if it ends up having forwarding issues, so that instead of just forwarding the result, you end up having to wait for it to be stable in the L1 cache (or possibly the register file). The forwarding from the store buffer is simplest and most reliable if the read is done at the exact same address and the exact same size as the write that gets forwarded. But that's true only if: (a) the write was very recent and is still in the write queue. I'm not sure that's the case here anyway. (b) on at least most Intel microarchitectures, you have to test a different byte than the lowest one (so forwarding a 64-bit write to a 8-bit read ends up working fine, as long as the 8-bit read is of the low 8 bits of the written data). A very similar issue *might* show up for registers too, not just memory writes, if you use 'testb' with a high-byte register (where instead of forwarding the value from the original producer it needs to go through the register file and then shifted). But it's mainly a problem for store buffers. But afaik, the way Denys changed the test instructions, neither of the above issues should be true. The real problem for store buffer forwarding tends to be "write 8 bits, read 32 bits". That can be really surprisingly expensive, because the read ends up having to wait until the write has hit the cacheline, and we might talk tens of cycles of latency here. But "write 32 bits, read the low 8 bits" *should* be fast on pretty much all x86 chips, afaik." Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com> Acked-by: Andy Lutomirski <luto@amacapital.net> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@linux.intel.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Will Drewry <wad@chromium.org> Link: http://lkml.kernel.org/r/1425675332-31576-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S8
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S8
-rw-r--r--arch/x86/lib/checksum_32.S4
-rw-r--r--arch/x86/lib/csum-copy_64.S2
5 files changed, 12 insertions, 12 deletions
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 9a0919678f97..ae6588b301c2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -146,7 +146,7 @@ startup_64:
146 leaq level2_kernel_pgt(%rip), %rdi 146 leaq level2_kernel_pgt(%rip), %rdi
147 leaq 4096(%rdi), %r8 147 leaq 4096(%rdi), %r8
148 /* See if it is a valid page table entry */ 148 /* See if it is a valid page table entry */
1491: testq $1, 0(%rdi) 1491: testb $1, 0(%rdi)
150 jz 2f 150 jz 2f
151 addq %rbp, 0(%rdi) 151 addq %rbp, 0(%rdi)
152 /* Go to the next page */ 152 /* Go to the next page */
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index e13f8e7c22a6..77630d57e7bf 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -226,23 +226,23 @@ swap_pages:
226 movl (%ebx), %ecx 226 movl (%ebx), %ecx
227 addl $4, %ebx 227 addl $4, %ebx
2281: 2281:
229 testl $0x1, %ecx /* is it a destination page */ 229 testb $0x1, %cl /* is it a destination page */
230 jz 2f 230 jz 2f
231 movl %ecx, %edi 231 movl %ecx, %edi
232 andl $0xfffff000, %edi 232 andl $0xfffff000, %edi
233 jmp 0b 233 jmp 0b
2342: 2342:
235 testl $0x2, %ecx /* is it an indirection page */ 235 testb $0x2, %cl /* is it an indirection page */
236 jz 2f 236 jz 2f
237 movl %ecx, %ebx 237 movl %ecx, %ebx
238 andl $0xfffff000, %ebx 238 andl $0xfffff000, %ebx
239 jmp 0b 239 jmp 0b
2402: 2402:
241 testl $0x4, %ecx /* is it the done indicator */ 241 testb $0x4, %cl /* is it the done indicator */
242 jz 2f 242 jz 2f
243 jmp 3f 243 jmp 3f
2442: 2442:
245 testl $0x8, %ecx /* is it the source indicator */ 245 testb $0x8, %cl /* is it the source indicator */
246 jz 0b /* Ignore it otherwise */ 246 jz 0b /* Ignore it otherwise */
247 movl %ecx, %esi /* For every source page do a copy */ 247 movl %ecx, %esi /* For every source page do a copy */
248 andl $0xfffff000, %esi 248 andl $0xfffff000, %esi
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 3fd2c693e475..04cb1790a596 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -221,23 +221,23 @@ swap_pages:
221 movq (%rbx), %rcx 221 movq (%rbx), %rcx
222 addq $8, %rbx 222 addq $8, %rbx
2231: 2231:
224 testq $0x1, %rcx /* is it a destination page? */ 224 testb $0x1, %cl /* is it a destination page? */
225 jz 2f 225 jz 2f
226 movq %rcx, %rdi 226 movq %rcx, %rdi
227 andq $0xfffffffffffff000, %rdi 227 andq $0xfffffffffffff000, %rdi
228 jmp 0b 228 jmp 0b
2292: 2292:
230 testq $0x2, %rcx /* is it an indirection page? */ 230 testb $0x2, %cl /* is it an indirection page? */
231 jz 2f 231 jz 2f
232 movq %rcx, %rbx 232 movq %rcx, %rbx
233 andq $0xfffffffffffff000, %rbx 233 andq $0xfffffffffffff000, %rbx
234 jmp 0b 234 jmp 0b
2352: 2352:
236 testq $0x4, %rcx /* is it the done indicator? */ 236 testb $0x4, %cl /* is it the done indicator? */
237 jz 2f 237 jz 2f
238 jmp 3f 238 jmp 3f
2392: 2392:
240 testq $0x8, %rcx /* is it the source indicator? */ 240 testb $0x8, %cl /* is it the source indicator? */
241 jz 0b /* Ignore it otherwise */ 241 jz 0b /* Ignore it otherwise */
242 movq %rcx, %rsi /* For ever source page do a copy */ 242 movq %rcx, %rsi /* For ever source page do a copy */
243 andq $0xfffffffffffff000, %rsi 243 andq $0xfffffffffffff000, %rsi
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index c3b9953d3fa0..9bc944a91274 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -125,7 +125,7 @@ ENTRY(csum_partial)
1256: addl %ecx,%eax 1256: addl %ecx,%eax
126 adcl $0, %eax 126 adcl $0, %eax
1277: 1277:
128 testl $1, 12(%esp) 128 testb $1, 12(%esp)
129 jz 8f 129 jz 8f
130 roll $8, %eax 130 roll $8, %eax
1318: 1318:
@@ -245,7 +245,7 @@ ENTRY(csum_partial)
245 addl %ebx,%eax 245 addl %ebx,%eax
246 adcl $0,%eax 246 adcl $0,%eax
24780: 24780:
248 testl $1, 12(%esp) 248 testb $1, 12(%esp)
249 jz 90f 249 jz 90f
250 roll $8, %eax 250 roll $8, %eax
25190: 25190:
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 2419d5fefae3..9734182966f3 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic)
196 196
197 /* handle last odd byte */ 197 /* handle last odd byte */
198.Lhandle_1: 198.Lhandle_1:
199 testl $1, %r10d 199 testb $1, %r10b
200 jz .Lende 200 jz .Lende
201 xorl %ebx, %ebx 201 xorl %ebx, %ebx
202 source 202 source