x86-64: Handle byte-wise tail copying in memcpy() without a loop

While hard to measure, reducing the number of possibly/likely mis-predicted branches can generally be expected to be slightly better. Other than apparent at the first glance, this also doesn't grow the function size (the alignment gap to the next function just gets smaller). Signed-off-by: Jan Beulich <jbeulich@suse.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/4F218584020000780006F422@nat28.tlf.novell.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Jan Beulich <JBeulich@suse.com> 2012-01-26 10:55:32 -0500
committer: Ingo Molnar <mingo@elte.hu> 2012-01-26 15:19:20 -0500
commit: 9d8e22777e66f420e46490e9fc6f8cb7e0e2222b (patch)
tree: dd0ec6122dda1409206dda70f6ae4fd3c9a2cd35 /arch/x86/lib
parent: 2ab560911a427fdc73bfd3a7d2944d8ee0ca6db8 (diff)
1 files changed, 10 insertions, 9 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 1235b04a9a60..1c273be7c97e 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -164,18 +164,19 @@ ENTRY(memcpy)
        retq
        .p2align 4
 .Lless_3bytes:
-        cmpl $0, %edx
+        subl $1, %edx
-        je .Lend
+        jb .Lend
        /*
         * Move data from 1 bytes to 3 bytes.
         */
-.Lloop_1:
+        movzbl (%rsi), %ecx
-        movb (%rsi), %r8b
+        jz .Lstore_1byte
-        movb %r8b, (%rdi)
+        movzbq 1(%rsi), %r8
-        incq %rdi
+        movzbq (%rsi, %rdx), %r9
-        incq %rsi
+        movb %r8b, 1(%rdi)
-        decl %edx
+        movb %r9b, (%rdi, %rdx)
-        jnz .Lloop_1
+.Lstore_1byte:
+        movb %cl, (%rdi)
 .Lend:
        retq
author	Jan Beulich <JBeulich@suse.com>	2012-01-26 10:55:32 -0500
committer	Ingo Molnar <mingo@elte.hu>	2012-01-26 15:19:20 -0500
commit	9d8e22777e66f420e46490e9fc6f8cb7e0e2222b (patch)
tree	dd0ec6122dda1409206dda70f6ae4fd3c9a2cd35 /arch/x86/lib
parent	2ab560911a427fdc73bfd3a7d2944d8ee0ca6db8 (diff)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 1235b04a9a60..1c273be7c97e 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S
@@ -164,18 +164,19 @@ ENTRY(memcpy)
164	retq	164	retq
165	.p2align 4	165	.p2align 4
166	.Lless_3bytes:	166	.Lless_3bytes:
167	cmpl $0, %edx	167	subl $1, %edx
168	je .Lend	168	jb .Lend
169	/*	169	/*
170	* Move data from 1 bytes to 3 bytes.	170	* Move data from 1 bytes to 3 bytes.
171	*/	171	*/
172	.Lloop_1:	172	movzbl (%rsi), %ecx
173	movb (%rsi), %r8b	173	jz .Lstore_1byte
174	movb %r8b, (%rdi)	174	movzbq 1(%rsi), %r8
175	incq %rdi	175	movzbq (%rsi, %rdx), %r9
176	incq %rsi	176	movb %r8b, 1(%rdi)
177	decl %edx	177	movb %r9b, (%rdi, %rdx)
178	jnz .Lloop_1	178	.Lstore_1byte:
		179	movb %cl, (%rdi)
179		180
180	.Lend:	181	.Lend:
181	retq	182	retq