[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions

They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Andi Kleen <ak@suse.de> 2006-02-03 15:51:02 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-02-04 19:43:13 -0500
commit: 7bcd3f34e262bbebffa954d80eab3a84f053da31 (patch)
tree: f0765da9eaa8024a2b1d67d3e43730cb32f99fa7 /arch/x86_64/lib/memcpy.S
parent: 6bca52b544489b626c7d0db801df6b4aa3d5adb5 (diff)
1 files changed, 91 insertions, 2 deletions
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index 92dd80544602..5554948b5554 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,8 +11,6 @@
 * 
 * Output:
 * rax original destination
- * 
- * TODO: check best memcpy for PSC
 */     
        .globl __memcpy
@@ -20,6 +18,95 @@
        .p2align 4
 __memcpy:
 memcpy:         
+        pushq %rbx
+        movq %rdi,%rax
+        movl %edx,%ecx
+        shrl $6,%ecx
+        jz .Lhandle_tail
+        .p2align 4
+.Lloop_64:
+        decl %ecx
+        movq (%rsi),%r11
+        movq 8(%rsi),%r8
+        movq %r11,(%rdi)
+        movq %r8,1*8(%rdi)
+        movq 2*8(%rsi),%r9
+        movq 3*8(%rsi),%r10
+        movq %r9,2*8(%rdi)
+        movq %r10,3*8(%rdi)
+        movq 4*8(%rsi),%r11
+        movq 5*8(%rsi),%r8
+        movq %r11,4*8(%rdi)
+        movq %r8,5*8(%rdi)
+        movq 6*8(%rsi),%r9
+        movq 7*8(%rsi),%r10
+        movq %r9,6*8(%rdi)
+        movq %r10,7*8(%rdi)
+        leaq 64(%rsi),%rsi
+        leaq 64(%rdi),%rdi
+        jnz  .Lloop_64
+.Lhandle_tail:
+        movl %edx,%ecx
+        andl $63,%ecx
+        shrl $3,%ecx
+        jz   .Lhandle_7
+        .p2align 4
+.Lloop_8:
+        decl %ecx
+        movq (%rsi),%r8
+        movq %r8,(%rdi)
+        leaq 8(%rdi),%rdi
+        leaq 8(%rsi),%rsi
+        jnz  .Lloop_8
+.Lhandle_7:
+        movl %edx,%ecx
+        andl $7,%ecx
+        jz .Lende
+        .p2align 4
+.Lloop_1:
+        movb (%rsi),%r8b
+        movb %r8b,(%rdi)
+        incq %rdi
+        incq %rsi
+        decl %ecx
+        jnz .Lloop_1
+.Lende:
+        popq %rbx
+        ret
+.Lfinal:
+        /* Some CPUs run faster using the string copy instructions.
+           It is also a lot simpler. Use this when possible */
+        .section .altinstructions,"a"
+        .align 8
+        .quad  memcpy
+        .quad  memcpy_c
+        .byte  X86_FEATURE_REP_GOOD
+        .byte  .Lfinal-memcpy
+        .byte  memcpy_c_end-memcpy_c
+        .previous
+        .section .altinstr_replacement,"ax"
+ /* rdi destination
+  * rsi source
+  * rdx count
+  */
+memcpy_c:
        movq %rdi,%rax
        movl %edx,%ecx
        shrl $3,%ecx
@@ -30,3 +117,5 @@ memcpy:
        rep
        movsb
        ret
+memcpy_c_end:
+        .previous
author	Andi Kleen <ak@suse.de>	2006-02-03 15:51:02 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-02-04 19:43:13 -0500
commit	7bcd3f34e262bbebffa954d80eab3a84f053da31 (patch)
tree	f0765da9eaa8024a2b1d67d3e43730cb32f99fa7 /arch/x86_64/lib/memcpy.S
parent	6bca52b544489b626c7d0db801df6b4aa3d5adb5 (diff)

diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index 92dd80544602..5554948b5554 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S
@@ -11,8 +11,6 @@
11	*	11	*
12	* Output:	12	* Output:
13	* rax original destination	13	* rax original destination
14	*
15	* TODO: check best memcpy for PSC
16	*/	14	*/
17		15
18	.globl __memcpy	16	.globl __memcpy
@@ -20,6 +18,95 @@
20	.p2align 4	18	.p2align 4
21	__memcpy:	19	__memcpy:
22	memcpy:	20	memcpy:
		21	pushq %rbx
		22	movq %rdi,%rax
		23
		24	movl %edx,%ecx
		25	shrl $6,%ecx
		26	jz .Lhandle_tail
		27
		28	.p2align 4
		29	.Lloop_64:
		30	decl %ecx
		31
		32	movq (%rsi),%r11
		33	movq 8(%rsi),%r8
		34
		35	movq %r11,(%rdi)
		36	movq %r8,1*8(%rdi)
		37
		38	movq 2*8(%rsi),%r9
		39	movq 3*8(%rsi),%r10
		40
		41	movq %r9,2*8(%rdi)
		42	movq %r10,3*8(%rdi)
		43
		44	movq 4*8(%rsi),%r11
		45	movq 5*8(%rsi),%r8
		46
		47	movq %r11,4*8(%rdi)
		48	movq %r8,5*8(%rdi)
		49
		50	movq 6*8(%rsi),%r9
		51	movq 7*8(%rsi),%r10
		52
		53	movq %r9,6*8(%rdi)
		54	movq %r10,7*8(%rdi)
		55
		56	leaq 64(%rsi),%rsi
		57	leaq 64(%rdi),%rdi
		58	jnz .Lloop_64
		59
		60	.Lhandle_tail:
		61	movl %edx,%ecx
		62	andl $63,%ecx
		63	shrl $3,%ecx
		64	jz .Lhandle_7
		65	.p2align 4
		66	.Lloop_8:
		67	decl %ecx
		68	movq (%rsi),%r8
		69	movq %r8,(%rdi)
		70	leaq 8(%rdi),%rdi
		71	leaq 8(%rsi),%rsi
		72	jnz .Lloop_8
		73
		74	.Lhandle_7:
		75	movl %edx,%ecx
		76	andl $7,%ecx
		77	jz .Lende
		78	.p2align 4
		79	.Lloop_1:
		80	movb (%rsi),%r8b
		81	movb %r8b,(%rdi)
		82	incq %rdi
		83	incq %rsi
		84	decl %ecx
		85	jnz .Lloop_1
		86
		87	.Lende:
		88	popq %rbx
		89	ret
		90	.Lfinal:
		91
		92	/* Some CPUs run faster using the string copy instructions.
		93	It is also a lot simpler. Use this when possible */
		94
		95	.section .altinstructions,"a"
		96	.align 8
		97	.quad memcpy
		98	.quad memcpy_c
		99	.byte X86_FEATURE_REP_GOOD
		100	.byte .Lfinal-memcpy
		101	.byte memcpy_c_end-memcpy_c
		102	.previous
		103
		104	.section .altinstr_replacement,"ax"
		105	/* rdi destination
		106	* rsi source
		107	* rdx count
		108	*/
		109	memcpy_c:
23	movq %rdi,%rax	110	movq %rdi,%rax
24	movl %edx,%ecx	111	movl %edx,%ecx
25	shrl $3,%ecx	112	shrl $3,%ecx
@@ -30,3 +117,5 @@ memcpy:
30	rep	117	rep
31	movsb	118	movsb
32	ret	119	ret
		120	memcpy_c_end:
		121	.previous