[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions

They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Andi Kleen <ak@suse.de> 2006-02-03 15:51:02 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-02-04 19:43:13 -0500
commit: 7bcd3f34e262bbebffa954d80eab3a84f053da31 (patch)
tree: f0765da9eaa8024a2b1d67d3e43730cb32f99fa7 /arch/x86_64/lib/memset.S
parent: 6bca52b544489b626c7d0db801df6b4aa3d5adb5 (diff)
1 files changed, 94 insertions, 0 deletions
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 2aa48f24ed1e..ad397f2c7de8 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -13,6 +13,98 @@
        .p2align 4
 memset: 
 __memset:
+        movq %rdi,%r10
+        movq %rdx,%r11
+        /* expand byte value  */
+        movzbl %sil,%ecx
+        movabs $0x0101010101010101,%rax
+        mul    %rcx             /* with rax, clobbers rdx */
+        /* align dst */
+        movl  %edi,%r9d
+        andl  $7,%r9d
+        jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+        movl %r11d,%ecx
+        shrl $6,%ecx
+        jz       .Lhandle_tail
+        .p2align 4
+.Lloop_64:
+        decl   %ecx
+        movq  %rax,(%rdi)
+        movq  %rax,8(%rdi)
+        movq  %rax,16(%rdi)
+        movq  %rax,24(%rdi)
+        movq  %rax,32(%rdi)
+        movq  %rax,40(%rdi)
+        movq  %rax,48(%rdi)
+        movq  %rax,56(%rdi)
+        leaq  64(%rdi),%rdi
+        jnz    .Lloop_64
+        /* Handle tail in loops. The loops should be faster than hard
+           to predict jump tables. */
+        .p2align 4
+.Lhandle_tail:
+        movl    %r11d,%ecx
+        andl    $63&(~7),%ecx
+        jz              .Lhandle_7
+        shrl    $3,%ecx
+        .p2align 4
+.Lloop_8:
+        decl   %ecx
+        movq  %rax,(%rdi)
+        leaq  8(%rdi),%rdi
+        jnz    .Lloop_8
+.Lhandle_7:
+        movl    %r11d,%ecx
+        andl    $7,%ecx
+        jz      .Lende
+        .p2align 4
+.Lloop_1:
+        decl    %ecx
+        movb    %al,(%rdi)
+        leaq    1(%rdi),%rdi
+        jnz     .Lloop_1
+.Lende:
+        movq    %r10,%rax
+        ret
+.Lbad_alignment:
+        cmpq $7,%r11
+        jbe     .Lhandle_7
+        movq %rax,(%rdi)        /* unaligned store */
+        movq $8,%r8
+        subq %r9,%r8
+        addq %r8,%rdi
+        subq %r8,%r11
+        jmp .Lafter_bad_alignment
+        /* Some CPUs run faster using the string instructions.
+           It is also a lot simpler. Use this when possible */
+#include <asm/cpufeature.h>
+        .section .altinstructions,"a"
+        .align 8
+        .quad  memset
+        .quad  memset_c
+        .byte  X86_FEATURE_REP_GOOD
+        .byte  memset_c_end-memset_c
+        .byte  memset_c_end-memset_c
+        .previous
+        .section .altinstr_replacement,"ax"
+ /* rdi destination
+  * rsi value
+  * rdx count
+  */
+memset_c:
        movq %rdi,%r9
        movl %edx,%r8d
        andl $7,%r8d            
@@ -29,3 +121,5 @@ __memset:
        stosb
        movq %r9,%rax
        ret
+memset_c_end:
+        .previous
author	Andi Kleen <ak@suse.de>	2006-02-03 15:51:02 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-02-04 19:43:13 -0500
commit	7bcd3f34e262bbebffa954d80eab3a84f053da31 (patch)
tree	f0765da9eaa8024a2b1d67d3e43730cb32f99fa7 /arch/x86_64/lib/memset.S
parent	6bca52b544489b626c7d0db801df6b4aa3d5adb5 (diff)

diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index 2aa48f24ed1e..ad397f2c7de8 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S
@@ -13,6 +13,98 @@
13	.p2align 4	13	.p2align 4
14	memset:	14	memset:
15	__memset:	15	__memset:
		16	movq %rdi,%r10
		17	movq %rdx,%r11
		18
		19	/* expand byte value */
		20	movzbl %sil,%ecx
		21	movabs $0x0101010101010101,%rax
		22	mul %rcx /* with rax, clobbers rdx */
		23
		24	/* align dst */
		25	movl %edi,%r9d
		26	andl $7,%r9d
		27	jnz .Lbad_alignment
		28	.Lafter_bad_alignment:
		29
		30	movl %r11d,%ecx
		31	shrl $6,%ecx
		32	jz .Lhandle_tail
		33
		34	.p2align 4
		35	.Lloop_64:
		36	decl %ecx
		37	movq %rax,(%rdi)
		38	movq %rax,8(%rdi)
		39	movq %rax,16(%rdi)
		40	movq %rax,24(%rdi)
		41	movq %rax,32(%rdi)
		42	movq %rax,40(%rdi)
		43	movq %rax,48(%rdi)
		44	movq %rax,56(%rdi)
		45	leaq 64(%rdi),%rdi
		46	jnz .Lloop_64
		47
		48	/* Handle tail in loops. The loops should be faster than hard
		49	to predict jump tables. */
		50	.p2align 4
		51	.Lhandle_tail:
		52	movl %r11d,%ecx
		53	andl $63&(~7),%ecx
		54	jz .Lhandle_7
		55	shrl $3,%ecx
		56	.p2align 4
		57	.Lloop_8:
		58	decl %ecx
		59	movq %rax,(%rdi)
		60	leaq 8(%rdi),%rdi
		61	jnz .Lloop_8
		62
		63	.Lhandle_7:
		64	movl %r11d,%ecx
		65	andl $7,%ecx
		66	jz .Lende
		67	.p2align 4
		68	.Lloop_1:
		69	decl %ecx
		70	movb %al,(%rdi)
		71	leaq 1(%rdi),%rdi
		72	jnz .Lloop_1
		73
		74	.Lende:
		75	movq %r10,%rax
		76	ret
		77
		78	.Lbad_alignment:
		79	cmpq $7,%r11
		80	jbe .Lhandle_7
		81	movq %rax,(%rdi) /* unaligned store */
		82	movq $8,%r8
		83	subq %r9,%r8
		84	addq %r8,%rdi
		85	subq %r8,%r11
		86	jmp .Lafter_bad_alignment
		87
		88	/* Some CPUs run faster using the string instructions.
		89	It is also a lot simpler. Use this when possible */
		90
		91	#include <asm/cpufeature.h>
		92
		93	.section .altinstructions,"a"
		94	.align 8
		95	.quad memset
		96	.quad memset_c
		97	.byte X86_FEATURE_REP_GOOD
		98	.byte memset_c_end-memset_c
		99	.byte memset_c_end-memset_c
		100	.previous
		101
		102	.section .altinstr_replacement,"ax"
		103	/* rdi destination
		104	* rsi value
		105	* rdx count
		106	*/
		107	memset_c:
16	movq %rdi,%r9	108	movq %rdi,%r9
17	movl %edx,%r8d	109	movl %edx,%r8d
18	andl $7,%r8d	110	andl $7,%r8d
@@ -29,3 +121,5 @@ __memset:
29	stosb	121	stosb
30	movq %r9,%rax	122	movq %r9,%rax
31	ret	123	ret
		124	memset_c_end:
		125	.previous