aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/lib/copy_page.S
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2006-02-03 15:51:02 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-02-04 19:43:13 -0500
commit7bcd3f34e262bbebffa954d80eab3a84f053da31 (patch)
treef0765da9eaa8024a2b1d67d3e43730cb32f99fa7 /arch/x86_64/lib/copy_page.S
parent6bca52b544489b626c7d0db801df6b4aa3d5adb5 (diff)
[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions
They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/lib/copy_page.S')
-rw-r--r--arch/x86_64/lib/copy_page.S87
1 files changed, 87 insertions, 0 deletions
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index 621a19769406..8fa19d96a7ee 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -8,7 +8,94 @@
8 .globl copy_page 8 .globl copy_page
9 .p2align 4 9 .p2align 4
10copy_page: 10copy_page:
11 subq $3*8,%rsp
12 movq %rbx,(%rsp)
13 movq %r12,1*8(%rsp)
14 movq %r13,2*8(%rsp)
15
16 movl $(4096/64)-5,%ecx
17 .p2align 4
18.Loop64:
19 dec %rcx
20
21 movq (%rsi), %rax
22 movq 8 (%rsi), %rbx
23 movq 16 (%rsi), %rdx
24 movq 24 (%rsi), %r8
25 movq 32 (%rsi), %r9
26 movq 40 (%rsi), %r10
27 movq 48 (%rsi), %r11
28 movq 56 (%rsi), %r12
29
30 prefetcht0 5*64(%rsi)
31
32 movq %rax, (%rdi)
33 movq %rbx, 8 (%rdi)
34 movq %rdx, 16 (%rdi)
35 movq %r8, 24 (%rdi)
36 movq %r9, 32 (%rdi)
37 movq %r10, 40 (%rdi)
38 movq %r11, 48 (%rdi)
39 movq %r12, 56 (%rdi)
40
41 leaq 64 (%rsi), %rsi
42 leaq 64 (%rdi), %rdi
43
44 jnz .Loop64
45
46 movl $5,%ecx
47 .p2align 4
48.Loop2:
49 decl %ecx
50
51 movq (%rsi), %rax
52 movq 8 (%rsi), %rbx
53 movq 16 (%rsi), %rdx
54 movq 24 (%rsi), %r8
55 movq 32 (%rsi), %r9
56 movq 40 (%rsi), %r10
57 movq 48 (%rsi), %r11
58 movq 56 (%rsi), %r12
59
60 movq %rax, (%rdi)
61 movq %rbx, 8 (%rdi)
62 movq %rdx, 16 (%rdi)
63 movq %r8, 24 (%rdi)
64 movq %r9, 32 (%rdi)
65 movq %r10, 40 (%rdi)
66 movq %r11, 48 (%rdi)
67 movq %r12, 56 (%rdi)
68
69 leaq 64(%rdi),%rdi
70 leaq 64(%rsi),%rsi
71
72 jnz .Loop2
73
74 movq (%rsp),%rbx
75 movq 1*8(%rsp),%r12
76 movq 2*8(%rsp),%r13
77 addq $3*8,%rsp
78 ret
79
80 /* Some CPUs run faster using the string copy instructions.
81 It is also a lot simpler. Use this when possible */
82
83#include <asm/cpufeature.h>
84
85 .section .altinstructions,"a"
86 .align 8
87 .quad copy_page
88 .quad copy_page_c
89 .byte X86_FEATURE_REP_GOOD
90 .byte copy_page_c_end-copy_page_c
91 .byte copy_page_c_end-copy_page_c
92 .previous
93
94 .section .altinstr_replacement,"ax"
95copy_page_c:
11 movl $4096/8,%ecx 96 movl $4096/8,%ecx
12 rep 97 rep
13 movsq 98 movsq
14 ret 99 ret
100copy_page_c_end:
101 .previous