aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/lib/memcpy_64.S
diff options
context:
space:
mode:
authorMa Ling <ling.ma@intel.com>2010-06-28 15:24:25 -0400
committerH. Peter Anvin <hpa@linux.intel.com>2010-08-23 17:56:41 -0400
commit59daa706fbec745684702741b9f5373142dd9fdc (patch)
tree53c20211ef0bf1be250a31a98ae0d966dce49308 /arch/x86/lib/memcpy_64.S
parentfdf4289679fd41d76553ce224750e9737cd80eea (diff)
x86, mem: Optimize memcpy by avoiding memory false dependece
All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch/x86/lib/memcpy_64.S')
-rw-r--r--arch/x86/lib/memcpy_64.S158
1 files changed, 103 insertions, 55 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e0f7d..75ef61e35e3 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
40ENTRY(__memcpy) 40ENTRY(__memcpy)
41ENTRY(memcpy) 41ENTRY(memcpy)
42 CFI_STARTPROC 42 CFI_STARTPROC
43 movq %rdi, %rax
43 44
44 /* 45 /*
45 * Put the number of full 64-byte blocks into %ecx. 46 * Use 32bit CMP here to avoid long NOP padding.
46 * Tail portion is handled at the end:
47 */ 47 */
48 movq %rdi, %rax 48 cmp $0x20, %edx
49 movl %edx, %ecx 49 jb .Lhandle_tail
50 shrl $6, %ecx
51 jz .Lhandle_tail
52 50
53 .p2align 4
54.Lloop_64:
55 /* 51 /*
56 * We decrement the loop index here - and the zero-flag is 52 * We check whether memory false dependece could occur,
57 * checked at the end of the loop (instructions inbetween do 53 * then jump to corresponding copy mode.
58 * not change the zero flag):
59 */ 54 */
60 decl %ecx 55 cmp %dil, %sil
56 jl .Lcopy_backward
57 subl $0x20, %edx
58.Lcopy_forward_loop:
59 subq $0x20, %rdx
61 60
62 /* 61 /*
63 * Move in blocks of 4x16 bytes: 62 * Move in blocks of 4x8 bytes:
64 */ 63 */
65 movq 0*8(%rsi), %r11 64 movq 0*8(%rsi), %r8
66 movq 1*8(%rsi), %r8 65 movq 1*8(%rsi), %r9
67 movq %r11, 0*8(%rdi) 66 movq 2*8(%rsi), %r10
68 movq %r8, 1*8(%rdi) 67 movq 3*8(%rsi), %r11
69 68 leaq 4*8(%rsi), %rsi
70 movq 2*8(%rsi), %r9 69
71 movq 3*8(%rsi), %r10 70 movq %r8, 0*8(%rdi)
72 movq %r9, 2*8(%rdi) 71 movq %r9, 1*8(%rdi)
73 movq %r10, 3*8(%rdi) 72 movq %r10, 2*8(%rdi)
74 73 movq %r11, 3*8(%rdi)
75 movq 4*8(%rsi), %r11 74 leaq 4*8(%rdi), %rdi
76 movq 5*8(%rsi), %r8 75 jae .Lcopy_forward_loop
77 movq %r11, 4*8(%rdi) 76 addq $0x20, %rdx
78 movq %r8, 5*8(%rdi) 77 jmp .Lhandle_tail
79 78
80 movq 6*8(%rsi), %r9 79.Lcopy_backward:
81 movq 7*8(%rsi), %r10 80 /*
82 movq %r9, 6*8(%rdi) 81 * Calculate copy position to tail.
83 movq %r10, 7*8(%rdi) 82 */
84 83 addq %rdx, %rsi
85 leaq 64(%rsi), %rsi 84 addq %rdx, %rdi
86 leaq 64(%rdi), %rdi 85 subq $0x20, %rdx
87 86 /*
88 jnz .Lloop_64 87 * At most 3 ALU operations in one cycle,
88 * so append NOPS in the same 16bytes trunk.
89 */
90 .p2align 4
91.Lcopy_backward_loop:
92 subq $0x20, %rdx
93 movq -1*8(%rsi), %r8
94 movq -2*8(%rsi), %r9
95 movq -3*8(%rsi), %r10
96 movq -4*8(%rsi), %r11
97 leaq -4*8(%rsi), %rsi
98 movq %r8, -1*8(%rdi)
99 movq %r9, -2*8(%rdi)
100 movq %r10, -3*8(%rdi)
101 movq %r11, -4*8(%rdi)
102 leaq -4*8(%rdi), %rdi
103 jae .Lcopy_backward_loop
89 104
105 /*
106 * Calculate copy position to head.
107 */
108 addq $0x20, %rdx
109 subq %rdx, %rsi
110 subq %rdx, %rdi
90.Lhandle_tail: 111.Lhandle_tail:
91 movl %edx, %ecx 112 cmpq $16, %rdx
92 andl $63, %ecx 113 jb .Lless_16bytes
93 shrl $3, %ecx
94 jz .Lhandle_7
95 114
115 /*
116 * Move data from 16 bytes to 31 bytes.
117 */
118 movq 0*8(%rsi), %r8
119 movq 1*8(%rsi), %r9
120 movq -2*8(%rsi, %rdx), %r10
121 movq -1*8(%rsi, %rdx), %r11
122 movq %r8, 0*8(%rdi)
123 movq %r9, 1*8(%rdi)
124 movq %r10, -2*8(%rdi, %rdx)
125 movq %r11, -1*8(%rdi, %rdx)
126 retq
96 .p2align 4 127 .p2align 4
97.Lloop_8: 128.Lless_16bytes:
98 decl %ecx 129 cmpq $8, %rdx
99 movq (%rsi), %r8 130 jb .Lless_8bytes
100 movq %r8, (%rdi) 131 /*
101 leaq 8(%rdi), %rdi 132 * Move data from 8 bytes to 15 bytes.
102 leaq 8(%rsi), %rsi 133 */
103 jnz .Lloop_8 134 movq 0*8(%rsi), %r8
104 135 movq -1*8(%rsi, %rdx), %r9
105.Lhandle_7: 136 movq %r8, 0*8(%rdi)
106 movl %edx, %ecx 137 movq %r9, -1*8(%rdi, %rdx)
107 andl $7, %ecx 138 retq
108 jz .Lend 139 .p2align 4
140.Lless_8bytes:
141 cmpq $4, %rdx
142 jb .Lless_3bytes
109 143
144 /*
145 * Move data from 4 bytes to 7 bytes.
146 */
147 movl (%rsi), %ecx
148 movl -4(%rsi, %rdx), %r8d
149 movl %ecx, (%rdi)
150 movl %r8d, -4(%rdi, %rdx)
151 retq
110 .p2align 4 152 .p2align 4
153.Lless_3bytes:
154 cmpl $0, %edx
155 je .Lend
156 /*
157 * Move data from 1 bytes to 3 bytes.
158 */
111.Lloop_1: 159.Lloop_1:
112 movb (%rsi), %r8b 160 movb (%rsi), %r8b
113 movb %r8b, (%rdi) 161 movb %r8b, (%rdi)
114 incq %rdi 162 incq %rdi
115 incq %rsi 163 incq %rsi
116 decl %ecx 164 decl %edx
117 jnz .Lloop_1 165 jnz .Lloop_1
118 166
119.Lend: 167.Lend:
120 ret 168 retq
121 CFI_ENDPROC 169 CFI_ENDPROC
122ENDPROC(memcpy) 170ENDPROC(memcpy)
123ENDPROC(__memcpy) 171ENDPROC(__memcpy)