x86, mem: Optimize memcpy by avoiding memory false dependece

All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
author: Ma Ling <ling.ma@intel.com> 2010-06-28 15:24:25 -0400
committer: H. Peter Anvin <hpa@linux.intel.com> 2010-08-23 17:56:41 -0400
commit: 59daa706fbec745684702741b9f5373142dd9fdc (patch)
tree: 53c20211ef0bf1be250a31a98ae0d966dce49308 /arch/x86/lib/memcpy_64.S
parent: fdf4289679fd41d76553ce224750e9737cd80eea (diff)
1 files changed, 103 insertions, 55 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e0f7d..75ef61e35e3 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
 ENTRY(__memcpy)
 ENTRY(memcpy)
        CFI_STARTPROC
+        movq %rdi, %rax
        /*
-         * Put the number of full 64-byte blocks into %ecx.
+         * Use 32bit CMP here to avoid long NOP padding.
-         * Tail portion is handled at the end:
         */
-        movq %rdi, %rax
+        cmp  $0x20, %edx
-        movl %edx, %ecx
+        jb .Lhandle_tail
-        shrl   $6, %ecx
-        jz .Lhandle_tail
-        .p2align 4
-.Lloop_64:
        /*
-         * We decrement the loop index here - and the zero-flag is
+         * We check whether memory false dependece could occur,
-         * checked at the end of the loop (instructions inbetween do
+         * then jump to corresponding copy mode.
-         * not change the zero flag):
         */
-        decl %ecx
+        cmp  %dil, %sil
+        jl .Lcopy_backward
+        subl $0x20, %edx
+.Lcopy_forward_loop:
+        subq $0x20,     %rdx
        /*
-         * Move in blocks of 4x16 bytes:
+         * Move in blocks of 4x8 bytes:
         */
-        movq 0*8(%rsi),         %r11
+        movq 0*8(%rsi), %r8
-        movq 1*8(%rsi),         %r8
+        movq 1*8(%rsi), %r9
-        movq %r11,              0*8(%rdi)
+        movq 2*8(%rsi), %r10
-        movq %r8,               1*8(%rdi)
+        movq 3*8(%rsi), %r11
+        leaq 4*8(%rsi), %rsi
-        movq 2*8(%rsi),         %r9
-        movq 3*8(%rsi),         %r10
+        movq %r8,       0*8(%rdi)
-        movq %r9,               2*8(%rdi)
+        movq %r9,       1*8(%rdi)
-        movq %r10,              3*8(%rdi)
+        movq %r10,      2*8(%rdi)
+        movq %r11,      3*8(%rdi)
-        movq 4*8(%rsi),         %r11
+        leaq 4*8(%rdi), %rdi
-        movq 5*8(%rsi),         %r8
+        jae  .Lcopy_forward_loop
-        movq %r11,              4*8(%rdi)
+        addq $0x20,     %rdx
-        movq %r8,               5*8(%rdi)
+        jmp  .Lhandle_tail
-        movq 6*8(%rsi),         %r9
+.Lcopy_backward:
-        movq 7*8(%rsi),         %r10
+        /*
-        movq %r9,               6*8(%rdi)
+         * Calculate copy position to tail.
-        movq %r10,              7*8(%rdi)
+         */
+        addq %rdx,      %rsi
-        leaq 64(%rsi), %rsi
+        addq %rdx,      %rdi
-        leaq 64(%rdi), %rdi
+        subq $0x20,     %rdx
+        /*
-        jnz  .Lloop_64
+         * At most 3 ALU operations in one cycle,
+         * so append NOPS in the same 16bytes trunk.
+         */
+        .p2align 4
+.Lcopy_backward_loop:
+        subq $0x20,     %rdx
+        movq -1*8(%rsi),        %r8
+        movq -2*8(%rsi),        %r9
+        movq -3*8(%rsi),        %r10
+        movq -4*8(%rsi),        %r11
+        leaq -4*8(%rsi),        %rsi
+        movq %r8,               -1*8(%rdi)
+        movq %r9,               -2*8(%rdi)
+        movq %r10,              -3*8(%rdi)
+        movq %r11,              -4*8(%rdi)
+        leaq -4*8(%rdi),        %rdi
+        jae  .Lcopy_backward_loop
+        /*
+         * Calculate copy position to head.
+         */
+        addq $0x20,     %rdx
+        subq %rdx,      %rsi
+        subq %rdx,      %rdi
 .Lhandle_tail:
-        movl %edx, %ecx
+        cmpq $16,       %rdx
-        andl  $63, %ecx
+        jb   .Lless_16bytes
-        shrl   $3, %ecx
-        jz   .Lhandle_7
+        /*
+         * Move data from 16 bytes to 31 bytes.
+         */
+        movq 0*8(%rsi), %r8
+        movq 1*8(%rsi), %r9
+        movq -2*8(%rsi, %rdx),  %r10
+        movq -1*8(%rsi, %rdx),  %r11
+        movq %r8,       0*8(%rdi)
+        movq %r9,       1*8(%rdi)
+        movq %r10,      -2*8(%rdi, %rdx)
+        movq %r11,      -1*8(%rdi, %rdx)
+        retq
        .p2align 4
-.Lloop_8:
+.Lless_16bytes:
-        decl %ecx
+        cmpq $8,        %rdx
-        movq (%rsi),            %r8
+        jb   .Lless_8bytes
-        movq %r8,               (%rdi)
+        /*
-        leaq 8(%rdi),           %rdi
+         * Move data from 8 bytes to 15 bytes.
-        leaq 8(%rsi),           %rsi
+         */
-        jnz  .Lloop_8
+        movq 0*8(%rsi), %r8
+        movq -1*8(%rsi, %rdx),  %r9
-.Lhandle_7:
+        movq %r8,       0*8(%rdi)
-        movl %edx, %ecx
+        movq %r9,       -1*8(%rdi, %rdx)
-        andl $7, %ecx
+        retq
-        jz .Lend
+        .p2align 4
+.Lless_8bytes:
+        cmpq $4,        %rdx
+        jb   .Lless_3bytes
+        /*
+         * Move data from 4 bytes to 7 bytes.
+         */
+        movl (%rsi), %ecx
+        movl -4(%rsi, %rdx), %r8d
+        movl %ecx, (%rdi)
+        movl %r8d, -4(%rdi, %rdx)
+        retq
        .p2align 4
+.Lless_3bytes:
+        cmpl $0, %edx
+        je .Lend
+        /*
+         * Move data from 1 bytes to 3 bytes.
+         */
 .Lloop_1:
        movb (%rsi), %r8b
        movb %r8b, (%rdi)
        incq %rdi
        incq %rsi
-        decl %ecx
+        decl %edx
        jnz .Lloop_1
 .Lend:
-        ret
+        retq
        CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
author	Ma Ling <ling.ma@intel.com>	2010-06-28 15:24:25 -0400
committer	H. Peter Anvin <hpa@linux.intel.com>	2010-08-23 17:56:41 -0400
commit	59daa706fbec745684702741b9f5373142dd9fdc (patch)
tree	53c20211ef0bf1be250a31a98ae0d966dce49308 /arch/x86/lib/memcpy_64.S
parent	fdf4289679fd41d76553ce224750e9737cd80eea (diff)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bcbcd1e0f7d..75ef61e35e3 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
40	ENTRY(__memcpy)	40	ENTRY(__memcpy)
41	ENTRY(memcpy)	41	ENTRY(memcpy)
42	CFI_STARTPROC	42	CFI_STARTPROC
		43	movq %rdi, %rax
43		44
44	/*	45	/*
45	* Put the number of full 64-byte blocks into %ecx.	46	* Use 32bit CMP here to avoid long NOP padding.
46	* Tail portion is handled at the end:
47	*/	47	*/
48	movq %rdi, %rax	48	cmp $0x20, %edx
49	movl %edx, %ecx	49	jb .Lhandle_tail
50	shrl $6, %ecx
51	jz .Lhandle_tail
52		50
53	.p2align 4
54	.Lloop_64:
55	/*	51	/*
56	* We decrement the loop index here - and the zero-flag is	52	* We check whether memory false dependece could occur,
57	* checked at the end of the loop (instructions inbetween do	53	* then jump to corresponding copy mode.
58	* not change the zero flag):
59	*/	54	*/
60	decl %ecx	55	cmp %dil, %sil
		56	jl .Lcopy_backward
		57	subl $0x20, %edx
		58	.Lcopy_forward_loop:
		59	subq $0x20, %rdx
61		60
62	/*	61	/*
63	* Move in blocks of 4x16 bytes:	62	* Move in blocks of 4x8 bytes:
64	*/	63	*/
65	movq 0*8(%rsi), %r11	64	movq 0*8(%rsi), %r8
66	movq 1*8(%rsi), %r8	65	movq 1*8(%rsi), %r9
67	movq %r11, 0*8(%rdi)	66	movq 2*8(%rsi), %r10
68	movq %r8, 1*8(%rdi)	67	movq 3*8(%rsi), %r11
69		68	leaq 4*8(%rsi), %rsi
70	movq 2*8(%rsi), %r9	69
71	movq 3*8(%rsi), %r10	70	movq %r8, 0*8(%rdi)
72	movq %r9, 2*8(%rdi)	71	movq %r9, 1*8(%rdi)
73	movq %r10, 3*8(%rdi)	72	movq %r10, 2*8(%rdi)
74		73	movq %r11, 3*8(%rdi)
75	movq 4*8(%rsi), %r11	74	leaq 4*8(%rdi), %rdi
76	movq 5*8(%rsi), %r8	75	jae .Lcopy_forward_loop
77	movq %r11, 4*8(%rdi)	76	addq $0x20, %rdx
78	movq %r8, 5*8(%rdi)	77	jmp .Lhandle_tail
79		78
80	movq 6*8(%rsi), %r9	79	.Lcopy_backward:
81	movq 7*8(%rsi), %r10	80	/*
82	movq %r9, 6*8(%rdi)	81	* Calculate copy position to tail.
83	movq %r10, 7*8(%rdi)	82	*/
84		83	addq %rdx, %rsi
85	leaq 64(%rsi), %rsi	84	addq %rdx, %rdi
86	leaq 64(%rdi), %rdi	85	subq $0x20, %rdx
87		86	/*
88	jnz .Lloop_64	87	* At most 3 ALU operations in one cycle,
		88	* so append NOPS in the same 16bytes trunk.
		89	*/
		90	.p2align 4
		91	.Lcopy_backward_loop:
		92	subq $0x20, %rdx
		93	movq -1*8(%rsi), %r8
		94	movq -2*8(%rsi), %r9
		95	movq -3*8(%rsi), %r10
		96	movq -4*8(%rsi), %r11
		97	leaq -4*8(%rsi), %rsi
		98	movq %r8, -1*8(%rdi)
		99	movq %r9, -2*8(%rdi)
		100	movq %r10, -3*8(%rdi)
		101	movq %r11, -4*8(%rdi)
		102	leaq -4*8(%rdi), %rdi
		103	jae .Lcopy_backward_loop
89		104
		105	/*
		106	* Calculate copy position to head.
		107	*/
		108	addq $0x20, %rdx
		109	subq %rdx, %rsi
		110	subq %rdx, %rdi
90	.Lhandle_tail:	111	.Lhandle_tail:
91	movl %edx, %ecx	112	cmpq $16, %rdx
92	andl $63, %ecx	113	jb .Lless_16bytes
93	shrl $3, %ecx
94	jz .Lhandle_7
95		114
		115	/*
		116	* Move data from 16 bytes to 31 bytes.
		117	*/
		118	movq 0*8(%rsi), %r8
		119	movq 1*8(%rsi), %r9
		120	movq -2*8(%rsi, %rdx), %r10
		121	movq -1*8(%rsi, %rdx), %r11
		122	movq %r8, 0*8(%rdi)
		123	movq %r9, 1*8(%rdi)
		124	movq %r10, -2*8(%rdi, %rdx)
		125	movq %r11, -1*8(%rdi, %rdx)
		126	retq
96	.p2align 4	127	.p2align 4
97	.Lloop_8:	128	.Lless_16bytes:
98	decl %ecx	129	cmpq $8, %rdx
99	movq (%rsi), %r8	130	jb .Lless_8bytes
100	movq %r8, (%rdi)	131	/*
101	leaq 8(%rdi), %rdi	132	* Move data from 8 bytes to 15 bytes.
102	leaq 8(%rsi), %rsi	133	*/
103	jnz .Lloop_8	134	movq 0*8(%rsi), %r8
104		135	movq -1*8(%rsi, %rdx), %r9
105	.Lhandle_7:	136	movq %r8, 0*8(%rdi)
106	movl %edx, %ecx	137	movq %r9, -1*8(%rdi, %rdx)
107	andl $7, %ecx	138	retq
108	jz .Lend	139	.p2align 4
		140	.Lless_8bytes:
		141	cmpq $4, %rdx
		142	jb .Lless_3bytes
109		143
		144	/*
		145	* Move data from 4 bytes to 7 bytes.
		146	*/
		147	movl (%rsi), %ecx
		148	movl -4(%rsi, %rdx), %r8d
		149	movl %ecx, (%rdi)
		150	movl %r8d, -4(%rdi, %rdx)
		151	retq
110	.p2align 4	152	.p2align 4
		153	.Lless_3bytes:
		154	cmpl $0, %edx
		155	je .Lend
		156	/*
		157	* Move data from 1 bytes to 3 bytes.
		158	*/
111	.Lloop_1:	159	.Lloop_1:
112	movb (%rsi), %r8b	160	movb (%rsi), %r8b
113	movb %r8b, (%rdi)	161	movb %r8b, (%rdi)
114	incq %rdi	162	incq %rdi
115	incq %rsi	163	incq %rsi
116	decl %ecx	164	decl %edx
117	jnz .Lloop_1	165	jnz .Lloop_1
118		166
119	.Lend:	167	.Lend:
120	ret	168	retq
121	CFI_ENDPROC	169	CFI_ENDPROC
122	ENDPROC(memcpy)	170	ENDPROC(memcpy)
123	ENDPROC(__memcpy)	171	ENDPROC(__memcpy)