1 files changed, 103 insertions, 55 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e0f7d5..75ef61e35e38 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
 ENTRY(__memcpy)
 ENTRY(memcpy)
        CFI_STARTPROC
+        movq %rdi, %rax
        /*
-         * Put the number of full 64-byte blocks into %ecx.
+         * Use 32bit CMP here to avoid long NOP padding.
-         * Tail portion is handled at the end:
         */
-        movq %rdi, %rax
+        cmp  $0x20, %edx
-        movl %edx, %ecx
+        jb .Lhandle_tail
-        shrl   $6, %ecx
-        jz .Lhandle_tail
-        .p2align 4
-.Lloop_64:
        /*
-         * We decrement the loop index here - and the zero-flag is
+         * We check whether memory false dependece could occur,
-         * checked at the end of the loop (instructions inbetween do
+         * then jump to corresponding copy mode.
-         * not change the zero flag):
         */
-        decl %ecx
+        cmp  %dil, %sil
+        jl .Lcopy_backward
+        subl $0x20, %edx
+.Lcopy_forward_loop:
+        subq $0x20,     %rdx
        /*
-         * Move in blocks of 4x16 bytes:
+         * Move in blocks of 4x8 bytes:
         */
-        movq 0*8(%rsi),         %r11
+        movq 0*8(%rsi), %r8
-        movq 1*8(%rsi),         %r8
+        movq 1*8(%rsi), %r9
-        movq %r11,              0*8(%rdi)
+        movq 2*8(%rsi), %r10
-        movq %r8,               1*8(%rdi)
+        movq 3*8(%rsi), %r11
+        leaq 4*8(%rsi), %rsi
-        movq 2*8(%rsi),         %r9
-        movq 3*8(%rsi),         %r10
+        movq %r8,       0*8(%rdi)
-        movq %r9,               2*8(%rdi)
+        movq %r9,       1*8(%rdi)
-        movq %r10,              3*8(%rdi)
+        movq %r10,      2*8(%rdi)
+        movq %r11,      3*8(%rdi)
-        movq 4*8(%rsi),         %r11
+        leaq 4*8(%rdi), %rdi
-        movq 5*8(%rsi),         %r8
+        jae  .Lcopy_forward_loop
-        movq %r11,              4*8(%rdi)
+        addq $0x20,     %rdx
-        movq %r8,               5*8(%rdi)
+        jmp  .Lhandle_tail
-        movq 6*8(%rsi),         %r9
+.Lcopy_backward:
-        movq 7*8(%rsi),         %r10
+        /*
-        movq %r9,               6*8(%rdi)
+         * Calculate copy position to tail.
-        movq %r10,              7*8(%rdi)
+         */
+        addq %rdx,      %rsi
-        leaq 64(%rsi), %rsi
+        addq %rdx,      %rdi
-        leaq 64(%rdi), %rdi
+        subq $0x20,     %rdx
+        /*
-        jnz  .Lloop_64
+         * At most 3 ALU operations in one cycle,
+         * so append NOPS in the same 16bytes trunk.
+         */
+        .p2align 4
+.Lcopy_backward_loop:
+        subq $0x20,     %rdx
+        movq -1*8(%rsi),        %r8
+        movq -2*8(%rsi),        %r9
+        movq -3*8(%rsi),        %r10
+        movq -4*8(%rsi),        %r11
+        leaq -4*8(%rsi),        %rsi
+        movq %r8,               -1*8(%rdi)
+        movq %r9,               -2*8(%rdi)
+        movq %r10,              -3*8(%rdi)
+        movq %r11,              -4*8(%rdi)
+        leaq -4*8(%rdi),        %rdi
+        jae  .Lcopy_backward_loop
+        /*
+         * Calculate copy position to head.
+         */
+        addq $0x20,     %rdx
+        subq %rdx,      %rsi
+        subq %rdx,      %rdi
 .Lhandle_tail:
-        movl %edx, %ecx
+        cmpq $16,       %rdx
-        andl  $63, %ecx
+        jb   .Lless_16bytes
-        shrl   $3, %ecx
-        jz   .Lhandle_7
+        /*
+         * Move data from 16 bytes to 31 bytes.
+         */
+        movq 0*8(%rsi), %r8
+        movq 1*8(%rsi), %r9
+        movq -2*8(%rsi, %rdx),  %r10
+        movq -1*8(%rsi, %rdx),  %r11
+        movq %r8,       0*8(%rdi)
+        movq %r9,       1*8(%rdi)
+        movq %r10,      -2*8(%rdi, %rdx)
+        movq %r11,      -1*8(%rdi, %rdx)
+        retq
        .p2align 4
-.Lloop_8:
+.Lless_16bytes:
-        decl %ecx
+        cmpq $8,        %rdx
-        movq (%rsi),            %r8
+        jb   .Lless_8bytes
-        movq %r8,               (%rdi)
+        /*
-        leaq 8(%rdi),           %rdi
+         * Move data from 8 bytes to 15 bytes.
-        leaq 8(%rsi),           %rsi
+         */
-        jnz  .Lloop_8
+        movq 0*8(%rsi), %r8
+        movq -1*8(%rsi, %rdx),  %r9
-.Lhandle_7:
+        movq %r8,       0*8(%rdi)
-        movl %edx, %ecx
+        movq %r9,       -1*8(%rdi, %rdx)
-        andl $7, %ecx
+        retq
-        jz .Lend
+        .p2align 4
+.Lless_8bytes:
+        cmpq $4,        %rdx
+        jb   .Lless_3bytes
+        /*
+         * Move data from 4 bytes to 7 bytes.
+         */
+        movl (%rsi), %ecx
+        movl -4(%rsi, %rdx), %r8d
+        movl %ecx, (%rdi)
+        movl %r8d, -4(%rdi, %rdx)
+        retq
        .p2align 4
+.Lless_3bytes:
+        cmpl $0, %edx
+        je .Lend
+        /*
+         * Move data from 1 bytes to 3 bytes.
+         */
 .Lloop_1:
        movb (%rsi), %r8b
        movb %r8b, (%rdi)
        incq %rdi
        incq %rsi
-        decl %ecx
+        decl %edx
        jnz .Lloop_1
 .Lend:
-        ret
+        retq
        CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bcbcd1e0f7d5..75ef61e35e38 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
40	ENTRY(__memcpy)	40	ENTRY(__memcpy)
41	ENTRY(memcpy)	41	ENTRY(memcpy)
42	CFI_STARTPROC	42	CFI_STARTPROC
		43	movq %rdi, %rax
43		44
44	/*	45	/*
45	* Put the number of full 64-byte blocks into %ecx.	46	* Use 32bit CMP here to avoid long NOP padding.
46	* Tail portion is handled at the end:
47	*/	47	*/
48	movq %rdi, %rax	48	cmp $0x20, %edx
49	movl %edx, %ecx	49	jb .Lhandle_tail
50	shrl $6, %ecx
51	jz .Lhandle_tail
52		50
53	.p2align 4
54	.Lloop_64:
55	/*	51	/*
56	* We decrement the loop index here - and the zero-flag is	52	* We check whether memory false dependece could occur,
57	* checked at the end of the loop (instructions inbetween do	53	* then jump to corresponding copy mode.
58	* not change the zero flag):
59	*/	54	*/
60	decl %ecx	55	cmp %dil, %sil
		56	jl .Lcopy_backward
		57	subl $0x20, %edx
		58	.Lcopy_forward_loop:
		59	subq $0x20, %rdx
61		60
62	/*	61	/*
63	* Move in blocks of 4x16 bytes:	62	* Move in blocks of 4x8 bytes:
64	*/	63	*/
65	movq 0*8(%rsi), %r11	64	movq 0*8(%rsi), %r8
66	movq 1*8(%rsi), %r8	65	movq 1*8(%rsi), %r9
67	movq %r11, 0*8(%rdi)	66	movq 2*8(%rsi), %r10
68	movq %r8, 1*8(%rdi)	67	movq 3*8(%rsi), %r11
69		68	leaq 4*8(%rsi), %rsi
70	movq 2*8(%rsi), %r9	69
71	movq 3*8(%rsi), %r10	70	movq %r8, 0*8(%rdi)
72	movq %r9, 2*8(%rdi)	71	movq %r9, 1*8(%rdi)
73	movq %r10, 3*8(%rdi)	72	movq %r10, 2*8(%rdi)
74		73	movq %r11, 3*8(%rdi)
75	movq 4*8(%rsi), %r11	74	leaq 4*8(%rdi), %rdi
76	movq 5*8(%rsi), %r8	75	jae .Lcopy_forward_loop
77	movq %r11, 4*8(%rdi)	76	addq $0x20, %rdx
78	movq %r8, 5*8(%rdi)	77	jmp .Lhandle_tail
79		78
80	movq 6*8(%rsi), %r9	79	.Lcopy_backward:
81	movq 7*8(%rsi), %r10	80	/*
82	movq %r9, 6*8(%rdi)	81	* Calculate copy position to tail.
83	movq %r10, 7*8(%rdi)	82	*/
84		83	addq %rdx, %rsi
85	leaq 64(%rsi), %rsi	84	addq %rdx, %rdi
86	leaq 64(%rdi), %rdi	85	subq $0x20, %rdx
87		86	/*
88	jnz .Lloop_64	87	* At most 3 ALU operations in one cycle,
		88	* so append NOPS in the same 16bytes trunk.
		89	*/
		90	.p2align 4
		91	.Lcopy_backward_loop:
		92	subq $0x20, %rdx
		93	movq -1*8(%rsi), %r8
		94	movq -2*8(%rsi), %r9
		95	movq -3*8(%rsi), %r10
		96	movq -4*8(%rsi), %r11
		97	leaq -4*8(%rsi), %rsi
		98	movq %r8, -1*8(%rdi)
		99	movq %r9, -2*8(%rdi)
		100	movq %r10, -3*8(%rdi)
		101	movq %r11, -4*8(%rdi)
		102	leaq -4*8(%rdi), %rdi
		103	jae .Lcopy_backward_loop
89		104
		105	/*
		106	* Calculate copy position to head.
		107	*/
		108	addq $0x20, %rdx
		109	subq %rdx, %rsi
		110	subq %rdx, %rdi
90	.Lhandle_tail:	111	.Lhandle_tail:
91	movl %edx, %ecx	112	cmpq $16, %rdx
92	andl $63, %ecx	113	jb .Lless_16bytes
93	shrl $3, %ecx
94	jz .Lhandle_7
95		114
		115	/*
		116	* Move data from 16 bytes to 31 bytes.
		117	*/
		118	movq 0*8(%rsi), %r8
		119	movq 1*8(%rsi), %r9
		120	movq -2*8(%rsi, %rdx), %r10
		121	movq -1*8(%rsi, %rdx), %r11
		122	movq %r8, 0*8(%rdi)
		123	movq %r9, 1*8(%rdi)
		124	movq %r10, -2*8(%rdi, %rdx)
		125	movq %r11, -1*8(%rdi, %rdx)
		126	retq
96	.p2align 4	127	.p2align 4
97	.Lloop_8:	128	.Lless_16bytes:
98	decl %ecx	129	cmpq $8, %rdx
99	movq (%rsi), %r8	130	jb .Lless_8bytes
100	movq %r8, (%rdi)	131	/*
101	leaq 8(%rdi), %rdi	132	* Move data from 8 bytes to 15 bytes.
102	leaq 8(%rsi), %rsi	133	*/
103	jnz .Lloop_8	134	movq 0*8(%rsi), %r8
104		135	movq -1*8(%rsi, %rdx), %r9
105	.Lhandle_7:	136	movq %r8, 0*8(%rdi)
106	movl %edx, %ecx	137	movq %r9, -1*8(%rdi, %rdx)
107	andl $7, %ecx	138	retq
108	jz .Lend	139	.p2align 4
		140	.Lless_8bytes:
		141	cmpq $4, %rdx
		142	jb .Lless_3bytes
109		143
		144	/*
		145	* Move data from 4 bytes to 7 bytes.
		146	*/
		147	movl (%rsi), %ecx
		148	movl -4(%rsi, %rdx), %r8d
		149	movl %ecx, (%rdi)
		150	movl %r8d, -4(%rdi, %rdx)
		151	retq
110	.p2align 4	152	.p2align 4
		153	.Lless_3bytes:
		154	cmpl $0, %edx
		155	je .Lend
		156	/*
		157	* Move data from 1 bytes to 3 bytes.
		158	*/
111	.Lloop_1:	159	.Lloop_1:
112	movb (%rsi), %r8b	160	movb (%rsi), %r8b
113	movb %r8b, (%rdi)	161	movb %r8b, (%rdi)
114	incq %rdi	162	incq %rdi
115	incq %rsi	163	incq %rsi
116	decl %ecx	164	decl %edx
117	jnz .Lloop_1	165	jnz .Lloop_1
118		166
119	.Lend:	167	.Lend:
120	ret	168	retq
121	CFI_ENDPROC	169	CFI_ENDPROC
122	ENDPROC(memcpy)	170	ENDPROC(memcpy)
123	ENDPROC(__memcpy)	171	ENDPROC(__memcpy)