1 files changed, 81 insertions, 55 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 10c067694af4..ad5441ed1b57 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
 /* Copyright 2002 Andi Kleen */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
 /*
 * memcpy - Copy a memory block.
 *
- * Input:       
+ * Input:
- * rdi destination
+ *  rdi destination
- * rsi source
+ *  rsi source
- * rdx count
+ *  rdx count
- * 
+ *
 * Output:
 * rax original destination
- */     
+ */
+/*
+ * memcpy_c() - fast string ops (REP MOVSQ) based variant.
+ *
+ * Calls to this get patched into the kernel image via the
+ * alternative instructions framework:
+ */
        ALIGN
 memcpy_c:
        CFI_STARTPROC
-        movq %rdi,%rax
+        movq %rdi, %rax
-        movl %edx,%ecx
-        shrl $3,%ecx
+        movl %edx, %ecx
-        andl $7,%edx
+        shrl $3, %ecx
+        andl $7, %edx
        rep movsq
-        movl %edx,%ecx
+        movl %edx, %ecx
        rep movsb
        ret
        CFI_ENDPROC
@@ -33,92 +41,110 @@ ENDPROC(memcpy_c)
 ENTRY(__memcpy)
 ENTRY(memcpy)
        CFI_STARTPROC
-        movq %rdi,%rax
-        movl %edx,%ecx
+        /*
-        shrl $6,%ecx
+         * Put the number of full 64-byte blocks into %ecx.
+         * Tail portion is handled at the end:
+         */
+        movq %rdi, %rax
+        movl %edx, %ecx
+        shrl   $6, %ecx
        jz .Lhandle_tail
        .p2align 4
 .Lloop_64:
+        /*
+         * We decrement the loop index here - and the zero-flag is
+         * checked at the end of the loop (instructions inbetween do
+         * not change the zero flag):
+         */
        decl %ecx
-        movq (%rsi),%r11
+        /*
-        movq 8(%rsi),%r8
+         * Move in blocks of 4x16 bytes:
+         */
+        movq 0*8(%rsi),         %r11
+        movq 1*8(%rsi),         %r8
+        movq %r11,              0*8(%rdi)
+        movq %r8,               1*8(%rdi)
-        movq %r11,(%rdi)
+        movq 2*8(%rsi),         %r9
-        movq %r8,1*8(%rdi)
+        movq 3*8(%rsi),         %r10
+        movq %r9,               2*8(%rdi)
+        movq %r10,              3*8(%rdi)
-        movq 2*8(%rsi),%r9
+        movq 4*8(%rsi),         %r11
-        movq 3*8(%rsi),%r10
+        movq 5*8(%rsi),         %r8
+        movq %r11,              4*8(%rdi)
+        movq %r8,               5*8(%rdi)
-        movq %r9,2*8(%rdi)
+        movq 6*8(%rsi),         %r9
-        movq %r10,3*8(%rdi)
+        movq 7*8(%rsi),         %r10
+        movq %r9,               6*8(%rdi)
+        movq %r10,              7*8(%rdi)
-        movq 4*8(%rsi),%r11
+        leaq 64(%rsi), %rsi
-        movq 5*8(%rsi),%r8
+        leaq 64(%rdi), %rdi
-        movq %r11,4*8(%rdi)
-        movq %r8,5*8(%rdi)
-        movq 6*8(%rsi),%r9
-        movq 7*8(%rsi),%r10
-        movq %r9,6*8(%rdi)
-        movq %r10,7*8(%rdi)
-        leaq 64(%rsi),%rsi
-        leaq 64(%rdi),%rdi
        jnz  .Lloop_64
 .Lhandle_tail:
-        movl %edx,%ecx
+        movl %edx, %ecx
-        andl $63,%ecx
+        andl  $63, %ecx
-        shrl $3,%ecx
+        shrl   $3, %ecx
        jz   .Lhandle_7
        .p2align 4
 .Lloop_8:
        decl %ecx
-        movq (%rsi),%r8
+        movq (%rsi),            %r8
-        movq %r8,(%rdi)
+        movq %r8,               (%rdi)
-        leaq 8(%rdi),%rdi
+        leaq 8(%rdi),           %rdi
-        leaq 8(%rsi),%rsi
+        leaq 8(%rsi),           %rsi
        jnz  .Lloop_8
 .Lhandle_7:
-        movl %edx,%ecx
+        movl %edx, %ecx
-        andl $7,%ecx
+        andl $7, %ecx
-        jz .Lende
+        jz .Lend
        .p2align 4
 .Lloop_1:
-        movb (%rsi),%r8b
+        movb (%rsi), %r8b
-        movb %r8b,(%rdi)
+        movb %r8b, (%rdi)
        incq %rdi
        incq %rsi
        decl %ecx
        jnz .Lloop_1
-.Lende:
+.Lend:
        ret
        CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
-        /* Some CPUs run faster using the string copy instructions.
+        /*
-           It is also a lot simpler. Use this when possible */
+         * Some CPUs run faster using the string copy instructions.
+         * It is also a lot simpler. Use this when possible:
+         */
-        .section .altinstr_replacement,"ax"
+        .section .altinstr_replacement, "ax"
 1:      .byte 0xeb                              /* jmp <disp8> */
        .byte (memcpy_c - memcpy) - (2f - 1b)   /* offset */
 2:
        .previous
-        .section .altinstructions,"a"
+        .section .altinstructions, "a"
        .align 8
        .quad memcpy
        .quad 1b
        .byte X86_FEATURE_REP_GOOD
-        /* Replace only beginning, memcpy is used to apply alternatives, so it
-         * is silly to overwrite itself with nops - reboot is only outcome... */
+        /*
+         * Replace only beginning, memcpy is used to apply alternatives,
+         * so it is silly to overwrite itself with nops - reboot is the
+         * only outcome...
+         */
        .byte 2b - 1b
        .byte 2b - 1b
        .previous

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 10c067694af4..ad5441ed1b57 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
1	/* Copyright 2002 Andi Kleen */	1	/* Copyright 2002 Andi Kleen */
2		2
3	#include <linux/linkage.h>	3	#include <linux/linkage.h>
4	#include <asm/dwarf2.h>	4
5	#include <asm/cpufeature.h>	5	#include <asm/cpufeature.h>
		6	#include <asm/dwarf2.h>
6		7
7	/*	8	/*
8	* memcpy - Copy a memory block.	9	* memcpy - Copy a memory block.
9	*	10	*
10	* Input:	11	* Input:
11	* rdi destination	12	* rdi destination
12	* rsi source	13	* rsi source
13	* rdx count	14	* rdx count
14	*	15	*
15	* Output:	16	* Output:
16	* rax original destination	17	* rax original destination
17	*/	18	*/
18		19
		20	/*
		21	* memcpy_c() - fast string ops (REP MOVSQ) based variant.
		22	*
		23	* Calls to this get patched into the kernel image via the
		24	* alternative instructions framework:
		25	*/
19	ALIGN	26	ALIGN
20	memcpy_c:	27	memcpy_c:
21	CFI_STARTPROC	28	CFI_STARTPROC
22	movq %rdi,%rax	29	movq %rdi, %rax
23	movl %edx,%ecx	30
24	shrl $3,%ecx	31	movl %edx, %ecx
25	andl $7,%edx	32	shrl $3, %ecx
		33	andl $7, %edx
26	rep movsq	34	rep movsq
27	movl %edx,%ecx	35	movl %edx, %ecx
28	rep movsb	36	rep movsb
29	ret	37	ret
30	CFI_ENDPROC	38	CFI_ENDPROC
@@ -33,92 +41,110 @@ ENDPROC(memcpy_c)
33	ENTRY(__memcpy)	41	ENTRY(__memcpy)
34	ENTRY(memcpy)	42	ENTRY(memcpy)
35	CFI_STARTPROC	43	CFI_STARTPROC
36	movq %rdi,%rax
37		44
38	movl %edx,%ecx	45	/*
39	shrl $6,%ecx	46	* Put the number of full 64-byte blocks into %ecx.
		47	* Tail portion is handled at the end:
		48	*/
		49	movq %rdi, %rax
		50	movl %edx, %ecx
		51	shrl $6, %ecx
40	jz .Lhandle_tail	52	jz .Lhandle_tail
41		53
42	.p2align 4	54	.p2align 4
43	.Lloop_64:	55	.Lloop_64:
		56	/*
		57	* We decrement the loop index here - and the zero-flag is
		58	* checked at the end of the loop (instructions inbetween do
		59	* not change the zero flag):
		60	*/
44	decl %ecx	61	decl %ecx
45		62
46	movq (%rsi),%r11	63	/*
47	movq 8(%rsi),%r8	64	* Move in blocks of 4x16 bytes:
		65	*/
		66	movq 0*8(%rsi), %r11
		67	movq 1*8(%rsi), %r8
		68	movq %r11, 0*8(%rdi)
		69	movq %r8, 1*8(%rdi)
48		70
49	movq %r11,(%rdi)	71	movq 2*8(%rsi), %r9
50	movq %r8,1*8(%rdi)	72	movq 3*8(%rsi), %r10
		73	movq %r9, 2*8(%rdi)
		74	movq %r10, 3*8(%rdi)
51		75
52	movq 2*8(%rsi),%r9	76	movq 4*8(%rsi), %r11
53	movq 3*8(%rsi),%r10	77	movq 5*8(%rsi), %r8
		78	movq %r11, 4*8(%rdi)
		79	movq %r8, 5*8(%rdi)
54		80
55	movq %r9,2*8(%rdi)	81	movq 6*8(%rsi), %r9
56	movq %r10,3*8(%rdi)	82	movq 7*8(%rsi), %r10
		83	movq %r9, 6*8(%rdi)
		84	movq %r10, 7*8(%rdi)
57		85
58	movq 4*8(%rsi),%r11	86	leaq 64(%rsi), %rsi
59	movq 5*8(%rsi),%r8	87	leaq 64(%rdi), %rdi
60		88
61	movq %r11,4*8(%rdi)
62	movq %r8,5*8(%rdi)
63
64	movq 6*8(%rsi),%r9
65	movq 7*8(%rsi),%r10
66
67	movq %r9,6*8(%rdi)
68	movq %r10,7*8(%rdi)
69
70	leaq 64(%rsi),%rsi
71	leaq 64(%rdi),%rdi
72	jnz .Lloop_64	89	jnz .Lloop_64
73		90
74	.Lhandle_tail:	91	.Lhandle_tail:
75	movl %edx,%ecx	92	movl %edx, %ecx
76	andl $63,%ecx	93	andl $63, %ecx
77	shrl $3,%ecx	94	shrl $3, %ecx
78	jz .Lhandle_7	95	jz .Lhandle_7
		96
79	.p2align 4	97	.p2align 4
80	.Lloop_8:	98	.Lloop_8:
81	decl %ecx	99	decl %ecx
82	movq (%rsi),%r8	100	movq (%rsi), %r8
83	movq %r8,(%rdi)	101	movq %r8, (%rdi)
84	leaq 8(%rdi),%rdi	102	leaq 8(%rdi), %rdi
85	leaq 8(%rsi),%rsi	103	leaq 8(%rsi), %rsi
86	jnz .Lloop_8	104	jnz .Lloop_8
87		105
88	.Lhandle_7:	106	.Lhandle_7:
89	movl %edx,%ecx	107	movl %edx, %ecx
90	andl $7,%ecx	108	andl $7, %ecx
91	jz .Lende	109	jz .Lend
		110
92	.p2align 4	111	.p2align 4
93	.Lloop_1:	112	.Lloop_1:
94	movb (%rsi),%r8b	113	movb (%rsi), %r8b
95	movb %r8b,(%rdi)	114	movb %r8b, (%rdi)
96	incq %rdi	115	incq %rdi
97	incq %rsi	116	incq %rsi
98	decl %ecx	117	decl %ecx
99	jnz .Lloop_1	118	jnz .Lloop_1
100		119
101	.Lende:	120	.Lend:
102	ret	121	ret
103	CFI_ENDPROC	122	CFI_ENDPROC
104	ENDPROC(memcpy)	123	ENDPROC(memcpy)
105	ENDPROC(__memcpy)	124	ENDPROC(__memcpy)
106		125
107	/* Some CPUs run faster using the string copy instructions.	126	/*
108	It is also a lot simpler. Use this when possible */	127	* Some CPUs run faster using the string copy instructions.
		128	* It is also a lot simpler. Use this when possible:
		129	*/
109		130
110	.section .altinstr_replacement,"ax"	131	.section .altinstr_replacement, "ax"
111	1: .byte 0xeb /* jmp <disp8> */	132	1: .byte 0xeb /* jmp <disp8> */
112	.byte (memcpy_c - memcpy) - (2f - 1b) /* offset */	133	.byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
113	2:	134	2:
114	.previous	135	.previous
115	.section .altinstructions,"a"	136
		137	.section .altinstructions, "a"
116	.align 8	138	.align 8
117	.quad memcpy	139	.quad memcpy
118	.quad 1b	140	.quad 1b
119	.byte X86_FEATURE_REP_GOOD	141	.byte X86_FEATURE_REP_GOOD
120	/* Replace only beginning, memcpy is used to apply alternatives, so it	142
121	* is silly to overwrite itself with nops - reboot is only outcome... */	143	/*
		144	* Replace only beginning, memcpy is used to apply alternatives,
		145	* so it is silly to overwrite itself with nops - reboot is the
		146	* only outcome...
		147	*/
122	.byte 2b - 1b	148	.byte 2b - 1b
123	.byte 2b - 1b	149	.byte 2b - 1b
124	.previous	150	.previous