x86/asm: Clean up copy_page_*() comments and code

Modern CPUs use fast-string instruction to accelerate copy performance, by combining data into 128 bit chunks. Modify comments and coding style to match it. Signed-off-by: Ma Ling <ling.ma@intel.com> Cc: iant@google.com Link: http://lkml.kernel.org/r/1350503565-19167-1-git-send-email-ling.ma@intel.com [ Cleaned up the clean up. ] Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ma Ling <ling.ma@intel.com> 2012-10-17 15:52:45 -0400
committer: Ingo Molnar <mingo@kernel.org> 2012-10-24 06:42:47 -0400
commit: 269833bd5a0f4443873da358b71675a890b47c3c (patch)
tree: 40d476ea896ded8c0957f20c77e583673f8d1d0b /arch/x86/lib
parent: 0e9e3e306c7e472bdcffa34c4c4584301eda03b3 (diff)
1 files changed, 59 insertions, 61 deletions
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 6b34d04d096a..176cca67212b 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -5,91 +5,89 @@
 #include <asm/alternative-asm.h>
        ALIGN
-copy_page_c:
+copy_page_rep:
        CFI_STARTPROC
-        movl $4096/8,%ecx
+        movl    $4096/8, %ecx
-        rep movsq
+        rep     movsq
        ret
        CFI_ENDPROC
-ENDPROC(copy_page_c)
+ENDPROC(copy_page_rep)
-/* Don't use streaming store because it's better when the target
+/*
-   ends up in cache. */
+ *  Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
-            
+ *  Could vary the prefetch distance based on SMP/UP.
-/* Could vary the prefetch distance based on SMP/UP */
+*/
 ENTRY(copy_page)
        CFI_STARTPROC
-        subq    $2*8,%rsp
+        subq    $2*8,   %rsp
        CFI_ADJUST_CFA_OFFSET 2*8
-        movq    %rbx,(%rsp)
+        movq    %rbx,   (%rsp)
        CFI_REL_OFFSET rbx, 0
-        movq    %r12,1*8(%rsp)
+        movq    %r12,   1*8(%rsp)
        CFI_REL_OFFSET r12, 1*8
-        movl    $(4096/64)-5,%ecx
+        movl    $(4096/64)-5,   %ecx
        .p2align 4
 .Loop64:
-        dec     %rcx
+        dec     %rcx
+        movq    0x8*0(%rsi), %rax
-        movq        (%rsi), %rax
+        movq    0x8*1(%rsi), %rbx
-        movq      8 (%rsi), %rbx
+        movq    0x8*2(%rsi), %rdx
-        movq     16 (%rsi), %rdx
+        movq    0x8*3(%rsi), %r8
-        movq     24 (%rsi), %r8
+        movq    0x8*4(%rsi), %r9
-        movq     32 (%rsi), %r9
+        movq    0x8*5(%rsi), %r10
-        movq     40 (%rsi), %r10
+        movq    0x8*6(%rsi), %r11
-        movq     48 (%rsi), %r11
+        movq    0x8*7(%rsi), %r12
-        movq     56 (%rsi), %r12
        prefetcht0 5*64(%rsi)
-        movq     %rax,    (%rdi)
+        movq    %rax, 0x8*0(%rdi)
-        movq     %rbx,  8 (%rdi)
+        movq    %rbx, 0x8*1(%rdi)
-        movq     %rdx, 16 (%rdi)
+        movq    %rdx, 0x8*2(%rdi)
-        movq     %r8,  24 (%rdi)
+        movq    %r8,  0x8*3(%rdi)
-        movq     %r9,  32 (%rdi)
+        movq    %r9,  0x8*4(%rdi)
-        movq     %r10, 40 (%rdi)
+        movq    %r10, 0x8*5(%rdi)
-        movq     %r11, 48 (%rdi)
+        movq    %r11, 0x8*6(%rdi)
-        movq     %r12, 56 (%rdi)
+        movq    %r12, 0x8*7(%rdi)
-        leaq    64 (%rsi), %rsi
+        leaq    64 (%rsi), %rsi
-        leaq    64 (%rdi), %rdi
+        leaq    64 (%rdi), %rdi
-        jnz     .Loop64
+        jnz     .Loop64
-        movl    $5,%ecx
+        movl    $5, %ecx
        .p2align 4
 .Loop2:
-        decl   %ecx
+        decl    %ecx
-        movq        (%rsi), %rax
+        movq    0x8*0(%rsi), %rax
-        movq      8 (%rsi), %rbx
+        movq    0x8*1(%rsi), %rbx
-        movq     16 (%rsi), %rdx
+        movq    0x8*2(%rsi), %rdx
-        movq     24 (%rsi), %r8
+        movq    0x8*3(%rsi), %r8
-        movq     32 (%rsi), %r9
+        movq    0x8*4(%rsi), %r9
-        movq     40 (%rsi), %r10
+        movq    0x8*5(%rsi), %r10
-        movq     48 (%rsi), %r11
+        movq    0x8*6(%rsi), %r11
-        movq     56 (%rsi), %r12
+        movq    0x8*7(%rsi), %r12
-        movq     %rax,    (%rdi)
+        movq    %rax, 0x8*0(%rdi)
-        movq     %rbx,  8 (%rdi)
+        movq    %rbx, 0x8*1(%rdi)
-        movq     %rdx, 16 (%rdi)
+        movq    %rdx, 0x8*2(%rdi)
-        movq     %r8,  24 (%rdi)
+        movq    %r8,  0x8*3(%rdi)
-        movq     %r9,  32 (%rdi)
+        movq    %r9,  0x8*4(%rdi)
-        movq     %r10, 40 (%rdi)
+        movq    %r10, 0x8*5(%rdi)
-        movq     %r11, 48 (%rdi)
+        movq    %r11, 0x8*6(%rdi)
-        movq     %r12, 56 (%rdi)
+        movq    %r12, 0x8*7(%rdi)
-        leaq    64(%rdi),%rdi
+        leaq    64(%rdi), %rdi
-        leaq    64(%rsi),%rsi
+        leaq    64(%rsi), %rsi
        jnz     .Loop2
-        movq    (%rsp),%rbx
+        movq    (%rsp), %rbx
        CFI_RESTORE rbx
-        movq    1*8(%rsp),%r12
+        movq    1*8(%rsp), %r12
        CFI_RESTORE r12
-        addq    $2*8,%rsp
+        addq    $2*8, %rsp
        CFI_ADJUST_CFA_OFFSET -2*8
        ret
 .Lcopy_page_end:
@@ -103,7 +101,7 @@ ENDPROC(copy_page)
        .section .altinstr_replacement,"ax"
 1:      .byte 0xeb                                      /* jmp <disp8> */
-        .byte (copy_page_c - copy_page) - (2f - 1b)     /* offset */
+        .byte (copy_page_rep - copy_page) - (2f - 1b)   /* offset */
 2:
        .previous
        .section .altinstructions,"a"
author	Ma Ling <ling.ma@intel.com>	2012-10-17 15:52:45 -0400
committer	Ingo Molnar <mingo@kernel.org>	2012-10-24 06:42:47 -0400
commit	269833bd5a0f4443873da358b71675a890b47c3c (patch)
tree	40d476ea896ded8c0957f20c77e583673f8d1d0b /arch/x86/lib
parent	0e9e3e306c7e472bdcffa34c4c4584301eda03b3 (diff)

diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 6b34d04d096a..176cca67212b 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S
@@ -5,91 +5,89 @@
5	#include <asm/alternative-asm.h>	5	#include <asm/alternative-asm.h>
6		6
7	ALIGN	7	ALIGN
8	copy_page_c:	8	copy_page_rep:
9	CFI_STARTPROC	9	CFI_STARTPROC
10	movl $4096/8,%ecx	10	movl $4096/8, %ecx
11	rep movsq	11	rep movsq
12	ret	12	ret
13	CFI_ENDPROC	13	CFI_ENDPROC
14	ENDPROC(copy_page_c)	14	ENDPROC(copy_page_rep)
15		15
16	/* Don't use streaming store because it's better when the target	16	/*
17	ends up in cache. */	17	* Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
18		18	* Could vary the prefetch distance based on SMP/UP.
19	/* Could vary the prefetch distance based on SMP/UP */	19	*/
20		20
21	ENTRY(copy_page)	21	ENTRY(copy_page)
22	CFI_STARTPROC	22	CFI_STARTPROC
23	subq $2*8,%rsp	23	subq $2*8, %rsp
24	CFI_ADJUST_CFA_OFFSET 2*8	24	CFI_ADJUST_CFA_OFFSET 2*8
25	movq %rbx,(%rsp)	25	movq %rbx, (%rsp)
26	CFI_REL_OFFSET rbx, 0	26	CFI_REL_OFFSET rbx, 0
27	movq %r12,1*8(%rsp)	27	movq %r12, 1*8(%rsp)
28	CFI_REL_OFFSET r12, 1*8	28	CFI_REL_OFFSET r12, 1*8
29		29
30	movl $(4096/64)-5,%ecx	30	movl $(4096/64)-5, %ecx
31	.p2align 4	31	.p2align 4
32	.Loop64:	32	.Loop64:
33	dec %rcx	33	dec %rcx
34		34	movq 0x8*0(%rsi), %rax
35	movq (%rsi), %rax	35	movq 0x8*1(%rsi), %rbx
36	movq 8 (%rsi), %rbx	36	movq 0x8*2(%rsi), %rdx
37	movq 16 (%rsi), %rdx	37	movq 0x8*3(%rsi), %r8
38	movq 24 (%rsi), %r8	38	movq 0x8*4(%rsi), %r9
39	movq 32 (%rsi), %r9	39	movq 0x8*5(%rsi), %r10
40	movq 40 (%rsi), %r10	40	movq 0x8*6(%rsi), %r11
41	movq 48 (%rsi), %r11	41	movq 0x8*7(%rsi), %r12
42	movq 56 (%rsi), %r12
43		42
44	prefetcht0 5*64(%rsi)	43	prefetcht0 5*64(%rsi)
45		44
46	movq %rax, (%rdi)	45	movq %rax, 0x8*0(%rdi)
47	movq %rbx, 8 (%rdi)	46	movq %rbx, 0x8*1(%rdi)
48	movq %rdx, 16 (%rdi)	47	movq %rdx, 0x8*2(%rdi)
49	movq %r8, 24 (%rdi)	48	movq %r8, 0x8*3(%rdi)
50	movq %r9, 32 (%rdi)	49	movq %r9, 0x8*4(%rdi)
51	movq %r10, 40 (%rdi)	50	movq %r10, 0x8*5(%rdi)
52	movq %r11, 48 (%rdi)	51	movq %r11, 0x8*6(%rdi)
53	movq %r12, 56 (%rdi)	52	movq %r12, 0x8*7(%rdi)
54		53
55	leaq 64 (%rsi), %rsi	54	leaq 64 (%rsi), %rsi
56	leaq 64 (%rdi), %rdi	55	leaq 64 (%rdi), %rdi
57		56
58	jnz .Loop64	57	jnz .Loop64
59		58
60	movl $5,%ecx	59	movl $5, %ecx
61	.p2align 4	60	.p2align 4
62	.Loop2:	61	.Loop2:
63	decl %ecx	62	decl %ecx
64		63
65	movq (%rsi), %rax	64	movq 0x8*0(%rsi), %rax
66	movq 8 (%rsi), %rbx	65	movq 0x8*1(%rsi), %rbx
67	movq 16 (%rsi), %rdx	66	movq 0x8*2(%rsi), %rdx
68	movq 24 (%rsi), %r8	67	movq 0x8*3(%rsi), %r8
69	movq 32 (%rsi), %r9	68	movq 0x8*4(%rsi), %r9
70	movq 40 (%rsi), %r10	69	movq 0x8*5(%rsi), %r10
71	movq 48 (%rsi), %r11	70	movq 0x8*6(%rsi), %r11
72	movq 56 (%rsi), %r12	71	movq 0x8*7(%rsi), %r12
73		72
74	movq %rax, (%rdi)	73	movq %rax, 0x8*0(%rdi)
75	movq %rbx, 8 (%rdi)	74	movq %rbx, 0x8*1(%rdi)
76	movq %rdx, 16 (%rdi)	75	movq %rdx, 0x8*2(%rdi)
77	movq %r8, 24 (%rdi)	76	movq %r8, 0x8*3(%rdi)
78	movq %r9, 32 (%rdi)	77	movq %r9, 0x8*4(%rdi)
79	movq %r10, 40 (%rdi)	78	movq %r10, 0x8*5(%rdi)
80	movq %r11, 48 (%rdi)	79	movq %r11, 0x8*6(%rdi)
81	movq %r12, 56 (%rdi)	80	movq %r12, 0x8*7(%rdi)
82		81
83	leaq 64(%rdi),%rdi	82	leaq 64(%rdi), %rdi
84	leaq 64(%rsi),%rsi	83	leaq 64(%rsi), %rsi
85
86	jnz .Loop2	84	jnz .Loop2
87		85
88	movq (%rsp),%rbx	86	movq (%rsp), %rbx
89	CFI_RESTORE rbx	87	CFI_RESTORE rbx
90	movq 1*8(%rsp),%r12	88	movq 1*8(%rsp), %r12
91	CFI_RESTORE r12	89	CFI_RESTORE r12
92	addq $2*8,%rsp	90	addq $2*8, %rsp
93	CFI_ADJUST_CFA_OFFSET -2*8	91	CFI_ADJUST_CFA_OFFSET -2*8
94	ret	92	ret
95	.Lcopy_page_end:	93	.Lcopy_page_end:
@@ -103,7 +101,7 @@ ENDPROC(copy_page)
103		101
104	.section .altinstr_replacement,"ax"	102	.section .altinstr_replacement,"ax"
105	1: .byte 0xeb /* jmp <disp8> */	103	1: .byte 0xeb /* jmp <disp8> */
106	.byte (copy_page_c - copy_page) - (2f - 1b) /* offset */	104	.byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
107	2:	105	2:
108	.previous	106	.previous
109	.section .altinstructions,"a"	107	.section .altinstructions,"a"