Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp

Conflicts: litmus/sched_cedf.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
commit: c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree: ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/lib/memcpy_64.S
parent: ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent: 6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
1 files changed, 135 insertions, 68 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e0f7d5..efbf2a0ecdea 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
 #include <asm/cpufeature.h>
 #include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
 /*
 * memcpy - Copy a memory block.
@@ -37,107 +38,173 @@
 .Lmemcpy_e:
        .previous
+/*
+ * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
+ * memcpy_c. Use memcpy_c_e when possible.
+ *
+ * This gets patched over the unrolled variant (below) via the
+ * alternative instructions framework:
+ */
+        .section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c_e:
+        movq %rdi, %rax
+        movl %edx, %ecx
+        rep movsb
+        ret
+.Lmemcpy_e_e:
+        .previous
 ENTRY(__memcpy)
 ENTRY(memcpy)
        CFI_STARTPROC
+        movq %rdi, %rax
        /*
-         * Put the number of full 64-byte blocks into %ecx.
+         * Use 32bit CMP here to avoid long NOP padding.
-         * Tail portion is handled at the end:
         */
-        movq %rdi, %rax
+        cmp  $0x20, %edx
-        movl %edx, %ecx
+        jb .Lhandle_tail
-        shrl   $6, %ecx
-        jz .Lhandle_tail
-        .p2align 4
-.Lloop_64:
        /*
-         * We decrement the loop index here - and the zero-flag is
+         * We check whether memory false dependence could occur,
-         * checked at the end of the loop (instructions inbetween do
+         * then jump to corresponding copy mode.
-         * not change the zero flag):
         */
-        decl %ecx
+        cmp  %dil, %sil
+        jl .Lcopy_backward
+        subl $0x20, %edx
+.Lcopy_forward_loop:
+        subq $0x20,     %rdx
        /*
-         * Move in blocks of 4x16 bytes:
+         * Move in blocks of 4x8 bytes:
         */
-        movq 0*8(%rsi),         %r11
+        movq 0*8(%rsi), %r8
-        movq 1*8(%rsi),         %r8
+        movq 1*8(%rsi), %r9
-        movq %r11,              0*8(%rdi)
+        movq 2*8(%rsi), %r10
-        movq %r8,               1*8(%rdi)
+        movq 3*8(%rsi), %r11
+        leaq 4*8(%rsi), %rsi
-        movq 2*8(%rsi),         %r9
-        movq 3*8(%rsi),         %r10
+        movq %r8,       0*8(%rdi)
-        movq %r9,               2*8(%rdi)
+        movq %r9,       1*8(%rdi)
-        movq %r10,              3*8(%rdi)
+        movq %r10,      2*8(%rdi)
+        movq %r11,      3*8(%rdi)
-        movq 4*8(%rsi),         %r11
+        leaq 4*8(%rdi), %rdi
-        movq 5*8(%rsi),         %r8
+        jae  .Lcopy_forward_loop
-        movq %r11,              4*8(%rdi)
+        addq $0x20,     %rdx
-        movq %r8,               5*8(%rdi)
+        jmp  .Lhandle_tail
-        movq 6*8(%rsi),         %r9
+.Lcopy_backward:
-        movq 7*8(%rsi),         %r10
+        /*
-        movq %r9,               6*8(%rdi)
+         * Calculate copy position to tail.
-        movq %r10,              7*8(%rdi)
+         */
+        addq %rdx,      %rsi
-        leaq 64(%rsi), %rsi
+        addq %rdx,      %rdi
-        leaq 64(%rdi), %rdi
+        subq $0x20,     %rdx
+        /*
-        jnz  .Lloop_64
+         * At most 3 ALU operations in one cycle,
+         * so append NOPS in the same 16bytes trunk.
+         */
+        .p2align 4
+.Lcopy_backward_loop:
+        subq $0x20,     %rdx
+        movq -1*8(%rsi),        %r8
+        movq -2*8(%rsi),        %r9
+        movq -3*8(%rsi),        %r10
+        movq -4*8(%rsi),        %r11
+        leaq -4*8(%rsi),        %rsi
+        movq %r8,               -1*8(%rdi)
+        movq %r9,               -2*8(%rdi)
+        movq %r10,              -3*8(%rdi)
+        movq %r11,              -4*8(%rdi)
+        leaq -4*8(%rdi),        %rdi
+        jae  .Lcopy_backward_loop
+        /*
+         * Calculate copy position to head.
+         */
+        addq $0x20,     %rdx
+        subq %rdx,      %rsi
+        subq %rdx,      %rdi
 .Lhandle_tail:
-        movl %edx, %ecx
+        cmpq $16,       %rdx
-        andl  $63, %ecx
+        jb   .Lless_16bytes
-        shrl   $3, %ecx
-        jz   .Lhandle_7
+        /*
+         * Move data from 16 bytes to 31 bytes.
+         */
+        movq 0*8(%rsi), %r8
+        movq 1*8(%rsi), %r9
+        movq -2*8(%rsi, %rdx),  %r10
+        movq -1*8(%rsi, %rdx),  %r11
+        movq %r8,       0*8(%rdi)
+        movq %r9,       1*8(%rdi)
+        movq %r10,      -2*8(%rdi, %rdx)
+        movq %r11,      -1*8(%rdi, %rdx)
+        retq
        .p2align 4
-.Lloop_8:
+.Lless_16bytes:
-        decl %ecx
+        cmpq $8,        %rdx
-        movq (%rsi),            %r8
+        jb   .Lless_8bytes
-        movq %r8,               (%rdi)
+        /*
-        leaq 8(%rdi),           %rdi
+         * Move data from 8 bytes to 15 bytes.
-        leaq 8(%rsi),           %rsi
+         */
-        jnz  .Lloop_8
+        movq 0*8(%rsi), %r8
+        movq -1*8(%rsi, %rdx),  %r9
-.Lhandle_7:
+        movq %r8,       0*8(%rdi)
-        movl %edx, %ecx
+        movq %r9,       -1*8(%rdi, %rdx)
-        andl $7, %ecx
+        retq
-        jz .Lend
+        .p2align 4
+.Lless_8bytes:
+        cmpq $4,        %rdx
+        jb   .Lless_3bytes
+        /*
+         * Move data from 4 bytes to 7 bytes.
+         */
+        movl (%rsi), %ecx
+        movl -4(%rsi, %rdx), %r8d
+        movl %ecx, (%rdi)
+        movl %r8d, -4(%rdi, %rdx)
+        retq
        .p2align 4
+.Lless_3bytes:
+        cmpl $0, %edx
+        je .Lend
+        /*
+         * Move data from 1 bytes to 3 bytes.
+         */
 .Lloop_1:
        movb (%rsi), %r8b
        movb %r8b, (%rdi)
        incq %rdi
        incq %rsi
-        decl %ecx
+        decl %edx
        jnz .Lloop_1
 .Lend:
-        ret
+        retq
        CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
        /*
-         * Some CPUs run faster using the string copy instructions.
+         * Some CPUs are adding enhanced REP MOVSB/STOSB feature
-         * It is also a lot simpler. Use this when possible:
+         * If the feature is supported, memcpy_c_e() is the first choice.
-         */
+         * If enhanced rep movsb copy is not available, use fast string copy
+         * memcpy_c() when possible. This is faster and code is simpler than
-        .section .altinstructions, "a"
+         * original memcpy().
-        .align 8
+         * Otherwise, original memcpy() is used.
-        .quad memcpy
+         * In .altinstructions section, ERMS feature is placed after REG_GOOD
-        .quad .Lmemcpy_c
+         * feature to implement the right patch order.
-        .word X86_FEATURE_REP_GOOD
+         *
-        /*
         * Replace only beginning, memcpy is used to apply alternatives,
         * so it is silly to overwrite itself with nops - reboot is the
         * only outcome...
         */
-        .byte .Lmemcpy_e - .Lmemcpy_c
+        .section .altinstructions, "a"
-        .byte .Lmemcpy_e - .Lmemcpy_c
+        altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+                             .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+        altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+                             .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
        .previous
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
commit	c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree	ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/lib/memcpy_64.S
parent	ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent	6a00f206debf8a5c8899055726ad127dbeeed098 (diff)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bcbcd1e0f7d5..efbf2a0ecdea 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
4		4
5	#include <asm/cpufeature.h>	5	#include <asm/cpufeature.h>
6	#include <asm/dwarf2.h>	6	#include <asm/dwarf2.h>
		7	#include <asm/alternative-asm.h>
7		8
8	/*	9	/*
9	* memcpy - Copy a memory block.	10	* memcpy - Copy a memory block.
@@ -37,107 +38,173 @@
37	.Lmemcpy_e:	38	.Lmemcpy_e:
38	.previous	39	.previous
39		40
		41	/*
		42	* memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
		43	* memcpy_c. Use memcpy_c_e when possible.
		44	*
		45	* This gets patched over the unrolled variant (below) via the
		46	* alternative instructions framework:
		47	*/
		48	.section .altinstr_replacement, "ax", @progbits
		49	.Lmemcpy_c_e:
		50	movq %rdi, %rax
		51
		52	movl %edx, %ecx
		53	rep movsb
		54	ret
		55	.Lmemcpy_e_e:
		56	.previous
		57
40	ENTRY(__memcpy)	58	ENTRY(__memcpy)
41	ENTRY(memcpy)	59	ENTRY(memcpy)
42	CFI_STARTPROC	60	CFI_STARTPROC
		61	movq %rdi, %rax
43		62
44	/*	63	/*
45	* Put the number of full 64-byte blocks into %ecx.	64	* Use 32bit CMP here to avoid long NOP padding.
46	* Tail portion is handled at the end:
47	*/	65	*/
48	movq %rdi, %rax	66	cmp $0x20, %edx
49	movl %edx, %ecx	67	jb .Lhandle_tail
50	shrl $6, %ecx
51	jz .Lhandle_tail
52		68
53	.p2align 4
54	.Lloop_64:
55	/*	69	/*
56	* We decrement the loop index here - and the zero-flag is	70	* We check whether memory false dependence could occur,
57	* checked at the end of the loop (instructions inbetween do	71	* then jump to corresponding copy mode.
58	* not change the zero flag):
59	*/	72	*/
60	decl %ecx	73	cmp %dil, %sil
		74	jl .Lcopy_backward
		75	subl $0x20, %edx
		76	.Lcopy_forward_loop:
		77	subq $0x20, %rdx
61		78
62	/*	79	/*
63	* Move in blocks of 4x16 bytes:	80	* Move in blocks of 4x8 bytes:
64	*/	81	*/
65	movq 0*8(%rsi), %r11	82	movq 0*8(%rsi), %r8
66	movq 1*8(%rsi), %r8	83	movq 1*8(%rsi), %r9
67	movq %r11, 0*8(%rdi)	84	movq 2*8(%rsi), %r10
68	movq %r8, 1*8(%rdi)	85	movq 3*8(%rsi), %r11
69		86	leaq 4*8(%rsi), %rsi
70	movq 2*8(%rsi), %r9	87
71	movq 3*8(%rsi), %r10	88	movq %r8, 0*8(%rdi)
72	movq %r9, 2*8(%rdi)	89	movq %r9, 1*8(%rdi)
73	movq %r10, 3*8(%rdi)	90	movq %r10, 2*8(%rdi)
74		91	movq %r11, 3*8(%rdi)
75	movq 4*8(%rsi), %r11	92	leaq 4*8(%rdi), %rdi
76	movq 5*8(%rsi), %r8	93	jae .Lcopy_forward_loop
77	movq %r11, 4*8(%rdi)	94	addq $0x20, %rdx
78	movq %r8, 5*8(%rdi)	95	jmp .Lhandle_tail
79		96
80	movq 6*8(%rsi), %r9	97	.Lcopy_backward:
81	movq 7*8(%rsi), %r10	98	/*
82	movq %r9, 6*8(%rdi)	99	* Calculate copy position to tail.
83	movq %r10, 7*8(%rdi)	100	*/
84		101	addq %rdx, %rsi
85	leaq 64(%rsi), %rsi	102	addq %rdx, %rdi
86	leaq 64(%rdi), %rdi	103	subq $0x20, %rdx
87		104	/*
88	jnz .Lloop_64	105	* At most 3 ALU operations in one cycle,
		106	* so append NOPS in the same 16bytes trunk.
		107	*/
		108	.p2align 4
		109	.Lcopy_backward_loop:
		110	subq $0x20, %rdx
		111	movq -1*8(%rsi), %r8
		112	movq -2*8(%rsi), %r9
		113	movq -3*8(%rsi), %r10
		114	movq -4*8(%rsi), %r11
		115	leaq -4*8(%rsi), %rsi
		116	movq %r8, -1*8(%rdi)
		117	movq %r9, -2*8(%rdi)
		118	movq %r10, -3*8(%rdi)
		119	movq %r11, -4*8(%rdi)
		120	leaq -4*8(%rdi), %rdi
		121	jae .Lcopy_backward_loop
89		122
		123	/*
		124	* Calculate copy position to head.
		125	*/
		126	addq $0x20, %rdx
		127	subq %rdx, %rsi
		128	subq %rdx, %rdi
90	.Lhandle_tail:	129	.Lhandle_tail:
91	movl %edx, %ecx	130	cmpq $16, %rdx
92	andl $63, %ecx	131	jb .Lless_16bytes
93	shrl $3, %ecx
94	jz .Lhandle_7
95		132
		133	/*
		134	* Move data from 16 bytes to 31 bytes.
		135	*/
		136	movq 0*8(%rsi), %r8
		137	movq 1*8(%rsi), %r9
		138	movq -2*8(%rsi, %rdx), %r10
		139	movq -1*8(%rsi, %rdx), %r11
		140	movq %r8, 0*8(%rdi)
		141	movq %r9, 1*8(%rdi)
		142	movq %r10, -2*8(%rdi, %rdx)
		143	movq %r11, -1*8(%rdi, %rdx)
		144	retq
96	.p2align 4	145	.p2align 4
97	.Lloop_8:	146	.Lless_16bytes:
98	decl %ecx	147	cmpq $8, %rdx
99	movq (%rsi), %r8	148	jb .Lless_8bytes
100	movq %r8, (%rdi)	149	/*
101	leaq 8(%rdi), %rdi	150	* Move data from 8 bytes to 15 bytes.
102	leaq 8(%rsi), %rsi	151	*/
103	jnz .Lloop_8	152	movq 0*8(%rsi), %r8
104		153	movq -1*8(%rsi, %rdx), %r9
105	.Lhandle_7:	154	movq %r8, 0*8(%rdi)
106	movl %edx, %ecx	155	movq %r9, -1*8(%rdi, %rdx)
107	andl $7, %ecx	156	retq
108	jz .Lend	157	.p2align 4
		158	.Lless_8bytes:
		159	cmpq $4, %rdx
		160	jb .Lless_3bytes
109		161
		162	/*
		163	* Move data from 4 bytes to 7 bytes.
		164	*/
		165	movl (%rsi), %ecx
		166	movl -4(%rsi, %rdx), %r8d
		167	movl %ecx, (%rdi)
		168	movl %r8d, -4(%rdi, %rdx)
		169	retq
110	.p2align 4	170	.p2align 4
		171	.Lless_3bytes:
		172	cmpl $0, %edx
		173	je .Lend
		174	/*
		175	* Move data from 1 bytes to 3 bytes.
		176	*/
111	.Lloop_1:	177	.Lloop_1:
112	movb (%rsi), %r8b	178	movb (%rsi), %r8b
113	movb %r8b, (%rdi)	179	movb %r8b, (%rdi)
114	incq %rdi	180	incq %rdi
115	incq %rsi	181	incq %rsi
116	decl %ecx	182	decl %edx
117	jnz .Lloop_1	183	jnz .Lloop_1
118		184
119	.Lend:	185	.Lend:
120	ret	186	retq
121	CFI_ENDPROC	187	CFI_ENDPROC
122	ENDPROC(memcpy)	188	ENDPROC(memcpy)
123	ENDPROC(__memcpy)	189	ENDPROC(__memcpy)
124		190
125	/*	191	/*
126	* Some CPUs run faster using the string copy instructions.	192	* Some CPUs are adding enhanced REP MOVSB/STOSB feature
127	* It is also a lot simpler. Use this when possible:	193	* If the feature is supported, memcpy_c_e() is the first choice.
128	*/	194	* If enhanced rep movsb copy is not available, use fast string copy
129		195	* memcpy_c() when possible. This is faster and code is simpler than
130	.section .altinstructions, "a"	196	* original memcpy().
131	.align 8	197	* Otherwise, original memcpy() is used.
132	.quad memcpy	198	* In .altinstructions section, ERMS feature is placed after REG_GOOD
133	.quad .Lmemcpy_c	199	* feature to implement the right patch order.
134	.word X86_FEATURE_REP_GOOD	200	*
135
136	/*
137	* Replace only beginning, memcpy is used to apply alternatives,	201	* Replace only beginning, memcpy is used to apply alternatives,
138	* so it is silly to overwrite itself with nops - reboot is the	202	* so it is silly to overwrite itself with nops - reboot is the
139	* only outcome...	203	* only outcome...
140	*/	204	*/
141	.byte .Lmemcpy_e - .Lmemcpy_c	205	.section .altinstructions, "a"
142	.byte .Lmemcpy_e - .Lmemcpy_c	206	altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
		207	.Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
		208	altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
		209	.Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
143	.previous	210	.previous