diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/lib/memcpy_64.S | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'arch/x86/lib/memcpy_64.S')
-rw-r--r-- | arch/x86/lib/memcpy_64.S | 203 |
1 files changed, 135 insertions, 68 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bcbcd1e0f7d5..efbf2a0ecdea 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S | |||
@@ -4,6 +4,7 @@ | |||
4 | 4 | ||
5 | #include <asm/cpufeature.h> | 5 | #include <asm/cpufeature.h> |
6 | #include <asm/dwarf2.h> | 6 | #include <asm/dwarf2.h> |
7 | #include <asm/alternative-asm.h> | ||
7 | 8 | ||
8 | /* | 9 | /* |
9 | * memcpy - Copy a memory block. | 10 | * memcpy - Copy a memory block. |
@@ -37,107 +38,173 @@ | |||
37 | .Lmemcpy_e: | 38 | .Lmemcpy_e: |
38 | .previous | 39 | .previous |
39 | 40 | ||
41 | /* | ||
42 | * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than | ||
43 | * memcpy_c. Use memcpy_c_e when possible. | ||
44 | * | ||
45 | * This gets patched over the unrolled variant (below) via the | ||
46 | * alternative instructions framework: | ||
47 | */ | ||
48 | .section .altinstr_replacement, "ax", @progbits | ||
49 | .Lmemcpy_c_e: | ||
50 | movq %rdi, %rax | ||
51 | |||
52 | movl %edx, %ecx | ||
53 | rep movsb | ||
54 | ret | ||
55 | .Lmemcpy_e_e: | ||
56 | .previous | ||
57 | |||
40 | ENTRY(__memcpy) | 58 | ENTRY(__memcpy) |
41 | ENTRY(memcpy) | 59 | ENTRY(memcpy) |
42 | CFI_STARTPROC | 60 | CFI_STARTPROC |
61 | movq %rdi, %rax | ||
43 | 62 | ||
44 | /* | 63 | /* |
45 | * Put the number of full 64-byte blocks into %ecx. | 64 | * Use 32bit CMP here to avoid long NOP padding. |
46 | * Tail portion is handled at the end: | ||
47 | */ | 65 | */ |
48 | movq %rdi, %rax | 66 | cmp $0x20, %edx |
49 | movl %edx, %ecx | 67 | jb .Lhandle_tail |
50 | shrl $6, %ecx | ||
51 | jz .Lhandle_tail | ||
52 | 68 | ||
53 | .p2align 4 | ||
54 | .Lloop_64: | ||
55 | /* | 69 | /* |
56 | * We decrement the loop index here - and the zero-flag is | 70 | * We check whether memory false dependence could occur, |
57 | * checked at the end of the loop (instructions inbetween do | 71 | * then jump to corresponding copy mode. |
58 | * not change the zero flag): | ||
59 | */ | 72 | */ |
60 | decl %ecx | 73 | cmp %dil, %sil |
74 | jl .Lcopy_backward | ||
75 | subl $0x20, %edx | ||
76 | .Lcopy_forward_loop: | ||
77 | subq $0x20, %rdx | ||
61 | 78 | ||
62 | /* | 79 | /* |
63 | * Move in blocks of 4x16 bytes: | 80 | * Move in blocks of 4x8 bytes: |
64 | */ | 81 | */ |
65 | movq 0*8(%rsi), %r11 | 82 | movq 0*8(%rsi), %r8 |
66 | movq 1*8(%rsi), %r8 | 83 | movq 1*8(%rsi), %r9 |
67 | movq %r11, 0*8(%rdi) | 84 | movq 2*8(%rsi), %r10 |
68 | movq %r8, 1*8(%rdi) | 85 | movq 3*8(%rsi), %r11 |
69 | 86 | leaq 4*8(%rsi), %rsi | |
70 | movq 2*8(%rsi), %r9 | 87 | |
71 | movq 3*8(%rsi), %r10 | 88 | movq %r8, 0*8(%rdi) |
72 | movq %r9, 2*8(%rdi) | 89 | movq %r9, 1*8(%rdi) |
73 | movq %r10, 3*8(%rdi) | 90 | movq %r10, 2*8(%rdi) |
74 | 91 | movq %r11, 3*8(%rdi) | |
75 | movq 4*8(%rsi), %r11 | 92 | leaq 4*8(%rdi), %rdi |
76 | movq 5*8(%rsi), %r8 | 93 | jae .Lcopy_forward_loop |
77 | movq %r11, 4*8(%rdi) | 94 | addq $0x20, %rdx |
78 | movq %r8, 5*8(%rdi) | 95 | jmp .Lhandle_tail |
79 | 96 | ||
80 | movq 6*8(%rsi), %r9 | 97 | .Lcopy_backward: |
81 | movq 7*8(%rsi), %r10 | 98 | /* |
82 | movq %r9, 6*8(%rdi) | 99 | * Calculate copy position to tail. |
83 | movq %r10, 7*8(%rdi) | 100 | */ |
84 | 101 | addq %rdx, %rsi | |
85 | leaq 64(%rsi), %rsi | 102 | addq %rdx, %rdi |
86 | leaq 64(%rdi), %rdi | 103 | subq $0x20, %rdx |
87 | 104 | /* | |
88 | jnz .Lloop_64 | 105 | * At most 3 ALU operations in one cycle, |
106 | * so append NOPS in the same 16bytes trunk. | ||
107 | */ | ||
108 | .p2align 4 | ||
109 | .Lcopy_backward_loop: | ||
110 | subq $0x20, %rdx | ||
111 | movq -1*8(%rsi), %r8 | ||
112 | movq -2*8(%rsi), %r9 | ||
113 | movq -3*8(%rsi), %r10 | ||
114 | movq -4*8(%rsi), %r11 | ||
115 | leaq -4*8(%rsi), %rsi | ||
116 | movq %r8, -1*8(%rdi) | ||
117 | movq %r9, -2*8(%rdi) | ||
118 | movq %r10, -3*8(%rdi) | ||
119 | movq %r11, -4*8(%rdi) | ||
120 | leaq -4*8(%rdi), %rdi | ||
121 | jae .Lcopy_backward_loop | ||
89 | 122 | ||
123 | /* | ||
124 | * Calculate copy position to head. | ||
125 | */ | ||
126 | addq $0x20, %rdx | ||
127 | subq %rdx, %rsi | ||
128 | subq %rdx, %rdi | ||
90 | .Lhandle_tail: | 129 | .Lhandle_tail: |
91 | movl %edx, %ecx | 130 | cmpq $16, %rdx |
92 | andl $63, %ecx | 131 | jb .Lless_16bytes |
93 | shrl $3, %ecx | ||
94 | jz .Lhandle_7 | ||
95 | 132 | ||
133 | /* | ||
134 | * Move data from 16 bytes to 31 bytes. | ||
135 | */ | ||
136 | movq 0*8(%rsi), %r8 | ||
137 | movq 1*8(%rsi), %r9 | ||
138 | movq -2*8(%rsi, %rdx), %r10 | ||
139 | movq -1*8(%rsi, %rdx), %r11 | ||
140 | movq %r8, 0*8(%rdi) | ||
141 | movq %r9, 1*8(%rdi) | ||
142 | movq %r10, -2*8(%rdi, %rdx) | ||
143 | movq %r11, -1*8(%rdi, %rdx) | ||
144 | retq | ||
96 | .p2align 4 | 145 | .p2align 4 |
97 | .Lloop_8: | 146 | .Lless_16bytes: |
98 | decl %ecx | 147 | cmpq $8, %rdx |
99 | movq (%rsi), %r8 | 148 | jb .Lless_8bytes |
100 | movq %r8, (%rdi) | 149 | /* |
101 | leaq 8(%rdi), %rdi | 150 | * Move data from 8 bytes to 15 bytes. |
102 | leaq 8(%rsi), %rsi | 151 | */ |
103 | jnz .Lloop_8 | 152 | movq 0*8(%rsi), %r8 |
104 | 153 | movq -1*8(%rsi, %rdx), %r9 | |
105 | .Lhandle_7: | 154 | movq %r8, 0*8(%rdi) |
106 | movl %edx, %ecx | 155 | movq %r9, -1*8(%rdi, %rdx) |
107 | andl $7, %ecx | 156 | retq |
108 | jz .Lend | 157 | .p2align 4 |
158 | .Lless_8bytes: | ||
159 | cmpq $4, %rdx | ||
160 | jb .Lless_3bytes | ||
109 | 161 | ||
162 | /* | ||
163 | * Move data from 4 bytes to 7 bytes. | ||
164 | */ | ||
165 | movl (%rsi), %ecx | ||
166 | movl -4(%rsi, %rdx), %r8d | ||
167 | movl %ecx, (%rdi) | ||
168 | movl %r8d, -4(%rdi, %rdx) | ||
169 | retq | ||
110 | .p2align 4 | 170 | .p2align 4 |
171 | .Lless_3bytes: | ||
172 | cmpl $0, %edx | ||
173 | je .Lend | ||
174 | /* | ||
175 | * Move data from 1 bytes to 3 bytes. | ||
176 | */ | ||
111 | .Lloop_1: | 177 | .Lloop_1: |
112 | movb (%rsi), %r8b | 178 | movb (%rsi), %r8b |
113 | movb %r8b, (%rdi) | 179 | movb %r8b, (%rdi) |
114 | incq %rdi | 180 | incq %rdi |
115 | incq %rsi | 181 | incq %rsi |
116 | decl %ecx | 182 | decl %edx |
117 | jnz .Lloop_1 | 183 | jnz .Lloop_1 |
118 | 184 | ||
119 | .Lend: | 185 | .Lend: |
120 | ret | 186 | retq |
121 | CFI_ENDPROC | 187 | CFI_ENDPROC |
122 | ENDPROC(memcpy) | 188 | ENDPROC(memcpy) |
123 | ENDPROC(__memcpy) | 189 | ENDPROC(__memcpy) |
124 | 190 | ||
125 | /* | 191 | /* |
126 | * Some CPUs run faster using the string copy instructions. | 192 | * Some CPUs are adding enhanced REP MOVSB/STOSB feature |
127 | * It is also a lot simpler. Use this when possible: | 193 | * If the feature is supported, memcpy_c_e() is the first choice. |
128 | */ | 194 | * If enhanced rep movsb copy is not available, use fast string copy |
129 | 195 | * memcpy_c() when possible. This is faster and code is simpler than | |
130 | .section .altinstructions, "a" | 196 | * original memcpy(). |
131 | .align 8 | 197 | * Otherwise, original memcpy() is used. |
132 | .quad memcpy | 198 | * In .altinstructions section, ERMS feature is placed after REG_GOOD |
133 | .quad .Lmemcpy_c | 199 | * feature to implement the right patch order. |
134 | .word X86_FEATURE_REP_GOOD | 200 | * |
135 | |||
136 | /* | ||
137 | * Replace only beginning, memcpy is used to apply alternatives, | 201 | * Replace only beginning, memcpy is used to apply alternatives, |
138 | * so it is silly to overwrite itself with nops - reboot is the | 202 | * so it is silly to overwrite itself with nops - reboot is the |
139 | * only outcome... | 203 | * only outcome... |
140 | */ | 204 | */ |
141 | .byte .Lmemcpy_e - .Lmemcpy_c | 205 | .section .altinstructions, "a" |
142 | .byte .Lmemcpy_e - .Lmemcpy_c | 206 | altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ |
207 | .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c | ||
208 | altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ | ||
209 | .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e | ||
143 | .previous | 210 | .previous |