diff options
Diffstat (limited to 'include/asm-x86/xor_64.h')
-rw-r--r-- | include/asm-x86/xor_64.h | 361 |
1 files changed, 0 insertions, 361 deletions
diff --git a/include/asm-x86/xor_64.h b/include/asm-x86/xor_64.h deleted file mode 100644 index 2d3a18de295b..000000000000 --- a/include/asm-x86/xor_64.h +++ /dev/null | |||
@@ -1,361 +0,0 @@ | |||
1 | #ifndef ASM_X86__XOR_64_H | ||
2 | #define ASM_X86__XOR_64_H | ||
3 | |||
4 | /* | ||
5 | * Optimized RAID-5 checksumming functions for MMX and SSE. | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2, or (at your option) | ||
10 | * any later version. | ||
11 | * | ||
12 | * You should have received a copy of the GNU General Public License | ||
13 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
14 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
15 | */ | ||
16 | |||
17 | |||
18 | /* | ||
19 | * Cache avoiding checksumming functions utilizing KNI instructions | ||
20 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | * Based on | ||
25 | * High-speed RAID5 checksumming functions utilizing SSE instructions. | ||
26 | * Copyright (C) 1998 Ingo Molnar. | ||
27 | */ | ||
28 | |||
29 | /* | ||
30 | * x86-64 changes / gcc fixes from Andi Kleen. | ||
31 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
32 | * | ||
33 | * This hasn't been optimized for the hammer yet, but there are likely | ||
34 | * no advantages to be gotten from x86-64 here anyways. | ||
35 | */ | ||
36 | |||
37 | typedef struct { | ||
38 | unsigned long a, b; | ||
39 | } __attribute__((aligned(16))) xmm_store_t; | ||
40 | |||
41 | /* Doesn't use gcc to save the XMM registers, because there is no easy way to | ||
42 | tell it to do a clts before the register saving. */ | ||
43 | #define XMMS_SAVE \ | ||
44 | do { \ | ||
45 | preempt_disable(); \ | ||
46 | asm volatile( \ | ||
47 | "movq %%cr0,%0 ;\n\t" \ | ||
48 | "clts ;\n\t" \ | ||
49 | "movups %%xmm0,(%1) ;\n\t" \ | ||
50 | "movups %%xmm1,0x10(%1) ;\n\t" \ | ||
51 | "movups %%xmm2,0x20(%1) ;\n\t" \ | ||
52 | "movups %%xmm3,0x30(%1) ;\n\t" \ | ||
53 | : "=&r" (cr0) \ | ||
54 | : "r" (xmm_save) \ | ||
55 | : "memory"); \ | ||
56 | } while (0) | ||
57 | |||
58 | #define XMMS_RESTORE \ | ||
59 | do { \ | ||
60 | asm volatile( \ | ||
61 | "sfence ;\n\t" \ | ||
62 | "movups (%1),%%xmm0 ;\n\t" \ | ||
63 | "movups 0x10(%1),%%xmm1 ;\n\t" \ | ||
64 | "movups 0x20(%1),%%xmm2 ;\n\t" \ | ||
65 | "movups 0x30(%1),%%xmm3 ;\n\t" \ | ||
66 | "movq %0,%%cr0 ;\n\t" \ | ||
67 | : \ | ||
68 | : "r" (cr0), "r" (xmm_save) \ | ||
69 | : "memory"); \ | ||
70 | preempt_enable(); \ | ||
71 | } while (0) | ||
72 | |||
73 | #define OFFS(x) "16*("#x")" | ||
74 | #define PF_OFFS(x) "256+16*("#x")" | ||
75 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" | ||
76 | #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" | ||
77 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" | ||
78 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" | ||
79 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" | ||
80 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" | ||
81 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" | ||
82 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" | ||
83 | #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" | ||
84 | #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" | ||
85 | #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" | ||
86 | #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" | ||
87 | #define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" | ||
88 | |||
89 | |||
90 | static void | ||
91 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
92 | { | ||
93 | unsigned int lines = bytes >> 8; | ||
94 | unsigned long cr0; | ||
95 | xmm_store_t xmm_save[4]; | ||
96 | |||
97 | XMMS_SAVE; | ||
98 | |||
99 | asm volatile( | ||
100 | #undef BLOCK | ||
101 | #define BLOCK(i) \ | ||
102 | LD(i, 0) \ | ||
103 | LD(i + 1, 1) \ | ||
104 | PF1(i) \ | ||
105 | PF1(i + 2) \ | ||
106 | LD(i + 2, 2) \ | ||
107 | LD(i + 3, 3) \ | ||
108 | PF0(i + 4) \ | ||
109 | PF0(i + 6) \ | ||
110 | XO1(i, 0) \ | ||
111 | XO1(i + 1, 1) \ | ||
112 | XO1(i + 2, 2) \ | ||
113 | XO1(i + 3, 3) \ | ||
114 | ST(i, 0) \ | ||
115 | ST(i + 1, 1) \ | ||
116 | ST(i + 2, 2) \ | ||
117 | ST(i + 3, 3) \ | ||
118 | |||
119 | |||
120 | PF0(0) | ||
121 | PF0(2) | ||
122 | |||
123 | " .align 32 ;\n" | ||
124 | " 1: ;\n" | ||
125 | |||
126 | BLOCK(0) | ||
127 | BLOCK(4) | ||
128 | BLOCK(8) | ||
129 | BLOCK(12) | ||
130 | |||
131 | " addq %[inc], %[p1] ;\n" | ||
132 | " addq %[inc], %[p2] ;\n" | ||
133 | " decl %[cnt] ; jnz 1b" | ||
134 | : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) | ||
135 | : [inc] "r" (256UL) | ||
136 | : "memory"); | ||
137 | |||
138 | XMMS_RESTORE; | ||
139 | } | ||
140 | |||
141 | static void | ||
142 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
143 | unsigned long *p3) | ||
144 | { | ||
145 | unsigned int lines = bytes >> 8; | ||
146 | xmm_store_t xmm_save[4]; | ||
147 | unsigned long cr0; | ||
148 | |||
149 | XMMS_SAVE; | ||
150 | |||
151 | asm volatile( | ||
152 | #undef BLOCK | ||
153 | #define BLOCK(i) \ | ||
154 | PF1(i) \ | ||
155 | PF1(i + 2) \ | ||
156 | LD(i, 0) \ | ||
157 | LD(i + 1, 1) \ | ||
158 | LD(i + 2, 2) \ | ||
159 | LD(i + 3, 3) \ | ||
160 | PF2(i) \ | ||
161 | PF2(i + 2) \ | ||
162 | PF0(i + 4) \ | ||
163 | PF0(i + 6) \ | ||
164 | XO1(i, 0) \ | ||
165 | XO1(i + 1, 1) \ | ||
166 | XO1(i + 2, 2) \ | ||
167 | XO1(i + 3, 3) \ | ||
168 | XO2(i, 0) \ | ||
169 | XO2(i + 1, 1) \ | ||
170 | XO2(i + 2, 2) \ | ||
171 | XO2(i + 3, 3) \ | ||
172 | ST(i, 0) \ | ||
173 | ST(i + 1, 1) \ | ||
174 | ST(i + 2, 2) \ | ||
175 | ST(i + 3, 3) \ | ||
176 | |||
177 | |||
178 | PF0(0) | ||
179 | PF0(2) | ||
180 | |||
181 | " .align 32 ;\n" | ||
182 | " 1: ;\n" | ||
183 | |||
184 | BLOCK(0) | ||
185 | BLOCK(4) | ||
186 | BLOCK(8) | ||
187 | BLOCK(12) | ||
188 | |||
189 | " addq %[inc], %[p1] ;\n" | ||
190 | " addq %[inc], %[p2] ;\n" | ||
191 | " addq %[inc], %[p3] ;\n" | ||
192 | " decl %[cnt] ; jnz 1b" | ||
193 | : [cnt] "+r" (lines), | ||
194 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | ||
195 | : [inc] "r" (256UL) | ||
196 | : "memory"); | ||
197 | XMMS_RESTORE; | ||
198 | } | ||
199 | |||
200 | static void | ||
201 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
202 | unsigned long *p3, unsigned long *p4) | ||
203 | { | ||
204 | unsigned int lines = bytes >> 8; | ||
205 | xmm_store_t xmm_save[4]; | ||
206 | unsigned long cr0; | ||
207 | |||
208 | XMMS_SAVE; | ||
209 | |||
210 | asm volatile( | ||
211 | #undef BLOCK | ||
212 | #define BLOCK(i) \ | ||
213 | PF1(i) \ | ||
214 | PF1(i + 2) \ | ||
215 | LD(i, 0) \ | ||
216 | LD(i + 1, 1) \ | ||
217 | LD(i + 2, 2) \ | ||
218 | LD(i + 3, 3) \ | ||
219 | PF2(i) \ | ||
220 | PF2(i + 2) \ | ||
221 | XO1(i, 0) \ | ||
222 | XO1(i + 1, 1) \ | ||
223 | XO1(i + 2, 2) \ | ||
224 | XO1(i + 3, 3) \ | ||
225 | PF3(i) \ | ||
226 | PF3(i + 2) \ | ||
227 | PF0(i + 4) \ | ||
228 | PF0(i + 6) \ | ||
229 | XO2(i, 0) \ | ||
230 | XO2(i + 1, 1) \ | ||
231 | XO2(i + 2, 2) \ | ||
232 | XO2(i + 3, 3) \ | ||
233 | XO3(i, 0) \ | ||
234 | XO3(i + 1, 1) \ | ||
235 | XO3(i + 2, 2) \ | ||
236 | XO3(i + 3, 3) \ | ||
237 | ST(i, 0) \ | ||
238 | ST(i + 1, 1) \ | ||
239 | ST(i + 2, 2) \ | ||
240 | ST(i + 3, 3) \ | ||
241 | |||
242 | |||
243 | PF0(0) | ||
244 | PF0(2) | ||
245 | |||
246 | " .align 32 ;\n" | ||
247 | " 1: ;\n" | ||
248 | |||
249 | BLOCK(0) | ||
250 | BLOCK(4) | ||
251 | BLOCK(8) | ||
252 | BLOCK(12) | ||
253 | |||
254 | " addq %[inc], %[p1] ;\n" | ||
255 | " addq %[inc], %[p2] ;\n" | ||
256 | " addq %[inc], %[p3] ;\n" | ||
257 | " addq %[inc], %[p4] ;\n" | ||
258 | " decl %[cnt] ; jnz 1b" | ||
259 | : [cnt] "+c" (lines), | ||
260 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | ||
261 | : [inc] "r" (256UL) | ||
262 | : "memory" ); | ||
263 | |||
264 | XMMS_RESTORE; | ||
265 | } | ||
266 | |||
267 | static void | ||
268 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
269 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
270 | { | ||
271 | unsigned int lines = bytes >> 8; | ||
272 | xmm_store_t xmm_save[4]; | ||
273 | unsigned long cr0; | ||
274 | |||
275 | XMMS_SAVE; | ||
276 | |||
277 | asm volatile( | ||
278 | #undef BLOCK | ||
279 | #define BLOCK(i) \ | ||
280 | PF1(i) \ | ||
281 | PF1(i + 2) \ | ||
282 | LD(i, 0) \ | ||
283 | LD(i + 1, 1) \ | ||
284 | LD(i + 2, 2) \ | ||
285 | LD(i + 3, 3) \ | ||
286 | PF2(i) \ | ||
287 | PF2(i + 2) \ | ||
288 | XO1(i, 0) \ | ||
289 | XO1(i + 1, 1) \ | ||
290 | XO1(i + 2, 2) \ | ||
291 | XO1(i + 3, 3) \ | ||
292 | PF3(i) \ | ||
293 | PF3(i + 2) \ | ||
294 | XO2(i, 0) \ | ||
295 | XO2(i + 1, 1) \ | ||
296 | XO2(i + 2, 2) \ | ||
297 | XO2(i + 3, 3) \ | ||
298 | PF4(i) \ | ||
299 | PF4(i + 2) \ | ||
300 | PF0(i + 4) \ | ||
301 | PF0(i + 6) \ | ||
302 | XO3(i, 0) \ | ||
303 | XO3(i + 1, 1) \ | ||
304 | XO3(i + 2, 2) \ | ||
305 | XO3(i + 3, 3) \ | ||
306 | XO4(i, 0) \ | ||
307 | XO4(i + 1, 1) \ | ||
308 | XO4(i + 2, 2) \ | ||
309 | XO4(i + 3, 3) \ | ||
310 | ST(i, 0) \ | ||
311 | ST(i + 1, 1) \ | ||
312 | ST(i + 2, 2) \ | ||
313 | ST(i + 3, 3) \ | ||
314 | |||
315 | |||
316 | PF0(0) | ||
317 | PF0(2) | ||
318 | |||
319 | " .align 32 ;\n" | ||
320 | " 1: ;\n" | ||
321 | |||
322 | BLOCK(0) | ||
323 | BLOCK(4) | ||
324 | BLOCK(8) | ||
325 | BLOCK(12) | ||
326 | |||
327 | " addq %[inc], %[p1] ;\n" | ||
328 | " addq %[inc], %[p2] ;\n" | ||
329 | " addq %[inc], %[p3] ;\n" | ||
330 | " addq %[inc], %[p4] ;\n" | ||
331 | " addq %[inc], %[p5] ;\n" | ||
332 | " decl %[cnt] ; jnz 1b" | ||
333 | : [cnt] "+c" (lines), | ||
334 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), | ||
335 | [p5] "+r" (p5) | ||
336 | : [inc] "r" (256UL) | ||
337 | : "memory"); | ||
338 | |||
339 | XMMS_RESTORE; | ||
340 | } | ||
341 | |||
342 | static struct xor_block_template xor_block_sse = { | ||
343 | .name = "generic_sse", | ||
344 | .do_2 = xor_sse_2, | ||
345 | .do_3 = xor_sse_3, | ||
346 | .do_4 = xor_sse_4, | ||
347 | .do_5 = xor_sse_5, | ||
348 | }; | ||
349 | |||
350 | #undef XOR_TRY_TEMPLATES | ||
351 | #define XOR_TRY_TEMPLATES \ | ||
352 | do { \ | ||
353 | xor_speed(&xor_block_sse); \ | ||
354 | } while (0) | ||
355 | |||
356 | /* We force the use of the SSE xor block because it can write around L2. | ||
357 | We may also be able to load into the L1 only depending on how the cpu | ||
358 | deals with a load to a line that is being prefetched. */ | ||
359 | #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) | ||
360 | |||
361 | #endif /* ASM_X86__XOR_64_H */ | ||