aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/include
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/include')
-rw-r--r--arch/x86/include/asm/linkage.h18
-rw-r--r--arch/x86/include/asm/pgtable.h12
-rw-r--r--arch/x86/include/asm/pgtable_32.h7
-rw-r--r--arch/x86/include/asm/pgtable_64.h3
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/xor.h491
-rw-r--r--arch/x86/include/asm/xor_32.h309
-rw-r--r--arch/x86/include/asm/xor_64.h305
8 files changed, 534 insertions, 619 deletions
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 48142971b25d..79327e9483a3 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -27,20 +27,20 @@
27#define __asmlinkage_protect0(ret) \ 27#define __asmlinkage_protect0(ret) \
28 __asmlinkage_protect_n(ret) 28 __asmlinkage_protect_n(ret)
29#define __asmlinkage_protect1(ret, arg1) \ 29#define __asmlinkage_protect1(ret, arg1) \
30 __asmlinkage_protect_n(ret, "g" (arg1)) 30 __asmlinkage_protect_n(ret, "m" (arg1))
31#define __asmlinkage_protect2(ret, arg1, arg2) \ 31#define __asmlinkage_protect2(ret, arg1, arg2) \
32 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2)) 32 __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2))
33#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \ 33#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \
34 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3)) 34 __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3))
35#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \ 35#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \
36 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ 36 __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
37 "g" (arg4)) 37 "m" (arg4))
38#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \ 38#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \
39 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ 39 __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
40 "g" (arg4), "g" (arg5)) 40 "m" (arg4), "m" (arg5))
41#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \ 41#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \
42 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ 42 __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
43 "g" (arg4), "g" (arg5), "g" (arg6)) 43 "m" (arg4), "m" (arg5), "m" (arg6))
44 44
45#endif /* CONFIG_X86_32 */ 45#endif /* CONFIG_X86_32 */
46 46
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c1a955e67c0..fc304279b559 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -786,6 +786,18 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
786 memcpy(dst, src, count * sizeof(pgd_t)); 786 memcpy(dst, src, count * sizeof(pgd_t));
787} 787}
788 788
789/*
790 * The x86 doesn't have any external MMU info: the kernel page
791 * tables contain all the necessary information.
792 */
793static inline void update_mmu_cache(struct vm_area_struct *vma,
794 unsigned long addr, pte_t *ptep)
795{
796}
797static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
798 unsigned long addr, pmd_t *pmd)
799{
800}
789 801
790#include <asm-generic/pgtable.h> 802#include <asm-generic/pgtable.h>
791#endif /* __ASSEMBLY__ */ 803#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 8faa215a503e..9ee322103c6d 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -66,13 +66,6 @@ do { \
66 __flush_tlb_one((vaddr)); \ 66 __flush_tlb_one((vaddr)); \
67} while (0) 67} while (0)
68 68
69/*
70 * The i386 doesn't have any external MMU info: the kernel page
71 * tables contain all the necessary information.
72 */
73#define update_mmu_cache(vma, address, ptep) do { } while (0)
74#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
75
76#endif /* !__ASSEMBLY__ */ 69#endif /* !__ASSEMBLY__ */
77 70
78/* 71/*
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 47356f9df82e..615b0c78449f 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -142,9 +142,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
142#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) 142#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
143#define pte_unmap(pte) ((void)(pte))/* NOP */ 143#define pte_unmap(pte) ((void)(pte))/* NOP */
144 144
145#define update_mmu_cache(vma, address, ptep) do { } while (0)
146#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
147
148/* Encode and de-code a swap entry */ 145/* Encode and de-code a swap entry */
149#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 146#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
150#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) 147#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 6c7fc25f2c34..5c6e4fb370f5 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -47,6 +47,12 @@
47# define NEED_NOPL 0 47# define NEED_NOPL 0
48#endif 48#endif
49 49
50#ifdef CONFIG_MATOM
51# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31))
52#else
53# define NEED_MOVBE 0
54#endif
55
50#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
51#ifdef CONFIG_PARAVIRT 57#ifdef CONFIG_PARAVIRT
52/* Paravirtualized systems may not have PSE or PGE available */ 58/* Paravirtualized systems may not have PSE or PGE available */
@@ -80,7 +86,7 @@
80 86
81#define REQUIRED_MASK2 0 87#define REQUIRED_MASK2 0
82#define REQUIRED_MASK3 (NEED_NOPL) 88#define REQUIRED_MASK3 (NEED_NOPL)
83#define REQUIRED_MASK4 0 89#define REQUIRED_MASK4 (NEED_MOVBE)
84#define REQUIRED_MASK5 0 90#define REQUIRED_MASK5 0
85#define REQUIRED_MASK6 0 91#define REQUIRED_MASK6 0
86#define REQUIRED_MASK7 0 92#define REQUIRED_MASK7 0
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index f8fde90bc45e..d8829751b3f8 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,10 +1,499 @@
1#ifdef CONFIG_KMEMCHECK 1#ifdef CONFIG_KMEMCHECK
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ 2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h> 3# include <asm-generic/xor.h>
4#elif !defined(_ASM_X86_XOR_H)
5#define _ASM_X86_XOR_H
6
7/*
8 * Optimized RAID-5 checksumming functions for SSE.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2, or (at your option)
13 * any later version.
14 *
15 * You should have received a copy of the GNU General Public License
16 * (for example /usr/src/linux/COPYING); if not, write to the Free
17 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20/*
21 * Cache avoiding checksumming functions utilizing KNI instructions
22 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
23 */
24
25/*
26 * Based on
27 * High-speed RAID5 checksumming functions utilizing SSE instructions.
28 * Copyright (C) 1998 Ingo Molnar.
29 */
30
31/*
32 * x86-64 changes / gcc fixes from Andi Kleen.
33 * Copyright 2002 Andi Kleen, SuSE Labs.
34 *
35 * This hasn't been optimized for the hammer yet, but there are likely
36 * no advantages to be gotten from x86-64 here anyways.
37 */
38
39#include <asm/i387.h>
40
41#ifdef CONFIG_X86_32
42/* reduce register pressure */
43# define XOR_CONSTANT_CONSTRAINT "i"
4#else 44#else
45# define XOR_CONSTANT_CONSTRAINT "re"
46#endif
47
48#define OFFS(x) "16*("#x")"
49#define PF_OFFS(x) "256+16*("#x")"
50#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
51#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
52#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
53#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
54#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
55#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
56#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
57#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
58#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
59#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
60#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
61#define NOP(x)
62
63#define BLK64(pf, op, i) \
64 pf(i) \
65 op(i, 0) \
66 op(i + 1, 1) \
67 op(i + 2, 2) \
68 op(i + 3, 3)
69
70static void
71xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
72{
73 unsigned long lines = bytes >> 8;
74
75 kernel_fpu_begin();
76
77 asm volatile(
78#undef BLOCK
79#define BLOCK(i) \
80 LD(i, 0) \
81 LD(i + 1, 1) \
82 PF1(i) \
83 PF1(i + 2) \
84 LD(i + 2, 2) \
85 LD(i + 3, 3) \
86 PF0(i + 4) \
87 PF0(i + 6) \
88 XO1(i, 0) \
89 XO1(i + 1, 1) \
90 XO1(i + 2, 2) \
91 XO1(i + 3, 3) \
92 ST(i, 0) \
93 ST(i + 1, 1) \
94 ST(i + 2, 2) \
95 ST(i + 3, 3) \
96
97
98 PF0(0)
99 PF0(2)
100
101 " .align 32 ;\n"
102 " 1: ;\n"
103
104 BLOCK(0)
105 BLOCK(4)
106 BLOCK(8)
107 BLOCK(12)
108
109 " add %[inc], %[p1] ;\n"
110 " add %[inc], %[p2] ;\n"
111 " dec %[cnt] ;\n"
112 " jnz 1b ;\n"
113 : [cnt] "+r" (lines),
114 [p1] "+r" (p1), [p2] "+r" (p2)
115 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
116 : "memory");
117
118 kernel_fpu_end();
119}
120
121static void
122xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
123{
124 unsigned long lines = bytes >> 8;
125
126 kernel_fpu_begin();
127
128 asm volatile(
129#undef BLOCK
130#define BLOCK(i) \
131 BLK64(PF0, LD, i) \
132 BLK64(PF1, XO1, i) \
133 BLK64(NOP, ST, i) \
134
135 " .align 32 ;\n"
136 " 1: ;\n"
137
138 BLOCK(0)
139 BLOCK(4)
140 BLOCK(8)
141 BLOCK(12)
142
143 " add %[inc], %[p1] ;\n"
144 " add %[inc], %[p2] ;\n"
145 " dec %[cnt] ;\n"
146 " jnz 1b ;\n"
147 : [cnt] "+r" (lines),
148 [p1] "+r" (p1), [p2] "+r" (p2)
149 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
150 : "memory");
151
152 kernel_fpu_end();
153}
154
155static void
156xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
157 unsigned long *p3)
158{
159 unsigned long lines = bytes >> 8;
160
161 kernel_fpu_begin();
162
163 asm volatile(
164#undef BLOCK
165#define BLOCK(i) \
166 PF1(i) \
167 PF1(i + 2) \
168 LD(i, 0) \
169 LD(i + 1, 1) \
170 LD(i + 2, 2) \
171 LD(i + 3, 3) \
172 PF2(i) \
173 PF2(i + 2) \
174 PF0(i + 4) \
175 PF0(i + 6) \
176 XO1(i, 0) \
177 XO1(i + 1, 1) \
178 XO1(i + 2, 2) \
179 XO1(i + 3, 3) \
180 XO2(i, 0) \
181 XO2(i + 1, 1) \
182 XO2(i + 2, 2) \
183 XO2(i + 3, 3) \
184 ST(i, 0) \
185 ST(i + 1, 1) \
186 ST(i + 2, 2) \
187 ST(i + 3, 3) \
188
189
190 PF0(0)
191 PF0(2)
192
193 " .align 32 ;\n"
194 " 1: ;\n"
195
196 BLOCK(0)
197 BLOCK(4)
198 BLOCK(8)
199 BLOCK(12)
200
201 " add %[inc], %[p1] ;\n"
202 " add %[inc], %[p2] ;\n"
203 " add %[inc], %[p3] ;\n"
204 " dec %[cnt] ;\n"
205 " jnz 1b ;\n"
206 : [cnt] "+r" (lines),
207 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
208 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
209 : "memory");
210
211 kernel_fpu_end();
212}
213
214static void
215xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
216 unsigned long *p3)
217{
218 unsigned long lines = bytes >> 8;
219
220 kernel_fpu_begin();
221
222 asm volatile(
223#undef BLOCK
224#define BLOCK(i) \
225 BLK64(PF0, LD, i) \
226 BLK64(PF1, XO1, i) \
227 BLK64(PF2, XO2, i) \
228 BLK64(NOP, ST, i) \
229
230 " .align 32 ;\n"
231 " 1: ;\n"
232
233 BLOCK(0)
234 BLOCK(4)
235 BLOCK(8)
236 BLOCK(12)
237
238 " add %[inc], %[p1] ;\n"
239 " add %[inc], %[p2] ;\n"
240 " add %[inc], %[p3] ;\n"
241 " dec %[cnt] ;\n"
242 " jnz 1b ;\n"
243 : [cnt] "+r" (lines),
244 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
245 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
246 : "memory");
247
248 kernel_fpu_end();
249}
250
251static void
252xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
253 unsigned long *p3, unsigned long *p4)
254{
255 unsigned long lines = bytes >> 8;
256
257 kernel_fpu_begin();
258
259 asm volatile(
260#undef BLOCK
261#define BLOCK(i) \
262 PF1(i) \
263 PF1(i + 2) \
264 LD(i, 0) \
265 LD(i + 1, 1) \
266 LD(i + 2, 2) \
267 LD(i + 3, 3) \
268 PF2(i) \
269 PF2(i + 2) \
270 XO1(i, 0) \
271 XO1(i + 1, 1) \
272 XO1(i + 2, 2) \
273 XO1(i + 3, 3) \
274 PF3(i) \
275 PF3(i + 2) \
276 PF0(i + 4) \
277 PF0(i + 6) \
278 XO2(i, 0) \
279 XO2(i + 1, 1) \
280 XO2(i + 2, 2) \
281 XO2(i + 3, 3) \
282 XO3(i, 0) \
283 XO3(i + 1, 1) \
284 XO3(i + 2, 2) \
285 XO3(i + 3, 3) \
286 ST(i, 0) \
287 ST(i + 1, 1) \
288 ST(i + 2, 2) \
289 ST(i + 3, 3) \
290
291
292 PF0(0)
293 PF0(2)
294
295 " .align 32 ;\n"
296 " 1: ;\n"
297
298 BLOCK(0)
299 BLOCK(4)
300 BLOCK(8)
301 BLOCK(12)
302
303 " add %[inc], %[p1] ;\n"
304 " add %[inc], %[p2] ;\n"
305 " add %[inc], %[p3] ;\n"
306 " add %[inc], %[p4] ;\n"
307 " dec %[cnt] ;\n"
308 " jnz 1b ;\n"
309 : [cnt] "+r" (lines), [p1] "+r" (p1),
310 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
311 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
312 : "memory");
313
314 kernel_fpu_end();
315}
316
317static void
318xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
319 unsigned long *p3, unsigned long *p4)
320{
321 unsigned long lines = bytes >> 8;
322
323 kernel_fpu_begin();
324
325 asm volatile(
326#undef BLOCK
327#define BLOCK(i) \
328 BLK64(PF0, LD, i) \
329 BLK64(PF1, XO1, i) \
330 BLK64(PF2, XO2, i) \
331 BLK64(PF3, XO3, i) \
332 BLK64(NOP, ST, i) \
333
334 " .align 32 ;\n"
335 " 1: ;\n"
336
337 BLOCK(0)
338 BLOCK(4)
339 BLOCK(8)
340 BLOCK(12)
341
342 " add %[inc], %[p1] ;\n"
343 " add %[inc], %[p2] ;\n"
344 " add %[inc], %[p3] ;\n"
345 " add %[inc], %[p4] ;\n"
346 " dec %[cnt] ;\n"
347 " jnz 1b ;\n"
348 : [cnt] "+r" (lines), [p1] "+r" (p1),
349 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
350 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
351 : "memory");
352
353 kernel_fpu_end();
354}
355
356static void
357xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
358 unsigned long *p3, unsigned long *p4, unsigned long *p5)
359{
360 unsigned long lines = bytes >> 8;
361
362 kernel_fpu_begin();
363
364 asm volatile(
365#undef BLOCK
366#define BLOCK(i) \
367 PF1(i) \
368 PF1(i + 2) \
369 LD(i, 0) \
370 LD(i + 1, 1) \
371 LD(i + 2, 2) \
372 LD(i + 3, 3) \
373 PF2(i) \
374 PF2(i + 2) \
375 XO1(i, 0) \
376 XO1(i + 1, 1) \
377 XO1(i + 2, 2) \
378 XO1(i + 3, 3) \
379 PF3(i) \
380 PF3(i + 2) \
381 XO2(i, 0) \
382 XO2(i + 1, 1) \
383 XO2(i + 2, 2) \
384 XO2(i + 3, 3) \
385 PF4(i) \
386 PF4(i + 2) \
387 PF0(i + 4) \
388 PF0(i + 6) \
389 XO3(i, 0) \
390 XO3(i + 1, 1) \
391 XO3(i + 2, 2) \
392 XO3(i + 3, 3) \
393 XO4(i, 0) \
394 XO4(i + 1, 1) \
395 XO4(i + 2, 2) \
396 XO4(i + 3, 3) \
397 ST(i, 0) \
398 ST(i + 1, 1) \
399 ST(i + 2, 2) \
400 ST(i + 3, 3) \
401
402
403 PF0(0)
404 PF0(2)
405
406 " .align 32 ;\n"
407 " 1: ;\n"
408
409 BLOCK(0)
410 BLOCK(4)
411 BLOCK(8)
412 BLOCK(12)
413
414 " add %[inc], %[p1] ;\n"
415 " add %[inc], %[p2] ;\n"
416 " add %[inc], %[p3] ;\n"
417 " add %[inc], %[p4] ;\n"
418 " add %[inc], %[p5] ;\n"
419 " dec %[cnt] ;\n"
420 " jnz 1b ;\n"
421 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
422 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
423 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
424 : "memory");
425
426 kernel_fpu_end();
427}
428
429static void
430xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
431 unsigned long *p3, unsigned long *p4, unsigned long *p5)
432{
433 unsigned long lines = bytes >> 8;
434
435 kernel_fpu_begin();
436
437 asm volatile(
438#undef BLOCK
439#define BLOCK(i) \
440 BLK64(PF0, LD, i) \
441 BLK64(PF1, XO1, i) \
442 BLK64(PF2, XO2, i) \
443 BLK64(PF3, XO3, i) \
444 BLK64(PF4, XO4, i) \
445 BLK64(NOP, ST, i) \
446
447 " .align 32 ;\n"
448 " 1: ;\n"
449
450 BLOCK(0)
451 BLOCK(4)
452 BLOCK(8)
453 BLOCK(12)
454
455 " add %[inc], %[p1] ;\n"
456 " add %[inc], %[p2] ;\n"
457 " add %[inc], %[p3] ;\n"
458 " add %[inc], %[p4] ;\n"
459 " add %[inc], %[p5] ;\n"
460 " dec %[cnt] ;\n"
461 " jnz 1b ;\n"
462 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
463 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
464 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
465 : "memory");
466
467 kernel_fpu_end();
468}
469
470static struct xor_block_template xor_block_sse_pf64 = {
471 .name = "prefetch64-sse",
472 .do_2 = xor_sse_2_pf64,
473 .do_3 = xor_sse_3_pf64,
474 .do_4 = xor_sse_4_pf64,
475 .do_5 = xor_sse_5_pf64,
476};
477
478#undef LD
479#undef XO1
480#undef XO2
481#undef XO3
482#undef XO4
483#undef ST
484#undef NOP
485#undef BLK64
486#undef BLOCK
487
488#undef XOR_CONSTANT_CONSTRAINT
489
5#ifdef CONFIG_X86_32 490#ifdef CONFIG_X86_32
6# include <asm/xor_32.h> 491# include <asm/xor_32.h>
7#else 492#else
8# include <asm/xor_64.h> 493# include <asm/xor_64.h>
9#endif 494#endif
10#endif 495
496#define XOR_SELECT_TEMPLATE(FASTEST) \
497 AVX_SELECT(FASTEST)
498
499#endif /* _ASM_X86_XOR_H */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index f79cb7ec0e06..ce05722e3c68 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -2,7 +2,7 @@
2#define _ASM_X86_XOR_32_H 2#define _ASM_X86_XOR_32_H
3 3
4/* 4/*
5 * Optimized RAID-5 checksumming functions for MMX and SSE. 5 * Optimized RAID-5 checksumming functions for MMX.
6 * 6 *
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by 8 * it under the terms of the GNU General Public License as published by
@@ -529,290 +529,6 @@ static struct xor_block_template xor_block_p5_mmx = {
529 .do_5 = xor_p5_mmx_5, 529 .do_5 = xor_p5_mmx_5,
530}; 530};
531 531
532/*
533 * Cache avoiding checksumming functions utilizing KNI instructions
534 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
535 */
536
537#define OFFS(x) "16*("#x")"
538#define PF_OFFS(x) "256+16*("#x")"
539#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
540#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
541#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
542#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
543#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
544#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
545#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
546#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
547#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
548#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
549#define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
550#define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
551#define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
552
553
554static void
555xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
556{
557 unsigned long lines = bytes >> 8;
558
559 kernel_fpu_begin();
560
561 asm volatile(
562#undef BLOCK
563#define BLOCK(i) \
564 LD(i, 0) \
565 LD(i + 1, 1) \
566 PF1(i) \
567 PF1(i + 2) \
568 LD(i + 2, 2) \
569 LD(i + 3, 3) \
570 PF0(i + 4) \
571 PF0(i + 6) \
572 XO1(i, 0) \
573 XO1(i + 1, 1) \
574 XO1(i + 2, 2) \
575 XO1(i + 3, 3) \
576 ST(i, 0) \
577 ST(i + 1, 1) \
578 ST(i + 2, 2) \
579 ST(i + 3, 3) \
580
581
582 PF0(0)
583 PF0(2)
584
585 " .align 32 ;\n"
586 " 1: ;\n"
587
588 BLOCK(0)
589 BLOCK(4)
590 BLOCK(8)
591 BLOCK(12)
592
593 " addl $256, %1 ;\n"
594 " addl $256, %2 ;\n"
595 " decl %0 ;\n"
596 " jnz 1b ;\n"
597 : "+r" (lines),
598 "+r" (p1), "+r" (p2)
599 :
600 : "memory");
601
602 kernel_fpu_end();
603}
604
605static void
606xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
607 unsigned long *p3)
608{
609 unsigned long lines = bytes >> 8;
610
611 kernel_fpu_begin();
612
613 asm volatile(
614#undef BLOCK
615#define BLOCK(i) \
616 PF1(i) \
617 PF1(i + 2) \
618 LD(i,0) \
619 LD(i + 1, 1) \
620 LD(i + 2, 2) \
621 LD(i + 3, 3) \
622 PF2(i) \
623 PF2(i + 2) \
624 PF0(i + 4) \
625 PF0(i + 6) \
626 XO1(i,0) \
627 XO1(i + 1, 1) \
628 XO1(i + 2, 2) \
629 XO1(i + 3, 3) \
630 XO2(i,0) \
631 XO2(i + 1, 1) \
632 XO2(i + 2, 2) \
633 XO2(i + 3, 3) \
634 ST(i,0) \
635 ST(i + 1, 1) \
636 ST(i + 2, 2) \
637 ST(i + 3, 3) \
638
639
640 PF0(0)
641 PF0(2)
642
643 " .align 32 ;\n"
644 " 1: ;\n"
645
646 BLOCK(0)
647 BLOCK(4)
648 BLOCK(8)
649 BLOCK(12)
650
651 " addl $256, %1 ;\n"
652 " addl $256, %2 ;\n"
653 " addl $256, %3 ;\n"
654 " decl %0 ;\n"
655 " jnz 1b ;\n"
656 : "+r" (lines),
657 "+r" (p1), "+r"(p2), "+r"(p3)
658 :
659 : "memory" );
660
661 kernel_fpu_end();
662}
663
664static void
665xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
666 unsigned long *p3, unsigned long *p4)
667{
668 unsigned long lines = bytes >> 8;
669
670 kernel_fpu_begin();
671
672 asm volatile(
673#undef BLOCK
674#define BLOCK(i) \
675 PF1(i) \
676 PF1(i + 2) \
677 LD(i,0) \
678 LD(i + 1, 1) \
679 LD(i + 2, 2) \
680 LD(i + 3, 3) \
681 PF2(i) \
682 PF2(i + 2) \
683 XO1(i,0) \
684 XO1(i + 1, 1) \
685 XO1(i + 2, 2) \
686 XO1(i + 3, 3) \
687 PF3(i) \
688 PF3(i + 2) \
689 PF0(i + 4) \
690 PF0(i + 6) \
691 XO2(i,0) \
692 XO2(i + 1, 1) \
693 XO2(i + 2, 2) \
694 XO2(i + 3, 3) \
695 XO3(i,0) \
696 XO3(i + 1, 1) \
697 XO3(i + 2, 2) \
698 XO3(i + 3, 3) \
699 ST(i,0) \
700 ST(i + 1, 1) \
701 ST(i + 2, 2) \
702 ST(i + 3, 3) \
703
704
705 PF0(0)
706 PF0(2)
707
708 " .align 32 ;\n"
709 " 1: ;\n"
710
711 BLOCK(0)
712 BLOCK(4)
713 BLOCK(8)
714 BLOCK(12)
715
716 " addl $256, %1 ;\n"
717 " addl $256, %2 ;\n"
718 " addl $256, %3 ;\n"
719 " addl $256, %4 ;\n"
720 " decl %0 ;\n"
721 " jnz 1b ;\n"
722 : "+r" (lines),
723 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
724 :
725 : "memory" );
726
727 kernel_fpu_end();
728}
729
730static void
731xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
732 unsigned long *p3, unsigned long *p4, unsigned long *p5)
733{
734 unsigned long lines = bytes >> 8;
735
736 kernel_fpu_begin();
737
738 /* Make sure GCC forgets anything it knows about p4 or p5,
739 such that it won't pass to the asm volatile below a
740 register that is shared with any other variable. That's
741 because we modify p4 and p5 there, but we can't mark them
742 as read/write, otherwise we'd overflow the 10-asm-operands
743 limit of GCC < 3.1. */
744 asm("" : "+r" (p4), "+r" (p5));
745
746 asm volatile(
747#undef BLOCK
748#define BLOCK(i) \
749 PF1(i) \
750 PF1(i + 2) \
751 LD(i,0) \
752 LD(i + 1, 1) \
753 LD(i + 2, 2) \
754 LD(i + 3, 3) \
755 PF2(i) \
756 PF2(i + 2) \
757 XO1(i,0) \
758 XO1(i + 1, 1) \
759 XO1(i + 2, 2) \
760 XO1(i + 3, 3) \
761 PF3(i) \
762 PF3(i + 2) \
763 XO2(i,0) \
764 XO2(i + 1, 1) \
765 XO2(i + 2, 2) \
766 XO2(i + 3, 3) \
767 PF4(i) \
768 PF4(i + 2) \
769 PF0(i + 4) \
770 PF0(i + 6) \
771 XO3(i,0) \
772 XO3(i + 1, 1) \
773 XO3(i + 2, 2) \
774 XO3(i + 3, 3) \
775 XO4(i,0) \
776 XO4(i + 1, 1) \
777 XO4(i + 2, 2) \
778 XO4(i + 3, 3) \
779 ST(i,0) \
780 ST(i + 1, 1) \
781 ST(i + 2, 2) \
782 ST(i + 3, 3) \
783
784
785 PF0(0)
786 PF0(2)
787
788 " .align 32 ;\n"
789 " 1: ;\n"
790
791 BLOCK(0)
792 BLOCK(4)
793 BLOCK(8)
794 BLOCK(12)
795
796 " addl $256, %1 ;\n"
797 " addl $256, %2 ;\n"
798 " addl $256, %3 ;\n"
799 " addl $256, %4 ;\n"
800 " addl $256, %5 ;\n"
801 " decl %0 ;\n"
802 " jnz 1b ;\n"
803 : "+r" (lines),
804 "+r" (p1), "+r" (p2), "+r" (p3)
805 : "r" (p4), "r" (p5)
806 : "memory");
807
808 /* p4 and p5 were modified, and now the variables are dead.
809 Clobber them just to be sure nobody does something stupid
810 like assuming they have some legal value. */
811 asm("" : "=r" (p4), "=r" (p5));
812
813 kernel_fpu_end();
814}
815
816static struct xor_block_template xor_block_pIII_sse = { 532static struct xor_block_template xor_block_pIII_sse = {
817 .name = "pIII_sse", 533 .name = "pIII_sse",
818 .do_2 = xor_sse_2, 534 .do_2 = xor_sse_2,
@@ -827,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = {
827/* Also try the generic routines. */ 543/* Also try the generic routines. */
828#include <asm-generic/xor.h> 544#include <asm-generic/xor.h>
829 545
546/* We force the use of the SSE xor block because it can write around L2.
547 We may also be able to load into the L1 only depending on how the cpu
548 deals with a load to a line that is being prefetched. */
830#undef XOR_TRY_TEMPLATES 549#undef XOR_TRY_TEMPLATES
831#define XOR_TRY_TEMPLATES \ 550#define XOR_TRY_TEMPLATES \
832do { \ 551do { \
833 xor_speed(&xor_block_8regs); \
834 xor_speed(&xor_block_8regs_p); \
835 xor_speed(&xor_block_32regs); \
836 xor_speed(&xor_block_32regs_p); \
837 AVX_XOR_SPEED; \ 552 AVX_XOR_SPEED; \
838 if (cpu_has_xmm) \ 553 if (cpu_has_xmm) { \
839 xor_speed(&xor_block_pIII_sse); \ 554 xor_speed(&xor_block_pIII_sse); \
840 if (cpu_has_mmx) { \ 555 xor_speed(&xor_block_sse_pf64); \
556 } else if (cpu_has_mmx) { \
841 xor_speed(&xor_block_pII_mmx); \ 557 xor_speed(&xor_block_pII_mmx); \
842 xor_speed(&xor_block_p5_mmx); \ 558 xor_speed(&xor_block_p5_mmx); \
559 } else { \
560 xor_speed(&xor_block_8regs); \
561 xor_speed(&xor_block_8regs_p); \
562 xor_speed(&xor_block_32regs); \
563 xor_speed(&xor_block_32regs_p); \
843 } \ 564 } \
844} while (0) 565} while (0)
845 566
846/* We force the use of the SSE xor block because it can write around L2.
847 We may also be able to load into the L1 only depending on how the cpu
848 deals with a load to a line that is being prefetched. */
849#define XOR_SELECT_TEMPLATE(FASTEST) \
850 AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
851
852#endif /* _ASM_X86_XOR_32_H */ 567#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 87ac522c4af5..546f1e3b87cc 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -1,301 +1,6 @@
1#ifndef _ASM_X86_XOR_64_H 1#ifndef _ASM_X86_XOR_64_H
2#define _ASM_X86_XOR_64_H 2#define _ASM_X86_XOR_64_H
3 3
4/*
5 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17
18/*
19 * Cache avoiding checksumming functions utilizing KNI instructions
20 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21 */
22
23/*
24 * Based on
25 * High-speed RAID5 checksumming functions utilizing SSE instructions.
26 * Copyright (C) 1998 Ingo Molnar.
27 */
28
29/*
30 * x86-64 changes / gcc fixes from Andi Kleen.
31 * Copyright 2002 Andi Kleen, SuSE Labs.
32 *
33 * This hasn't been optimized for the hammer yet, but there are likely
34 * no advantages to be gotten from x86-64 here anyways.
35 */
36
37#include <asm/i387.h>
38
39#define OFFS(x) "16*("#x")"
40#define PF_OFFS(x) "256+16*("#x")"
41#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
42#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
43#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
44#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
45#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
46#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
47#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
48#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
49#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
50#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
51#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
52#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
53#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
54
55
56static void
57xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
58{
59 unsigned int lines = bytes >> 8;
60
61 kernel_fpu_begin();
62
63 asm volatile(
64#undef BLOCK
65#define BLOCK(i) \
66 LD(i, 0) \
67 LD(i + 1, 1) \
68 PF1(i) \
69 PF1(i + 2) \
70 LD(i + 2, 2) \
71 LD(i + 3, 3) \
72 PF0(i + 4) \
73 PF0(i + 6) \
74 XO1(i, 0) \
75 XO1(i + 1, 1) \
76 XO1(i + 2, 2) \
77 XO1(i + 3, 3) \
78 ST(i, 0) \
79 ST(i + 1, 1) \
80 ST(i + 2, 2) \
81 ST(i + 3, 3) \
82
83
84 PF0(0)
85 PF0(2)
86
87 " .align 32 ;\n"
88 " 1: ;\n"
89
90 BLOCK(0)
91 BLOCK(4)
92 BLOCK(8)
93 BLOCK(12)
94
95 " addq %[inc], %[p1] ;\n"
96 " addq %[inc], %[p2] ;\n"
97 " decl %[cnt] ; jnz 1b"
98 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
99 : [inc] "r" (256UL)
100 : "memory");
101
102 kernel_fpu_end();
103}
104
105static void
106xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
107 unsigned long *p3)
108{
109 unsigned int lines = bytes >> 8;
110
111 kernel_fpu_begin();
112 asm volatile(
113#undef BLOCK
114#define BLOCK(i) \
115 PF1(i) \
116 PF1(i + 2) \
117 LD(i, 0) \
118 LD(i + 1, 1) \
119 LD(i + 2, 2) \
120 LD(i + 3, 3) \
121 PF2(i) \
122 PF2(i + 2) \
123 PF0(i + 4) \
124 PF0(i + 6) \
125 XO1(i, 0) \
126 XO1(i + 1, 1) \
127 XO1(i + 2, 2) \
128 XO1(i + 3, 3) \
129 XO2(i, 0) \
130 XO2(i + 1, 1) \
131 XO2(i + 2, 2) \
132 XO2(i + 3, 3) \
133 ST(i, 0) \
134 ST(i + 1, 1) \
135 ST(i + 2, 2) \
136 ST(i + 3, 3) \
137
138
139 PF0(0)
140 PF0(2)
141
142 " .align 32 ;\n"
143 " 1: ;\n"
144
145 BLOCK(0)
146 BLOCK(4)
147 BLOCK(8)
148 BLOCK(12)
149
150 " addq %[inc], %[p1] ;\n"
151 " addq %[inc], %[p2] ;\n"
152 " addq %[inc], %[p3] ;\n"
153 " decl %[cnt] ; jnz 1b"
154 : [cnt] "+r" (lines),
155 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
156 : [inc] "r" (256UL)
157 : "memory");
158 kernel_fpu_end();
159}
160
161static void
162xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
163 unsigned long *p3, unsigned long *p4)
164{
165 unsigned int lines = bytes >> 8;
166
167 kernel_fpu_begin();
168
169 asm volatile(
170#undef BLOCK
171#define BLOCK(i) \
172 PF1(i) \
173 PF1(i + 2) \
174 LD(i, 0) \
175 LD(i + 1, 1) \
176 LD(i + 2, 2) \
177 LD(i + 3, 3) \
178 PF2(i) \
179 PF2(i + 2) \
180 XO1(i, 0) \
181 XO1(i + 1, 1) \
182 XO1(i + 2, 2) \
183 XO1(i + 3, 3) \
184 PF3(i) \
185 PF3(i + 2) \
186 PF0(i + 4) \
187 PF0(i + 6) \
188 XO2(i, 0) \
189 XO2(i + 1, 1) \
190 XO2(i + 2, 2) \
191 XO2(i + 3, 3) \
192 XO3(i, 0) \
193 XO3(i + 1, 1) \
194 XO3(i + 2, 2) \
195 XO3(i + 3, 3) \
196 ST(i, 0) \
197 ST(i + 1, 1) \
198 ST(i + 2, 2) \
199 ST(i + 3, 3) \
200
201
202 PF0(0)
203 PF0(2)
204
205 " .align 32 ;\n"
206 " 1: ;\n"
207
208 BLOCK(0)
209 BLOCK(4)
210 BLOCK(8)
211 BLOCK(12)
212
213 " addq %[inc], %[p1] ;\n"
214 " addq %[inc], %[p2] ;\n"
215 " addq %[inc], %[p3] ;\n"
216 " addq %[inc], %[p4] ;\n"
217 " decl %[cnt] ; jnz 1b"
218 : [cnt] "+c" (lines),
219 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
220 : [inc] "r" (256UL)
221 : "memory" );
222
223 kernel_fpu_end();
224}
225
226static void
227xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
228 unsigned long *p3, unsigned long *p4, unsigned long *p5)
229{
230 unsigned int lines = bytes >> 8;
231
232 kernel_fpu_begin();
233
234 asm volatile(
235#undef BLOCK
236#define BLOCK(i) \
237 PF1(i) \
238 PF1(i + 2) \
239 LD(i, 0) \
240 LD(i + 1, 1) \
241 LD(i + 2, 2) \
242 LD(i + 3, 3) \
243 PF2(i) \
244 PF2(i + 2) \
245 XO1(i, 0) \
246 XO1(i + 1, 1) \
247 XO1(i + 2, 2) \
248 XO1(i + 3, 3) \
249 PF3(i) \
250 PF3(i + 2) \
251 XO2(i, 0) \
252 XO2(i + 1, 1) \
253 XO2(i + 2, 2) \
254 XO2(i + 3, 3) \
255 PF4(i) \
256 PF4(i + 2) \
257 PF0(i + 4) \
258 PF0(i + 6) \
259 XO3(i, 0) \
260 XO3(i + 1, 1) \
261 XO3(i + 2, 2) \
262 XO3(i + 3, 3) \
263 XO4(i, 0) \
264 XO4(i + 1, 1) \
265 XO4(i + 2, 2) \
266 XO4(i + 3, 3) \
267 ST(i, 0) \
268 ST(i + 1, 1) \
269 ST(i + 2, 2) \
270 ST(i + 3, 3) \
271
272
273 PF0(0)
274 PF0(2)
275
276 " .align 32 ;\n"
277 " 1: ;\n"
278
279 BLOCK(0)
280 BLOCK(4)
281 BLOCK(8)
282 BLOCK(12)
283
284 " addq %[inc], %[p1] ;\n"
285 " addq %[inc], %[p2] ;\n"
286 " addq %[inc], %[p3] ;\n"
287 " addq %[inc], %[p4] ;\n"
288 " addq %[inc], %[p5] ;\n"
289 " decl %[cnt] ; jnz 1b"
290 : [cnt] "+c" (lines),
291 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
292 [p5] "+r" (p5)
293 : [inc] "r" (256UL)
294 : "memory");
295
296 kernel_fpu_end();
297}
298
299static struct xor_block_template xor_block_sse = { 4static struct xor_block_template xor_block_sse = {
300 .name = "generic_sse", 5 .name = "generic_sse",
301 .do_2 = xor_sse_2, 6 .do_2 = xor_sse_2,
@@ -308,17 +13,15 @@ static struct xor_block_template xor_block_sse = {
308/* Also try the AVX routines */ 13/* Also try the AVX routines */
309#include <asm/xor_avx.h> 14#include <asm/xor_avx.h>
310 15
16/* We force the use of the SSE xor block because it can write around L2.
17 We may also be able to load into the L1 only depending on how the cpu
18 deals with a load to a line that is being prefetched. */
311#undef XOR_TRY_TEMPLATES 19#undef XOR_TRY_TEMPLATES
312#define XOR_TRY_TEMPLATES \ 20#define XOR_TRY_TEMPLATES \
313do { \ 21do { \
314 AVX_XOR_SPEED; \ 22 AVX_XOR_SPEED; \
23 xor_speed(&xor_block_sse_pf64); \
315 xor_speed(&xor_block_sse); \ 24 xor_speed(&xor_block_sse); \
316} while (0) 25} while (0)
317 26
318/* We force the use of the SSE xor block because it can write around L2.
319 We may also be able to load into the L1 only depending on how the cpu
320 deals with a load to a line that is being prefetched. */
321#define XOR_SELECT_TEMPLATE(FASTEST) \
322 AVX_SELECT(&xor_block_sse)
323
324#endif /* _ASM_X86_XOR_64_H */ 27#endif /* _ASM_X86_XOR_64_H */