diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/asm-x86/xor_64.h | 294 |
1 files changed, 149 insertions, 145 deletions
diff --git a/include/asm-x86/xor_64.h b/include/asm-x86/xor_64.h index 1eee7fcb2420..24957e39ac8a 100644 --- a/include/asm-x86/xor_64.h +++ b/include/asm-x86/xor_64.h | |||
@@ -24,20 +24,23 @@ | |||
24 | */ | 24 | */ |
25 | 25 | ||
26 | /* | 26 | /* |
27 | * x86-64 changes / gcc fixes from Andi Kleen. | 27 | * x86-64 changes / gcc fixes from Andi Kleen. |
28 | * Copyright 2002 Andi Kleen, SuSE Labs. | 28 | * Copyright 2002 Andi Kleen, SuSE Labs. |
29 | * | 29 | * |
30 | * This hasn't been optimized for the hammer yet, but there are likely | 30 | * This hasn't been optimized for the hammer yet, but there are likely |
31 | * no advantages to be gotten from x86-64 here anyways. | 31 | * no advantages to be gotten from x86-64 here anyways. |
32 | */ | 32 | */ |
33 | 33 | ||
34 | typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t; | 34 | typedef struct { |
35 | unsigned long a, b; | ||
36 | } __attribute__((aligned(16))) xmm_store_t; | ||
35 | 37 | ||
36 | /* Doesn't use gcc to save the XMM registers, because there is no easy way to | 38 | /* Doesn't use gcc to save the XMM registers, because there is no easy way to |
37 | tell it to do a clts before the register saving. */ | 39 | tell it to do a clts before the register saving. */ |
38 | #define XMMS_SAVE do { \ | 40 | #define XMMS_SAVE \ |
41 | do { \ | ||
39 | preempt_disable(); \ | 42 | preempt_disable(); \ |
40 | asm volatile ( \ | 43 | asm volatile( \ |
41 | "movq %%cr0,%0 ;\n\t" \ | 44 | "movq %%cr0,%0 ;\n\t" \ |
42 | "clts ;\n\t" \ | 45 | "clts ;\n\t" \ |
43 | "movups %%xmm0,(%1) ;\n\t" \ | 46 | "movups %%xmm0,(%1) ;\n\t" \ |
@@ -47,10 +50,11 @@ typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t; | |||
47 | : "=&r" (cr0) \ | 50 | : "=&r" (cr0) \ |
48 | : "r" (xmm_save) \ | 51 | : "r" (xmm_save) \ |
49 | : "memory"); \ | 52 | : "memory"); \ |
50 | } while(0) | 53 | } while (0) |
51 | 54 | ||
52 | #define XMMS_RESTORE do { \ | 55 | #define XMMS_RESTORE \ |
53 | asm volatile ( \ | 56 | do { \ |
57 | asm volatile( \ | ||
54 | "sfence ;\n\t" \ | 58 | "sfence ;\n\t" \ |
55 | "movups (%1),%%xmm0 ;\n\t" \ | 59 | "movups (%1),%%xmm0 ;\n\t" \ |
56 | "movups 0x10(%1),%%xmm1 ;\n\t" \ | 60 | "movups 0x10(%1),%%xmm1 ;\n\t" \ |
@@ -61,72 +65,72 @@ typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t; | |||
61 | : "r" (cr0), "r" (xmm_save) \ | 65 | : "r" (cr0), "r" (xmm_save) \ |
62 | : "memory"); \ | 66 | : "memory"); \ |
63 | preempt_enable(); \ | 67 | preempt_enable(); \ |
64 | } while(0) | 68 | } while (0) |
65 | 69 | ||
66 | #define OFFS(x) "16*("#x")" | 70 | #define OFFS(x) "16*("#x")" |
67 | #define PF_OFFS(x) "256+16*("#x")" | 71 | #define PF_OFFS(x) "256+16*("#x")" |
68 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" | 72 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" |
69 | #define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" | 73 | #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" |
70 | #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" | 74 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" |
71 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" | 75 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" |
72 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" | 76 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" |
73 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" | 77 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" |
74 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" | 78 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" |
75 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" | 79 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" |
76 | #define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" | 80 | #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" |
77 | #define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" | 81 | #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" |
78 | #define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" | 82 | #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" |
79 | #define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" | 83 | #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" |
80 | #define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" | 84 | #define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" |
81 | 85 | ||
82 | 86 | ||
83 | static void | 87 | static void |
84 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | 88 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) |
85 | { | 89 | { |
86 | unsigned int lines = bytes >> 8; | 90 | unsigned int lines = bytes >> 8; |
87 | unsigned long cr0; | 91 | unsigned long cr0; |
88 | xmm_store_t xmm_save[4]; | 92 | xmm_store_t xmm_save[4]; |
89 | 93 | ||
90 | XMMS_SAVE; | 94 | XMMS_SAVE; |
91 | 95 | ||
92 | asm volatile ( | 96 | asm volatile( |
93 | #undef BLOCK | 97 | #undef BLOCK |
94 | #define BLOCK(i) \ | 98 | #define BLOCK(i) \ |
95 | LD(i,0) \ | 99 | LD(i, 0) \ |
96 | LD(i+1,1) \ | 100 | LD(i + 1, 1) \ |
97 | PF1(i) \ | 101 | PF1(i) \ |
98 | PF1(i+2) \ | 102 | PF1(i + 2) \ |
99 | LD(i+2,2) \ | 103 | LD(i + 2, 2) \ |
100 | LD(i+3,3) \ | 104 | LD(i + 3, 3) \ |
101 | PF0(i+4) \ | 105 | PF0(i + 4) \ |
102 | PF0(i+6) \ | 106 | PF0(i + 6) \ |
103 | XO1(i,0) \ | 107 | XO1(i, 0) \ |
104 | XO1(i+1,1) \ | 108 | XO1(i + 1, 1) \ |
105 | XO1(i+2,2) \ | 109 | XO1(i + 2, 2) \ |
106 | XO1(i+3,3) \ | 110 | XO1(i + 3, 3) \ |
107 | ST(i,0) \ | 111 | ST(i, 0) \ |
108 | ST(i+1,1) \ | 112 | ST(i + 1, 1) \ |
109 | ST(i+2,2) \ | 113 | ST(i + 2, 2) \ |
110 | ST(i+3,3) \ | 114 | ST(i + 3, 3) \ |
111 | 115 | ||
112 | 116 | ||
113 | PF0(0) | 117 | PF0(0) |
114 | PF0(2) | 118 | PF0(2) |
115 | 119 | ||
116 | " .align 32 ;\n" | 120 | " .align 32 ;\n" |
117 | " 1: ;\n" | 121 | " 1: ;\n" |
118 | 122 | ||
119 | BLOCK(0) | 123 | BLOCK(0) |
120 | BLOCK(4) | 124 | BLOCK(4) |
121 | BLOCK(8) | 125 | BLOCK(8) |
122 | BLOCK(12) | 126 | BLOCK(12) |
123 | 127 | ||
124 | " addq %[inc], %[p1] ;\n" | 128 | " addq %[inc], %[p1] ;\n" |
125 | " addq %[inc], %[p2] ;\n" | 129 | " addq %[inc], %[p2] ;\n" |
126 | " decl %[cnt] ; jnz 1b" | 130 | " decl %[cnt] ; jnz 1b" |
127 | : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) | 131 | : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) |
128 | : [inc] "r" (256UL) | 132 | : [inc] "r" (256UL) |
129 | : "memory"); | 133 | : "memory"); |
130 | 134 | ||
131 | XMMS_RESTORE; | 135 | XMMS_RESTORE; |
132 | } | 136 | } |
@@ -141,52 +145,52 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
141 | 145 | ||
142 | XMMS_SAVE; | 146 | XMMS_SAVE; |
143 | 147 | ||
144 | __asm__ __volatile__ ( | 148 | asm volatile( |
145 | #undef BLOCK | 149 | #undef BLOCK |
146 | #define BLOCK(i) \ | 150 | #define BLOCK(i) \ |
147 | PF1(i) \ | 151 | PF1(i) \ |
148 | PF1(i+2) \ | 152 | PF1(i + 2) \ |
149 | LD(i,0) \ | 153 | LD(i, 0) \ |
150 | LD(i+1,1) \ | 154 | LD(i + 1, 1) \ |
151 | LD(i+2,2) \ | 155 | LD(i + 2, 2) \ |
152 | LD(i+3,3) \ | 156 | LD(i + 3, 3) \ |
153 | PF2(i) \ | 157 | PF2(i) \ |
154 | PF2(i+2) \ | 158 | PF2(i + 2) \ |
155 | PF0(i+4) \ | 159 | PF0(i + 4) \ |
156 | PF0(i+6) \ | 160 | PF0(i + 6) \ |
157 | XO1(i,0) \ | 161 | XO1(i, 0) \ |
158 | XO1(i+1,1) \ | 162 | XO1(i + 1, 1) \ |
159 | XO1(i+2,2) \ | 163 | XO1(i + 2, 2) \ |
160 | XO1(i+3,3) \ | 164 | XO1(i + 3, 3) \ |
161 | XO2(i,0) \ | 165 | XO2(i, 0) \ |
162 | XO2(i+1,1) \ | 166 | XO2(i + 1, 1) \ |
163 | XO2(i+2,2) \ | 167 | XO2(i + 2, 2) \ |
164 | XO2(i+3,3) \ | 168 | XO2(i + 3, 3) \ |
165 | ST(i,0) \ | 169 | ST(i, 0) \ |
166 | ST(i+1,1) \ | 170 | ST(i + 1, 1) \ |
167 | ST(i+2,2) \ | 171 | ST(i + 2, 2) \ |
168 | ST(i+3,3) \ | 172 | ST(i + 3, 3) \ |
169 | 173 | ||
170 | 174 | ||
171 | PF0(0) | 175 | PF0(0) |
172 | PF0(2) | 176 | PF0(2) |
173 | 177 | ||
174 | " .align 32 ;\n" | 178 | " .align 32 ;\n" |
175 | " 1: ;\n" | 179 | " 1: ;\n" |
176 | 180 | ||
177 | BLOCK(0) | 181 | BLOCK(0) |
178 | BLOCK(4) | 182 | BLOCK(4) |
179 | BLOCK(8) | 183 | BLOCK(8) |
180 | BLOCK(12) | 184 | BLOCK(12) |
181 | 185 | ||
182 | " addq %[inc], %[p1] ;\n" | 186 | " addq %[inc], %[p1] ;\n" |
183 | " addq %[inc], %[p2] ;\n" | 187 | " addq %[inc], %[p2] ;\n" |
184 | " addq %[inc], %[p3] ;\n" | 188 | " addq %[inc], %[p3] ;\n" |
185 | " decl %[cnt] ; jnz 1b" | 189 | " decl %[cnt] ; jnz 1b" |
186 | : [cnt] "+r" (lines), | 190 | : [cnt] "+r" (lines), |
187 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | 191 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) |
188 | : [inc] "r" (256UL) | 192 | : [inc] "r" (256UL) |
189 | : "memory"); | 193 | : "memory"); |
190 | XMMS_RESTORE; | 194 | XMMS_RESTORE; |
191 | } | 195 | } |
192 | 196 | ||
@@ -195,64 +199,64 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
195 | unsigned long *p3, unsigned long *p4) | 199 | unsigned long *p3, unsigned long *p4) |
196 | { | 200 | { |
197 | unsigned int lines = bytes >> 8; | 201 | unsigned int lines = bytes >> 8; |
198 | xmm_store_t xmm_save[4]; | 202 | xmm_store_t xmm_save[4]; |
199 | unsigned long cr0; | 203 | unsigned long cr0; |
200 | 204 | ||
201 | XMMS_SAVE; | 205 | XMMS_SAVE; |
202 | 206 | ||
203 | __asm__ __volatile__ ( | 207 | asm volatile( |
204 | #undef BLOCK | 208 | #undef BLOCK |
205 | #define BLOCK(i) \ | 209 | #define BLOCK(i) \ |
206 | PF1(i) \ | 210 | PF1(i) \ |
207 | PF1(i+2) \ | 211 | PF1(i + 2) \ |
208 | LD(i,0) \ | 212 | LD(i, 0) \ |
209 | LD(i+1,1) \ | 213 | LD(i + 1, 1) \ |
210 | LD(i+2,2) \ | 214 | LD(i + 2, 2) \ |
211 | LD(i+3,3) \ | 215 | LD(i + 3, 3) \ |
212 | PF2(i) \ | 216 | PF2(i) \ |
213 | PF2(i+2) \ | 217 | PF2(i + 2) \ |
214 | XO1(i,0) \ | 218 | XO1(i, 0) \ |
215 | XO1(i+1,1) \ | 219 | XO1(i + 1, 1) \ |
216 | XO1(i+2,2) \ | 220 | XO1(i + 2, 2) \ |
217 | XO1(i+3,3) \ | 221 | XO1(i + 3, 3) \ |
218 | PF3(i) \ | 222 | PF3(i) \ |
219 | PF3(i+2) \ | 223 | PF3(i + 2) \ |
220 | PF0(i+4) \ | 224 | PF0(i + 4) \ |
221 | PF0(i+6) \ | 225 | PF0(i + 6) \ |
222 | XO2(i,0) \ | 226 | XO2(i, 0) \ |
223 | XO2(i+1,1) \ | 227 | XO2(i + 1, 1) \ |
224 | XO2(i+2,2) \ | 228 | XO2(i + 2, 2) \ |
225 | XO2(i+3,3) \ | 229 | XO2(i + 3, 3) \ |
226 | XO3(i,0) \ | 230 | XO3(i, 0) \ |
227 | XO3(i+1,1) \ | 231 | XO3(i + 1, 1) \ |
228 | XO3(i+2,2) \ | 232 | XO3(i + 2, 2) \ |
229 | XO3(i+3,3) \ | 233 | XO3(i + 3, 3) \ |
230 | ST(i,0) \ | 234 | ST(i, 0) \ |
231 | ST(i+1,1) \ | 235 | ST(i + 1, 1) \ |
232 | ST(i+2,2) \ | 236 | ST(i + 2, 2) \ |
233 | ST(i+3,3) \ | 237 | ST(i + 3, 3) \ |
234 | 238 | ||
235 | 239 | ||
236 | PF0(0) | 240 | PF0(0) |
237 | PF0(2) | 241 | PF0(2) |
238 | 242 | ||
239 | " .align 32 ;\n" | 243 | " .align 32 ;\n" |
240 | " 1: ;\n" | 244 | " 1: ;\n" |
241 | 245 | ||
242 | BLOCK(0) | 246 | BLOCK(0) |
243 | BLOCK(4) | 247 | BLOCK(4) |
244 | BLOCK(8) | 248 | BLOCK(8) |
245 | BLOCK(12) | 249 | BLOCK(12) |
246 | 250 | ||
247 | " addq %[inc], %[p1] ;\n" | 251 | " addq %[inc], %[p1] ;\n" |
248 | " addq %[inc], %[p2] ;\n" | 252 | " addq %[inc], %[p2] ;\n" |
249 | " addq %[inc], %[p3] ;\n" | 253 | " addq %[inc], %[p3] ;\n" |
250 | " addq %[inc], %[p4] ;\n" | 254 | " addq %[inc], %[p4] ;\n" |
251 | " decl %[cnt] ; jnz 1b" | 255 | " decl %[cnt] ; jnz 1b" |
252 | : [cnt] "+c" (lines), | 256 | : [cnt] "+c" (lines), |
253 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | 257 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) |
254 | : [inc] "r" (256UL) | 258 | : [inc] "r" (256UL) |
255 | : "memory" ); | 259 | : "memory" ); |
256 | 260 | ||
257 | XMMS_RESTORE; | 261 | XMMS_RESTORE; |
258 | } | 262 | } |
@@ -261,70 +265,70 @@ static void | |||
261 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | 265 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
262 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | 266 | unsigned long *p3, unsigned long *p4, unsigned long *p5) |
263 | { | 267 | { |
264 | unsigned int lines = bytes >> 8; | 268 | unsigned int lines = bytes >> 8; |
265 | xmm_store_t xmm_save[4]; | 269 | xmm_store_t xmm_save[4]; |
266 | unsigned long cr0; | 270 | unsigned long cr0; |
267 | 271 | ||
268 | XMMS_SAVE; | 272 | XMMS_SAVE; |
269 | 273 | ||
270 | __asm__ __volatile__ ( | 274 | asm volatile( |
271 | #undef BLOCK | 275 | #undef BLOCK |
272 | #define BLOCK(i) \ | 276 | #define BLOCK(i) \ |
273 | PF1(i) \ | 277 | PF1(i) \ |
274 | PF1(i+2) \ | 278 | PF1(i + 2) \ |
275 | LD(i,0) \ | 279 | LD(i, 0) \ |
276 | LD(i+1,1) \ | 280 | LD(i + 1, 1) \ |
277 | LD(i+2,2) \ | 281 | LD(i + 2, 2) \ |
278 | LD(i+3,3) \ | 282 | LD(i + 3, 3) \ |
279 | PF2(i) \ | 283 | PF2(i) \ |
280 | PF2(i+2) \ | 284 | PF2(i + 2) \ |
281 | XO1(i,0) \ | 285 | XO1(i, 0) \ |
282 | XO1(i+1,1) \ | 286 | XO1(i + 1, 1) \ |
283 | XO1(i+2,2) \ | 287 | XO1(i + 2, 2) \ |
284 | XO1(i+3,3) \ | 288 | XO1(i + 3, 3) \ |
285 | PF3(i) \ | 289 | PF3(i) \ |
286 | PF3(i+2) \ | 290 | PF3(i + 2) \ |
287 | XO2(i,0) \ | 291 | XO2(i, 0) \ |
288 | XO2(i+1,1) \ | 292 | XO2(i + 1, 1) \ |
289 | XO2(i+2,2) \ | 293 | XO2(i + 2, 2) \ |
290 | XO2(i+3,3) \ | 294 | XO2(i + 3, 3) \ |
291 | PF4(i) \ | 295 | PF4(i) \ |
292 | PF4(i+2) \ | 296 | PF4(i + 2) \ |
293 | PF0(i+4) \ | 297 | PF0(i + 4) \ |
294 | PF0(i+6) \ | 298 | PF0(i + 6) \ |
295 | XO3(i,0) \ | 299 | XO3(i, 0) \ |
296 | XO3(i+1,1) \ | 300 | XO3(i + 1, 1) \ |
297 | XO3(i+2,2) \ | 301 | XO3(i + 2, 2) \ |
298 | XO3(i+3,3) \ | 302 | XO3(i + 3, 3) \ |
299 | XO4(i,0) \ | 303 | XO4(i, 0) \ |
300 | XO4(i+1,1) \ | 304 | XO4(i + 1, 1) \ |
301 | XO4(i+2,2) \ | 305 | XO4(i + 2, 2) \ |
302 | XO4(i+3,3) \ | 306 | XO4(i + 3, 3) \ |
303 | ST(i,0) \ | 307 | ST(i, 0) \ |
304 | ST(i+1,1) \ | 308 | ST(i + 1, 1) \ |
305 | ST(i+2,2) \ | 309 | ST(i + 2, 2) \ |
306 | ST(i+3,3) \ | 310 | ST(i + 3, 3) \ |
307 | 311 | ||
308 | 312 | ||
309 | PF0(0) | 313 | PF0(0) |
310 | PF0(2) | 314 | PF0(2) |
311 | 315 | ||
312 | " .align 32 ;\n" | 316 | " .align 32 ;\n" |
313 | " 1: ;\n" | 317 | " 1: ;\n" |
314 | 318 | ||
315 | BLOCK(0) | 319 | BLOCK(0) |
316 | BLOCK(4) | 320 | BLOCK(4) |
317 | BLOCK(8) | 321 | BLOCK(8) |
318 | BLOCK(12) | 322 | BLOCK(12) |
319 | 323 | ||
320 | " addq %[inc], %[p1] ;\n" | 324 | " addq %[inc], %[p1] ;\n" |
321 | " addq %[inc], %[p2] ;\n" | 325 | " addq %[inc], %[p2] ;\n" |
322 | " addq %[inc], %[p3] ;\n" | 326 | " addq %[inc], %[p3] ;\n" |
323 | " addq %[inc], %[p4] ;\n" | 327 | " addq %[inc], %[p4] ;\n" |
324 | " addq %[inc], %[p5] ;\n" | 328 | " addq %[inc], %[p5] ;\n" |
325 | " decl %[cnt] ; jnz 1b" | 329 | " decl %[cnt] ; jnz 1b" |
326 | : [cnt] "+c" (lines), | 330 | : [cnt] "+c" (lines), |
327 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), | 331 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), |
328 | [p5] "+r" (p5) | 332 | [p5] "+r" (p5) |
329 | : [inc] "r" (256UL) | 333 | : [inc] "r" (256UL) |
330 | : "memory"); | 334 | : "memory"); |
@@ -333,18 +337,18 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
333 | } | 337 | } |
334 | 338 | ||
335 | static struct xor_block_template xor_block_sse = { | 339 | static struct xor_block_template xor_block_sse = { |
336 | .name = "generic_sse", | 340 | .name = "generic_sse", |
337 | .do_2 = xor_sse_2, | 341 | .do_2 = xor_sse_2, |
338 | .do_3 = xor_sse_3, | 342 | .do_3 = xor_sse_3, |
339 | .do_4 = xor_sse_4, | 343 | .do_4 = xor_sse_4, |
340 | .do_5 = xor_sse_5, | 344 | .do_5 = xor_sse_5, |
341 | }; | 345 | }; |
342 | 346 | ||
343 | #undef XOR_TRY_TEMPLATES | 347 | #undef XOR_TRY_TEMPLATES |
344 | #define XOR_TRY_TEMPLATES \ | 348 | #define XOR_TRY_TEMPLATES \ |
345 | do { \ | 349 | do { \ |
346 | xor_speed(&xor_block_sse); \ | 350 | xor_speed(&xor_block_sse); \ |
347 | } while (0) | 351 | } while (0) |
348 | 352 | ||
349 | /* We force the use of the SSE xor block because it can write around L2. | 353 | /* We force the use of the SSE xor block because it can write around L2. |
350 | We may also be able to load into the L1 only depending on how the cpu | 354 | We may also be able to load into the L1 only depending on how the cpu |