aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-19 22:09:42 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-19 22:09:42 -0500
commita57ed93600f2dab64e859d524c3320fe0922e99d (patch)
treee490392e4b438401b0690a2a7781bb5c88feda34 /arch/x86
parent5800700f66678ea5c85e7d62b138416070bf7f60 (diff)
parent5e2a044daf0c6f897eb69de931e3b29020e874a9 (diff)
Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86/asm changes from Ingo Molnar: "The biggest change (by line count) is the unification of the XOR code and then the introduction of an additional SSE based XOR assembly method. The other bigger change is the head_32.S rework/cleanup by Borislav Petkov. Last but not least there's the usual laundry list of small but dangerous (and hopefully perfectly tested) changes to subtle low level x86 code, plus cleanups." * 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86, head_32: Give the 6 label a real name x86, head_32: Remove second CPUID detection from default_entry x86: Detect CPUID support early at boot x86, head_32: Remove i386 pieces x86: Require MOVBE feature in cpuid when we use it x86: Enable ARCH_USE_BUILTIN_BSWAP x86/xor: Add alternative SSE implementation only prefetching once per 64-byte line x86/xor: Unify SSE-base xor-block routines x86: Fix a typo x86/mm: Fix the argument passed to sync_global_pgds() x86/mm: Convert update_mmu_cache() and update_mmu_cache_pmd() to functions ix86: Tighten asmlinkage_protect() constraints
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/linkage.h18
-rw-r--r--arch/x86/include/asm/pgtable.h12
-rw-r--r--arch/x86/include/asm/pgtable_32.h7
-rw-r--r--arch/x86/include/asm/pgtable_64.h3
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/xor.h491
-rw-r--r--arch/x86/include/asm/xor_32.h309
-rw-r--r--arch/x86/include/asm/xor_64.h305
-rw-r--r--arch/x86/kernel/head_32.S93
-rw-r--r--arch/x86/kernel/sys_x86_64.c2
-rw-r--r--arch/x86/mm/init_64.c4
12 files changed, 574 insertions, 679 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f8130a770653..a9e50ac90838 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -116,6 +116,7 @@ config X86
116 select MODULES_USE_ELF_RELA if X86_64 116 select MODULES_USE_ELF_RELA if X86_64
117 select CLONE_BACKWARDS if X86_32 117 select CLONE_BACKWARDS if X86_32
118 select GENERIC_SIGALTSTACK 118 select GENERIC_SIGALTSTACK
119 select ARCH_USE_BUILTIN_BSWAP
119 120
120config INSTRUCTION_DECODER 121config INSTRUCTION_DECODER
121 def_bool y 122 def_bool y
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 48142971b25d..79327e9483a3 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -27,20 +27,20 @@
27#define __asmlinkage_protect0(ret) \ 27#define __asmlinkage_protect0(ret) \
28 __asmlinkage_protect_n(ret) 28 __asmlinkage_protect_n(ret)
29#define __asmlinkage_protect1(ret, arg1) \ 29#define __asmlinkage_protect1(ret, arg1) \
30 __asmlinkage_protect_n(ret, "g" (arg1)) 30 __asmlinkage_protect_n(ret, "m" (arg1))
31#define __asmlinkage_protect2(ret, arg1, arg2) \ 31#define __asmlinkage_protect2(ret, arg1, arg2) \
32 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2)) 32 __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2))
33#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \ 33#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \
34 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3)) 34 __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3))
35#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \ 35#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \
36 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ 36 __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
37 "g" (arg4)) 37 "m" (arg4))
38#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \ 38#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \
39 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ 39 __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
40 "g" (arg4), "g" (arg5)) 40 "m" (arg4), "m" (arg5))
41#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \ 41#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \
42 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ 42 __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
43 "g" (arg4), "g" (arg5), "g" (arg6)) 43 "m" (arg4), "m" (arg5), "m" (arg6))
44 44
45#endif /* CONFIG_X86_32 */ 45#endif /* CONFIG_X86_32 */
46 46
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c1a955e67c0..fc304279b559 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -786,6 +786,18 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
786 memcpy(dst, src, count * sizeof(pgd_t)); 786 memcpy(dst, src, count * sizeof(pgd_t));
787} 787}
788 788
789/*
790 * The x86 doesn't have any external MMU info: the kernel page
791 * tables contain all the necessary information.
792 */
793static inline void update_mmu_cache(struct vm_area_struct *vma,
794 unsigned long addr, pte_t *ptep)
795{
796}
797static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
798 unsigned long addr, pmd_t *pmd)
799{
800}
789 801
790#include <asm-generic/pgtable.h> 802#include <asm-generic/pgtable.h>
791#endif /* __ASSEMBLY__ */ 803#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 8faa215a503e..9ee322103c6d 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -66,13 +66,6 @@ do { \
66 __flush_tlb_one((vaddr)); \ 66 __flush_tlb_one((vaddr)); \
67} while (0) 67} while (0)
68 68
69/*
70 * The i386 doesn't have any external MMU info: the kernel page
71 * tables contain all the necessary information.
72 */
73#define update_mmu_cache(vma, address, ptep) do { } while (0)
74#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
75
76#endif /* !__ASSEMBLY__ */ 69#endif /* !__ASSEMBLY__ */
77 70
78/* 71/*
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 47356f9df82e..615b0c78449f 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -142,9 +142,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
142#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) 142#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
143#define pte_unmap(pte) ((void)(pte))/* NOP */ 143#define pte_unmap(pte) ((void)(pte))/* NOP */
144 144
145#define update_mmu_cache(vma, address, ptep) do { } while (0)
146#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
147
148/* Encode and de-code a swap entry */ 145/* Encode and de-code a swap entry */
149#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 146#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
150#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) 147#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 6c7fc25f2c34..5c6e4fb370f5 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -47,6 +47,12 @@
47# define NEED_NOPL 0 47# define NEED_NOPL 0
48#endif 48#endif
49 49
50#ifdef CONFIG_MATOM
51# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31))
52#else
53# define NEED_MOVBE 0
54#endif
55
50#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
51#ifdef CONFIG_PARAVIRT 57#ifdef CONFIG_PARAVIRT
52/* Paravirtualized systems may not have PSE or PGE available */ 58/* Paravirtualized systems may not have PSE or PGE available */
@@ -80,7 +86,7 @@
80 86
81#define REQUIRED_MASK2 0 87#define REQUIRED_MASK2 0
82#define REQUIRED_MASK3 (NEED_NOPL) 88#define REQUIRED_MASK3 (NEED_NOPL)
83#define REQUIRED_MASK4 0 89#define REQUIRED_MASK4 (NEED_MOVBE)
84#define REQUIRED_MASK5 0 90#define REQUIRED_MASK5 0
85#define REQUIRED_MASK6 0 91#define REQUIRED_MASK6 0
86#define REQUIRED_MASK7 0 92#define REQUIRED_MASK7 0
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index f8fde90bc45e..d8829751b3f8 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,10 +1,499 @@
1#ifdef CONFIG_KMEMCHECK 1#ifdef CONFIG_KMEMCHECK
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ 2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h> 3# include <asm-generic/xor.h>
4#elif !defined(_ASM_X86_XOR_H)
5#define _ASM_X86_XOR_H
6
7/*
8 * Optimized RAID-5 checksumming functions for SSE.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2, or (at your option)
13 * any later version.
14 *
15 * You should have received a copy of the GNU General Public License
16 * (for example /usr/src/linux/COPYING); if not, write to the Free
17 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20/*
21 * Cache avoiding checksumming functions utilizing KNI instructions
22 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
23 */
24
25/*
26 * Based on
27 * High-speed RAID5 checksumming functions utilizing SSE instructions.
28 * Copyright (C) 1998 Ingo Molnar.
29 */
30
31/*
32 * x86-64 changes / gcc fixes from Andi Kleen.
33 * Copyright 2002 Andi Kleen, SuSE Labs.
34 *
35 * This hasn't been optimized for the hammer yet, but there are likely
36 * no advantages to be gotten from x86-64 here anyways.
37 */
38
39#include <asm/i387.h>
40
41#ifdef CONFIG_X86_32
42/* reduce register pressure */
43# define XOR_CONSTANT_CONSTRAINT "i"
4#else 44#else
45# define XOR_CONSTANT_CONSTRAINT "re"
46#endif
47
48#define OFFS(x) "16*("#x")"
49#define PF_OFFS(x) "256+16*("#x")"
50#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
51#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
52#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
53#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
54#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
55#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
56#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
57#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
58#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
59#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
60#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
61#define NOP(x)
62
63#define BLK64(pf, op, i) \
64 pf(i) \
65 op(i, 0) \
66 op(i + 1, 1) \
67 op(i + 2, 2) \
68 op(i + 3, 3)
69
70static void
71xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
72{
73 unsigned long lines = bytes >> 8;
74
75 kernel_fpu_begin();
76
77 asm volatile(
78#undef BLOCK
79#define BLOCK(i) \
80 LD(i, 0) \
81 LD(i + 1, 1) \
82 PF1(i) \
83 PF1(i + 2) \
84 LD(i + 2, 2) \
85 LD(i + 3, 3) \
86 PF0(i + 4) \
87 PF0(i + 6) \
88 XO1(i, 0) \
89 XO1(i + 1, 1) \
90 XO1(i + 2, 2) \
91 XO1(i + 3, 3) \
92 ST(i, 0) \
93 ST(i + 1, 1) \
94 ST(i + 2, 2) \
95 ST(i + 3, 3) \
96
97
98 PF0(0)
99 PF0(2)
100
101 " .align 32 ;\n"
102 " 1: ;\n"
103
104 BLOCK(0)
105 BLOCK(4)
106 BLOCK(8)
107 BLOCK(12)
108
109 " add %[inc], %[p1] ;\n"
110 " add %[inc], %[p2] ;\n"
111 " dec %[cnt] ;\n"
112 " jnz 1b ;\n"
113 : [cnt] "+r" (lines),
114 [p1] "+r" (p1), [p2] "+r" (p2)
115 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
116 : "memory");
117
118 kernel_fpu_end();
119}
120
121static void
122xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
123{
124 unsigned long lines = bytes >> 8;
125
126 kernel_fpu_begin();
127
128 asm volatile(
129#undef BLOCK
130#define BLOCK(i) \
131 BLK64(PF0, LD, i) \
132 BLK64(PF1, XO1, i) \
133 BLK64(NOP, ST, i) \
134
135 " .align 32 ;\n"
136 " 1: ;\n"
137
138 BLOCK(0)
139 BLOCK(4)
140 BLOCK(8)
141 BLOCK(12)
142
143 " add %[inc], %[p1] ;\n"
144 " add %[inc], %[p2] ;\n"
145 " dec %[cnt] ;\n"
146 " jnz 1b ;\n"
147 : [cnt] "+r" (lines),
148 [p1] "+r" (p1), [p2] "+r" (p2)
149 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
150 : "memory");
151
152 kernel_fpu_end();
153}
154
155static void
156xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
157 unsigned long *p3)
158{
159 unsigned long lines = bytes >> 8;
160
161 kernel_fpu_begin();
162
163 asm volatile(
164#undef BLOCK
165#define BLOCK(i) \
166 PF1(i) \
167 PF1(i + 2) \
168 LD(i, 0) \
169 LD(i + 1, 1) \
170 LD(i + 2, 2) \
171 LD(i + 3, 3) \
172 PF2(i) \
173 PF2(i + 2) \
174 PF0(i + 4) \
175 PF0(i + 6) \
176 XO1(i, 0) \
177 XO1(i + 1, 1) \
178 XO1(i + 2, 2) \
179 XO1(i + 3, 3) \
180 XO2(i, 0) \
181 XO2(i + 1, 1) \
182 XO2(i + 2, 2) \
183 XO2(i + 3, 3) \
184 ST(i, 0) \
185 ST(i + 1, 1) \
186 ST(i + 2, 2) \
187 ST(i + 3, 3) \
188
189
190 PF0(0)
191 PF0(2)
192
193 " .align 32 ;\n"
194 " 1: ;\n"
195
196 BLOCK(0)
197 BLOCK(4)
198 BLOCK(8)
199 BLOCK(12)
200
201 " add %[inc], %[p1] ;\n"
202 " add %[inc], %[p2] ;\n"
203 " add %[inc], %[p3] ;\n"
204 " dec %[cnt] ;\n"
205 " jnz 1b ;\n"
206 : [cnt] "+r" (lines),
207 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
208 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
209 : "memory");
210
211 kernel_fpu_end();
212}
213
214static void
215xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
216 unsigned long *p3)
217{
218 unsigned long lines = bytes >> 8;
219
220 kernel_fpu_begin();
221
222 asm volatile(
223#undef BLOCK
224#define BLOCK(i) \
225 BLK64(PF0, LD, i) \
226 BLK64(PF1, XO1, i) \
227 BLK64(PF2, XO2, i) \
228 BLK64(NOP, ST, i) \
229
230 " .align 32 ;\n"
231 " 1: ;\n"
232
233 BLOCK(0)
234 BLOCK(4)
235 BLOCK(8)
236 BLOCK(12)
237
238 " add %[inc], %[p1] ;\n"
239 " add %[inc], %[p2] ;\n"
240 " add %[inc], %[p3] ;\n"
241 " dec %[cnt] ;\n"
242 " jnz 1b ;\n"
243 : [cnt] "+r" (lines),
244 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
245 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
246 : "memory");
247
248 kernel_fpu_end();
249}
250
251static void
252xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
253 unsigned long *p3, unsigned long *p4)
254{
255 unsigned long lines = bytes >> 8;
256
257 kernel_fpu_begin();
258
259 asm volatile(
260#undef BLOCK
261#define BLOCK(i) \
262 PF1(i) \
263 PF1(i + 2) \
264 LD(i, 0) \
265 LD(i + 1, 1) \
266 LD(i + 2, 2) \
267 LD(i + 3, 3) \
268 PF2(i) \
269 PF2(i + 2) \
270 XO1(i, 0) \
271 XO1(i + 1, 1) \
272 XO1(i + 2, 2) \
273 XO1(i + 3, 3) \
274 PF3(i) \
275 PF3(i + 2) \
276 PF0(i + 4) \
277 PF0(i + 6) \
278 XO2(i, 0) \
279 XO2(i + 1, 1) \
280 XO2(i + 2, 2) \
281 XO2(i + 3, 3) \
282 XO3(i, 0) \
283 XO3(i + 1, 1) \
284 XO3(i + 2, 2) \
285 XO3(i + 3, 3) \
286 ST(i, 0) \
287 ST(i + 1, 1) \
288 ST(i + 2, 2) \
289 ST(i + 3, 3) \
290
291
292 PF0(0)
293 PF0(2)
294
295 " .align 32 ;\n"
296 " 1: ;\n"
297
298 BLOCK(0)
299 BLOCK(4)
300 BLOCK(8)
301 BLOCK(12)
302
303 " add %[inc], %[p1] ;\n"
304 " add %[inc], %[p2] ;\n"
305 " add %[inc], %[p3] ;\n"
306 " add %[inc], %[p4] ;\n"
307 " dec %[cnt] ;\n"
308 " jnz 1b ;\n"
309 : [cnt] "+r" (lines), [p1] "+r" (p1),
310 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
311 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
312 : "memory");
313
314 kernel_fpu_end();
315}
316
317static void
318xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
319 unsigned long *p3, unsigned long *p4)
320{
321 unsigned long lines = bytes >> 8;
322
323 kernel_fpu_begin();
324
325 asm volatile(
326#undef BLOCK
327#define BLOCK(i) \
328 BLK64(PF0, LD, i) \
329 BLK64(PF1, XO1, i) \
330 BLK64(PF2, XO2, i) \
331 BLK64(PF3, XO3, i) \
332 BLK64(NOP, ST, i) \
333
334 " .align 32 ;\n"
335 " 1: ;\n"
336
337 BLOCK(0)
338 BLOCK(4)
339 BLOCK(8)
340 BLOCK(12)
341
342 " add %[inc], %[p1] ;\n"
343 " add %[inc], %[p2] ;\n"
344 " add %[inc], %[p3] ;\n"
345 " add %[inc], %[p4] ;\n"
346 " dec %[cnt] ;\n"
347 " jnz 1b ;\n"
348 : [cnt] "+r" (lines), [p1] "+r" (p1),
349 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
350 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
351 : "memory");
352
353 kernel_fpu_end();
354}
355
356static void
357xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
358 unsigned long *p3, unsigned long *p4, unsigned long *p5)
359{
360 unsigned long lines = bytes >> 8;
361
362 kernel_fpu_begin();
363
364 asm volatile(
365#undef BLOCK
366#define BLOCK(i) \
367 PF1(i) \
368 PF1(i + 2) \
369 LD(i, 0) \
370 LD(i + 1, 1) \
371 LD(i + 2, 2) \
372 LD(i + 3, 3) \
373 PF2(i) \
374 PF2(i + 2) \
375 XO1(i, 0) \
376 XO1(i + 1, 1) \
377 XO1(i + 2, 2) \
378 XO1(i + 3, 3) \
379 PF3(i) \
380 PF3(i + 2) \
381 XO2(i, 0) \
382 XO2(i + 1, 1) \
383 XO2(i + 2, 2) \
384 XO2(i + 3, 3) \
385 PF4(i) \
386 PF4(i + 2) \
387 PF0(i + 4) \
388 PF0(i + 6) \
389 XO3(i, 0) \
390 XO3(i + 1, 1) \
391 XO3(i + 2, 2) \
392 XO3(i + 3, 3) \
393 XO4(i, 0) \
394 XO4(i + 1, 1) \
395 XO4(i + 2, 2) \
396 XO4(i + 3, 3) \
397 ST(i, 0) \
398 ST(i + 1, 1) \
399 ST(i + 2, 2) \
400 ST(i + 3, 3) \
401
402
403 PF0(0)
404 PF0(2)
405
406 " .align 32 ;\n"
407 " 1: ;\n"
408
409 BLOCK(0)
410 BLOCK(4)
411 BLOCK(8)
412 BLOCK(12)
413
414 " add %[inc], %[p1] ;\n"
415 " add %[inc], %[p2] ;\n"
416 " add %[inc], %[p3] ;\n"
417 " add %[inc], %[p4] ;\n"
418 " add %[inc], %[p5] ;\n"
419 " dec %[cnt] ;\n"
420 " jnz 1b ;\n"
421 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
422 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
423 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
424 : "memory");
425
426 kernel_fpu_end();
427}
428
429static void
430xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
431 unsigned long *p3, unsigned long *p4, unsigned long *p5)
432{
433 unsigned long lines = bytes >> 8;
434
435 kernel_fpu_begin();
436
437 asm volatile(
438#undef BLOCK
439#define BLOCK(i) \
440 BLK64(PF0, LD, i) \
441 BLK64(PF1, XO1, i) \
442 BLK64(PF2, XO2, i) \
443 BLK64(PF3, XO3, i) \
444 BLK64(PF4, XO4, i) \
445 BLK64(NOP, ST, i) \
446
447 " .align 32 ;\n"
448 " 1: ;\n"
449
450 BLOCK(0)
451 BLOCK(4)
452 BLOCK(8)
453 BLOCK(12)
454
455 " add %[inc], %[p1] ;\n"
456 " add %[inc], %[p2] ;\n"
457 " add %[inc], %[p3] ;\n"
458 " add %[inc], %[p4] ;\n"
459 " add %[inc], %[p5] ;\n"
460 " dec %[cnt] ;\n"
461 " jnz 1b ;\n"
462 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
463 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
464 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
465 : "memory");
466
467 kernel_fpu_end();
468}
469
470static struct xor_block_template xor_block_sse_pf64 = {
471 .name = "prefetch64-sse",
472 .do_2 = xor_sse_2_pf64,
473 .do_3 = xor_sse_3_pf64,
474 .do_4 = xor_sse_4_pf64,
475 .do_5 = xor_sse_5_pf64,
476};
477
478#undef LD
479#undef XO1
480#undef XO2
481#undef XO3
482#undef XO4
483#undef ST
484#undef NOP
485#undef BLK64
486#undef BLOCK
487
488#undef XOR_CONSTANT_CONSTRAINT
489
5#ifdef CONFIG_X86_32 490#ifdef CONFIG_X86_32
6# include <asm/xor_32.h> 491# include <asm/xor_32.h>
7#else 492#else
8# include <asm/xor_64.h> 493# include <asm/xor_64.h>
9#endif 494#endif
10#endif 495
496#define XOR_SELECT_TEMPLATE(FASTEST) \
497 AVX_SELECT(FASTEST)
498
499#endif /* _ASM_X86_XOR_H */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index f79cb7ec0e06..ce05722e3c68 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -2,7 +2,7 @@
2#define _ASM_X86_XOR_32_H 2#define _ASM_X86_XOR_32_H
3 3
4/* 4/*
5 * Optimized RAID-5 checksumming functions for MMX and SSE. 5 * Optimized RAID-5 checksumming functions for MMX.
6 * 6 *
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by 8 * it under the terms of the GNU General Public License as published by
@@ -529,290 +529,6 @@ static struct xor_block_template xor_block_p5_mmx = {
529 .do_5 = xor_p5_mmx_5, 529 .do_5 = xor_p5_mmx_5,
530}; 530};
531 531
532/*
533 * Cache avoiding checksumming functions utilizing KNI instructions
534 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
535 */
536
537#define OFFS(x) "16*("#x")"
538#define PF_OFFS(x) "256+16*("#x")"
539#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
540#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
541#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
542#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
543#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
544#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
545#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
546#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
547#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
548#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
549#define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
550#define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
551#define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
552
553
554static void
555xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
556{
557 unsigned long lines = bytes >> 8;
558
559 kernel_fpu_begin();
560
561 asm volatile(
562#undef BLOCK
563#define BLOCK(i) \
564 LD(i, 0) \
565 LD(i + 1, 1) \
566 PF1(i) \
567 PF1(i + 2) \
568 LD(i + 2, 2) \
569 LD(i + 3, 3) \
570 PF0(i + 4) \
571 PF0(i + 6) \
572 XO1(i, 0) \
573 XO1(i + 1, 1) \
574 XO1(i + 2, 2) \
575 XO1(i + 3, 3) \
576 ST(i, 0) \
577 ST(i + 1, 1) \
578 ST(i + 2, 2) \
579 ST(i + 3, 3) \
580
581
582 PF0(0)
583 PF0(2)
584
585 " .align 32 ;\n"
586 " 1: ;\n"
587
588 BLOCK(0)
589 BLOCK(4)
590 BLOCK(8)
591 BLOCK(12)
592
593 " addl $256, %1 ;\n"
594 " addl $256, %2 ;\n"
595 " decl %0 ;\n"
596 " jnz 1b ;\n"
597 : "+r" (lines),
598 "+r" (p1), "+r" (p2)
599 :
600 : "memory");
601
602 kernel_fpu_end();
603}
604
605static void
606xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
607 unsigned long *p3)
608{
609 unsigned long lines = bytes >> 8;
610
611 kernel_fpu_begin();
612
613 asm volatile(
614#undef BLOCK
615#define BLOCK(i) \
616 PF1(i) \
617 PF1(i + 2) \
618 LD(i,0) \
619 LD(i + 1, 1) \
620 LD(i + 2, 2) \
621 LD(i + 3, 3) \
622 PF2(i) \
623 PF2(i + 2) \
624 PF0(i + 4) \
625 PF0(i + 6) \
626 XO1(i,0) \
627 XO1(i + 1, 1) \
628 XO1(i + 2, 2) \
629 XO1(i + 3, 3) \
630 XO2(i,0) \
631 XO2(i + 1, 1) \
632 XO2(i + 2, 2) \
633 XO2(i + 3, 3) \
634 ST(i,0) \
635 ST(i + 1, 1) \
636 ST(i + 2, 2) \
637 ST(i + 3, 3) \
638
639
640 PF0(0)
641 PF0(2)
642
643 " .align 32 ;\n"
644 " 1: ;\n"
645
646 BLOCK(0)
647 BLOCK(4)
648 BLOCK(8)
649 BLOCK(12)
650
651 " addl $256, %1 ;\n"
652 " addl $256, %2 ;\n"
653 " addl $256, %3 ;\n"
654 " decl %0 ;\n"
655 " jnz 1b ;\n"
656 : "+r" (lines),
657 "+r" (p1), "+r"(p2), "+r"(p3)
658 :
659 : "memory" );
660
661 kernel_fpu_end();
662}
663
664static void
665xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
666 unsigned long *p3, unsigned long *p4)
667{
668 unsigned long lines = bytes >> 8;
669
670 kernel_fpu_begin();
671
672 asm volatile(
673#undef BLOCK
674#define BLOCK(i) \
675 PF1(i) \
676 PF1(i + 2) \
677 LD(i,0) \
678 LD(i + 1, 1) \
679 LD(i + 2, 2) \
680 LD(i + 3, 3) \
681 PF2(i) \
682 PF2(i + 2) \
683 XO1(i,0) \
684 XO1(i + 1, 1) \
685 XO1(i + 2, 2) \
686 XO1(i + 3, 3) \
687 PF3(i) \
688 PF3(i + 2) \
689 PF0(i + 4) \
690 PF0(i + 6) \
691 XO2(i,0) \
692 XO2(i + 1, 1) \
693 XO2(i + 2, 2) \
694 XO2(i + 3, 3) \
695 XO3(i,0) \
696 XO3(i + 1, 1) \
697 XO3(i + 2, 2) \
698 XO3(i + 3, 3) \
699 ST(i,0) \
700 ST(i + 1, 1) \
701 ST(i + 2, 2) \
702 ST(i + 3, 3) \
703
704
705 PF0(0)
706 PF0(2)
707
708 " .align 32 ;\n"
709 " 1: ;\n"
710
711 BLOCK(0)
712 BLOCK(4)
713 BLOCK(8)
714 BLOCK(12)
715
716 " addl $256, %1 ;\n"
717 " addl $256, %2 ;\n"
718 " addl $256, %3 ;\n"
719 " addl $256, %4 ;\n"
720 " decl %0 ;\n"
721 " jnz 1b ;\n"
722 : "+r" (lines),
723 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
724 :
725 : "memory" );
726
727 kernel_fpu_end();
728}
729
730static void
731xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
732 unsigned long *p3, unsigned long *p4, unsigned long *p5)
733{
734 unsigned long lines = bytes >> 8;
735
736 kernel_fpu_begin();
737
738 /* Make sure GCC forgets anything it knows about p4 or p5,
739 such that it won't pass to the asm volatile below a
740 register that is shared with any other variable. That's
741 because we modify p4 and p5 there, but we can't mark them
742 as read/write, otherwise we'd overflow the 10-asm-operands
743 limit of GCC < 3.1. */
744 asm("" : "+r" (p4), "+r" (p5));
745
746 asm volatile(
747#undef BLOCK
748#define BLOCK(i) \
749 PF1(i) \
750 PF1(i + 2) \
751 LD(i,0) \
752 LD(i + 1, 1) \
753 LD(i + 2, 2) \
754 LD(i + 3, 3) \
755 PF2(i) \
756 PF2(i + 2) \
757 XO1(i,0) \
758 XO1(i + 1, 1) \
759 XO1(i + 2, 2) \
760 XO1(i + 3, 3) \
761 PF3(i) \
762 PF3(i + 2) \
763 XO2(i,0) \
764 XO2(i + 1, 1) \
765 XO2(i + 2, 2) \
766 XO2(i + 3, 3) \
767 PF4(i) \
768 PF4(i + 2) \
769 PF0(i + 4) \
770 PF0(i + 6) \
771 XO3(i,0) \
772 XO3(i + 1, 1) \
773 XO3(i + 2, 2) \
774 XO3(i + 3, 3) \
775 XO4(i,0) \
776 XO4(i + 1, 1) \
777 XO4(i + 2, 2) \
778 XO4(i + 3, 3) \
779 ST(i,0) \
780 ST(i + 1, 1) \
781 ST(i + 2, 2) \
782 ST(i + 3, 3) \
783
784
785 PF0(0)
786 PF0(2)
787
788 " .align 32 ;\n"
789 " 1: ;\n"
790
791 BLOCK(0)
792 BLOCK(4)
793 BLOCK(8)
794 BLOCK(12)
795
796 " addl $256, %1 ;\n"
797 " addl $256, %2 ;\n"
798 " addl $256, %3 ;\n"
799 " addl $256, %4 ;\n"
800 " addl $256, %5 ;\n"
801 " decl %0 ;\n"
802 " jnz 1b ;\n"
803 : "+r" (lines),
804 "+r" (p1), "+r" (p2), "+r" (p3)
805 : "r" (p4), "r" (p5)
806 : "memory");
807
808 /* p4 and p5 were modified, and now the variables are dead.
809 Clobber them just to be sure nobody does something stupid
810 like assuming they have some legal value. */
811 asm("" : "=r" (p4), "=r" (p5));
812
813 kernel_fpu_end();
814}
815
816static struct xor_block_template xor_block_pIII_sse = { 532static struct xor_block_template xor_block_pIII_sse = {
817 .name = "pIII_sse", 533 .name = "pIII_sse",
818 .do_2 = xor_sse_2, 534 .do_2 = xor_sse_2,
@@ -827,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = {
827/* Also try the generic routines. */ 543/* Also try the generic routines. */
828#include <asm-generic/xor.h> 544#include <asm-generic/xor.h>
829 545
546/* We force the use of the SSE xor block because it can write around L2.
547 We may also be able to load into the L1 only depending on how the cpu
548 deals with a load to a line that is being prefetched. */
830#undef XOR_TRY_TEMPLATES 549#undef XOR_TRY_TEMPLATES
831#define XOR_TRY_TEMPLATES \ 550#define XOR_TRY_TEMPLATES \
832do { \ 551do { \
833 xor_speed(&xor_block_8regs); \
834 xor_speed(&xor_block_8regs_p); \
835 xor_speed(&xor_block_32regs); \
836 xor_speed(&xor_block_32regs_p); \
837 AVX_XOR_SPEED; \ 552 AVX_XOR_SPEED; \
838 if (cpu_has_xmm) \ 553 if (cpu_has_xmm) { \
839 xor_speed(&xor_block_pIII_sse); \ 554 xor_speed(&xor_block_pIII_sse); \
840 if (cpu_has_mmx) { \ 555 xor_speed(&xor_block_sse_pf64); \
556 } else if (cpu_has_mmx) { \
841 xor_speed(&xor_block_pII_mmx); \ 557 xor_speed(&xor_block_pII_mmx); \
842 xor_speed(&xor_block_p5_mmx); \ 558 xor_speed(&xor_block_p5_mmx); \
559 } else { \
560 xor_speed(&xor_block_8regs); \
561 xor_speed(&xor_block_8regs_p); \
562 xor_speed(&xor_block_32regs); \
563 xor_speed(&xor_block_32regs_p); \
843 } \ 564 } \
844} while (0) 565} while (0)
845 566
846/* We force the use of the SSE xor block because it can write around L2.
847 We may also be able to load into the L1 only depending on how the cpu
848 deals with a load to a line that is being prefetched. */
849#define XOR_SELECT_TEMPLATE(FASTEST) \
850 AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
851
852#endif /* _ASM_X86_XOR_32_H */ 567#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 87ac522c4af5..546f1e3b87cc 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -1,301 +1,6 @@
1#ifndef _ASM_X86_XOR_64_H 1#ifndef _ASM_X86_XOR_64_H
2#define _ASM_X86_XOR_64_H 2#define _ASM_X86_XOR_64_H
3 3
4/*
5 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17
18/*
19 * Cache avoiding checksumming functions utilizing KNI instructions
20 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21 */
22
23/*
24 * Based on
25 * High-speed RAID5 checksumming functions utilizing SSE instructions.
26 * Copyright (C) 1998 Ingo Molnar.
27 */
28
29/*
30 * x86-64 changes / gcc fixes from Andi Kleen.
31 * Copyright 2002 Andi Kleen, SuSE Labs.
32 *
33 * This hasn't been optimized for the hammer yet, but there are likely
34 * no advantages to be gotten from x86-64 here anyways.
35 */
36
37#include <asm/i387.h>
38
39#define OFFS(x) "16*("#x")"
40#define PF_OFFS(x) "256+16*("#x")"
41#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
42#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
43#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
44#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
45#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
46#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
47#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
48#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
49#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
50#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
51#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
52#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
53#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
54
55
56static void
57xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
58{
59 unsigned int lines = bytes >> 8;
60
61 kernel_fpu_begin();
62
63 asm volatile(
64#undef BLOCK
65#define BLOCK(i) \
66 LD(i, 0) \
67 LD(i + 1, 1) \
68 PF1(i) \
69 PF1(i + 2) \
70 LD(i + 2, 2) \
71 LD(i + 3, 3) \
72 PF0(i + 4) \
73 PF0(i + 6) \
74 XO1(i, 0) \
75 XO1(i + 1, 1) \
76 XO1(i + 2, 2) \
77 XO1(i + 3, 3) \
78 ST(i, 0) \
79 ST(i + 1, 1) \
80 ST(i + 2, 2) \
81 ST(i + 3, 3) \
82
83
84 PF0(0)
85 PF0(2)
86
87 " .align 32 ;\n"
88 " 1: ;\n"
89
90 BLOCK(0)
91 BLOCK(4)
92 BLOCK(8)
93 BLOCK(12)
94
95 " addq %[inc], %[p1] ;\n"
96 " addq %[inc], %[p2] ;\n"
97 " decl %[cnt] ; jnz 1b"
98 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
99 : [inc] "r" (256UL)
100 : "memory");
101
102 kernel_fpu_end();
103}
104
105static void
106xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
107 unsigned long *p3)
108{
109 unsigned int lines = bytes >> 8;
110
111 kernel_fpu_begin();
112 asm volatile(
113#undef BLOCK
114#define BLOCK(i) \
115 PF1(i) \
116 PF1(i + 2) \
117 LD(i, 0) \
118 LD(i + 1, 1) \
119 LD(i + 2, 2) \
120 LD(i + 3, 3) \
121 PF2(i) \
122 PF2(i + 2) \
123 PF0(i + 4) \
124 PF0(i + 6) \
125 XO1(i, 0) \
126 XO1(i + 1, 1) \
127 XO1(i + 2, 2) \
128 XO1(i + 3, 3) \
129 XO2(i, 0) \
130 XO2(i + 1, 1) \
131 XO2(i + 2, 2) \
132 XO2(i + 3, 3) \
133 ST(i, 0) \
134 ST(i + 1, 1) \
135 ST(i + 2, 2) \
136 ST(i + 3, 3) \
137
138
139 PF0(0)
140 PF0(2)
141
142 " .align 32 ;\n"
143 " 1: ;\n"
144
145 BLOCK(0)
146 BLOCK(4)
147 BLOCK(8)
148 BLOCK(12)
149
150 " addq %[inc], %[p1] ;\n"
151 " addq %[inc], %[p2] ;\n"
152 " addq %[inc], %[p3] ;\n"
153 " decl %[cnt] ; jnz 1b"
154 : [cnt] "+r" (lines),
155 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
156 : [inc] "r" (256UL)
157 : "memory");
158 kernel_fpu_end();
159}
160
161static void
162xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
163 unsigned long *p3, unsigned long *p4)
164{
165 unsigned int lines = bytes >> 8;
166
167 kernel_fpu_begin();
168
169 asm volatile(
170#undef BLOCK
171#define BLOCK(i) \
172 PF1(i) \
173 PF1(i + 2) \
174 LD(i, 0) \
175 LD(i + 1, 1) \
176 LD(i + 2, 2) \
177 LD(i + 3, 3) \
178 PF2(i) \
179 PF2(i + 2) \
180 XO1(i, 0) \
181 XO1(i + 1, 1) \
182 XO1(i + 2, 2) \
183 XO1(i + 3, 3) \
184 PF3(i) \
185 PF3(i + 2) \
186 PF0(i + 4) \
187 PF0(i + 6) \
188 XO2(i, 0) \
189 XO2(i + 1, 1) \
190 XO2(i + 2, 2) \
191 XO2(i + 3, 3) \
192 XO3(i, 0) \
193 XO3(i + 1, 1) \
194 XO3(i + 2, 2) \
195 XO3(i + 3, 3) \
196 ST(i, 0) \
197 ST(i + 1, 1) \
198 ST(i + 2, 2) \
199 ST(i + 3, 3) \
200
201
202 PF0(0)
203 PF0(2)
204
205 " .align 32 ;\n"
206 " 1: ;\n"
207
208 BLOCK(0)
209 BLOCK(4)
210 BLOCK(8)
211 BLOCK(12)
212
213 " addq %[inc], %[p1] ;\n"
214 " addq %[inc], %[p2] ;\n"
215 " addq %[inc], %[p3] ;\n"
216 " addq %[inc], %[p4] ;\n"
217 " decl %[cnt] ; jnz 1b"
218 : [cnt] "+c" (lines),
219 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
220 : [inc] "r" (256UL)
221 : "memory" );
222
223 kernel_fpu_end();
224}
225
226static void
227xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
228 unsigned long *p3, unsigned long *p4, unsigned long *p5)
229{
230 unsigned int lines = bytes >> 8;
231
232 kernel_fpu_begin();
233
234 asm volatile(
235#undef BLOCK
236#define BLOCK(i) \
237 PF1(i) \
238 PF1(i + 2) \
239 LD(i, 0) \
240 LD(i + 1, 1) \
241 LD(i + 2, 2) \
242 LD(i + 3, 3) \
243 PF2(i) \
244 PF2(i + 2) \
245 XO1(i, 0) \
246 XO1(i + 1, 1) \
247 XO1(i + 2, 2) \
248 XO1(i + 3, 3) \
249 PF3(i) \
250 PF3(i + 2) \
251 XO2(i, 0) \
252 XO2(i + 1, 1) \
253 XO2(i + 2, 2) \
254 XO2(i + 3, 3) \
255 PF4(i) \
256 PF4(i + 2) \
257 PF0(i + 4) \
258 PF0(i + 6) \
259 XO3(i, 0) \
260 XO3(i + 1, 1) \
261 XO3(i + 2, 2) \
262 XO3(i + 3, 3) \
263 XO4(i, 0) \
264 XO4(i + 1, 1) \
265 XO4(i + 2, 2) \
266 XO4(i + 3, 3) \
267 ST(i, 0) \
268 ST(i + 1, 1) \
269 ST(i + 2, 2) \
270 ST(i + 3, 3) \
271
272
273 PF0(0)
274 PF0(2)
275
276 " .align 32 ;\n"
277 " 1: ;\n"
278
279 BLOCK(0)
280 BLOCK(4)
281 BLOCK(8)
282 BLOCK(12)
283
284 " addq %[inc], %[p1] ;\n"
285 " addq %[inc], %[p2] ;\n"
286 " addq %[inc], %[p3] ;\n"
287 " addq %[inc], %[p4] ;\n"
288 " addq %[inc], %[p5] ;\n"
289 " decl %[cnt] ; jnz 1b"
290 : [cnt] "+c" (lines),
291 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
292 [p5] "+r" (p5)
293 : [inc] "r" (256UL)
294 : "memory");
295
296 kernel_fpu_end();
297}
298
299static struct xor_block_template xor_block_sse = { 4static struct xor_block_template xor_block_sse = {
300 .name = "generic_sse", 5 .name = "generic_sse",
301 .do_2 = xor_sse_2, 6 .do_2 = xor_sse_2,
@@ -308,17 +13,15 @@ static struct xor_block_template xor_block_sse = {
308/* Also try the AVX routines */ 13/* Also try the AVX routines */
309#include <asm/xor_avx.h> 14#include <asm/xor_avx.h>
310 15
16/* We force the use of the SSE xor block because it can write around L2.
17 We may also be able to load into the L1 only depending on how the cpu
18 deals with a load to a line that is being prefetched. */
311#undef XOR_TRY_TEMPLATES 19#undef XOR_TRY_TEMPLATES
312#define XOR_TRY_TEMPLATES \ 20#define XOR_TRY_TEMPLATES \
313do { \ 21do { \
314 AVX_XOR_SPEED; \ 22 AVX_XOR_SPEED; \
23 xor_speed(&xor_block_sse_pf64); \
315 xor_speed(&xor_block_sse); \ 24 xor_speed(&xor_block_sse); \
316} while (0) 25} while (0)
317 26
318/* We force the use of the SSE xor block because it can write around L2.
319 We may also be able to load into the L1 only depending on how the cpu
320 deals with a load to a line that is being prefetched. */
321#define XOR_SELECT_TEMPLATE(FASTEST) \
322 AVX_SELECT(&xor_block_sse)
323
324#endif /* _ASM_X86_XOR_64_H */ 27#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c8932c79e78b..3c3f58a0808f 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -307,36 +307,45 @@ default_entry:
307 movl %eax,%cr0 307 movl %eax,%cr0
308 308
309/* 309/*
310 * New page tables may be in 4Mbyte page mode and may 310 * We want to start out with EFLAGS unambiguously cleared. Some BIOSes leave
311 * be using the global pages. 311 * bits like NT set. This would confuse the debugger if this code is traced. So
312 * initialize them properly now before switching to protected mode. That means
313 * DF in particular (even though we have cleared it earlier after copying the
314 * command line) because GCC expects it.
315 */
316 pushl $0
317 popfl
318
319/*
320 * New page tables may be in 4Mbyte page mode and may be using the global pages.
312 * 321 *
313 * NOTE! If we are on a 486 we may have no cr4 at all! 322 * NOTE! If we are on a 486 we may have no cr4 at all! Specifically, cr4 exists
314 * Specifically, cr4 exists if and only if CPUID exists 323 * if and only if CPUID exists and has flags other than the FPU flag set.
315 * and has flags other than the FPU flag set.
316 */ 324 */
325 movl $-1,pa(X86_CPUID) # preset CPUID level
317 movl $X86_EFLAGS_ID,%ecx 326 movl $X86_EFLAGS_ID,%ecx
318 pushl %ecx 327 pushl %ecx
319 popfl 328 popfl # set EFLAGS=ID
320 pushfl 329 pushfl
321 popl %eax 330 popl %eax # get EFLAGS
322 pushl $0 331 testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set?
323 popfl 332 jz enable_paging # hw disallowed setting of ID bit
324 pushfl 333 # which means no CPUID and no CR4
325 popl %edx 334
326 xorl %edx,%eax 335 xorl %eax,%eax
327 testl %ecx,%eax 336 cpuid
328 jz 6f # No ID flag = no CPUID = no CR4 337 movl %eax,pa(X86_CPUID) # save largest std CPUID function
329 338
330 movl $1,%eax 339 movl $1,%eax
331 cpuid 340 cpuid
332 andl $~1,%edx # Ignore CPUID.FPU 341 andl $~1,%edx # Ignore CPUID.FPU
333 jz 6f # No flags or only CPUID.FPU = no CR4 342 jz enable_paging # No flags or only CPUID.FPU = no CR4
334 343
335 movl pa(mmu_cr4_features),%eax 344 movl pa(mmu_cr4_features),%eax
336 movl %eax,%cr4 345 movl %eax,%cr4
337 346
338 testb $X86_CR4_PAE, %al # check if PAE is enabled 347 testb $X86_CR4_PAE, %al # check if PAE is enabled
339 jz 6f 348 jz enable_paging
340 349
341 /* Check if extended functions are implemented */ 350 /* Check if extended functions are implemented */
342 movl $0x80000000, %eax 351 movl $0x80000000, %eax
@@ -344,7 +353,7 @@ default_entry:
344 /* Value must be in the range 0x80000001 to 0x8000ffff */ 353 /* Value must be in the range 0x80000001 to 0x8000ffff */
345 subl $0x80000001, %eax 354 subl $0x80000001, %eax
346 cmpl $(0x8000ffff-0x80000001), %eax 355 cmpl $(0x8000ffff-0x80000001), %eax
347 ja 6f 356 ja enable_paging
348 357
349 /* Clear bogus XD_DISABLE bits */ 358 /* Clear bogus XD_DISABLE bits */
350 call verify_cpu 359 call verify_cpu
@@ -353,7 +362,7 @@ default_entry:
353 cpuid 362 cpuid
354 /* Execute Disable bit supported? */ 363 /* Execute Disable bit supported? */
355 btl $(X86_FEATURE_NX & 31), %edx 364 btl $(X86_FEATURE_NX & 31), %edx
356 jnc 6f 365 jnc enable_paging
357 366
358 /* Setup EFER (Extended Feature Enable Register) */ 367 /* Setup EFER (Extended Feature Enable Register) */
359 movl $MSR_EFER, %ecx 368 movl $MSR_EFER, %ecx
@@ -363,7 +372,7 @@ default_entry:
363 /* Make changes effective */ 372 /* Make changes effective */
364 wrmsr 373 wrmsr
365 374
3666: 375enable_paging:
367 376
368/* 377/*
369 * Enable paging 378 * Enable paging
@@ -378,14 +387,6 @@ default_entry:
378 addl $__PAGE_OFFSET, %esp 387 addl $__PAGE_OFFSET, %esp
379 388
380/* 389/*
381 * Initialize eflags. Some BIOS's leave bits like NT set. This would
382 * confuse the debugger if this code is traced.
383 * XXX - best to initialize before switching to protected mode.
384 */
385 pushl $0
386 popfl
387
388/*
389 * start system 32-bit setup. We need to re-do some of the things done 390 * start system 32-bit setup. We need to re-do some of the things done
390 * in 16-bit mode for the "real" operations. 391 * in 16-bit mode for the "real" operations.
391 */ 392 */
@@ -394,31 +395,11 @@ default_entry:
394 jz 1f # Did we do this already? 395 jz 1f # Did we do this already?
395 call *%eax 396 call *%eax
3961: 3971:
397 398
398/* check if it is 486 or 386. */
399/* 399/*
400 * XXX - this does a lot of unnecessary setup. Alignment checks don't 400 * Check if it is 486
401 * apply at our cpl of 0 and the stack ought to be aligned already, and
402 * we don't need to preserve eflags.
403 */ 401 */
404 movl $-1,X86_CPUID # -1 for no CPUID initially 402 cmpl $-1,X86_CPUID
405 movb $3,X86 # at least 386
406 pushfl # push EFLAGS
407 popl %eax # get EFLAGS
408 movl %eax,%ecx # save original EFLAGS
409 xorl $0x240000,%eax # flip AC and ID bits in EFLAGS
410 pushl %eax # copy to EFLAGS
411 popfl # set EFLAGS
412 pushfl # get new EFLAGS
413 popl %eax # put it in eax
414 xorl %ecx,%eax # change in flags
415 pushl %ecx # restore original EFLAGS
416 popfl
417 testl $0x40000,%eax # check if AC bit changed
418 je is386
419
420 movb $4,X86 # at least 486
421 testl $0x200000,%eax # check if ID bit changed
422 je is486 403 je is486
423 404
424 /* get vendor info */ 405 /* get vendor info */
@@ -444,11 +425,10 @@ default_entry:
444 movb %cl,X86_MASK 425 movb %cl,X86_MASK
445 movl %edx,X86_CAPABILITY 426 movl %edx,X86_CAPABILITY
446 427
447is486: movl $0x50022,%ecx # set AM, WP, NE and MP 428is486:
448 jmp 2f 429 movb $4,X86
449 430 movl $0x50022,%ecx # set AM, WP, NE and MP
450is386: movl $2,%ecx # set MP 431 movl %cr0,%eax
4512: movl %cr0,%eax
452 andl $0x80000011,%eax # Save PG,PE,ET 432 andl $0x80000011,%eax # Save PG,PE,ET
453 orl %ecx,%eax 433 orl %ecx,%eax
454 movl %eax,%cr0 434 movl %eax,%cr0
@@ -473,7 +453,6 @@ is386: movl $2,%ecx # set MP
473 xorl %eax,%eax # Clear LDT 453 xorl %eax,%eax # Clear LDT
474 lldt %ax 454 lldt %ax
475 455
476 cld # gcc2 wants the direction flag cleared at all times
477 pushl $0 # fake return address for unwinder 456 pushl $0 # fake return address for unwinder
478 jmp *(initial_code) 457 jmp *(initial_code)
479 458
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 97ef74b88e0f..dbded5aedb81 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -157,7 +157,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
157 if (flags & MAP_FIXED) 157 if (flags & MAP_FIXED)
158 return addr; 158 return addr;
159 159
160 /* for MAP_32BIT mappings we force the legact mmap base */ 160 /* for MAP_32BIT mappings we force the legacy mmap base */
161 if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) 161 if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT))
162 goto bottomup; 162 goto bottomup;
163 163
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 75c9a6a59697..d6eeead43758 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -605,7 +605,7 @@ kernel_physical_mapping_init(unsigned long start,
605 } 605 }
606 606
607 if (pgd_changed) 607 if (pgd_changed)
608 sync_global_pgds(addr, end); 608 sync_global_pgds(addr, end - 1);
609 609
610 __flush_tlb_all(); 610 __flush_tlb_all();
611 611
@@ -984,7 +984,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
984 } 984 }
985 985
986 } 986 }
987 sync_global_pgds((unsigned long)start_page, end); 987 sync_global_pgds((unsigned long)start_page, end - 1);
988 return 0; 988 return 0;
989} 989}
990 990