8 files changed, 534 insertions, 619 deletions
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 48142971b25d..79327e9483a3 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -27,20 +27,20 @@
 #define __asmlinkage_protect0(ret) \
        __asmlinkage_protect_n(ret)
 #define __asmlinkage_protect1(ret, arg1) \
-        __asmlinkage_protect_n(ret, "g" (arg1))
+        __asmlinkage_protect_n(ret, "m" (arg1))
 #define __asmlinkage_protect2(ret, arg1, arg2) \
-        __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2))
+        __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2))
 #define __asmlinkage_protect3(ret, arg1, arg2, arg3) \
-        __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3))
+        __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3))
 #define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \
-        __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
+        __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
-                              "g" (arg4))
+                              "m" (arg4))
 #define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \
-        __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
+        __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
-                              "g" (arg4), "g" (arg5))
+                              "m" (arg4), "m" (arg5))
 #define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \
-        __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
+        __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
-                              "g" (arg4), "g" (arg5), "g" (arg6))
+                              "m" (arg4), "m" (arg5), "m" (arg6))
 #endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c1a955e67c0..fc304279b559 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -786,6 +786,18 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
       memcpy(dst, src, count * sizeof(pgd_t));
 }
+/*
+ * The x86 doesn't have any external MMU info: the kernel page
+ * tables contain all the necessary information.
+ */
+static inline void update_mmu_cache(struct vm_area_struct *vma,
+                unsigned long addr, pte_t *ptep)
+{
+}
+static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
+                unsigned long addr, pmd_t *pmd)
+{
+}
 #include <asm-generic/pgtable.h>
 #endif  /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 8faa215a503e..9ee322103c6d 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -66,13 +66,6 @@ do {						\
        __flush_tlb_one((vaddr));               \
 } while (0)
-/*
- * The i386 doesn't have any external MMU info: the kernel page
- * tables contain all the necessary information.
- */
-#define update_mmu_cache(vma, address, ptep) do { } while (0)
-#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
 #endif /* !__ASSEMBLY__ */
 /*
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 47356f9df82e..615b0c78449f 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -142,9 +142,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
 #define pte_unmap(pte) ((void)(pte))/* NOP */
-#define update_mmu_cache(vma, address, ptep) do { } while (0)
-#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
 /* Encode and de-code a swap entry */
 #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
 #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 6c7fc25f2c34..5c6e4fb370f5 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -47,6 +47,12 @@
 # define NEED_NOPL      0
 #endif
+#ifdef CONFIG_MATOM
+# define NEED_MOVBE     (1<<(X86_FEATURE_MOVBE & 31))
+#else
+# define NEED_MOVBE     0
+#endif
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_PARAVIRT
 /* Paravirtualized systems may not have PSE or PGE available */
@@ -80,7 +86,7 @@
 #define REQUIRED_MASK2  0
 #define REQUIRED_MASK3  (NEED_NOPL)
-#define REQUIRED_MASK4  0
+#define REQUIRED_MASK4  (NEED_MOVBE)
 #define REQUIRED_MASK5  0
 #define REQUIRED_MASK6  0
 #define REQUIRED_MASK7  0
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index f8fde90bc45e..d8829751b3f8 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,10 +1,499 @@
 #ifdef CONFIG_KMEMCHECK
 /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
 # include <asm-generic/xor.h>
+#elif !defined(_ASM_X86_XOR_H)
+#define _ASM_X86_XOR_H
+/*
+ * Optimized RAID-5 checksumming functions for SSE.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+/*
+ * Based on
+ * High-speed RAID5 checksumming functions utilizing SSE instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ */
+/*
+ * x86-64 changes / gcc fixes from Andi Kleen.
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ *
+ * This hasn't been optimized for the hammer yet, but there are likely
+ * no advantages to be gotten from x86-64 here anyways.
+ */
+#include <asm/i387.h>
+#ifdef CONFIG_X86_32
+/* reduce register pressure */
+# define XOR_CONSTANT_CONSTRAINT "i"
 #else
+# define XOR_CONSTANT_CONSTRAINT "re"
+#endif
+#define OFFS(x)         "16*("#x")"
+#define PF_OFFS(x)      "256+16*("#x")"
+#define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
+#define LD(x, y)        "       movaps "OFFS(x)"(%[p1]), %%xmm"#y"      ;\n"
+#define ST(x, y)        "       movaps %%xmm"#y", "OFFS(x)"(%[p1])      ;\n"
+#define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
+#define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
+#define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
+#define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
+#define XO1(x, y)       "       xorps "OFFS(x)"(%[p2]), %%xmm"#y"       ;\n"
+#define XO2(x, y)       "       xorps "OFFS(x)"(%[p3]), %%xmm"#y"       ;\n"
+#define XO3(x, y)       "       xorps "OFFS(x)"(%[p4]), %%xmm"#y"       ;\n"
+#define XO4(x, y)       "       xorps "OFFS(x)"(%[p5]), %%xmm"#y"       ;\n"
+#define NOP(x)
+#define BLK64(pf, op, i)                                \
+                pf(i)                                   \
+                op(i, 0)                                \
+                        op(i + 1, 1)                    \
+                                op(i + 2, 2)            \
+                                        op(i + 3, 3)
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+        unsigned long lines = bytes >> 8;
+        kernel_fpu_begin();
+        asm volatile(
+#undef BLOCK
+#define BLOCK(i)                                        \
+                LD(i, 0)                                \
+                        LD(i + 1, 1)                    \
+                PF1(i)                                  \
+                                PF1(i + 2)              \
+                                LD(i + 2, 2)            \
+                                        LD(i + 3, 3)    \
+                PF0(i + 4)                              \
+                                PF0(i + 6)              \
+                XO1(i, 0)                               \
+                        XO1(i + 1, 1)                   \
+                                XO1(i + 2, 2)           \
+                                        XO1(i + 3, 3)   \
+                ST(i, 0)                                \
+                        ST(i + 1, 1)                    \
+                                ST(i + 2, 2)            \
+                                        ST(i + 3, 3)    \
+                PF0(0)
+                                PF0(2)
+        " .align 32                     ;\n"
+        " 1:                            ;\n"
+                BLOCK(0)
+                BLOCK(4)
+                BLOCK(8)
+                BLOCK(12)
+        "       add %[inc], %[p1]       ;\n"
+        "       add %[inc], %[p2]       ;\n"
+        "       dec %[cnt]              ;\n"
+        "       jnz 1b                  ;\n"
+        : [cnt] "+r" (lines),
+          [p1] "+r" (p1), [p2] "+r" (p2)
+        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+        : "memory");
+        kernel_fpu_end();
+}
+static void
+xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+        unsigned long lines = bytes >> 8;
+        kernel_fpu_begin();
+        asm volatile(
+#undef BLOCK
+#define BLOCK(i)                        \
+                BLK64(PF0, LD, i)       \
+                BLK64(PF1, XO1, i)      \
+                BLK64(NOP, ST, i)       \
+        " .align 32                     ;\n"
+        " 1:                            ;\n"
+                BLOCK(0)
+                BLOCK(4)
+                BLOCK(8)
+                BLOCK(12)
+        "       add %[inc], %[p1]       ;\n"
+        "       add %[inc], %[p2]       ;\n"
+        "       dec %[cnt]              ;\n"
+        "       jnz 1b                  ;\n"
+        : [cnt] "+r" (lines),
+          [p1] "+r" (p1), [p2] "+r" (p2)
+        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+        : "memory");
+        kernel_fpu_end();
+}
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+          unsigned long *p3)
+{
+        unsigned long lines = bytes >> 8;
+        kernel_fpu_begin();
+        asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+                PF1(i)                                  \
+                                PF1(i + 2)              \
+                LD(i, 0)                                \
+                        LD(i + 1, 1)                    \
+                                LD(i + 2, 2)            \
+                                        LD(i + 3, 3)    \
+                PF2(i)                                  \
+                                PF2(i + 2)              \
+                PF0(i + 4)                              \
+                                PF0(i + 6)              \
+                XO1(i, 0)                               \
+                        XO1(i + 1, 1)                   \
+                                XO1(i + 2, 2)           \
+                                        XO1(i + 3, 3)   \
+                XO2(i, 0)                               \
+                        XO2(i + 1, 1)                   \
+                                XO2(i + 2, 2)           \
+                                        XO2(i + 3, 3)   \
+                ST(i, 0)                                \
+                        ST(i + 1, 1)                    \
+                                ST(i + 2, 2)            \
+                                        ST(i + 3, 3)    \
+                PF0(0)
+                                PF0(2)
+        " .align 32                     ;\n"
+        " 1:                            ;\n"
+                BLOCK(0)
+                BLOCK(4)
+                BLOCK(8)
+                BLOCK(12)
+        "       add %[inc], %[p1]       ;\n"
+        "       add %[inc], %[p2]       ;\n"
+        "       add %[inc], %[p3]       ;\n"
+        "       dec %[cnt]              ;\n"
+        "       jnz 1b                  ;\n"
+        : [cnt] "+r" (lines),
+          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+        : "memory");
+        kernel_fpu_end();
+}
+static void
+xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+               unsigned long *p3)
+{
+        unsigned long lines = bytes >> 8;
+        kernel_fpu_begin();
+        asm volatile(
+#undef BLOCK
+#define BLOCK(i)                        \
+                BLK64(PF0, LD, i)       \
+                BLK64(PF1, XO1, i)      \
+                BLK64(PF2, XO2, i)      \
+                BLK64(NOP, ST, i)       \
+        " .align 32                     ;\n"
+        " 1:                            ;\n"
+                BLOCK(0)
+                BLOCK(4)
+                BLOCK(8)
+                BLOCK(12)
+        "       add %[inc], %[p1]       ;\n"
+        "       add %[inc], %[p2]       ;\n"
+        "       add %[inc], %[p3]       ;\n"
+        "       dec %[cnt]              ;\n"
+        "       jnz 1b                  ;\n"
+        : [cnt] "+r" (lines),
+          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+        : "memory");
+        kernel_fpu_end();
+}
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+          unsigned long *p3, unsigned long *p4)
+{
+        unsigned long lines = bytes >> 8;
+        kernel_fpu_begin();
+        asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+                PF1(i)                                  \
+                                PF1(i + 2)              \
+                LD(i, 0)                                \
+                        LD(i + 1, 1)                    \
+                                LD(i + 2, 2)            \
+                                        LD(i + 3, 3)    \
+                PF2(i)                                  \
+                                PF2(i + 2)              \
+                XO1(i, 0)                               \
+                        XO1(i + 1, 1)                   \
+                                XO1(i + 2, 2)           \
+                                        XO1(i + 3, 3)   \
+                PF3(i)                                  \
+                                PF3(i + 2)              \
+                PF0(i + 4)                              \
+                                PF0(i + 6)              \
+                XO2(i, 0)                               \
+                        XO2(i + 1, 1)                   \
+                                XO2(i + 2, 2)           \
+                                        XO2(i + 3, 3)   \
+                XO3(i, 0)                               \
+                        XO3(i + 1, 1)                   \
+                                XO3(i + 2, 2)           \
+                                        XO3(i + 3, 3)   \
+                ST(i, 0)                                \
+                        ST(i + 1, 1)                    \
+                                ST(i + 2, 2)            \
+                                        ST(i + 3, 3)    \
+                PF0(0)
+                                PF0(2)
+        " .align 32                     ;\n"
+        " 1:                            ;\n"
+                BLOCK(0)
+                BLOCK(4)
+                BLOCK(8)
+                BLOCK(12)
+        "       add %[inc], %[p1]       ;\n"
+        "       add %[inc], %[p2]       ;\n"
+        "       add %[inc], %[p3]       ;\n"
+        "       add %[inc], %[p4]       ;\n"
+        "       dec %[cnt]              ;\n"
+        "       jnz 1b                  ;\n"
+        : [cnt] "+r" (lines), [p1] "+r" (p1),
+          [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+        : "memory");
+        kernel_fpu_end();
+}
+static void
+xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+               unsigned long *p3, unsigned long *p4)
+{
+        unsigned long lines = bytes >> 8;
+        kernel_fpu_begin();
+        asm volatile(
+#undef BLOCK
+#define BLOCK(i)                        \
+                BLK64(PF0, LD, i)       \
+                BLK64(PF1, XO1, i)      \
+                BLK64(PF2, XO2, i)      \
+                BLK64(PF3, XO3, i)      \
+                BLK64(NOP, ST, i)       \
+        " .align 32                     ;\n"
+        " 1:                            ;\n"
+                BLOCK(0)
+                BLOCK(4)
+                BLOCK(8)
+                BLOCK(12)
+        "       add %[inc], %[p1]       ;\n"
+        "       add %[inc], %[p2]       ;\n"
+        "       add %[inc], %[p3]       ;\n"
+        "       add %[inc], %[p4]       ;\n"
+        "       dec %[cnt]              ;\n"
+        "       jnz 1b                  ;\n"
+        : [cnt] "+r" (lines), [p1] "+r" (p1),
+          [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+        : "memory");
+        kernel_fpu_end();
+}
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+          unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+        unsigned long lines = bytes >> 8;
+        kernel_fpu_begin();
+        asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+                PF1(i)                                  \
+                                PF1(i + 2)              \
+                LD(i, 0)                                \
+                        LD(i + 1, 1)                    \
+                                LD(i + 2, 2)            \
+                                        LD(i + 3, 3)    \
+                PF2(i)                                  \
+                                PF2(i + 2)              \
+                XO1(i, 0)                               \
+                        XO1(i + 1, 1)                   \
+                                XO1(i + 2, 2)           \
+                                        XO1(i + 3, 3)   \
+                PF3(i)                                  \
+                                PF3(i + 2)              \
+                XO2(i, 0)                               \
+                        XO2(i + 1, 1)                   \
+                                XO2(i + 2, 2)           \
+                                        XO2(i + 3, 3)   \
+                PF4(i)                                  \
+                                PF4(i + 2)              \
+                PF0(i + 4)                              \
+                                PF0(i + 6)              \
+                XO3(i, 0)                               \
+                        XO3(i + 1, 1)                   \
+                                XO3(i + 2, 2)           \
+                                        XO3(i + 3, 3)   \
+                XO4(i, 0)                               \
+                        XO4(i + 1, 1)                   \
+                                XO4(i + 2, 2)           \
+                                        XO4(i + 3, 3)   \
+                ST(i, 0)                                \
+                        ST(i + 1, 1)                    \
+                                ST(i + 2, 2)            \
+                                        ST(i + 3, 3)    \
+                PF0(0)
+                                PF0(2)
+        " .align 32                     ;\n"
+        " 1:                            ;\n"
+                BLOCK(0)
+                BLOCK(4)
+                BLOCK(8)
+                BLOCK(12)
+        "       add %[inc], %[p1]       ;\n"
+        "       add %[inc], %[p2]       ;\n"
+        "       add %[inc], %[p3]       ;\n"
+        "       add %[inc], %[p4]       ;\n"
+        "       add %[inc], %[p5]       ;\n"
+        "       dec %[cnt]              ;\n"
+        "       jnz 1b                  ;\n"
+        : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
+          [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
+        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+        : "memory");
+        kernel_fpu_end();
+}
+static void
+xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+               unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+        unsigned long lines = bytes >> 8;
+        kernel_fpu_begin();
+        asm volatile(
+#undef BLOCK
+#define BLOCK(i)                        \
+                BLK64(PF0, LD, i)       \
+                BLK64(PF1, XO1, i)      \
+                BLK64(PF2, XO2, i)      \
+                BLK64(PF3, XO3, i)      \
+                BLK64(PF4, XO4, i)      \
+                BLK64(NOP, ST, i)       \
+        " .align 32                     ;\n"
+        " 1:                            ;\n"
+                BLOCK(0)
+                BLOCK(4)
+                BLOCK(8)
+                BLOCK(12)
+        "       add %[inc], %[p1]       ;\n"
+        "       add %[inc], %[p2]       ;\n"
+        "       add %[inc], %[p3]       ;\n"
+        "       add %[inc], %[p4]       ;\n"
+        "       add %[inc], %[p5]       ;\n"
+        "       dec %[cnt]              ;\n"
+        "       jnz 1b                  ;\n"
+        : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
+          [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
+        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+        : "memory");
+        kernel_fpu_end();
+}
+static struct xor_block_template xor_block_sse_pf64 = {
+        .name = "prefetch64-sse",
+        .do_2 = xor_sse_2_pf64,
+        .do_3 = xor_sse_3_pf64,
+        .do_4 = xor_sse_4_pf64,
+        .do_5 = xor_sse_5_pf64,
+};
+#undef LD
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef ST
+#undef NOP
+#undef BLK64
+#undef BLOCK
+#undef XOR_CONSTANT_CONSTRAINT
 #ifdef CONFIG_X86_32
 # include <asm/xor_32.h>
 #else
 # include <asm/xor_64.h>
 #endif
-#endif
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+        AVX_SELECT(FASTEST)
+#endif /* _ASM_X86_XOR_H */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index f79cb7ec0e06..ce05722e3c68 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -2,7 +2,7 @@
 #define _ASM_X86_XOR_32_H
 /*
- * Optimized RAID-5 checksumming functions for MMX and SSE.
+ * Optimized RAID-5 checksumming functions for MMX.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -529,290 +529,6 @@ static struct xor_block_template xor_block_p5_mmx = {
        .do_5 = xor_p5_mmx_5,
 };
-/*
- * Cache avoiding checksumming functions utilizing KNI instructions
- * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
- */
-#define OFFS(x)         "16*("#x")"
-#define PF_OFFS(x)      "256+16*("#x")"
-#define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%1)            ;\n"
-#define LD(x, y)        "       movaps   "OFFS(x)"(%1), %%xmm"#y"       ;\n"
-#define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%1)       ;\n"
-#define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%2)            ;\n"
-#define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%3)            ;\n"
-#define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%4)            ;\n"
-#define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%5)            ;\n"
-#define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%6)            ;\n"
-#define XO1(x, y)       "       xorps   "OFFS(x)"(%2), %%xmm"#y"        ;\n"
-#define XO2(x, y)       "       xorps   "OFFS(x)"(%3), %%xmm"#y"        ;\n"
-#define XO3(x, y)       "       xorps   "OFFS(x)"(%4), %%xmm"#y"        ;\n"
-#define XO4(x, y)       "       xorps   "OFFS(x)"(%5), %%xmm"#y"        ;\n"
-#define XO5(x, y)       "       xorps   "OFFS(x)"(%6), %%xmm"#y"        ;\n"
-static void
-xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
-{
-        unsigned long lines = bytes >> 8;
-        kernel_fpu_begin();
-        asm volatile(
-#undef BLOCK
-#define BLOCK(i)                                        \
-                LD(i, 0)                                \
-                        LD(i + 1, 1)                    \
-                PF1(i)                                  \
-                                PF1(i + 2)              \
-                                LD(i + 2, 2)            \
-                                        LD(i + 3, 3)    \
-                PF0(i + 4)                              \
-                                PF0(i + 6)              \
-                XO1(i, 0)                               \
-                        XO1(i + 1, 1)                   \
-                                XO1(i + 2, 2)           \
-                                        XO1(i + 3, 3)   \
-                ST(i, 0)                                \
-                        ST(i + 1, 1)                    \
-                                ST(i + 2, 2)            \
-                                        ST(i + 3, 3)    \
-                PF0(0)
-                                PF0(2)
-        " .align 32                     ;\n"
-        " 1:                            ;\n"
-                BLOCK(0)
-                BLOCK(4)
-                BLOCK(8)
-                BLOCK(12)
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-        : "+r" (lines),
-          "+r" (p1), "+r" (p2)
-        :
-        : "memory");
-        kernel_fpu_end();
-}
-static void
-xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-          unsigned long *p3)
-{
-        unsigned long lines = bytes >> 8;
-        kernel_fpu_begin();
-        asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-                PF1(i)                                  \
-                                PF1(i + 2)              \
-                LD(i,0)                                 \
-                        LD(i + 1, 1)                    \
-                                LD(i + 2, 2)            \
-                                        LD(i + 3, 3)    \
-                PF2(i)                                  \
-                                PF2(i + 2)              \
-                PF0(i + 4)                              \
-                                PF0(i + 6)              \
-                XO1(i,0)                                \
-                        XO1(i + 1, 1)                   \
-                                XO1(i + 2, 2)           \
-                                        XO1(i + 3, 3)   \
-                XO2(i,0)                                \
-                        XO2(i + 1, 1)                   \
-                                XO2(i + 2, 2)           \
-                                        XO2(i + 3, 3)   \
-                ST(i,0)                                 \
-                        ST(i + 1, 1)                    \
-                                ST(i + 2, 2)            \
-                                        ST(i + 3, 3)    \
-                PF0(0)
-                                PF0(2)
-        " .align 32                     ;\n"
-        " 1:                            ;\n"
-                BLOCK(0)
-                BLOCK(4)
-                BLOCK(8)
-                BLOCK(12)
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       addl $256, %3           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-        : "+r" (lines),
-          "+r" (p1), "+r"(p2), "+r"(p3)
-        :
-        : "memory" );
-        kernel_fpu_end();
-}
-static void
-xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-          unsigned long *p3, unsigned long *p4)
-{
-        unsigned long lines = bytes >> 8;
-        kernel_fpu_begin();
-        asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-                PF1(i)                                  \
-                                PF1(i + 2)              \
-                LD(i,0)                                 \
-                        LD(i + 1, 1)                    \
-                                LD(i + 2, 2)            \
-                                        LD(i + 3, 3)    \
-                PF2(i)                                  \
-                                PF2(i + 2)              \
-                XO1(i,0)                                \
-                        XO1(i + 1, 1)                   \
-                                XO1(i + 2, 2)           \
-                                        XO1(i + 3, 3)   \
-                PF3(i)                                  \
-                                PF3(i + 2)              \
-                PF0(i + 4)                              \
-                                PF0(i + 6)              \
-                XO2(i,0)                                \
-                        XO2(i + 1, 1)                   \
-                                XO2(i + 2, 2)           \
-                                        XO2(i + 3, 3)   \
-                XO3(i,0)                                \
-                        XO3(i + 1, 1)                   \
-                                XO3(i + 2, 2)           \
-                                        XO3(i + 3, 3)   \
-                ST(i,0)                                 \
-                        ST(i + 1, 1)                    \
-                                ST(i + 2, 2)            \
-                                        ST(i + 3, 3)    \
-                PF0(0)
-                                PF0(2)
-        " .align 32                     ;\n"
-        " 1:                            ;\n"
-                BLOCK(0)
-                BLOCK(4)
-                BLOCK(8)
-                BLOCK(12)
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       addl $256, %3           ;\n"
-        "       addl $256, %4           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-        : "+r" (lines),
-          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
-        :
-        : "memory" );
-        kernel_fpu_end();
-}
-static void
-xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-          unsigned long *p3, unsigned long *p4, unsigned long *p5)
-{
-        unsigned long lines = bytes >> 8;
-        kernel_fpu_begin();
-        /* Make sure GCC forgets anything it knows about p4 or p5,
-           such that it won't pass to the asm volatile below a
-           register that is shared with any other variable.  That's
-           because we modify p4 and p5 there, but we can't mark them
-           as read/write, otherwise we'd overflow the 10-asm-operands
-           limit of GCC < 3.1.  */
-        asm("" : "+r" (p4), "+r" (p5));
-        asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-                PF1(i)                                  \
-                                PF1(i + 2)              \
-                LD(i,0)                                 \
-                        LD(i + 1, 1)                    \
-                                LD(i + 2, 2)            \
-                                        LD(i + 3, 3)    \
-                PF2(i)                                  \
-                                PF2(i + 2)              \
-                XO1(i,0)                                \
-                        XO1(i + 1, 1)                   \
-                                XO1(i + 2, 2)           \
-                                        XO1(i + 3, 3)   \
-                PF3(i)                                  \
-                                PF3(i + 2)              \
-                XO2(i,0)                                \
-                        XO2(i + 1, 1)                   \
-                                XO2(i + 2, 2)           \
-                                        XO2(i + 3, 3)   \
-                PF4(i)                                  \
-                                PF4(i + 2)              \
-                PF0(i + 4)                              \
-                                PF0(i + 6)              \
-                XO3(i,0)                                \
-                        XO3(i + 1, 1)                   \
-                                XO3(i + 2, 2)           \
-                                        XO3(i + 3, 3)   \
-                XO4(i,0)                                \
-                        XO4(i + 1, 1)                   \
-                                XO4(i + 2, 2)           \
-                                        XO4(i + 3, 3)   \
-                ST(i,0)                                 \
-                        ST(i + 1, 1)                    \
-                                ST(i + 2, 2)            \
-                                        ST(i + 3, 3)    \
-                PF0(0)
-                                PF0(2)
-        " .align 32                     ;\n"
-        " 1:                            ;\n"
-                BLOCK(0)
-                BLOCK(4)
-                BLOCK(8)
-                BLOCK(12)
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       addl $256, %3           ;\n"
-        "       addl $256, %4           ;\n"
-        "       addl $256, %5           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-        : "+r" (lines),
-          "+r" (p1), "+r" (p2), "+r" (p3)
-        : "r" (p4), "r" (p5)
-        : "memory");
-        /* p4 and p5 were modified, and now the variables are dead.
-           Clobber them just to be sure nobody does something stupid
-           like assuming they have some legal value.  */
-        asm("" : "=r" (p4), "=r" (p5));
-        kernel_fpu_end();
-}
 static struct xor_block_template xor_block_pIII_sse = {
        .name = "pIII_sse",
        .do_2 = xor_sse_2,
@@ -827,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = {
 /* Also try the generic routines.  */
 #include <asm-generic/xor.h>
+/* We force the use of the SSE xor block because it can write around L2.
+   We may also be able to load into the L1 only depending on how the cpu
+   deals with a load to a line that is being prefetched.  */
 #undef XOR_TRY_TEMPLATES
 #define XOR_TRY_TEMPLATES                               \
 do {                                                    \
-        xor_speed(&xor_block_8regs);                    \
-        xor_speed(&xor_block_8regs_p);                  \
-        xor_speed(&xor_block_32regs);                   \
-        xor_speed(&xor_block_32regs_p);                 \
        AVX_XOR_SPEED;                                  \
-        if (cpu_has_xmm)                                \
+        if (cpu_has_xmm) {                              \
                xor_speed(&xor_block_pIII_sse);         \
-        if (cpu_has_mmx) {                              \
+                xor_speed(&xor_block_sse_pf64);         \
+        } else if (cpu_has_mmx) {                       \
                xor_speed(&xor_block_pII_mmx);          \
                xor_speed(&xor_block_p5_mmx);           \
+        } else {                                        \
+                xor_speed(&xor_block_8regs);            \
+                xor_speed(&xor_block_8regs_p);          \
+                xor_speed(&xor_block_32regs);           \
+                xor_speed(&xor_block_32regs_p);         \
        }                                               \
 } while (0)
-/* We force the use of the SSE xor block because it can write around L2.
-   We may also be able to load into the L1 only depending on how the cpu
-   deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST)                    \
-        AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
 #endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 87ac522c4af5..546f1e3b87cc 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -1,301 +1,6 @@
 #ifndef _ASM_X86_XOR_64_H
 #define _ASM_X86_XOR_64_H
-/*
- * Optimized RAID-5 checksumming functions for MMX and SSE.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-/*
- * Cache avoiding checksumming functions utilizing KNI instructions
- * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
- */
-/*
- * Based on
- * High-speed RAID5 checksumming functions utilizing SSE instructions.
- * Copyright (C) 1998 Ingo Molnar.
- */
-/*
- * x86-64 changes / gcc fixes from Andi Kleen.
- * Copyright 2002 Andi Kleen, SuSE Labs.
- *
- * This hasn't been optimized for the hammer yet, but there are likely
- * no advantages to be gotten from x86-64 here anyways.
- */
-#include <asm/i387.h>
-#define OFFS(x)         "16*("#x")"
-#define PF_OFFS(x)      "256+16*("#x")"
-#define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
-#define LD(x, y)        "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
-#define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
-#define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
-#define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
-#define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
-#define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
-#define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
-#define XO1(x, y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
-#define XO2(x, y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
-#define XO3(x, y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
-#define XO4(x, y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
-#define XO5(x, y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
-static void
-xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
-{
-        unsigned int lines = bytes >> 8;
-        kernel_fpu_begin();
-        asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-                LD(i, 0)                                \
-                        LD(i + 1, 1)                    \
-                PF1(i)                                  \
-                                PF1(i + 2)              \
-                                LD(i + 2, 2)            \
-                                        LD(i + 3, 3)    \
-                PF0(i + 4)                              \
-                                PF0(i + 6)              \
-                XO1(i, 0)                               \
-                        XO1(i + 1, 1)                   \
-                                XO1(i + 2, 2)           \
-                                        XO1(i + 3, 3)   \
-                ST(i, 0)                                \
-                        ST(i + 1, 1)                    \
-                                ST(i + 2, 2)            \
-                                        ST(i + 3, 3)    \
-                PF0(0)
-                                PF0(2)
-        " .align 32                     ;\n"
-        " 1:                            ;\n"
-                BLOCK(0)
-                BLOCK(4)
-                BLOCK(8)
-                BLOCK(12)
-        "       addq %[inc], %[p1]           ;\n"
-        "       addq %[inc], %[p2]           ;\n"
-                "               decl %[cnt] ; jnz 1b"
-        : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
-        : [inc] "r" (256UL)
-        : "memory");
-        kernel_fpu_end();
-}
-static void
-xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-          unsigned long *p3)
-{
-        unsigned int lines = bytes >> 8;
-        kernel_fpu_begin();
-        asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-                PF1(i)                                  \
-                                PF1(i + 2)              \
-                LD(i, 0)                                        \
-                        LD(i + 1, 1)                    \
-                                LD(i + 2, 2)            \
-                                        LD(i + 3, 3)    \
-                PF2(i)                                  \
-                                PF2(i + 2)              \
-                PF0(i + 4)                              \
-                                PF0(i + 6)              \
-                XO1(i, 0)                               \
-                        XO1(i + 1, 1)                   \
-                                XO1(i + 2, 2)           \
-                                        XO1(i + 3, 3)   \
-                XO2(i, 0)                               \
-                        XO2(i + 1, 1)                   \
-                                XO2(i + 2, 2)           \
-                                        XO2(i + 3, 3)   \
-                ST(i, 0)                                \
-                        ST(i + 1, 1)                    \
-                                ST(i + 2, 2)            \
-                                        ST(i + 3, 3)    \
-                PF0(0)
-                                PF0(2)
-        " .align 32                     ;\n"
-        " 1:                            ;\n"
-                BLOCK(0)
-                BLOCK(4)
-                BLOCK(8)
-                BLOCK(12)
-        "       addq %[inc], %[p1]           ;\n"
-        "       addq %[inc], %[p2]          ;\n"
-        "       addq %[inc], %[p3]           ;\n"
-                "               decl %[cnt] ; jnz 1b"
-        : [cnt] "+r" (lines),
-          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
-        : [inc] "r" (256UL)
-        : "memory");
-        kernel_fpu_end();
-}
-static void
-xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-          unsigned long *p3, unsigned long *p4)
-{
-        unsigned int lines = bytes >> 8;
-        kernel_fpu_begin();
-        asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-                PF1(i)                                  \
-                                PF1(i + 2)              \
-                LD(i, 0)                                \
-                        LD(i + 1, 1)                    \
-                                LD(i + 2, 2)            \
-                                        LD(i + 3, 3)    \
-                PF2(i)                                  \
-                                PF2(i + 2)              \
-                XO1(i, 0)                               \
-                        XO1(i + 1, 1)                   \
-                                XO1(i + 2, 2)           \
-                                        XO1(i + 3, 3)   \
-                PF3(i)                                  \
-                                PF3(i + 2)              \
-                PF0(i + 4)                              \
-                                PF0(i + 6)              \
-                XO2(i, 0)                               \
-                        XO2(i + 1, 1)                   \
-                                XO2(i + 2, 2)           \
-                                        XO2(i + 3, 3)   \
-                XO3(i, 0)                               \
-                        XO3(i + 1, 1)                   \
-                                XO3(i + 2, 2)           \
-                                        XO3(i + 3, 3)   \
-                ST(i, 0)                                \
-                        ST(i + 1, 1)                    \
-                                ST(i + 2, 2)            \
-                                        ST(i + 3, 3)    \
-                PF0(0)
-                                PF0(2)
-        " .align 32                     ;\n"
-        " 1:                            ;\n"
-                BLOCK(0)
-                BLOCK(4)
-                BLOCK(8)
-                BLOCK(12)
-        "       addq %[inc], %[p1]           ;\n"
-        "       addq %[inc], %[p2]           ;\n"
-        "       addq %[inc], %[p3]           ;\n"
-        "       addq %[inc], %[p4]           ;\n"
-        "       decl %[cnt] ; jnz 1b"
-        : [cnt] "+c" (lines),
-          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
-        : [inc] "r" (256UL)
-        : "memory" );
-        kernel_fpu_end();
-}
-static void
-xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-          unsigned long *p3, unsigned long *p4, unsigned long *p5)
-{
-        unsigned int lines = bytes >> 8;
-        kernel_fpu_begin();
-        asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-                PF1(i)                                  \
-                                PF1(i + 2)              \
-                LD(i, 0)                                \
-                        LD(i + 1, 1)                    \
-                                LD(i + 2, 2)            \
-                                        LD(i + 3, 3)    \
-                PF2(i)                                  \
-                                PF2(i + 2)              \
-                XO1(i, 0)                               \
-                        XO1(i + 1, 1)                   \
-                                XO1(i + 2, 2)           \
-                                        XO1(i + 3, 3)   \
-                PF3(i)                                  \
-                                PF3(i + 2)              \
-                XO2(i, 0)                               \
-                        XO2(i + 1, 1)                   \
-                                XO2(i + 2, 2)           \
-                                        XO2(i + 3, 3)   \
-                PF4(i)                                  \
-                                PF4(i + 2)              \
-                PF0(i + 4)                              \
-                                PF0(i + 6)              \
-                XO3(i, 0)                               \
-                        XO3(i + 1, 1)                   \
-                                XO3(i + 2, 2)           \
-                                        XO3(i + 3, 3)   \
-                XO4(i, 0)                               \
-                        XO4(i + 1, 1)                   \
-                                XO4(i + 2, 2)           \
-                                        XO4(i + 3, 3)   \
-                ST(i, 0)                                \
-                        ST(i + 1, 1)                    \
-                                ST(i + 2, 2)            \
-                                        ST(i + 3, 3)    \
-                PF0(0)
-                                PF0(2)
-        " .align 32                     ;\n"
-        " 1:                            ;\n"
-                BLOCK(0)
-                BLOCK(4)
-                BLOCK(8)
-                BLOCK(12)
-        "       addq %[inc], %[p1]           ;\n"
-        "       addq %[inc], %[p2]           ;\n"
-        "       addq %[inc], %[p3]           ;\n"
-        "       addq %[inc], %[p4]           ;\n"
-        "       addq %[inc], %[p5]           ;\n"
-        "       decl %[cnt] ; jnz 1b"
-        : [cnt] "+c" (lines),
-          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
-          [p5] "+r" (p5)
-        : [inc] "r" (256UL)
-        : "memory");
-        kernel_fpu_end();
-}
 static struct xor_block_template xor_block_sse = {
        .name = "generic_sse",
        .do_2 = xor_sse_2,
@@ -308,17 +13,15 @@ static struct xor_block_template xor_block_sse = {
 /* Also try the AVX routines */
 #include <asm/xor_avx.h>
+/* We force the use of the SSE xor block because it can write around L2.
+   We may also be able to load into the L1 only depending on how the cpu
+   deals with a load to a line that is being prefetched.  */
 #undef XOR_TRY_TEMPLATES
 #define XOR_TRY_TEMPLATES                       \
 do {                                            \
        AVX_XOR_SPEED;                          \
+        xor_speed(&xor_block_sse_pf64);         \
        xor_speed(&xor_block_sse);              \
 } while (0)
-/* We force the use of the SSE xor block because it can write around L2.
-   We may also be able to load into the L1 only depending on how the cpu
-   deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
-        AVX_SELECT(&xor_block_sse)
 #endif /* _ASM_X86_XOR_64_H */