x86, um: ... and asm-x86 move

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
author: Al Viro <viro@zeniv.linux.org.uk> 2008-08-17 21:05:42 -0400
committer: H. Peter Anvin <hpa@zytor.com> 2008-10-23 01:55:20 -0400
commit: bb8985586b7a906e116db835c64773b7a7d51663 (patch)
tree: de93ae58e88cc563d95cc124a73f3930594c6100 /include/asm-x86/xor_64.h
parent: 8ede0bdb63305d3353efd97e9af6210afb05734e (diff)
1 files changed, 0 insertions, 361 deletions
diff --git a/include/asm-x86/xor_64.h b/include/asm-x86/xor_64.h
deleted file mode 100644
index 2d3a18de295b..000000000000
--- a/include/asm-x86/xor_64.h
+++ /dev/null
@@ -1,361 +0,0 @@
-#ifndef ASM_X86__XOR_64_H
-#define ASM_X86__XOR_64_H
-/*
- * Optimized RAID-5 checksumming functions for MMX and SSE.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-/*
- * Cache avoiding checksumming functions utilizing KNI instructions
- * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
- */
-/*
- * Based on
- * High-speed RAID5 checksumming functions utilizing SSE instructions.
- * Copyright (C) 1998 Ingo Molnar.
- */
-/*
- * x86-64 changes / gcc fixes from Andi Kleen.
- * Copyright 2002 Andi Kleen, SuSE Labs.
- *
- * This hasn't been optimized for the hammer yet, but there are likely
- * no advantages to be gotten from x86-64 here anyways.
- */
-typedef struct {
-        unsigned long a, b;
-} __attribute__((aligned(16))) xmm_store_t;
-/* Doesn't use gcc to save the XMM registers, because there is no easy way to
-   tell it to do a clts before the register saving. */
-#define XMMS_SAVE                               \
-do {                                            \
-        preempt_disable();                      \
-        asm volatile(                           \
-                "movq %%cr0,%0          ;\n\t"  \
-                "clts                   ;\n\t"  \
-                "movups %%xmm0,(%1)     ;\n\t"  \
-                "movups %%xmm1,0x10(%1) ;\n\t"  \
-                "movups %%xmm2,0x20(%1) ;\n\t"  \
-                "movups %%xmm3,0x30(%1) ;\n\t"  \
-                : "=&r" (cr0)                   \
-                : "r" (xmm_save)                \
-                : "memory");                    \
-} while (0)
-#define XMMS_RESTORE                            \
-do {                                            \
-        asm volatile(                           \
-                "sfence                 ;\n\t"  \
-                "movups (%1),%%xmm0     ;\n\t"  \
-                "movups 0x10(%1),%%xmm1 ;\n\t"  \
-                "movups 0x20(%1),%%xmm2 ;\n\t"  \
-                "movups 0x30(%1),%%xmm3 ;\n\t"  \
-                "movq   %0,%%cr0        ;\n\t"  \
-                :                               \
-                : "r" (cr0), "r" (xmm_save)     \
-                : "memory");                    \
-        preempt_enable();                       \
-} while (0)
-#define OFFS(x)         "16*("#x")"
-#define PF_OFFS(x)      "256+16*("#x")"
-#define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
-#define LD(x, y)        "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
-#define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
-#define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
-#define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
-#define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
-#define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
-#define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
-#define XO1(x, y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
-#define XO2(x, y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
-#define XO3(x, y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
-#define XO4(x, y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
-#define XO5(x, y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
-static void
-xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
-{
-        unsigned int lines = bytes >> 8;
-        unsigned long cr0;
-        xmm_store_t xmm_save[4];
-        XMMS_SAVE;
-        asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-                LD(i, 0)                                \
-                        LD(i + 1, 1)                    \
-                PF1(i)                                  \
-                                PF1(i + 2)              \
-                                LD(i + 2, 2)            \
-                                        LD(i + 3, 3)    \
-                PF0(i + 4)                              \
-                                PF0(i + 6)              \
-                XO1(i, 0)                               \
-                        XO1(i + 1, 1)                   \
-                                XO1(i + 2, 2)           \
-                                        XO1(i + 3, 3)   \
-                ST(i, 0)                                \
-                        ST(i + 1, 1)                    \
-                                ST(i + 2, 2)            \
-                                        ST(i + 3, 3)    \
-                PF0(0)
-                                PF0(2)
-        " .align 32                     ;\n"
-        " 1:                            ;\n"
-                BLOCK(0)
-                BLOCK(4)
-                BLOCK(8)
-                BLOCK(12)
-        "       addq %[inc], %[p1]           ;\n"
-        "       addq %[inc], %[p2]           ;\n"
-                "               decl %[cnt] ; jnz 1b"
-        : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
-        : [inc] "r" (256UL)
-        : "memory");
-        XMMS_RESTORE;
-}
-static void
-xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-          unsigned long *p3)
-{
-        unsigned int lines = bytes >> 8;
-        xmm_store_t xmm_save[4];
-        unsigned long cr0;
-        XMMS_SAVE;
-        asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-                PF1(i)                                  \
-                                PF1(i + 2)              \
-                LD(i, 0)                                        \
-                        LD(i + 1, 1)                    \
-                                LD(i + 2, 2)            \
-                                        LD(i + 3, 3)    \
-                PF2(i)                                  \
-                                PF2(i + 2)              \
-                PF0(i + 4)                              \
-                                PF0(i + 6)              \
-                XO1(i, 0)                               \
-                        XO1(i + 1, 1)                   \
-                                XO1(i + 2, 2)           \
-                                        XO1(i + 3, 3)   \
-                XO2(i, 0)                               \
-                        XO2(i + 1, 1)                   \
-                                XO2(i + 2, 2)           \
-                                        XO2(i + 3, 3)   \
-                ST(i, 0)                                \
-                        ST(i + 1, 1)                    \
-                                ST(i + 2, 2)            \
-                                        ST(i + 3, 3)    \
-                PF0(0)
-                                PF0(2)
-        " .align 32                     ;\n"
-        " 1:                            ;\n"
-                BLOCK(0)
-                BLOCK(4)
-                BLOCK(8)
-                BLOCK(12)
-        "       addq %[inc], %[p1]           ;\n"
-        "       addq %[inc], %[p2]          ;\n"
-        "       addq %[inc], %[p3]           ;\n"
-                "               decl %[cnt] ; jnz 1b"
-        : [cnt] "+r" (lines),
-          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
-        : [inc] "r" (256UL)
-        : "memory");
-        XMMS_RESTORE;
-}
-static void
-xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-          unsigned long *p3, unsigned long *p4)
-{
-        unsigned int lines = bytes >> 8;
-        xmm_store_t xmm_save[4];
-        unsigned long cr0;
-        XMMS_SAVE;
-        asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-                PF1(i)                                  \
-                                PF1(i + 2)              \
-                LD(i, 0)                                \
-                        LD(i + 1, 1)                    \
-                                LD(i + 2, 2)            \
-                                        LD(i + 3, 3)    \
-                PF2(i)                                  \
-                                PF2(i + 2)              \
-                XO1(i, 0)                               \
-                        XO1(i + 1, 1)                   \
-                                XO1(i + 2, 2)           \
-                                        XO1(i + 3, 3)   \
-                PF3(i)                                  \
-                                PF3(i + 2)              \
-                PF0(i + 4)                              \
-                                PF0(i + 6)              \
-                XO2(i, 0)                               \
-                        XO2(i + 1, 1)                   \
-                                XO2(i + 2, 2)           \
-                                        XO2(i + 3, 3)   \
-                XO3(i, 0)                               \
-                        XO3(i + 1, 1)                   \
-                                XO3(i + 2, 2)           \
-                                        XO3(i + 3, 3)   \
-                ST(i, 0)                                \
-                        ST(i + 1, 1)                    \
-                                ST(i + 2, 2)            \
-                                        ST(i + 3, 3)    \
-                PF0(0)
-                                PF0(2)
-        " .align 32                     ;\n"
-        " 1:                            ;\n"
-                BLOCK(0)
-                BLOCK(4)
-                BLOCK(8)
-                BLOCK(12)
-        "       addq %[inc], %[p1]           ;\n"
-        "       addq %[inc], %[p2]           ;\n"
-        "       addq %[inc], %[p3]           ;\n"
-        "       addq %[inc], %[p4]           ;\n"
-        "       decl %[cnt] ; jnz 1b"
-        : [cnt] "+c" (lines),
-          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
-        : [inc] "r" (256UL)
-        : "memory" );
-        XMMS_RESTORE;
-}
-static void
-xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
-          unsigned long *p3, unsigned long *p4, unsigned long *p5)
-{
-        unsigned int lines = bytes >> 8;
-        xmm_store_t xmm_save[4];
-        unsigned long cr0;
-        XMMS_SAVE;
-        asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-                PF1(i)                                  \
-                                PF1(i + 2)              \
-                LD(i, 0)                                \
-                        LD(i + 1, 1)                    \
-                                LD(i + 2, 2)            \
-                                        LD(i + 3, 3)    \
-                PF2(i)                                  \
-                                PF2(i + 2)              \
-                XO1(i, 0)                               \
-                        XO1(i + 1, 1)                   \
-                                XO1(i + 2, 2)           \
-                                        XO1(i + 3, 3)   \
-                PF3(i)                                  \
-                                PF3(i + 2)              \
-                XO2(i, 0)                               \
-                        XO2(i + 1, 1)                   \
-                                XO2(i + 2, 2)           \
-                                        XO2(i + 3, 3)   \
-                PF4(i)                                  \
-                                PF4(i + 2)              \
-                PF0(i + 4)                              \
-                                PF0(i + 6)              \
-                XO3(i, 0)                               \
-                        XO3(i + 1, 1)                   \
-                                XO3(i + 2, 2)           \
-                                        XO3(i + 3, 3)   \
-                XO4(i, 0)                               \
-                        XO4(i + 1, 1)                   \
-                                XO4(i + 2, 2)           \
-                                        XO4(i + 3, 3)   \
-                ST(i, 0)                                \
-                        ST(i + 1, 1)                    \
-                                ST(i + 2, 2)            \
-                                        ST(i + 3, 3)    \
-                PF0(0)
-                                PF0(2)
-        " .align 32                     ;\n"
-        " 1:                            ;\n"
-                BLOCK(0)
-                BLOCK(4)
-                BLOCK(8)
-                BLOCK(12)
-        "       addq %[inc], %[p1]           ;\n"
-        "       addq %[inc], %[p2]           ;\n"
-        "       addq %[inc], %[p3]           ;\n"
-        "       addq %[inc], %[p4]           ;\n"
-        "       addq %[inc], %[p5]           ;\n"
-        "       decl %[cnt] ; jnz 1b"
-        : [cnt] "+c" (lines),
-          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
-          [p5] "+r" (p5)
-        : [inc] "r" (256UL)
-        : "memory");
-        XMMS_RESTORE;
-}
-static struct xor_block_template xor_block_sse = {
-        .name = "generic_sse",
-        .do_2 = xor_sse_2,
-        .do_3 = xor_sse_3,
-        .do_4 = xor_sse_4,
-        .do_5 = xor_sse_5,
-};
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES                       \
-do {                                            \
-        xor_speed(&xor_block_sse);              \
-} while (0)
-/* We force the use of the SSE xor block because it can write around L2.
-   We may also be able to load into the L1 only depending on how the cpu
-   deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
-#endif /* ASM_X86__XOR_64_H */
author	Al Viro <viro@zeniv.linux.org.uk>	2008-08-17 21:05:42 -0400
committer	H. Peter Anvin <hpa@zytor.com>	2008-10-23 01:55:20 -0400
commit	bb8985586b7a906e116db835c64773b7a7d51663 (patch)
tree	de93ae58e88cc563d95cc124a73f3930594c6100 /include/asm-x86/xor_64.h
parent	8ede0bdb63305d3353efd97e9af6210afb05734e (diff)

diff --git a/include/asm-x86/xor_64.h b/include/asm-x86/xor_64.h deleted file mode 100644 index 2d3a18de295b..000000000000 --- a/include/asm-x86/xor_64.h +++ /dev/null
@@ -1,361 +0,0 @@
1	#ifndef ASM_X86__XOR_64_H
2	#define ASM_X86__XOR_64_H
3
4	/*
5	* Optimized RAID-5 checksumming functions for MMX and SSE.
6	*
7	* This program is free software; you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation; either version 2, or (at your option)
10	* any later version.
11	*
12	* You should have received a copy of the GNU General Public License
13	* (for example /usr/src/linux/COPYING); if not, write to the Free
14	* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17
18	/*
19	* Cache avoiding checksumming functions utilizing KNI instructions
20	* Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21	*/
22
23	/*
24	* Based on
25	* High-speed RAID5 checksumming functions utilizing SSE instructions.
26	* Copyright (C) 1998 Ingo Molnar.
27	*/
28
29	/*
30	* x86-64 changes / gcc fixes from Andi Kleen.
31	* Copyright 2002 Andi Kleen, SuSE Labs.
32	*
33	* This hasn't been optimized for the hammer yet, but there are likely
34	* no advantages to be gotten from x86-64 here anyways.
35	*/
36
37	typedef struct {
38	unsigned long a, b;
39	} __attribute__((aligned(16))) xmm_store_t;
40
41	/* Doesn't use gcc to save the XMM registers, because there is no easy way to
42	tell it to do a clts before the register saving. */
43	#define XMMS_SAVE \
44	do { \
45	preempt_disable(); \
46	asm volatile( \
47	"movq %%cr0,%0 ;\n\t" \
48	"clts ;\n\t" \
49	"movups %%xmm0,(%1) ;\n\t" \
50	"movups %%xmm1,0x10(%1) ;\n\t" \
51	"movups %%xmm2,0x20(%1) ;\n\t" \
52	"movups %%xmm3,0x30(%1) ;\n\t" \
53	: "=&r" (cr0) \
54	: "r" (xmm_save) \
55	: "memory"); \
56	} while (0)
57
58	#define XMMS_RESTORE \
59	do { \
60	asm volatile( \
61	"sfence ;\n\t" \
62	"movups (%1),%%xmm0 ;\n\t" \
63	"movups 0x10(%1),%%xmm1 ;\n\t" \
64	"movups 0x20(%1),%%xmm2 ;\n\t" \
65	"movups 0x30(%1),%%xmm3 ;\n\t" \
66	"movq %0,%%cr0 ;\n\t" \
67	: \
68	: "r" (cr0), "r" (xmm_save) \
69	: "memory"); \
70	preempt_enable(); \
71	} while (0)
72
73	#define OFFS(x) "16*("#x")"
74	#define PF_OFFS(x) "256+16*("#x")"
75	#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
76	#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
77	#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
78	#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
79	#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
80	#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
81	#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
82	#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
83	#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
84	#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
85	#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
86	#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
87	#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
88
89
90	static void
91	xor_sse_2(unsigned long bytes, unsigned long p1, unsigned long p2)
92	{
93	unsigned int lines = bytes >> 8;
94	unsigned long cr0;
95	xmm_store_t xmm_save[4];
96
97	XMMS_SAVE;
98
99	asm volatile(
100	#undef BLOCK
101	#define BLOCK(i) \
102	LD(i, 0) \
103	LD(i + 1, 1) \
104	PF1(i) \
105	PF1(i + 2) \
106	LD(i + 2, 2) \
107	LD(i + 3, 3) \
108	PF0(i + 4) \
109	PF0(i + 6) \
110	XO1(i, 0) \
111	XO1(i + 1, 1) \
112	XO1(i + 2, 2) \
113	XO1(i + 3, 3) \
114	ST(i, 0) \
115	ST(i + 1, 1) \
116	ST(i + 2, 2) \
117	ST(i + 3, 3) \
118
119
120	PF0(0)
121	PF0(2)
122
123	" .align 32 ;\n"
124	" 1: ;\n"
125
126	BLOCK(0)
127	BLOCK(4)
128	BLOCK(8)
129	BLOCK(12)
130
131	" addq %[inc], %[p1] ;\n"
132	" addq %[inc], %[p2] ;\n"
133	" decl %[cnt] ; jnz 1b"
134	: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
135	: [inc] "r" (256UL)
136	: "memory");
137
138	XMMS_RESTORE;
139	}
140
141	static void
142	xor_sse_3(unsigned long bytes, unsigned long p1, unsigned long p2,
143	unsigned long *p3)
144	{
145	unsigned int lines = bytes >> 8;
146	xmm_store_t xmm_save[4];
147	unsigned long cr0;
148
149	XMMS_SAVE;
150
151	asm volatile(
152	#undef BLOCK
153	#define BLOCK(i) \
154	PF1(i) \
155	PF1(i + 2) \
156	LD(i, 0) \
157	LD(i + 1, 1) \
158	LD(i + 2, 2) \
159	LD(i + 3, 3) \
160	PF2(i) \
161	PF2(i + 2) \
162	PF0(i + 4) \
163	PF0(i + 6) \
164	XO1(i, 0) \
165	XO1(i + 1, 1) \
166	XO1(i + 2, 2) \
167	XO1(i + 3, 3) \
168	XO2(i, 0) \
169	XO2(i + 1, 1) \
170	XO2(i + 2, 2) \
171	XO2(i + 3, 3) \
172	ST(i, 0) \
173	ST(i + 1, 1) \
174	ST(i + 2, 2) \
175	ST(i + 3, 3) \
176
177
178	PF0(0)
179	PF0(2)
180
181	" .align 32 ;\n"
182	" 1: ;\n"
183
184	BLOCK(0)
185	BLOCK(4)
186	BLOCK(8)
187	BLOCK(12)
188
189	" addq %[inc], %[p1] ;\n"
190	" addq %[inc], %[p2] ;\n"
191	" addq %[inc], %[p3] ;\n"
192	" decl %[cnt] ; jnz 1b"
193	: [cnt] "+r" (lines),
194	[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
195	: [inc] "r" (256UL)
196	: "memory");
197	XMMS_RESTORE;
198	}
199
200	static void
201	xor_sse_4(unsigned long bytes, unsigned long p1, unsigned long p2,
202	unsigned long p3, unsigned long p4)
203	{
204	unsigned int lines = bytes >> 8;
205	xmm_store_t xmm_save[4];
206	unsigned long cr0;
207
208	XMMS_SAVE;
209
210	asm volatile(
211	#undef BLOCK
212	#define BLOCK(i) \
213	PF1(i) \
214	PF1(i + 2) \
215	LD(i, 0) \
216	LD(i + 1, 1) \
217	LD(i + 2, 2) \
218	LD(i + 3, 3) \
219	PF2(i) \
220	PF2(i + 2) \
221	XO1(i, 0) \
222	XO1(i + 1, 1) \
223	XO1(i + 2, 2) \
224	XO1(i + 3, 3) \
225	PF3(i) \
226	PF3(i + 2) \
227	PF0(i + 4) \
228	PF0(i + 6) \
229	XO2(i, 0) \
230	XO2(i + 1, 1) \
231	XO2(i + 2, 2) \
232	XO2(i + 3, 3) \
233	XO3(i, 0) \
234	XO3(i + 1, 1) \
235	XO3(i + 2, 2) \
236	XO3(i + 3, 3) \
237	ST(i, 0) \
238	ST(i + 1, 1) \
239	ST(i + 2, 2) \
240	ST(i + 3, 3) \
241
242
243	PF0(0)
244	PF0(2)
245
246	" .align 32 ;\n"
247	" 1: ;\n"
248
249	BLOCK(0)
250	BLOCK(4)
251	BLOCK(8)
252	BLOCK(12)
253
254	" addq %[inc], %[p1] ;\n"
255	" addq %[inc], %[p2] ;\n"
256	" addq %[inc], %[p3] ;\n"
257	" addq %[inc], %[p4] ;\n"
258	" decl %[cnt] ; jnz 1b"
259	: [cnt] "+c" (lines),
260	[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
261	: [inc] "r" (256UL)
262	: "memory" );
263
264	XMMS_RESTORE;
265	}
266
267	static void
268	xor_sse_5(unsigned long bytes, unsigned long p1, unsigned long p2,
269	unsigned long p3, unsigned long p4, unsigned long *p5)
270	{
271	unsigned int lines = bytes >> 8;
272	xmm_store_t xmm_save[4];
273	unsigned long cr0;
274
275	XMMS_SAVE;
276
277	asm volatile(
278	#undef BLOCK
279	#define BLOCK(i) \
280	PF1(i) \
281	PF1(i + 2) \
282	LD(i, 0) \
283	LD(i + 1, 1) \
284	LD(i + 2, 2) \
285	LD(i + 3, 3) \
286	PF2(i) \
287	PF2(i + 2) \
288	XO1(i, 0) \
289	XO1(i + 1, 1) \
290	XO1(i + 2, 2) \
291	XO1(i + 3, 3) \
292	PF3(i) \
293	PF3(i + 2) \
294	XO2(i, 0) \
295	XO2(i + 1, 1) \
296	XO2(i + 2, 2) \
297	XO2(i + 3, 3) \
298	PF4(i) \
299	PF4(i + 2) \
300	PF0(i + 4) \
301	PF0(i + 6) \
302	XO3(i, 0) \
303	XO3(i + 1, 1) \
304	XO3(i + 2, 2) \
305	XO3(i + 3, 3) \
306	XO4(i, 0) \
307	XO4(i + 1, 1) \
308	XO4(i + 2, 2) \
309	XO4(i + 3, 3) \
310	ST(i, 0) \
311	ST(i + 1, 1) \
312	ST(i + 2, 2) \
313	ST(i + 3, 3) \
314
315
316	PF0(0)
317	PF0(2)
318
319	" .align 32 ;\n"
320	" 1: ;\n"
321
322	BLOCK(0)
323	BLOCK(4)
324	BLOCK(8)
325	BLOCK(12)
326
327	" addq %[inc], %[p1] ;\n"
328	" addq %[inc], %[p2] ;\n"
329	" addq %[inc], %[p3] ;\n"
330	" addq %[inc], %[p4] ;\n"
331	" addq %[inc], %[p5] ;\n"
332	" decl %[cnt] ; jnz 1b"
333	: [cnt] "+c" (lines),
334	[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
335	[p5] "+r" (p5)
336	: [inc] "r" (256UL)
337	: "memory");
338
339	XMMS_RESTORE;
340	}
341
342	static struct xor_block_template xor_block_sse = {
343	.name = "generic_sse",
344	.do_2 = xor_sse_2,
345	.do_3 = xor_sse_3,
346	.do_4 = xor_sse_4,
347	.do_5 = xor_sse_5,
348	};
349
350	#undef XOR_TRY_TEMPLATES
351	#define XOR_TRY_TEMPLATES \
352	do { \
353	xor_speed(&xor_block_sse); \
354	} while (0)
355
356	/* We force the use of the SSE xor block because it can write around L2.
357	We may also be able to load into the L1 only depending on how the cpu
358	deals with a load to a line that is being prefetched. */
359	#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
360
361	#endif /* ASM_X86__XOR_64_H */