aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/include/asm/xor_avx.h
diff options
context:
space:
mode:
authorJim Kukunas <james.t.kukunas@linux.intel.com>2012-05-21 23:54:04 -0400
committerNeilBrown <neilb@suse.de>2012-05-21 23:54:04 -0400
commitea4d26ae24e58fbd2c61de9242adab053cb982d8 (patch)
tree3115dd168f0cf1eb1eb5dd6aecc385cfa0e8bc05 /arch/x86/include/asm/xor_avx.h
parent56a519913eeba2bdae4d7ee39e80fab442c3836c (diff)
raid5: add AVX optimized RAID5 checksumming
Optimize RAID5 xor checksumming by taking advantage of 256-bit YMM registers introduced in AVX. Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'arch/x86/include/asm/xor_avx.h')
-rw-r--r--arch/x86/include/asm/xor_avx.h214
1 files changed, 214 insertions, 0 deletions
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 000000000000..2510d35f480e
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
1#ifndef _ASM_X86_XOR_AVX_H
2#define _ASM_X86_XOR_AVX_H
3
4/*
5 * Optimized RAID-5 checksumming functions for AVX
6 *
7 * Copyright (C) 2012 Intel Corporation
8 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
9 *
10 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; version 2
15 * of the License.
16 */
17
18#ifdef CONFIG_AS_AVX
19
20#include <linux/compiler.h>
21#include <asm/i387.h>
22
23#define ALIGN32 __aligned(32)
24
25#define YMM_SAVED_REGS 4
26
27#define YMMS_SAVE \
28do { \
29 preempt_disable(); \
30 cr0 = read_cr0(); \
31 clts(); \
32 asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
33 asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
34 asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
35 asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
36} while (0);
37
38#define YMMS_RESTORE \
39do { \
40 asm volatile("sfence" : : : "memory"); \
41 asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
42 asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
43 asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
44 asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
45 write_cr0(cr0); \
46 preempt_enable(); \
47} while (0);
48
49#define BLOCK4(i) \
50 BLOCK(32 * i, 0) \
51 BLOCK(32 * (i + 1), 1) \
52 BLOCK(32 * (i + 2), 2) \
53 BLOCK(32 * (i + 3), 3)
54
55#define BLOCK16() \
56 BLOCK4(0) \
57 BLOCK4(4) \
58 BLOCK4(8) \
59 BLOCK4(12)
60
61static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
62{
63 unsigned long cr0, lines = bytes >> 9;
64 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
65
66 YMMS_SAVE
67
68 while (lines--) {
69#undef BLOCK
70#define BLOCK(i, reg) \
71do { \
72 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
73 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
74 "m" (p0[i / sizeof(*p0)])); \
75 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
76 "=m" (p0[i / sizeof(*p0)])); \
77} while (0);
78
79 BLOCK16()
80
81 p0 = (unsigned long *)((uintptr_t)p0 + 512);
82 p1 = (unsigned long *)((uintptr_t)p1 + 512);
83 }
84
85 YMMS_RESTORE
86}
87
88static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
89 unsigned long *p2)
90{
91 unsigned long cr0, lines = bytes >> 9;
92 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
93
94 YMMS_SAVE
95
96 while (lines--) {
97#undef BLOCK
98#define BLOCK(i, reg) \
99do { \
100 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102 "m" (p1[i / sizeof(*p1)])); \
103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104 "m" (p0[i / sizeof(*p0)])); \
105 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
106 "=m" (p0[i / sizeof(*p0)])); \
107} while (0);
108
109 BLOCK16()
110
111 p0 = (unsigned long *)((uintptr_t)p0 + 512);
112 p1 = (unsigned long *)((uintptr_t)p1 + 512);
113 p2 = (unsigned long *)((uintptr_t)p2 + 512);
114 }
115
116 YMMS_RESTORE
117}
118
119static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
120 unsigned long *p2, unsigned long *p3)
121{
122 unsigned long cr0, lines = bytes >> 9;
123 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
124
125 YMMS_SAVE
126
127 while (lines--) {
128#undef BLOCK
129#define BLOCK(i, reg) \
130do { \
131 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
132 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
133 "m" (p2[i / sizeof(*p2)])); \
134 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
135 "m" (p1[i / sizeof(*p1)])); \
136 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137 "m" (p0[i / sizeof(*p0)])); \
138 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
139 "=m" (p0[i / sizeof(*p0)])); \
140} while (0);
141
142 BLOCK16();
143
144 p0 = (unsigned long *)((uintptr_t)p0 + 512);
145 p1 = (unsigned long *)((uintptr_t)p1 + 512);
146 p2 = (unsigned long *)((uintptr_t)p2 + 512);
147 p3 = (unsigned long *)((uintptr_t)p3 + 512);
148 }
149
150 YMMS_RESTORE
151}
152
153static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
154 unsigned long *p2, unsigned long *p3, unsigned long *p4)
155{
156 unsigned long cr0, lines = bytes >> 9;
157 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
158
159 YMMS_SAVE
160
161 while (lines--) {
162#undef BLOCK
163#define BLOCK(i, reg) \
164do { \
165 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
166 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
167 "m" (p3[i / sizeof(*p3)])); \
168 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
169 "m" (p2[i / sizeof(*p2)])); \
170 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
171 "m" (p1[i / sizeof(*p1)])); \
172 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
173 "m" (p0[i / sizeof(*p0)])); \
174 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
175 "=m" (p0[i / sizeof(*p0)])); \
176} while (0);
177
178 BLOCK16()
179
180 p0 = (unsigned long *)((uintptr_t)p0 + 512);
181 p1 = (unsigned long *)((uintptr_t)p1 + 512);
182 p2 = (unsigned long *)((uintptr_t)p2 + 512);
183 p3 = (unsigned long *)((uintptr_t)p3 + 512);
184 p4 = (unsigned long *)((uintptr_t)p4 + 512);
185 }
186
187 YMMS_RESTORE
188}
189
190static struct xor_block_template xor_block_avx = {
191 .name = "avx",
192 .do_2 = xor_avx_2,
193 .do_3 = xor_avx_3,
194 .do_4 = xor_avx_4,
195 .do_5 = xor_avx_5,
196};
197
198#define AVX_XOR_SPEED \
199do { \
200 if (cpu_has_avx) \
201 xor_speed(&xor_block_avx); \
202} while (0)
203
204#define AVX_SELECT(FASTEST) \
205 (cpu_has_avx ? &xor_block_avx : FASTEST)
206
207#else
208
209#define AVX_XOR_SPEED {}
210
211#define AVX_SELECT(FASTEST) (FASTEST)
212
213#endif
214#endif