diff options
author | Jim Kukunas <james.t.kukunas@linux.intel.com> | 2012-05-21 23:54:04 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2012-05-21 23:54:04 -0400 |
commit | ea4d26ae24e58fbd2c61de9242adab053cb982d8 (patch) | |
tree | 3115dd168f0cf1eb1eb5dd6aecc385cfa0e8bc05 /arch/x86/include/asm/xor_avx.h | |
parent | 56a519913eeba2bdae4d7ee39e80fab442c3836c (diff) |
raid5: add AVX optimized RAID5 checksumming
Optimize RAID5 xor checksumming by taking advantage of
256-bit YMM registers introduced in AVX.
Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'arch/x86/include/asm/xor_avx.h')
-rw-r--r-- | arch/x86/include/asm/xor_avx.h | 214 |
1 files changed, 214 insertions, 0 deletions
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h new file mode 100644 index 000000000000..2510d35f480e --- /dev/null +++ b/arch/x86/include/asm/xor_avx.h | |||
@@ -0,0 +1,214 @@ | |||
1 | #ifndef _ASM_X86_XOR_AVX_H | ||
2 | #define _ASM_X86_XOR_AVX_H | ||
3 | |||
4 | /* | ||
5 | * Optimized RAID-5 checksumming functions for AVX | ||
6 | * | ||
7 | * Copyright (C) 2012 Intel Corporation | ||
8 | * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> | ||
9 | * | ||
10 | * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; version 2 | ||
15 | * of the License. | ||
16 | */ | ||
17 | |||
18 | #ifdef CONFIG_AS_AVX | ||
19 | |||
20 | #include <linux/compiler.h> | ||
21 | #include <asm/i387.h> | ||
22 | |||
23 | #define ALIGN32 __aligned(32) | ||
24 | |||
25 | #define YMM_SAVED_REGS 4 | ||
26 | |||
27 | #define YMMS_SAVE \ | ||
28 | do { \ | ||
29 | preempt_disable(); \ | ||
30 | cr0 = read_cr0(); \ | ||
31 | clts(); \ | ||
32 | asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ | ||
33 | asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ | ||
34 | asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ | ||
35 | asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ | ||
36 | } while (0); | ||
37 | |||
38 | #define YMMS_RESTORE \ | ||
39 | do { \ | ||
40 | asm volatile("sfence" : : : "memory"); \ | ||
41 | asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ | ||
42 | asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ | ||
43 | asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ | ||
44 | asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ | ||
45 | write_cr0(cr0); \ | ||
46 | preempt_enable(); \ | ||
47 | } while (0); | ||
48 | |||
49 | #define BLOCK4(i) \ | ||
50 | BLOCK(32 * i, 0) \ | ||
51 | BLOCK(32 * (i + 1), 1) \ | ||
52 | BLOCK(32 * (i + 2), 2) \ | ||
53 | BLOCK(32 * (i + 3), 3) | ||
54 | |||
55 | #define BLOCK16() \ | ||
56 | BLOCK4(0) \ | ||
57 | BLOCK4(4) \ | ||
58 | BLOCK4(8) \ | ||
59 | BLOCK4(12) | ||
60 | |||
61 | static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) | ||
62 | { | ||
63 | unsigned long cr0, lines = bytes >> 9; | ||
64 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
65 | |||
66 | YMMS_SAVE | ||
67 | |||
68 | while (lines--) { | ||
69 | #undef BLOCK | ||
70 | #define BLOCK(i, reg) \ | ||
71 | do { \ | ||
72 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ | ||
73 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
74 | "m" (p0[i / sizeof(*p0)])); \ | ||
75 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
76 | "=m" (p0[i / sizeof(*p0)])); \ | ||
77 | } while (0); | ||
78 | |||
79 | BLOCK16() | ||
80 | |||
81 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
82 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
83 | } | ||
84 | |||
85 | YMMS_RESTORE | ||
86 | } | ||
87 | |||
88 | static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, | ||
89 | unsigned long *p2) | ||
90 | { | ||
91 | unsigned long cr0, lines = bytes >> 9; | ||
92 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
93 | |||
94 | YMMS_SAVE | ||
95 | |||
96 | while (lines--) { | ||
97 | #undef BLOCK | ||
98 | #define BLOCK(i, reg) \ | ||
99 | do { \ | ||
100 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ | ||
101 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
102 | "m" (p1[i / sizeof(*p1)])); \ | ||
103 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
104 | "m" (p0[i / sizeof(*p0)])); \ | ||
105 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
106 | "=m" (p0[i / sizeof(*p0)])); \ | ||
107 | } while (0); | ||
108 | |||
109 | BLOCK16() | ||
110 | |||
111 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
112 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
113 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | ||
114 | } | ||
115 | |||
116 | YMMS_RESTORE | ||
117 | } | ||
118 | |||
119 | static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, | ||
120 | unsigned long *p2, unsigned long *p3) | ||
121 | { | ||
122 | unsigned long cr0, lines = bytes >> 9; | ||
123 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
124 | |||
125 | YMMS_SAVE | ||
126 | |||
127 | while (lines--) { | ||
128 | #undef BLOCK | ||
129 | #define BLOCK(i, reg) \ | ||
130 | do { \ | ||
131 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ | ||
132 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
133 | "m" (p2[i / sizeof(*p2)])); \ | ||
134 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
135 | "m" (p1[i / sizeof(*p1)])); \ | ||
136 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
137 | "m" (p0[i / sizeof(*p0)])); \ | ||
138 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
139 | "=m" (p0[i / sizeof(*p0)])); \ | ||
140 | } while (0); | ||
141 | |||
142 | BLOCK16(); | ||
143 | |||
144 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
145 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
146 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | ||
147 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | ||
148 | } | ||
149 | |||
150 | YMMS_RESTORE | ||
151 | } | ||
152 | |||
153 | static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, | ||
154 | unsigned long *p2, unsigned long *p3, unsigned long *p4) | ||
155 | { | ||
156 | unsigned long cr0, lines = bytes >> 9; | ||
157 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
158 | |||
159 | YMMS_SAVE | ||
160 | |||
161 | while (lines--) { | ||
162 | #undef BLOCK | ||
163 | #define BLOCK(i, reg) \ | ||
164 | do { \ | ||
165 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ | ||
166 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
167 | "m" (p3[i / sizeof(*p3)])); \ | ||
168 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
169 | "m" (p2[i / sizeof(*p2)])); \ | ||
170 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
171 | "m" (p1[i / sizeof(*p1)])); \ | ||
172 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
173 | "m" (p0[i / sizeof(*p0)])); \ | ||
174 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
175 | "=m" (p0[i / sizeof(*p0)])); \ | ||
176 | } while (0); | ||
177 | |||
178 | BLOCK16() | ||
179 | |||
180 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
181 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
182 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | ||
183 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | ||
184 | p4 = (unsigned long *)((uintptr_t)p4 + 512); | ||
185 | } | ||
186 | |||
187 | YMMS_RESTORE | ||
188 | } | ||
189 | |||
190 | static struct xor_block_template xor_block_avx = { | ||
191 | .name = "avx", | ||
192 | .do_2 = xor_avx_2, | ||
193 | .do_3 = xor_avx_3, | ||
194 | .do_4 = xor_avx_4, | ||
195 | .do_5 = xor_avx_5, | ||
196 | }; | ||
197 | |||
198 | #define AVX_XOR_SPEED \ | ||
199 | do { \ | ||
200 | if (cpu_has_avx) \ | ||
201 | xor_speed(&xor_block_avx); \ | ||
202 | } while (0) | ||
203 | |||
204 | #define AVX_SELECT(FASTEST) \ | ||
205 | (cpu_has_avx ? &xor_block_avx : FASTEST) | ||
206 | |||
207 | #else | ||
208 | |||
209 | #define AVX_XOR_SPEED {} | ||
210 | |||
211 | #define AVX_SELECT(FASTEST) (FASTEST) | ||
212 | |||
213 | #endif | ||
214 | #endif | ||