diff options
author | Jim Kukunas <james.t.kukunas@linux.intel.com> | 2012-05-21 23:54:04 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2012-05-21 23:54:04 -0400 |
commit | ea4d26ae24e58fbd2c61de9242adab053cb982d8 (patch) | |
tree | 3115dd168f0cf1eb1eb5dd6aecc385cfa0e8bc05 | |
parent | 56a519913eeba2bdae4d7ee39e80fab442c3836c (diff) |
raid5: add AVX optimized RAID5 checksumming
Optimize RAID5 xor checksumming by taking advantage of
256-bit YMM registers introduced in AVX.
Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r-- | arch/x86/Makefile | 5 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_32.h | 6 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_64.h | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_avx.h | 214 |
4 files changed, 229 insertions, 4 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 41a7237606a3..7a1cc9ee5c8a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile | |||
@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI | |||
115 | 115 | ||
116 | # does binutils support specific instructions? | 116 | # does binutils support specific instructions? |
117 | asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) | 117 | asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) |
118 | avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) | ||
118 | 119 | ||
119 | KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) | 120 | KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) |
120 | KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) | 121 | KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) |
121 | 122 | ||
122 | LDFLAGS := -m elf_$(UTS_MACHINE) | 123 | LDFLAGS := -m elf_$(UTS_MACHINE) |
123 | 124 | ||
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 133b40a0f495..454570891bdc 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h | |||
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = { | |||
861 | .do_5 = xor_sse_5, | 861 | .do_5 = xor_sse_5, |
862 | }; | 862 | }; |
863 | 863 | ||
864 | /* Also try the AVX routines */ | ||
865 | #include "xor_avx.h" | ||
866 | |||
864 | /* Also try the generic routines. */ | 867 | /* Also try the generic routines. */ |
865 | #include <asm-generic/xor.h> | 868 | #include <asm-generic/xor.h> |
866 | 869 | ||
@@ -871,6 +874,7 @@ do { \ | |||
871 | xor_speed(&xor_block_8regs_p); \ | 874 | xor_speed(&xor_block_8regs_p); \ |
872 | xor_speed(&xor_block_32regs); \ | 875 | xor_speed(&xor_block_32regs); \ |
873 | xor_speed(&xor_block_32regs_p); \ | 876 | xor_speed(&xor_block_32regs_p); \ |
877 | AVX_XOR_SPEED; \ | ||
874 | if (cpu_has_xmm) \ | 878 | if (cpu_has_xmm) \ |
875 | xor_speed(&xor_block_pIII_sse); \ | 879 | xor_speed(&xor_block_pIII_sse); \ |
876 | if (cpu_has_mmx) { \ | 880 | if (cpu_has_mmx) { \ |
@@ -883,6 +887,6 @@ do { \ | |||
883 | We may also be able to load into the L1 only depending on how the cpu | 887 | We may also be able to load into the L1 only depending on how the cpu |
884 | deals with a load to a line that is being prefetched. */ | 888 | deals with a load to a line that is being prefetched. */ |
885 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | 889 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
886 | (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) | 890 | AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) |
887 | 891 | ||
888 | #endif /* _ASM_X86_XOR_32_H */ | 892 | #endif /* _ASM_X86_XOR_32_H */ |
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1549b5e261f6..b9b2323e90fe 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h | |||
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = { | |||
347 | .do_5 = xor_sse_5, | 347 | .do_5 = xor_sse_5, |
348 | }; | 348 | }; |
349 | 349 | ||
350 | |||
351 | /* Also try the AVX routines */ | ||
352 | #include "xor_avx.h" | ||
353 | |||
350 | #undef XOR_TRY_TEMPLATES | 354 | #undef XOR_TRY_TEMPLATES |
351 | #define XOR_TRY_TEMPLATES \ | 355 | #define XOR_TRY_TEMPLATES \ |
352 | do { \ | 356 | do { \ |
357 | AVX_XOR_SPEED; \ | ||
353 | xor_speed(&xor_block_sse); \ | 358 | xor_speed(&xor_block_sse); \ |
354 | } while (0) | 359 | } while (0) |
355 | 360 | ||
356 | /* We force the use of the SSE xor block because it can write around L2. | 361 | /* We force the use of the SSE xor block because it can write around L2. |
357 | We may also be able to load into the L1 only depending on how the cpu | 362 | We may also be able to load into the L1 only depending on how the cpu |
358 | deals with a load to a line that is being prefetched. */ | 363 | deals with a load to a line that is being prefetched. */ |
359 | #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) | 364 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
365 | AVX_SELECT(&xor_block_sse) | ||
360 | 366 | ||
361 | #endif /* _ASM_X86_XOR_64_H */ | 367 | #endif /* _ASM_X86_XOR_64_H */ |
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h new file mode 100644 index 000000000000..2510d35f480e --- /dev/null +++ b/arch/x86/include/asm/xor_avx.h | |||
@@ -0,0 +1,214 @@ | |||
1 | #ifndef _ASM_X86_XOR_AVX_H | ||
2 | #define _ASM_X86_XOR_AVX_H | ||
3 | |||
4 | /* | ||
5 | * Optimized RAID-5 checksumming functions for AVX | ||
6 | * | ||
7 | * Copyright (C) 2012 Intel Corporation | ||
8 | * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> | ||
9 | * | ||
10 | * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; version 2 | ||
15 | * of the License. | ||
16 | */ | ||
17 | |||
18 | #ifdef CONFIG_AS_AVX | ||
19 | |||
20 | #include <linux/compiler.h> | ||
21 | #include <asm/i387.h> | ||
22 | |||
23 | #define ALIGN32 __aligned(32) | ||
24 | |||
25 | #define YMM_SAVED_REGS 4 | ||
26 | |||
27 | #define YMMS_SAVE \ | ||
28 | do { \ | ||
29 | preempt_disable(); \ | ||
30 | cr0 = read_cr0(); \ | ||
31 | clts(); \ | ||
32 | asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ | ||
33 | asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ | ||
34 | asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ | ||
35 | asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ | ||
36 | } while (0); | ||
37 | |||
38 | #define YMMS_RESTORE \ | ||
39 | do { \ | ||
40 | asm volatile("sfence" : : : "memory"); \ | ||
41 | asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ | ||
42 | asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ | ||
43 | asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ | ||
44 | asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ | ||
45 | write_cr0(cr0); \ | ||
46 | preempt_enable(); \ | ||
47 | } while (0); | ||
48 | |||
49 | #define BLOCK4(i) \ | ||
50 | BLOCK(32 * i, 0) \ | ||
51 | BLOCK(32 * (i + 1), 1) \ | ||
52 | BLOCK(32 * (i + 2), 2) \ | ||
53 | BLOCK(32 * (i + 3), 3) | ||
54 | |||
55 | #define BLOCK16() \ | ||
56 | BLOCK4(0) \ | ||
57 | BLOCK4(4) \ | ||
58 | BLOCK4(8) \ | ||
59 | BLOCK4(12) | ||
60 | |||
61 | static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) | ||
62 | { | ||
63 | unsigned long cr0, lines = bytes >> 9; | ||
64 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
65 | |||
66 | YMMS_SAVE | ||
67 | |||
68 | while (lines--) { | ||
69 | #undef BLOCK | ||
70 | #define BLOCK(i, reg) \ | ||
71 | do { \ | ||
72 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ | ||
73 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
74 | "m" (p0[i / sizeof(*p0)])); \ | ||
75 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
76 | "=m" (p0[i / sizeof(*p0)])); \ | ||
77 | } while (0); | ||
78 | |||
79 | BLOCK16() | ||
80 | |||
81 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
82 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
83 | } | ||
84 | |||
85 | YMMS_RESTORE | ||
86 | } | ||
87 | |||
88 | static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, | ||
89 | unsigned long *p2) | ||
90 | { | ||
91 | unsigned long cr0, lines = bytes >> 9; | ||
92 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
93 | |||
94 | YMMS_SAVE | ||
95 | |||
96 | while (lines--) { | ||
97 | #undef BLOCK | ||
98 | #define BLOCK(i, reg) \ | ||
99 | do { \ | ||
100 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ | ||
101 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
102 | "m" (p1[i / sizeof(*p1)])); \ | ||
103 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
104 | "m" (p0[i / sizeof(*p0)])); \ | ||
105 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
106 | "=m" (p0[i / sizeof(*p0)])); \ | ||
107 | } while (0); | ||
108 | |||
109 | BLOCK16() | ||
110 | |||
111 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
112 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
113 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | ||
114 | } | ||
115 | |||
116 | YMMS_RESTORE | ||
117 | } | ||
118 | |||
119 | static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, | ||
120 | unsigned long *p2, unsigned long *p3) | ||
121 | { | ||
122 | unsigned long cr0, lines = bytes >> 9; | ||
123 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
124 | |||
125 | YMMS_SAVE | ||
126 | |||
127 | while (lines--) { | ||
128 | #undef BLOCK | ||
129 | #define BLOCK(i, reg) \ | ||
130 | do { \ | ||
131 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ | ||
132 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
133 | "m" (p2[i / sizeof(*p2)])); \ | ||
134 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
135 | "m" (p1[i / sizeof(*p1)])); \ | ||
136 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
137 | "m" (p0[i / sizeof(*p0)])); \ | ||
138 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
139 | "=m" (p0[i / sizeof(*p0)])); \ | ||
140 | } while (0); | ||
141 | |||
142 | BLOCK16(); | ||
143 | |||
144 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
145 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
146 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | ||
147 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | ||
148 | } | ||
149 | |||
150 | YMMS_RESTORE | ||
151 | } | ||
152 | |||
153 | static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, | ||
154 | unsigned long *p2, unsigned long *p3, unsigned long *p4) | ||
155 | { | ||
156 | unsigned long cr0, lines = bytes >> 9; | ||
157 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
158 | |||
159 | YMMS_SAVE | ||
160 | |||
161 | while (lines--) { | ||
162 | #undef BLOCK | ||
163 | #define BLOCK(i, reg) \ | ||
164 | do { \ | ||
165 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ | ||
166 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
167 | "m" (p3[i / sizeof(*p3)])); \ | ||
168 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
169 | "m" (p2[i / sizeof(*p2)])); \ | ||
170 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
171 | "m" (p1[i / sizeof(*p1)])); \ | ||
172 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
173 | "m" (p0[i / sizeof(*p0)])); \ | ||
174 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
175 | "=m" (p0[i / sizeof(*p0)])); \ | ||
176 | } while (0); | ||
177 | |||
178 | BLOCK16() | ||
179 | |||
180 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
181 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
182 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | ||
183 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | ||
184 | p4 = (unsigned long *)((uintptr_t)p4 + 512); | ||
185 | } | ||
186 | |||
187 | YMMS_RESTORE | ||
188 | } | ||
189 | |||
190 | static struct xor_block_template xor_block_avx = { | ||
191 | .name = "avx", | ||
192 | .do_2 = xor_avx_2, | ||
193 | .do_3 = xor_avx_3, | ||
194 | .do_4 = xor_avx_4, | ||
195 | .do_5 = xor_avx_5, | ||
196 | }; | ||
197 | |||
198 | #define AVX_XOR_SPEED \ | ||
199 | do { \ | ||
200 | if (cpu_has_avx) \ | ||
201 | xor_speed(&xor_block_avx); \ | ||
202 | } while (0) | ||
203 | |||
204 | #define AVX_SELECT(FASTEST) \ | ||
205 | (cpu_has_avx ? &xor_block_avx : FASTEST) | ||
206 | |||
207 | #else | ||
208 | |||
209 | #define AVX_XOR_SPEED {} | ||
210 | |||
211 | #define AVX_SELECT(FASTEST) (FASTEST) | ||
212 | |||
213 | #endif | ||
214 | #endif | ||