aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJim Kukunas <james.t.kukunas@linux.intel.com>2012-05-21 23:54:04 -0400
committerNeilBrown <neilb@suse.de>2012-05-21 23:54:04 -0400
commitea4d26ae24e58fbd2c61de9242adab053cb982d8 (patch)
tree3115dd168f0cf1eb1eb5dd6aecc385cfa0e8bc05
parent56a519913eeba2bdae4d7ee39e80fab442c3836c (diff)
raid5: add AVX optimized RAID5 checksumming
Optimize RAID5 xor checksumming by taking advantage of 256-bit YMM registers introduced in AVX. Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com> Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/include/asm/xor_32.h6
-rw-r--r--arch/x86/include/asm/xor_64.h8
-rw-r--r--arch/x86/include/asm/xor_avx.h214
4 files changed, 229 insertions, 4 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 41a7237606a3..7a1cc9ee5c8a 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
115 115
116# does binutils support specific instructions? 116# does binutils support specific instructions?
117asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) 117asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
118avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
118 119
119KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) 120KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
120KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) 121KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
121 122
122LDFLAGS := -m elf_$(UTS_MACHINE) 123LDFLAGS := -m elf_$(UTS_MACHINE)
123 124
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 133b40a0f495..454570891bdc 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
861 .do_5 = xor_sse_5, 861 .do_5 = xor_sse_5,
862}; 862};
863 863
864/* Also try the AVX routines */
865#include "xor_avx.h"
866
864/* Also try the generic routines. */ 867/* Also try the generic routines. */
865#include <asm-generic/xor.h> 868#include <asm-generic/xor.h>
866 869
@@ -871,6 +874,7 @@ do { \
871 xor_speed(&xor_block_8regs_p); \ 874 xor_speed(&xor_block_8regs_p); \
872 xor_speed(&xor_block_32regs); \ 875 xor_speed(&xor_block_32regs); \
873 xor_speed(&xor_block_32regs_p); \ 876 xor_speed(&xor_block_32regs_p); \
877 AVX_XOR_SPEED; \
874 if (cpu_has_xmm) \ 878 if (cpu_has_xmm) \
875 xor_speed(&xor_block_pIII_sse); \ 879 xor_speed(&xor_block_pIII_sse); \
876 if (cpu_has_mmx) { \ 880 if (cpu_has_mmx) { \
@@ -883,6 +887,6 @@ do { \
883 We may also be able to load into the L1 only depending on how the cpu 887 We may also be able to load into the L1 only depending on how the cpu
884 deals with a load to a line that is being prefetched. */ 888 deals with a load to a line that is being prefetched. */
885#define XOR_SELECT_TEMPLATE(FASTEST) \ 889#define XOR_SELECT_TEMPLATE(FASTEST) \
886 (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) 890 AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
887 891
888#endif /* _ASM_X86_XOR_32_H */ 892#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1549b5e261f6..b9b2323e90fe 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = {
347 .do_5 = xor_sse_5, 347 .do_5 = xor_sse_5,
348}; 348};
349 349
350
351/* Also try the AVX routines */
352#include "xor_avx.h"
353
350#undef XOR_TRY_TEMPLATES 354#undef XOR_TRY_TEMPLATES
351#define XOR_TRY_TEMPLATES \ 355#define XOR_TRY_TEMPLATES \
352do { \ 356do { \
357 AVX_XOR_SPEED; \
353 xor_speed(&xor_block_sse); \ 358 xor_speed(&xor_block_sse); \
354} while (0) 359} while (0)
355 360
356/* We force the use of the SSE xor block because it can write around L2. 361/* We force the use of the SSE xor block because it can write around L2.
357 We may also be able to load into the L1 only depending on how the cpu 362 We may also be able to load into the L1 only depending on how the cpu
358 deals with a load to a line that is being prefetched. */ 363 deals with a load to a line that is being prefetched. */
359#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) 364#define XOR_SELECT_TEMPLATE(FASTEST) \
365 AVX_SELECT(&xor_block_sse)
360 366
361#endif /* _ASM_X86_XOR_64_H */ 367#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 000000000000..2510d35f480e
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
1#ifndef _ASM_X86_XOR_AVX_H
2#define _ASM_X86_XOR_AVX_H
3
4/*
5 * Optimized RAID-5 checksumming functions for AVX
6 *
7 * Copyright (C) 2012 Intel Corporation
8 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
9 *
10 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; version 2
15 * of the License.
16 */
17
18#ifdef CONFIG_AS_AVX
19
20#include <linux/compiler.h>
21#include <asm/i387.h>
22
23#define ALIGN32 __aligned(32)
24
25#define YMM_SAVED_REGS 4
26
27#define YMMS_SAVE \
28do { \
29 preempt_disable(); \
30 cr0 = read_cr0(); \
31 clts(); \
32 asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
33 asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
34 asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
35 asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
36} while (0);
37
38#define YMMS_RESTORE \
39do { \
40 asm volatile("sfence" : : : "memory"); \
41 asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
42 asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
43 asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
44 asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
45 write_cr0(cr0); \
46 preempt_enable(); \
47} while (0);
48
49#define BLOCK4(i) \
50 BLOCK(32 * i, 0) \
51 BLOCK(32 * (i + 1), 1) \
52 BLOCK(32 * (i + 2), 2) \
53 BLOCK(32 * (i + 3), 3)
54
55#define BLOCK16() \
56 BLOCK4(0) \
57 BLOCK4(4) \
58 BLOCK4(8) \
59 BLOCK4(12)
60
61static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
62{
63 unsigned long cr0, lines = bytes >> 9;
64 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
65
66 YMMS_SAVE
67
68 while (lines--) {
69#undef BLOCK
70#define BLOCK(i, reg) \
71do { \
72 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
73 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
74 "m" (p0[i / sizeof(*p0)])); \
75 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
76 "=m" (p0[i / sizeof(*p0)])); \
77} while (0);
78
79 BLOCK16()
80
81 p0 = (unsigned long *)((uintptr_t)p0 + 512);
82 p1 = (unsigned long *)((uintptr_t)p1 + 512);
83 }
84
85 YMMS_RESTORE
86}
87
88static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
89 unsigned long *p2)
90{
91 unsigned long cr0, lines = bytes >> 9;
92 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
93
94 YMMS_SAVE
95
96 while (lines--) {
97#undef BLOCK
98#define BLOCK(i, reg) \
99do { \
100 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102 "m" (p1[i / sizeof(*p1)])); \
103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104 "m" (p0[i / sizeof(*p0)])); \
105 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
106 "=m" (p0[i / sizeof(*p0)])); \
107} while (0);
108
109 BLOCK16()
110
111 p0 = (unsigned long *)((uintptr_t)p0 + 512);
112 p1 = (unsigned long *)((uintptr_t)p1 + 512);
113 p2 = (unsigned long *)((uintptr_t)p2 + 512);
114 }
115
116 YMMS_RESTORE
117}
118
119static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
120 unsigned long *p2, unsigned long *p3)
121{
122 unsigned long cr0, lines = bytes >> 9;
123 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
124
125 YMMS_SAVE
126
127 while (lines--) {
128#undef BLOCK
129#define BLOCK(i, reg) \
130do { \
131 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
132 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
133 "m" (p2[i / sizeof(*p2)])); \
134 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
135 "m" (p1[i / sizeof(*p1)])); \
136 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137 "m" (p0[i / sizeof(*p0)])); \
138 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
139 "=m" (p0[i / sizeof(*p0)])); \
140} while (0);
141
142 BLOCK16();
143
144 p0 = (unsigned long *)((uintptr_t)p0 + 512);
145 p1 = (unsigned long *)((uintptr_t)p1 + 512);
146 p2 = (unsigned long *)((uintptr_t)p2 + 512);
147 p3 = (unsigned long *)((uintptr_t)p3 + 512);
148 }
149
150 YMMS_RESTORE
151}
152
153static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
154 unsigned long *p2, unsigned long *p3, unsigned long *p4)
155{
156 unsigned long cr0, lines = bytes >> 9;
157 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
158
159 YMMS_SAVE
160
161 while (lines--) {
162#undef BLOCK
163#define BLOCK(i, reg) \
164do { \
165 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
166 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
167 "m" (p3[i / sizeof(*p3)])); \
168 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
169 "m" (p2[i / sizeof(*p2)])); \
170 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
171 "m" (p1[i / sizeof(*p1)])); \
172 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
173 "m" (p0[i / sizeof(*p0)])); \
174 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
175 "=m" (p0[i / sizeof(*p0)])); \
176} while (0);
177
178 BLOCK16()
179
180 p0 = (unsigned long *)((uintptr_t)p0 + 512);
181 p1 = (unsigned long *)((uintptr_t)p1 + 512);
182 p2 = (unsigned long *)((uintptr_t)p2 + 512);
183 p3 = (unsigned long *)((uintptr_t)p3 + 512);
184 p4 = (unsigned long *)((uintptr_t)p4 + 512);
185 }
186
187 YMMS_RESTORE
188}
189
190static struct xor_block_template xor_block_avx = {
191 .name = "avx",
192 .do_2 = xor_avx_2,
193 .do_3 = xor_avx_3,
194 .do_4 = xor_avx_4,
195 .do_5 = xor_avx_5,
196};
197
198#define AVX_XOR_SPEED \
199do { \
200 if (cpu_has_avx) \
201 xor_speed(&xor_block_avx); \
202} while (0)
203
204#define AVX_SELECT(FASTEST) \
205 (cpu_has_avx ? &xor_block_avx : FASTEST)
206
207#else
208
209#define AVX_XOR_SPEED {}
210
211#define AVX_SELECT(FASTEST) (FASTEST)
212
213#endif
214#endif