aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-05-23 20:08:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-23 20:08:40 -0400
commitc80ddb526331a72c9e9d1480f85f6fd7c74e3d2d (patch)
tree0212803a009f171990032abb94fad84156baa153 /arch
parent2c13bc0f8f0d3e13b42be70bf74fec8e56b58324 (diff)
parent1dff2b87a34a1ac1d1898ea109bf97ed396aca53 (diff)
Merge tag 'md-3.5' of git://neil.brown.name/md
Pull md updates from NeilBrown: "It's been a busy cycle for md - lots of fun stuff here.. if you like this kind of thing :-) Main features: - RAID10 arrays can be reshaped - adding and removing devices and changing chunks (not 'far' array though) - allow RAID5 arrays to be reshaped with a backup file (not tested yet, but the priciple works fine for RAID10). - arrays can be reshaped while a bitmap is present - you no longer need to remove it first - SSSE3 support for RAID6 syndrome calculations and of course a number of minor fixes etc." * tag 'md-3.5' of git://neil.brown.name/md: (56 commits) md/bitmap: record the space available for the bitmap in the superblock. md/raid10: Remove extras after reshape to smaller number of devices. md/raid5: improve removal of extra devices after reshape. md: check the return of mddev_find() MD RAID1: Further conditionalize 'fullsync' DM RAID: Use md_error() in place of simply setting Faulty bit DM RAID: Record and handle missing devices DM RAID: Set recovery flags on resume md/raid5: Allow reshape while a bitmap is present. md/raid10: resize bitmap when required during reshape. md: allow array to be resized while bitmap is present. md/bitmap: make sure reshape request are reflected in superblock. md/bitmap: add bitmap_resize function to allow bitmap resizing. md/bitmap: use DIV_ROUND_UP instead of open-code md/bitmap: create a 'struct bitmap_counts' substructure of 'struct bitmap' md/bitmap: make bitmap bitops atomic. md/bitmap: make _page_attr bitops atomic. md/bitmap: merge bitmap_file_unmap and bitmap_file_put. md/bitmap: remove async freeing of bitmap file. md/bitmap: convert some spin_lock_irqsave to spin_lock_irq ...
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/include/asm/xor_32.h6
-rw-r--r--arch/x86/include/asm/xor_64.h8
-rw-r--r--arch/x86/include/asm/xor_avx.h214
4 files changed, 229 insertions, 4 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index dc611a40a336..1f2521434554 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
115 115
116# does binutils support specific instructions? 116# does binutils support specific instructions?
117asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) 117asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
118avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
118 119
119KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) 120KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
120KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) 121KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
121 122
122LDFLAGS := -m elf_$(UTS_MACHINE) 123LDFLAGS := -m elf_$(UTS_MACHINE)
123 124
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 133b40a0f495..454570891bdc 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
861 .do_5 = xor_sse_5, 861 .do_5 = xor_sse_5,
862}; 862};
863 863
864/* Also try the AVX routines */
865#include "xor_avx.h"
866
864/* Also try the generic routines. */ 867/* Also try the generic routines. */
865#include <asm-generic/xor.h> 868#include <asm-generic/xor.h>
866 869
@@ -871,6 +874,7 @@ do { \
871 xor_speed(&xor_block_8regs_p); \ 874 xor_speed(&xor_block_8regs_p); \
872 xor_speed(&xor_block_32regs); \ 875 xor_speed(&xor_block_32regs); \
873 xor_speed(&xor_block_32regs_p); \ 876 xor_speed(&xor_block_32regs_p); \
877 AVX_XOR_SPEED; \
874 if (cpu_has_xmm) \ 878 if (cpu_has_xmm) \
875 xor_speed(&xor_block_pIII_sse); \ 879 xor_speed(&xor_block_pIII_sse); \
876 if (cpu_has_mmx) { \ 880 if (cpu_has_mmx) { \
@@ -883,6 +887,6 @@ do { \
883 We may also be able to load into the L1 only depending on how the cpu 887 We may also be able to load into the L1 only depending on how the cpu
884 deals with a load to a line that is being prefetched. */ 888 deals with a load to a line that is being prefetched. */
885#define XOR_SELECT_TEMPLATE(FASTEST) \ 889#define XOR_SELECT_TEMPLATE(FASTEST) \
886 (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) 890 AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
887 891
888#endif /* _ASM_X86_XOR_32_H */ 892#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1549b5e261f6..b9b2323e90fe 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = {
347 .do_5 = xor_sse_5, 347 .do_5 = xor_sse_5,
348}; 348};
349 349
350
351/* Also try the AVX routines */
352#include "xor_avx.h"
353
350#undef XOR_TRY_TEMPLATES 354#undef XOR_TRY_TEMPLATES
351#define XOR_TRY_TEMPLATES \ 355#define XOR_TRY_TEMPLATES \
352do { \ 356do { \
357 AVX_XOR_SPEED; \
353 xor_speed(&xor_block_sse); \ 358 xor_speed(&xor_block_sse); \
354} while (0) 359} while (0)
355 360
356/* We force the use of the SSE xor block because it can write around L2. 361/* We force the use of the SSE xor block because it can write around L2.
357 We may also be able to load into the L1 only depending on how the cpu 362 We may also be able to load into the L1 only depending on how the cpu
358 deals with a load to a line that is being prefetched. */ 363 deals with a load to a line that is being prefetched. */
359#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) 364#define XOR_SELECT_TEMPLATE(FASTEST) \
365 AVX_SELECT(&xor_block_sse)
360 366
361#endif /* _ASM_X86_XOR_64_H */ 367#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 000000000000..2510d35f480e
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
1#ifndef _ASM_X86_XOR_AVX_H
2#define _ASM_X86_XOR_AVX_H
3
4/*
5 * Optimized RAID-5 checksumming functions for AVX
6 *
7 * Copyright (C) 2012 Intel Corporation
8 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
9 *
10 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; version 2
15 * of the License.
16 */
17
18#ifdef CONFIG_AS_AVX
19
20#include <linux/compiler.h>
21#include <asm/i387.h>
22
23#define ALIGN32 __aligned(32)
24
25#define YMM_SAVED_REGS 4
26
27#define YMMS_SAVE \
28do { \
29 preempt_disable(); \
30 cr0 = read_cr0(); \
31 clts(); \
32 asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
33 asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
34 asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
35 asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
36} while (0);
37
38#define YMMS_RESTORE \
39do { \
40 asm volatile("sfence" : : : "memory"); \
41 asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
42 asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
43 asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
44 asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
45 write_cr0(cr0); \
46 preempt_enable(); \
47} while (0);
48
49#define BLOCK4(i) \
50 BLOCK(32 * i, 0) \
51 BLOCK(32 * (i + 1), 1) \
52 BLOCK(32 * (i + 2), 2) \
53 BLOCK(32 * (i + 3), 3)
54
55#define BLOCK16() \
56 BLOCK4(0) \
57 BLOCK4(4) \
58 BLOCK4(8) \
59 BLOCK4(12)
60
61static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
62{
63 unsigned long cr0, lines = bytes >> 9;
64 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
65
66 YMMS_SAVE
67
68 while (lines--) {
69#undef BLOCK
70#define BLOCK(i, reg) \
71do { \
72 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
73 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
74 "m" (p0[i / sizeof(*p0)])); \
75 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
76 "=m" (p0[i / sizeof(*p0)])); \
77} while (0);
78
79 BLOCK16()
80
81 p0 = (unsigned long *)((uintptr_t)p0 + 512);
82 p1 = (unsigned long *)((uintptr_t)p1 + 512);
83 }
84
85 YMMS_RESTORE
86}
87
88static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
89 unsigned long *p2)
90{
91 unsigned long cr0, lines = bytes >> 9;
92 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
93
94 YMMS_SAVE
95
96 while (lines--) {
97#undef BLOCK
98#define BLOCK(i, reg) \
99do { \
100 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102 "m" (p1[i / sizeof(*p1)])); \
103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104 "m" (p0[i / sizeof(*p0)])); \
105 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
106 "=m" (p0[i / sizeof(*p0)])); \
107} while (0);
108
109 BLOCK16()
110
111 p0 = (unsigned long *)((uintptr_t)p0 + 512);
112 p1 = (unsigned long *)((uintptr_t)p1 + 512);
113 p2 = (unsigned long *)((uintptr_t)p2 + 512);
114 }
115
116 YMMS_RESTORE
117}
118
119static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
120 unsigned long *p2, unsigned long *p3)
121{
122 unsigned long cr0, lines = bytes >> 9;
123 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
124
125 YMMS_SAVE
126
127 while (lines--) {
128#undef BLOCK
129#define BLOCK(i, reg) \
130do { \
131 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
132 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
133 "m" (p2[i / sizeof(*p2)])); \
134 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
135 "m" (p1[i / sizeof(*p1)])); \
136 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137 "m" (p0[i / sizeof(*p0)])); \
138 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
139 "=m" (p0[i / sizeof(*p0)])); \
140} while (0);
141
142 BLOCK16();
143
144 p0 = (unsigned long *)((uintptr_t)p0 + 512);
145 p1 = (unsigned long *)((uintptr_t)p1 + 512);
146 p2 = (unsigned long *)((uintptr_t)p2 + 512);
147 p3 = (unsigned long *)((uintptr_t)p3 + 512);
148 }
149
150 YMMS_RESTORE
151}
152
153static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
154 unsigned long *p2, unsigned long *p3, unsigned long *p4)
155{
156 unsigned long cr0, lines = bytes >> 9;
157 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
158
159 YMMS_SAVE
160
161 while (lines--) {
162#undef BLOCK
163#define BLOCK(i, reg) \
164do { \
165 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
166 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
167 "m" (p3[i / sizeof(*p3)])); \
168 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
169 "m" (p2[i / sizeof(*p2)])); \
170 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
171 "m" (p1[i / sizeof(*p1)])); \
172 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
173 "m" (p0[i / sizeof(*p0)])); \
174 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
175 "=m" (p0[i / sizeof(*p0)])); \
176} while (0);
177
178 BLOCK16()
179
180 p0 = (unsigned long *)((uintptr_t)p0 + 512);
181 p1 = (unsigned long *)((uintptr_t)p1 + 512);
182 p2 = (unsigned long *)((uintptr_t)p2 + 512);
183 p3 = (unsigned long *)((uintptr_t)p3 + 512);
184 p4 = (unsigned long *)((uintptr_t)p4 + 512);
185 }
186
187 YMMS_RESTORE
188}
189
190static struct xor_block_template xor_block_avx = {
191 .name = "avx",
192 .do_2 = xor_avx_2,
193 .do_3 = xor_avx_3,
194 .do_4 = xor_avx_4,
195 .do_5 = xor_avx_5,
196};
197
198#define AVX_XOR_SPEED \
199do { \
200 if (cpu_has_avx) \
201 xor_speed(&xor_block_avx); \
202} while (0)
203
204#define AVX_SELECT(FASTEST) \
205 (cpu_has_avx ? &xor_block_avx : FASTEST)
206
207#else
208
209#define AVX_XOR_SPEED {}
210
211#define AVX_SELECT(FASTEST) (FASTEST)
212
213#endif
214#endif