diff options
-rw-r--r-- | arch/x86/Makefile | 5 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_32.h | 6 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_64.h | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_avx.h | 214 | ||||
-rw-r--r-- | crypto/xor.c | 13 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 1100 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 60 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 22 | ||||
-rw-r--r-- | drivers/md/md.c | 370 | ||||
-rw-r--r-- | drivers/md/md.h | 12 | ||||
-rw-r--r-- | drivers/md/raid1.c | 22 | ||||
-rw-r--r-- | drivers/md/raid10.c | 1281 | ||||
-rw-r--r-- | drivers/md/raid10.h | 34 | ||||
-rw-r--r-- | drivers/md/raid5.c | 252 | ||||
-rw-r--r-- | drivers/md/raid5.h | 7 | ||||
-rw-r--r-- | include/linux/raid/md_p.h | 15 | ||||
-rw-r--r-- | include/linux/raid/pq.h | 18 | ||||
-rw-r--r-- | lib/raid6/Makefile | 2 | ||||
-rw-r--r-- | lib/raid6/algos.c | 127 | ||||
-rw-r--r-- | lib/raid6/mktables.c | 25 | ||||
-rw-r--r-- | lib/raid6/recov.c | 15 | ||||
-rw-r--r-- | lib/raid6/recov_ssse3.c | 335 | ||||
-rw-r--r-- | lib/raid6/test/Makefile | 2 | ||||
-rw-r--r-- | lib/raid6/test/test.c | 32 | ||||
-rw-r--r-- | lib/raid6/x86.h | 15 |
25 files changed, 3124 insertions, 868 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index dc611a40a336..1f2521434554 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile | |||
@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI | |||
115 | 115 | ||
116 | # does binutils support specific instructions? | 116 | # does binutils support specific instructions? |
117 | asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) | 117 | asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) |
118 | avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) | ||
118 | 119 | ||
119 | KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) | 120 | KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) |
120 | KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) | 121 | KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) |
121 | 122 | ||
122 | LDFLAGS := -m elf_$(UTS_MACHINE) | 123 | LDFLAGS := -m elf_$(UTS_MACHINE) |
123 | 124 | ||
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 133b40a0f495..454570891bdc 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h | |||
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = { | |||
861 | .do_5 = xor_sse_5, | 861 | .do_5 = xor_sse_5, |
862 | }; | 862 | }; |
863 | 863 | ||
864 | /* Also try the AVX routines */ | ||
865 | #include "xor_avx.h" | ||
866 | |||
864 | /* Also try the generic routines. */ | 867 | /* Also try the generic routines. */ |
865 | #include <asm-generic/xor.h> | 868 | #include <asm-generic/xor.h> |
866 | 869 | ||
@@ -871,6 +874,7 @@ do { \ | |||
871 | xor_speed(&xor_block_8regs_p); \ | 874 | xor_speed(&xor_block_8regs_p); \ |
872 | xor_speed(&xor_block_32regs); \ | 875 | xor_speed(&xor_block_32regs); \ |
873 | xor_speed(&xor_block_32regs_p); \ | 876 | xor_speed(&xor_block_32regs_p); \ |
877 | AVX_XOR_SPEED; \ | ||
874 | if (cpu_has_xmm) \ | 878 | if (cpu_has_xmm) \ |
875 | xor_speed(&xor_block_pIII_sse); \ | 879 | xor_speed(&xor_block_pIII_sse); \ |
876 | if (cpu_has_mmx) { \ | 880 | if (cpu_has_mmx) { \ |
@@ -883,6 +887,6 @@ do { \ | |||
883 | We may also be able to load into the L1 only depending on how the cpu | 887 | We may also be able to load into the L1 only depending on how the cpu |
884 | deals with a load to a line that is being prefetched. */ | 888 | deals with a load to a line that is being prefetched. */ |
885 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | 889 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
886 | (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) | 890 | AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) |
887 | 891 | ||
888 | #endif /* _ASM_X86_XOR_32_H */ | 892 | #endif /* _ASM_X86_XOR_32_H */ |
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1549b5e261f6..b9b2323e90fe 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h | |||
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = { | |||
347 | .do_5 = xor_sse_5, | 347 | .do_5 = xor_sse_5, |
348 | }; | 348 | }; |
349 | 349 | ||
350 | |||
351 | /* Also try the AVX routines */ | ||
352 | #include "xor_avx.h" | ||
353 | |||
350 | #undef XOR_TRY_TEMPLATES | 354 | #undef XOR_TRY_TEMPLATES |
351 | #define XOR_TRY_TEMPLATES \ | 355 | #define XOR_TRY_TEMPLATES \ |
352 | do { \ | 356 | do { \ |
357 | AVX_XOR_SPEED; \ | ||
353 | xor_speed(&xor_block_sse); \ | 358 | xor_speed(&xor_block_sse); \ |
354 | } while (0) | 359 | } while (0) |
355 | 360 | ||
356 | /* We force the use of the SSE xor block because it can write around L2. | 361 | /* We force the use of the SSE xor block because it can write around L2. |
357 | We may also be able to load into the L1 only depending on how the cpu | 362 | We may also be able to load into the L1 only depending on how the cpu |
358 | deals with a load to a line that is being prefetched. */ | 363 | deals with a load to a line that is being prefetched. */ |
359 | #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) | 364 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
365 | AVX_SELECT(&xor_block_sse) | ||
360 | 366 | ||
361 | #endif /* _ASM_X86_XOR_64_H */ | 367 | #endif /* _ASM_X86_XOR_64_H */ |
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h new file mode 100644 index 000000000000..2510d35f480e --- /dev/null +++ b/arch/x86/include/asm/xor_avx.h | |||
@@ -0,0 +1,214 @@ | |||
1 | #ifndef _ASM_X86_XOR_AVX_H | ||
2 | #define _ASM_X86_XOR_AVX_H | ||
3 | |||
4 | /* | ||
5 | * Optimized RAID-5 checksumming functions for AVX | ||
6 | * | ||
7 | * Copyright (C) 2012 Intel Corporation | ||
8 | * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> | ||
9 | * | ||
10 | * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; version 2 | ||
15 | * of the License. | ||
16 | */ | ||
17 | |||
18 | #ifdef CONFIG_AS_AVX | ||
19 | |||
20 | #include <linux/compiler.h> | ||
21 | #include <asm/i387.h> | ||
22 | |||
23 | #define ALIGN32 __aligned(32) | ||
24 | |||
25 | #define YMM_SAVED_REGS 4 | ||
26 | |||
27 | #define YMMS_SAVE \ | ||
28 | do { \ | ||
29 | preempt_disable(); \ | ||
30 | cr0 = read_cr0(); \ | ||
31 | clts(); \ | ||
32 | asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ | ||
33 | asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ | ||
34 | asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ | ||
35 | asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ | ||
36 | } while (0); | ||
37 | |||
38 | #define YMMS_RESTORE \ | ||
39 | do { \ | ||
40 | asm volatile("sfence" : : : "memory"); \ | ||
41 | asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ | ||
42 | asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ | ||
43 | asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ | ||
44 | asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ | ||
45 | write_cr0(cr0); \ | ||
46 | preempt_enable(); \ | ||
47 | } while (0); | ||
48 | |||
49 | #define BLOCK4(i) \ | ||
50 | BLOCK(32 * i, 0) \ | ||
51 | BLOCK(32 * (i + 1), 1) \ | ||
52 | BLOCK(32 * (i + 2), 2) \ | ||
53 | BLOCK(32 * (i + 3), 3) | ||
54 | |||
55 | #define BLOCK16() \ | ||
56 | BLOCK4(0) \ | ||
57 | BLOCK4(4) \ | ||
58 | BLOCK4(8) \ | ||
59 | BLOCK4(12) | ||
60 | |||
61 | static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) | ||
62 | { | ||
63 | unsigned long cr0, lines = bytes >> 9; | ||
64 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
65 | |||
66 | YMMS_SAVE | ||
67 | |||
68 | while (lines--) { | ||
69 | #undef BLOCK | ||
70 | #define BLOCK(i, reg) \ | ||
71 | do { \ | ||
72 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ | ||
73 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
74 | "m" (p0[i / sizeof(*p0)])); \ | ||
75 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
76 | "=m" (p0[i / sizeof(*p0)])); \ | ||
77 | } while (0); | ||
78 | |||
79 | BLOCK16() | ||
80 | |||
81 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
82 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
83 | } | ||
84 | |||
85 | YMMS_RESTORE | ||
86 | } | ||
87 | |||
88 | static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, | ||
89 | unsigned long *p2) | ||
90 | { | ||
91 | unsigned long cr0, lines = bytes >> 9; | ||
92 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
93 | |||
94 | YMMS_SAVE | ||
95 | |||
96 | while (lines--) { | ||
97 | #undef BLOCK | ||
98 | #define BLOCK(i, reg) \ | ||
99 | do { \ | ||
100 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ | ||
101 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
102 | "m" (p1[i / sizeof(*p1)])); \ | ||
103 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
104 | "m" (p0[i / sizeof(*p0)])); \ | ||
105 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
106 | "=m" (p0[i / sizeof(*p0)])); \ | ||
107 | } while (0); | ||
108 | |||
109 | BLOCK16() | ||
110 | |||
111 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
112 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
113 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | ||
114 | } | ||
115 | |||
116 | YMMS_RESTORE | ||
117 | } | ||
118 | |||
119 | static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, | ||
120 | unsigned long *p2, unsigned long *p3) | ||
121 | { | ||
122 | unsigned long cr0, lines = bytes >> 9; | ||
123 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
124 | |||
125 | YMMS_SAVE | ||
126 | |||
127 | while (lines--) { | ||
128 | #undef BLOCK | ||
129 | #define BLOCK(i, reg) \ | ||
130 | do { \ | ||
131 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ | ||
132 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
133 | "m" (p2[i / sizeof(*p2)])); \ | ||
134 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
135 | "m" (p1[i / sizeof(*p1)])); \ | ||
136 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
137 | "m" (p0[i / sizeof(*p0)])); \ | ||
138 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
139 | "=m" (p0[i / sizeof(*p0)])); \ | ||
140 | } while (0); | ||
141 | |||
142 | BLOCK16(); | ||
143 | |||
144 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
145 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
146 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | ||
147 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | ||
148 | } | ||
149 | |||
150 | YMMS_RESTORE | ||
151 | } | ||
152 | |||
153 | static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, | ||
154 | unsigned long *p2, unsigned long *p3, unsigned long *p4) | ||
155 | { | ||
156 | unsigned long cr0, lines = bytes >> 9; | ||
157 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
158 | |||
159 | YMMS_SAVE | ||
160 | |||
161 | while (lines--) { | ||
162 | #undef BLOCK | ||
163 | #define BLOCK(i, reg) \ | ||
164 | do { \ | ||
165 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ | ||
166 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
167 | "m" (p3[i / sizeof(*p3)])); \ | ||
168 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
169 | "m" (p2[i / sizeof(*p2)])); \ | ||
170 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
171 | "m" (p1[i / sizeof(*p1)])); \ | ||
172 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | ||
173 | "m" (p0[i / sizeof(*p0)])); \ | ||
174 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | ||
175 | "=m" (p0[i / sizeof(*p0)])); \ | ||
176 | } while (0); | ||
177 | |||
178 | BLOCK16() | ||
179 | |||
180 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | ||
181 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | ||
182 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | ||
183 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | ||
184 | p4 = (unsigned long *)((uintptr_t)p4 + 512); | ||
185 | } | ||
186 | |||
187 | YMMS_RESTORE | ||
188 | } | ||
189 | |||
190 | static struct xor_block_template xor_block_avx = { | ||
191 | .name = "avx", | ||
192 | .do_2 = xor_avx_2, | ||
193 | .do_3 = xor_avx_3, | ||
194 | .do_4 = xor_avx_4, | ||
195 | .do_5 = xor_avx_5, | ||
196 | }; | ||
197 | |||
198 | #define AVX_XOR_SPEED \ | ||
199 | do { \ | ||
200 | if (cpu_has_avx) \ | ||
201 | xor_speed(&xor_block_avx); \ | ||
202 | } while (0) | ||
203 | |||
204 | #define AVX_SELECT(FASTEST) \ | ||
205 | (cpu_has_avx ? &xor_block_avx : FASTEST) | ||
206 | |||
207 | #else | ||
208 | |||
209 | #define AVX_XOR_SPEED {} | ||
210 | |||
211 | #define AVX_SELECT(FASTEST) (FASTEST) | ||
212 | |||
213 | #endif | ||
214 | #endif | ||
diff --git a/crypto/xor.c b/crypto/xor.c index 664b6dfa9e2c..65c7b416b4a3 100644 --- a/crypto/xor.c +++ b/crypto/xor.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/gfp.h> | 21 | #include <linux/gfp.h> |
22 | #include <linux/raid/xor.h> | 22 | #include <linux/raid/xor.h> |
23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
24 | #include <linux/preempt.h> | ||
24 | #include <asm/xor.h> | 25 | #include <asm/xor.h> |
25 | 26 | ||
26 | /* The xor routines to use. */ | 27 | /* The xor routines to use. */ |
@@ -63,12 +64,14 @@ static void | |||
63 | do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) | 64 | do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) |
64 | { | 65 | { |
65 | int speed; | 66 | int speed; |
66 | unsigned long now; | 67 | unsigned long now, j; |
67 | int i, count, max; | 68 | int i, count, max; |
68 | 69 | ||
69 | tmpl->next = template_list; | 70 | tmpl->next = template_list; |
70 | template_list = tmpl; | 71 | template_list = tmpl; |
71 | 72 | ||
73 | preempt_disable(); | ||
74 | |||
72 | /* | 75 | /* |
73 | * Count the number of XORs done during a whole jiffy, and use | 76 | * Count the number of XORs done during a whole jiffy, and use |
74 | * this to calculate the speed of checksumming. We use a 2-page | 77 | * this to calculate the speed of checksumming. We use a 2-page |
@@ -76,9 +79,11 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) | |||
76 | */ | 79 | */ |
77 | max = 0; | 80 | max = 0; |
78 | for (i = 0; i < 5; i++) { | 81 | for (i = 0; i < 5; i++) { |
79 | now = jiffies; | 82 | j = jiffies; |
80 | count = 0; | 83 | count = 0; |
81 | while (jiffies == now) { | 84 | while ((now = jiffies) == j) |
85 | cpu_relax(); | ||
86 | while (time_before(jiffies, now + 1)) { | ||
82 | mb(); /* prevent loop optimzation */ | 87 | mb(); /* prevent loop optimzation */ |
83 | tmpl->do_2(BENCH_SIZE, b1, b2); | 88 | tmpl->do_2(BENCH_SIZE, b1, b2); |
84 | mb(); | 89 | mb(); |
@@ -89,6 +94,8 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) | |||
89 | max = count; | 94 | max = count; |
90 | } | 95 | } |
91 | 96 | ||
97 | preempt_enable(); | ||
98 | |||
92 | speed = max * (HZ * BENCH_SIZE / 1024); | 99 | speed = max * (HZ * BENCH_SIZE / 1024); |
93 | tmpl->speed = speed; | 100 | tmpl->speed = speed; |
94 | 101 | ||
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 17e2b472e16d..15dbe03117e4 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -45,7 +45,7 @@ static inline char *bmname(struct bitmap *bitmap) | |||
45 | * if we find our page, we increment the page's refcount so that it stays | 45 | * if we find our page, we increment the page's refcount so that it stays |
46 | * allocated while we're using it | 46 | * allocated while we're using it |
47 | */ | 47 | */ |
48 | static int bitmap_checkpage(struct bitmap *bitmap, | 48 | static int bitmap_checkpage(struct bitmap_counts *bitmap, |
49 | unsigned long page, int create) | 49 | unsigned long page, int create) |
50 | __releases(bitmap->lock) | 50 | __releases(bitmap->lock) |
51 | __acquires(bitmap->lock) | 51 | __acquires(bitmap->lock) |
@@ -76,8 +76,7 @@ __acquires(bitmap->lock) | |||
76 | spin_lock_irq(&bitmap->lock); | 76 | spin_lock_irq(&bitmap->lock); |
77 | 77 | ||
78 | if (mappage == NULL) { | 78 | if (mappage == NULL) { |
79 | pr_debug("%s: bitmap map page allocation failed, hijacking\n", | 79 | pr_debug("md/bitmap: map page allocation failed, hijacking\n"); |
80 | bmname(bitmap)); | ||
81 | /* failed - set the hijacked flag so that we can use the | 80 | /* failed - set the hijacked flag so that we can use the |
82 | * pointer as a counter */ | 81 | * pointer as a counter */ |
83 | if (!bitmap->bp[page].map) | 82 | if (!bitmap->bp[page].map) |
@@ -100,7 +99,7 @@ __acquires(bitmap->lock) | |||
100 | /* if page is completely empty, put it back on the free list, or dealloc it */ | 99 | /* if page is completely empty, put it back on the free list, or dealloc it */ |
101 | /* if page was hijacked, unmark the flag so it might get alloced next time */ | 100 | /* if page was hijacked, unmark the flag so it might get alloced next time */ |
102 | /* Note: lock should be held when calling this */ | 101 | /* Note: lock should be held when calling this */ |
103 | static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) | 102 | static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page) |
104 | { | 103 | { |
105 | char *ptr; | 104 | char *ptr; |
106 | 105 | ||
@@ -130,22 +129,14 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) | |||
130 | */ | 129 | */ |
131 | 130 | ||
132 | /* IO operations when bitmap is stored near all superblocks */ | 131 | /* IO operations when bitmap is stored near all superblocks */ |
133 | static struct page *read_sb_page(struct mddev *mddev, loff_t offset, | 132 | static int read_sb_page(struct mddev *mddev, loff_t offset, |
134 | struct page *page, | 133 | struct page *page, |
135 | unsigned long index, int size) | 134 | unsigned long index, int size) |
136 | { | 135 | { |
137 | /* choose a good rdev and read the page from there */ | 136 | /* choose a good rdev and read the page from there */ |
138 | 137 | ||
139 | struct md_rdev *rdev; | 138 | struct md_rdev *rdev; |
140 | sector_t target; | 139 | sector_t target; |
141 | int did_alloc = 0; | ||
142 | |||
143 | if (!page) { | ||
144 | page = alloc_page(GFP_KERNEL); | ||
145 | if (!page) | ||
146 | return ERR_PTR(-ENOMEM); | ||
147 | did_alloc = 1; | ||
148 | } | ||
149 | 140 | ||
150 | rdev_for_each(rdev, mddev) { | 141 | rdev_for_each(rdev, mddev) { |
151 | if (! test_bit(In_sync, &rdev->flags) | 142 | if (! test_bit(In_sync, &rdev->flags) |
@@ -158,15 +149,10 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset, | |||
158 | roundup(size, bdev_logical_block_size(rdev->bdev)), | 149 | roundup(size, bdev_logical_block_size(rdev->bdev)), |
159 | page, READ, true)) { | 150 | page, READ, true)) { |
160 | page->index = index; | 151 | page->index = index; |
161 | attach_page_buffers(page, NULL); /* so that free_buffer will | 152 | return 0; |
162 | * quietly no-op */ | ||
163 | return page; | ||
164 | } | 153 | } |
165 | } | 154 | } |
166 | if (did_alloc) | 155 | return -EIO; |
167 | put_page(page); | ||
168 | return ERR_PTR(-EIO); | ||
169 | |||
170 | } | 156 | } |
171 | 157 | ||
172 | static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) | 158 | static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) |
@@ -208,6 +194,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
208 | struct md_rdev *rdev = NULL; | 194 | struct md_rdev *rdev = NULL; |
209 | struct block_device *bdev; | 195 | struct block_device *bdev; |
210 | struct mddev *mddev = bitmap->mddev; | 196 | struct mddev *mddev = bitmap->mddev; |
197 | struct bitmap_storage *store = &bitmap->storage; | ||
211 | 198 | ||
212 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { | 199 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { |
213 | int size = PAGE_SIZE; | 200 | int size = PAGE_SIZE; |
@@ -215,9 +202,13 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
215 | 202 | ||
216 | bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; | 203 | bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; |
217 | 204 | ||
218 | if (page->index == bitmap->file_pages-1) | 205 | if (page->index == store->file_pages-1) { |
219 | size = roundup(bitmap->last_page_size, | 206 | int last_page_size = store->bytes & (PAGE_SIZE-1); |
207 | if (last_page_size == 0) | ||
208 | last_page_size = PAGE_SIZE; | ||
209 | size = roundup(last_page_size, | ||
220 | bdev_logical_block_size(bdev)); | 210 | bdev_logical_block_size(bdev)); |
211 | } | ||
221 | /* Just make sure we aren't corrupting data or | 212 | /* Just make sure we aren't corrupting data or |
222 | * metadata | 213 | * metadata |
223 | */ | 214 | */ |
@@ -276,10 +267,10 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) | |||
276 | { | 267 | { |
277 | struct buffer_head *bh; | 268 | struct buffer_head *bh; |
278 | 269 | ||
279 | if (bitmap->file == NULL) { | 270 | if (bitmap->storage.file == NULL) { |
280 | switch (write_sb_page(bitmap, page, wait)) { | 271 | switch (write_sb_page(bitmap, page, wait)) { |
281 | case -EINVAL: | 272 | case -EINVAL: |
282 | bitmap->flags |= BITMAP_WRITE_ERROR; | 273 | set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); |
283 | } | 274 | } |
284 | } else { | 275 | } else { |
285 | 276 | ||
@@ -297,20 +288,16 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) | |||
297 | wait_event(bitmap->write_wait, | 288 | wait_event(bitmap->write_wait, |
298 | atomic_read(&bitmap->pending_writes)==0); | 289 | atomic_read(&bitmap->pending_writes)==0); |
299 | } | 290 | } |
300 | if (bitmap->flags & BITMAP_WRITE_ERROR) | 291 | if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) |
301 | bitmap_file_kick(bitmap); | 292 | bitmap_file_kick(bitmap); |
302 | } | 293 | } |
303 | 294 | ||
304 | static void end_bitmap_write(struct buffer_head *bh, int uptodate) | 295 | static void end_bitmap_write(struct buffer_head *bh, int uptodate) |
305 | { | 296 | { |
306 | struct bitmap *bitmap = bh->b_private; | 297 | struct bitmap *bitmap = bh->b_private; |
307 | unsigned long flags; | ||
308 | 298 | ||
309 | if (!uptodate) { | 299 | if (!uptodate) |
310 | spin_lock_irqsave(&bitmap->lock, flags); | 300 | set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); |
311 | bitmap->flags |= BITMAP_WRITE_ERROR; | ||
312 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
313 | } | ||
314 | if (atomic_dec_and_test(&bitmap->pending_writes)) | 301 | if (atomic_dec_and_test(&bitmap->pending_writes)) |
315 | wake_up(&bitmap->write_wait); | 302 | wake_up(&bitmap->write_wait); |
316 | } | 303 | } |
@@ -325,8 +312,12 @@ __clear_page_buffers(struct page *page) | |||
325 | } | 312 | } |
326 | static void free_buffers(struct page *page) | 313 | static void free_buffers(struct page *page) |
327 | { | 314 | { |
328 | struct buffer_head *bh = page_buffers(page); | 315 | struct buffer_head *bh; |
329 | 316 | ||
317 | if (!PagePrivate(page)) | ||
318 | return; | ||
319 | |||
320 | bh = page_buffers(page); | ||
330 | while (bh) { | 321 | while (bh) { |
331 | struct buffer_head *next = bh->b_this_page; | 322 | struct buffer_head *next = bh->b_this_page; |
332 | free_buffer_head(bh); | 323 | free_buffer_head(bh); |
@@ -343,11 +334,12 @@ static void free_buffers(struct page *page) | |||
343 | * This usage is similar to how swap files are handled, and allows us | 334 | * This usage is similar to how swap files are handled, and allows us |
344 | * to write to a file with no concerns of memory allocation failing. | 335 | * to write to a file with no concerns of memory allocation failing. |
345 | */ | 336 | */ |
346 | static struct page *read_page(struct file *file, unsigned long index, | 337 | static int read_page(struct file *file, unsigned long index, |
347 | struct bitmap *bitmap, | 338 | struct bitmap *bitmap, |
348 | unsigned long count) | 339 | unsigned long count, |
340 | struct page *page) | ||
349 | { | 341 | { |
350 | struct page *page = NULL; | 342 | int ret = 0; |
351 | struct inode *inode = file->f_path.dentry->d_inode; | 343 | struct inode *inode = file->f_path.dentry->d_inode; |
352 | struct buffer_head *bh; | 344 | struct buffer_head *bh; |
353 | sector_t block; | 345 | sector_t block; |
@@ -355,16 +347,9 @@ static struct page *read_page(struct file *file, unsigned long index, | |||
355 | pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, | 347 | pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, |
356 | (unsigned long long)index << PAGE_SHIFT); | 348 | (unsigned long long)index << PAGE_SHIFT); |
357 | 349 | ||
358 | page = alloc_page(GFP_KERNEL); | ||
359 | if (!page) | ||
360 | page = ERR_PTR(-ENOMEM); | ||
361 | if (IS_ERR(page)) | ||
362 | goto out; | ||
363 | |||
364 | bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); | 350 | bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); |
365 | if (!bh) { | 351 | if (!bh) { |
366 | put_page(page); | 352 | ret = -ENOMEM; |
367 | page = ERR_PTR(-ENOMEM); | ||
368 | goto out; | 353 | goto out; |
369 | } | 354 | } |
370 | attach_page_buffers(page, bh); | 355 | attach_page_buffers(page, bh); |
@@ -376,8 +361,7 @@ static struct page *read_page(struct file *file, unsigned long index, | |||
376 | bh->b_blocknr = bmap(inode, block); | 361 | bh->b_blocknr = bmap(inode, block); |
377 | if (bh->b_blocknr == 0) { | 362 | if (bh->b_blocknr == 0) { |
378 | /* Cannot use this file! */ | 363 | /* Cannot use this file! */ |
379 | free_buffers(page); | 364 | ret = -EINVAL; |
380 | page = ERR_PTR(-EINVAL); | ||
381 | goto out; | 365 | goto out; |
382 | } | 366 | } |
383 | bh->b_bdev = inode->i_sb->s_bdev; | 367 | bh->b_bdev = inode->i_sb->s_bdev; |
@@ -400,17 +384,15 @@ static struct page *read_page(struct file *file, unsigned long index, | |||
400 | 384 | ||
401 | wait_event(bitmap->write_wait, | 385 | wait_event(bitmap->write_wait, |
402 | atomic_read(&bitmap->pending_writes)==0); | 386 | atomic_read(&bitmap->pending_writes)==0); |
403 | if (bitmap->flags & BITMAP_WRITE_ERROR) { | 387 | if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) |
404 | free_buffers(page); | 388 | ret = -EIO; |
405 | page = ERR_PTR(-EIO); | ||
406 | } | ||
407 | out: | 389 | out: |
408 | if (IS_ERR(page)) | 390 | if (ret) |
409 | printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n", | 391 | printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n", |
410 | (int)PAGE_SIZE, | 392 | (int)PAGE_SIZE, |
411 | (unsigned long long)index << PAGE_SHIFT, | 393 | (unsigned long long)index << PAGE_SHIFT, |
412 | PTR_ERR(page)); | 394 | ret); |
413 | return page; | 395 | return ret; |
414 | } | 396 | } |
415 | 397 | ||
416 | /* | 398 | /* |
@@ -426,9 +408,9 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
426 | return; | 408 | return; |
427 | if (bitmap->mddev->bitmap_info.external) | 409 | if (bitmap->mddev->bitmap_info.external) |
428 | return; | 410 | return; |
429 | if (!bitmap->sb_page) /* no superblock */ | 411 | if (!bitmap->storage.sb_page) /* no superblock */ |
430 | return; | 412 | return; |
431 | sb = kmap_atomic(bitmap->sb_page); | 413 | sb = kmap_atomic(bitmap->storage.sb_page); |
432 | sb->events = cpu_to_le64(bitmap->mddev->events); | 414 | sb->events = cpu_to_le64(bitmap->mddev->events); |
433 | if (bitmap->mddev->events < bitmap->events_cleared) | 415 | if (bitmap->mddev->events < bitmap->events_cleared) |
434 | /* rocking back to read-only */ | 416 | /* rocking back to read-only */ |
@@ -438,8 +420,13 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
438 | /* Just in case these have been changed via sysfs: */ | 420 | /* Just in case these have been changed via sysfs: */ |
439 | sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); | 421 | sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); |
440 | sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); | 422 | sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); |
423 | /* This might have been changed by a reshape */ | ||
424 | sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); | ||
425 | sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); | ||
426 | sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> | ||
427 | bitmap_info.space); | ||
441 | kunmap_atomic(sb); | 428 | kunmap_atomic(sb); |
442 | write_page(bitmap, bitmap->sb_page, 1); | 429 | write_page(bitmap, bitmap->storage.sb_page, 1); |
443 | } | 430 | } |
444 | 431 | ||
445 | /* print out the bitmap file superblock */ | 432 | /* print out the bitmap file superblock */ |
@@ -447,9 +434,9 @@ void bitmap_print_sb(struct bitmap *bitmap) | |||
447 | { | 434 | { |
448 | bitmap_super_t *sb; | 435 | bitmap_super_t *sb; |
449 | 436 | ||
450 | if (!bitmap || !bitmap->sb_page) | 437 | if (!bitmap || !bitmap->storage.sb_page) |
451 | return; | 438 | return; |
452 | sb = kmap_atomic(bitmap->sb_page); | 439 | sb = kmap_atomic(bitmap->storage.sb_page); |
453 | printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); | 440 | printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); |
454 | printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); | 441 | printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); |
455 | printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); | 442 | printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); |
@@ -488,15 +475,15 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) | |||
488 | unsigned long chunksize, daemon_sleep, write_behind; | 475 | unsigned long chunksize, daemon_sleep, write_behind; |
489 | int err = -EINVAL; | 476 | int err = -EINVAL; |
490 | 477 | ||
491 | bitmap->sb_page = alloc_page(GFP_KERNEL); | 478 | bitmap->storage.sb_page = alloc_page(GFP_KERNEL); |
492 | if (IS_ERR(bitmap->sb_page)) { | 479 | if (IS_ERR(bitmap->storage.sb_page)) { |
493 | err = PTR_ERR(bitmap->sb_page); | 480 | err = PTR_ERR(bitmap->storage.sb_page); |
494 | bitmap->sb_page = NULL; | 481 | bitmap->storage.sb_page = NULL; |
495 | return err; | 482 | return err; |
496 | } | 483 | } |
497 | bitmap->sb_page->index = 0; | 484 | bitmap->storage.sb_page->index = 0; |
498 | 485 | ||
499 | sb = kmap_atomic(bitmap->sb_page); | 486 | sb = kmap_atomic(bitmap->storage.sb_page); |
500 | 487 | ||
501 | sb->magic = cpu_to_le32(BITMAP_MAGIC); | 488 | sb->magic = cpu_to_le32(BITMAP_MAGIC); |
502 | sb->version = cpu_to_le32(BITMAP_MAJOR_HI); | 489 | sb->version = cpu_to_le32(BITMAP_MAJOR_HI); |
@@ -534,8 +521,8 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) | |||
534 | 521 | ||
535 | memcpy(sb->uuid, bitmap->mddev->uuid, 16); | 522 | memcpy(sb->uuid, bitmap->mddev->uuid, 16); |
536 | 523 | ||
537 | bitmap->flags |= BITMAP_STALE; | 524 | set_bit(BITMAP_STALE, &bitmap->flags); |
538 | sb->state |= cpu_to_le32(BITMAP_STALE); | 525 | sb->state = cpu_to_le32(bitmap->flags); |
539 | bitmap->events_cleared = bitmap->mddev->events; | 526 | bitmap->events_cleared = bitmap->mddev->events; |
540 | sb->events_cleared = cpu_to_le64(bitmap->mddev->events); | 527 | sb->events_cleared = cpu_to_le64(bitmap->mddev->events); |
541 | 528 | ||
@@ -551,31 +538,45 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
551 | bitmap_super_t *sb; | 538 | bitmap_super_t *sb; |
552 | unsigned long chunksize, daemon_sleep, write_behind; | 539 | unsigned long chunksize, daemon_sleep, write_behind; |
553 | unsigned long long events; | 540 | unsigned long long events; |
541 | unsigned long sectors_reserved = 0; | ||
554 | int err = -EINVAL; | 542 | int err = -EINVAL; |
543 | struct page *sb_page; | ||
555 | 544 | ||
545 | if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { | ||
546 | chunksize = 128 * 1024 * 1024; | ||
547 | daemon_sleep = 5 * HZ; | ||
548 | write_behind = 0; | ||
549 | set_bit(BITMAP_STALE, &bitmap->flags); | ||
550 | err = 0; | ||
551 | goto out_no_sb; | ||
552 | } | ||
556 | /* page 0 is the superblock, read it... */ | 553 | /* page 0 is the superblock, read it... */ |
557 | if (bitmap->file) { | 554 | sb_page = alloc_page(GFP_KERNEL); |
558 | loff_t isize = i_size_read(bitmap->file->f_mapping->host); | 555 | if (!sb_page) |
556 | return -ENOMEM; | ||
557 | bitmap->storage.sb_page = sb_page; | ||
558 | |||
559 | if (bitmap->storage.file) { | ||
560 | loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); | ||
559 | int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; | 561 | int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; |
560 | 562 | ||
561 | bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); | 563 | err = read_page(bitmap->storage.file, 0, |
564 | bitmap, bytes, sb_page); | ||
562 | } else { | 565 | } else { |
563 | bitmap->sb_page = read_sb_page(bitmap->mddev, | 566 | err = read_sb_page(bitmap->mddev, |
564 | bitmap->mddev->bitmap_info.offset, | 567 | bitmap->mddev->bitmap_info.offset, |
565 | NULL, | 568 | sb_page, |
566 | 0, sizeof(bitmap_super_t)); | 569 | 0, sizeof(bitmap_super_t)); |
567 | } | 570 | } |
568 | if (IS_ERR(bitmap->sb_page)) { | 571 | if (err) |
569 | err = PTR_ERR(bitmap->sb_page); | ||
570 | bitmap->sb_page = NULL; | ||
571 | return err; | 572 | return err; |
572 | } | ||
573 | 573 | ||
574 | sb = kmap_atomic(bitmap->sb_page); | 574 | sb = kmap_atomic(sb_page); |
575 | 575 | ||
576 | chunksize = le32_to_cpu(sb->chunksize); | 576 | chunksize = le32_to_cpu(sb->chunksize); |
577 | daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; | 577 | daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; |
578 | write_behind = le32_to_cpu(sb->write_behind); | 578 | write_behind = le32_to_cpu(sb->write_behind); |
579 | sectors_reserved = le32_to_cpu(sb->sectors_reserved); | ||
579 | 580 | ||
580 | /* verify that the bitmap-specific fields are valid */ | 581 | /* verify that the bitmap-specific fields are valid */ |
581 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) | 582 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) |
@@ -618,60 +619,32 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
618 | "-- forcing full recovery\n", | 619 | "-- forcing full recovery\n", |
619 | bmname(bitmap), events, | 620 | bmname(bitmap), events, |
620 | (unsigned long long) bitmap->mddev->events); | 621 | (unsigned long long) bitmap->mddev->events); |
621 | sb->state |= cpu_to_le32(BITMAP_STALE); | 622 | set_bit(BITMAP_STALE, &bitmap->flags); |
622 | } | 623 | } |
623 | } | 624 | } |
624 | 625 | ||
625 | /* assign fields using values from superblock */ | 626 | /* assign fields using values from superblock */ |
626 | bitmap->mddev->bitmap_info.chunksize = chunksize; | ||
627 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; | ||
628 | bitmap->mddev->bitmap_info.max_write_behind = write_behind; | ||
629 | bitmap->flags |= le32_to_cpu(sb->state); | 627 | bitmap->flags |= le32_to_cpu(sb->state); |
630 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) | 628 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) |
631 | bitmap->flags |= BITMAP_HOSTENDIAN; | 629 | set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); |
632 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); | 630 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); |
633 | if (bitmap->flags & BITMAP_STALE) | ||
634 | bitmap->events_cleared = bitmap->mddev->events; | ||
635 | err = 0; | 631 | err = 0; |
636 | out: | 632 | out: |
637 | kunmap_atomic(sb); | 633 | kunmap_atomic(sb); |
634 | out_no_sb: | ||
635 | if (test_bit(BITMAP_STALE, &bitmap->flags)) | ||
636 | bitmap->events_cleared = bitmap->mddev->events; | ||
637 | bitmap->mddev->bitmap_info.chunksize = chunksize; | ||
638 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; | ||
639 | bitmap->mddev->bitmap_info.max_write_behind = write_behind; | ||
640 | if (bitmap->mddev->bitmap_info.space == 0 || | ||
641 | bitmap->mddev->bitmap_info.space > sectors_reserved) | ||
642 | bitmap->mddev->bitmap_info.space = sectors_reserved; | ||
638 | if (err) | 643 | if (err) |
639 | bitmap_print_sb(bitmap); | 644 | bitmap_print_sb(bitmap); |
640 | return err; | 645 | return err; |
641 | } | 646 | } |
642 | 647 | ||
643 | enum bitmap_mask_op { | ||
644 | MASK_SET, | ||
645 | MASK_UNSET | ||
646 | }; | ||
647 | |||
648 | /* record the state of the bitmap in the superblock. Return the old value */ | ||
649 | static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, | ||
650 | enum bitmap_mask_op op) | ||
651 | { | ||
652 | bitmap_super_t *sb; | ||
653 | int old; | ||
654 | |||
655 | if (!bitmap->sb_page) /* can't set the state */ | ||
656 | return 0; | ||
657 | sb = kmap_atomic(bitmap->sb_page); | ||
658 | old = le32_to_cpu(sb->state) & bits; | ||
659 | switch (op) { | ||
660 | case MASK_SET: | ||
661 | sb->state |= cpu_to_le32(bits); | ||
662 | bitmap->flags |= bits; | ||
663 | break; | ||
664 | case MASK_UNSET: | ||
665 | sb->state &= cpu_to_le32(~bits); | ||
666 | bitmap->flags &= ~bits; | ||
667 | break; | ||
668 | default: | ||
669 | BUG(); | ||
670 | } | ||
671 | kunmap_atomic(sb); | ||
672 | return old; | ||
673 | } | ||
674 | |||
675 | /* | 648 | /* |
676 | * general bitmap file operations | 649 | * general bitmap file operations |
677 | */ | 650 | */ |
@@ -683,17 +656,19 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, | |||
683 | * file a page at a time. There's a superblock at the start of the file. | 656 | * file a page at a time. There's a superblock at the start of the file. |
684 | */ | 657 | */ |
685 | /* calculate the index of the page that contains this bit */ | 658 | /* calculate the index of the page that contains this bit */ |
686 | static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk) | 659 | static inline unsigned long file_page_index(struct bitmap_storage *store, |
660 | unsigned long chunk) | ||
687 | { | 661 | { |
688 | if (!bitmap->mddev->bitmap_info.external) | 662 | if (store->sb_page) |
689 | chunk += sizeof(bitmap_super_t) << 3; | 663 | chunk += sizeof(bitmap_super_t) << 3; |
690 | return chunk >> PAGE_BIT_SHIFT; | 664 | return chunk >> PAGE_BIT_SHIFT; |
691 | } | 665 | } |
692 | 666 | ||
693 | /* calculate the (bit) offset of this bit within a page */ | 667 | /* calculate the (bit) offset of this bit within a page */ |
694 | static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk) | 668 | static inline unsigned long file_page_offset(struct bitmap_storage *store, |
669 | unsigned long chunk) | ||
695 | { | 670 | { |
696 | if (!bitmap->mddev->bitmap_info.external) | 671 | if (store->sb_page) |
697 | chunk += sizeof(bitmap_super_t) << 3; | 672 | chunk += sizeof(bitmap_super_t) << 3; |
698 | return chunk & (PAGE_BITS - 1); | 673 | return chunk & (PAGE_BITS - 1); |
699 | } | 674 | } |
@@ -705,57 +680,86 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon | |||
705 | * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page | 680 | * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page |
706 | * 0 or page 1 | 681 | * 0 or page 1 |
707 | */ | 682 | */ |
708 | static inline struct page *filemap_get_page(struct bitmap *bitmap, | 683 | static inline struct page *filemap_get_page(struct bitmap_storage *store, |
709 | unsigned long chunk) | 684 | unsigned long chunk) |
710 | { | 685 | { |
711 | if (file_page_index(bitmap, chunk) >= bitmap->file_pages) | 686 | if (file_page_index(store, chunk) >= store->file_pages) |
712 | return NULL; | 687 | return NULL; |
713 | return bitmap->filemap[file_page_index(bitmap, chunk) | 688 | return store->filemap[file_page_index(store, chunk) |
714 | - file_page_index(bitmap, 0)]; | 689 | - file_page_index(store, 0)]; |
715 | } | 690 | } |
716 | 691 | ||
717 | static void bitmap_file_unmap(struct bitmap *bitmap) | 692 | static int bitmap_storage_alloc(struct bitmap_storage *store, |
693 | unsigned long chunks, int with_super) | ||
694 | { | ||
695 | int pnum; | ||
696 | unsigned long num_pages; | ||
697 | unsigned long bytes; | ||
698 | |||
699 | bytes = DIV_ROUND_UP(chunks, 8); | ||
700 | if (with_super) | ||
701 | bytes += sizeof(bitmap_super_t); | ||
702 | |||
703 | num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); | ||
704 | |||
705 | store->filemap = kmalloc(sizeof(struct page *) | ||
706 | * num_pages, GFP_KERNEL); | ||
707 | if (!store->filemap) | ||
708 | return -ENOMEM; | ||
709 | |||
710 | if (with_super && !store->sb_page) { | ||
711 | store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); | ||
712 | if (store->sb_page == NULL) | ||
713 | return -ENOMEM; | ||
714 | store->sb_page->index = 0; | ||
715 | } | ||
716 | pnum = 0; | ||
717 | if (store->sb_page) { | ||
718 | store->filemap[0] = store->sb_page; | ||
719 | pnum = 1; | ||
720 | } | ||
721 | for ( ; pnum < num_pages; pnum++) { | ||
722 | store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); | ||
723 | if (!store->filemap[pnum]) { | ||
724 | store->file_pages = pnum; | ||
725 | return -ENOMEM; | ||
726 | } | ||
727 | store->filemap[pnum]->index = pnum; | ||
728 | } | ||
729 | store->file_pages = pnum; | ||
730 | |||
731 | /* We need 4 bits per page, rounded up to a multiple | ||
732 | * of sizeof(unsigned long) */ | ||
733 | store->filemap_attr = kzalloc( | ||
734 | roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), | ||
735 | GFP_KERNEL); | ||
736 | if (!store->filemap_attr) | ||
737 | return -ENOMEM; | ||
738 | |||
739 | store->bytes = bytes; | ||
740 | |||
741 | return 0; | ||
742 | } | ||
743 | |||
744 | static void bitmap_file_unmap(struct bitmap_storage *store) | ||
718 | { | 745 | { |
719 | struct page **map, *sb_page; | 746 | struct page **map, *sb_page; |
720 | unsigned long *attr; | ||
721 | int pages; | 747 | int pages; |
722 | unsigned long flags; | 748 | struct file *file; |
723 | 749 | ||
724 | spin_lock_irqsave(&bitmap->lock, flags); | 750 | file = store->file; |
725 | map = bitmap->filemap; | 751 | map = store->filemap; |
726 | bitmap->filemap = NULL; | 752 | pages = store->file_pages; |
727 | attr = bitmap->filemap_attr; | 753 | sb_page = store->sb_page; |
728 | bitmap->filemap_attr = NULL; | ||
729 | pages = bitmap->file_pages; | ||
730 | bitmap->file_pages = 0; | ||
731 | sb_page = bitmap->sb_page; | ||
732 | bitmap->sb_page = NULL; | ||
733 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
734 | 754 | ||
735 | while (pages--) | 755 | while (pages--) |
736 | if (map[pages] != sb_page) /* 0 is sb_page, release it below */ | 756 | if (map[pages] != sb_page) /* 0 is sb_page, release it below */ |
737 | free_buffers(map[pages]); | 757 | free_buffers(map[pages]); |
738 | kfree(map); | 758 | kfree(map); |
739 | kfree(attr); | 759 | kfree(store->filemap_attr); |
740 | 760 | ||
741 | if (sb_page) | 761 | if (sb_page) |
742 | free_buffers(sb_page); | 762 | free_buffers(sb_page); |
743 | } | ||
744 | |||
745 | static void bitmap_file_put(struct bitmap *bitmap) | ||
746 | { | ||
747 | struct file *file; | ||
748 | unsigned long flags; | ||
749 | |||
750 | spin_lock_irqsave(&bitmap->lock, flags); | ||
751 | file = bitmap->file; | ||
752 | bitmap->file = NULL; | ||
753 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
754 | |||
755 | if (file) | ||
756 | wait_event(bitmap->write_wait, | ||
757 | atomic_read(&bitmap->pending_writes)==0); | ||
758 | bitmap_file_unmap(bitmap); | ||
759 | 763 | ||
760 | if (file) { | 764 | if (file) { |
761 | struct inode *inode = file->f_path.dentry->d_inode; | 765 | struct inode *inode = file->f_path.dentry->d_inode; |
@@ -773,14 +777,14 @@ static void bitmap_file_kick(struct bitmap *bitmap) | |||
773 | { | 777 | { |
774 | char *path, *ptr = NULL; | 778 | char *path, *ptr = NULL; |
775 | 779 | ||
776 | if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) { | 780 | if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { |
777 | bitmap_update_sb(bitmap); | 781 | bitmap_update_sb(bitmap); |
778 | 782 | ||
779 | if (bitmap->file) { | 783 | if (bitmap->storage.file) { |
780 | path = kmalloc(PAGE_SIZE, GFP_KERNEL); | 784 | path = kmalloc(PAGE_SIZE, GFP_KERNEL); |
781 | if (path) | 785 | if (path) |
782 | ptr = d_path(&bitmap->file->f_path, path, | 786 | ptr = d_path(&bitmap->storage.file->f_path, |
783 | PAGE_SIZE); | 787 | path, PAGE_SIZE); |
784 | 788 | ||
785 | printk(KERN_ALERT | 789 | printk(KERN_ALERT |
786 | "%s: kicking failed bitmap file %s from array!\n", | 790 | "%s: kicking failed bitmap file %s from array!\n", |
@@ -792,10 +796,6 @@ static void bitmap_file_kick(struct bitmap *bitmap) | |||
792 | "%s: disabling internal bitmap due to errors\n", | 796 | "%s: disabling internal bitmap due to errors\n", |
793 | bmname(bitmap)); | 797 | bmname(bitmap)); |
794 | } | 798 | } |
795 | |||
796 | bitmap_file_put(bitmap); | ||
797 | |||
798 | return; | ||
799 | } | 799 | } |
800 | 800 | ||
801 | enum bitmap_page_attr { | 801 | enum bitmap_page_attr { |
@@ -805,24 +805,30 @@ enum bitmap_page_attr { | |||
805 | BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ | 805 | BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ |
806 | }; | 806 | }; |
807 | 807 | ||
808 | static inline void set_page_attr(struct bitmap *bitmap, struct page *page, | 808 | static inline void set_page_attr(struct bitmap *bitmap, int pnum, |
809 | enum bitmap_page_attr attr) | 809 | enum bitmap_page_attr attr) |
810 | { | 810 | { |
811 | __set_bit((page->index<<2) + attr, bitmap->filemap_attr); | 811 | set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); |
812 | } | 812 | } |
813 | 813 | ||
814 | static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, | 814 | static inline void clear_page_attr(struct bitmap *bitmap, int pnum, |
815 | enum bitmap_page_attr attr) | 815 | enum bitmap_page_attr attr) |
816 | { | 816 | { |
817 | __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); | 817 | clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); |
818 | } | 818 | } |
819 | 819 | ||
820 | static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, | 820 | static inline int test_page_attr(struct bitmap *bitmap, int pnum, |
821 | enum bitmap_page_attr attr) | 821 | enum bitmap_page_attr attr) |
822 | { | 822 | { |
823 | return test_bit((page->index<<2) + attr, bitmap->filemap_attr); | 823 | return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); |
824 | } | 824 | } |
825 | 825 | ||
826 | static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum, | ||
827 | enum bitmap_page_attr attr) | ||
828 | { | ||
829 | return test_and_clear_bit((pnum<<2) + attr, | ||
830 | bitmap->storage.filemap_attr); | ||
831 | } | ||
826 | /* | 832 | /* |
827 | * bitmap_file_set_bit -- called before performing a write to the md device | 833 | * bitmap_file_set_bit -- called before performing a write to the md device |
828 | * to set (and eventually sync) a particular bit in the bitmap file | 834 | * to set (and eventually sync) a particular bit in the bitmap file |
@@ -835,26 +841,46 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) | |||
835 | unsigned long bit; | 841 | unsigned long bit; |
836 | struct page *page; | 842 | struct page *page; |
837 | void *kaddr; | 843 | void *kaddr; |
838 | unsigned long chunk = block >> bitmap->chunkshift; | 844 | unsigned long chunk = block >> bitmap->counts.chunkshift; |
839 | 845 | ||
840 | if (!bitmap->filemap) | 846 | page = filemap_get_page(&bitmap->storage, chunk); |
841 | return; | ||
842 | |||
843 | page = filemap_get_page(bitmap, chunk); | ||
844 | if (!page) | 847 | if (!page) |
845 | return; | 848 | return; |
846 | bit = file_page_offset(bitmap, chunk); | 849 | bit = file_page_offset(&bitmap->storage, chunk); |
847 | 850 | ||
848 | /* set the bit */ | 851 | /* set the bit */ |
849 | kaddr = kmap_atomic(page); | 852 | kaddr = kmap_atomic(page); |
850 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 853 | if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) |
851 | set_bit(bit, kaddr); | 854 | set_bit(bit, kaddr); |
852 | else | 855 | else |
853 | __set_bit_le(bit, kaddr); | 856 | test_and_set_bit_le(bit, kaddr); |
854 | kunmap_atomic(kaddr); | 857 | kunmap_atomic(kaddr); |
855 | pr_debug("set file bit %lu page %lu\n", bit, page->index); | 858 | pr_debug("set file bit %lu page %lu\n", bit, page->index); |
856 | /* record page number so it gets flushed to disk when unplug occurs */ | 859 | /* record page number so it gets flushed to disk when unplug occurs */ |
857 | set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); | 860 | set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY); |
861 | } | ||
862 | |||
863 | static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) | ||
864 | { | ||
865 | unsigned long bit; | ||
866 | struct page *page; | ||
867 | void *paddr; | ||
868 | unsigned long chunk = block >> bitmap->counts.chunkshift; | ||
869 | |||
870 | page = filemap_get_page(&bitmap->storage, chunk); | ||
871 | if (!page) | ||
872 | return; | ||
873 | bit = file_page_offset(&bitmap->storage, chunk); | ||
874 | paddr = kmap_atomic(page); | ||
875 | if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) | ||
876 | clear_bit(bit, paddr); | ||
877 | else | ||
878 | test_and_clear_bit_le(bit, paddr); | ||
879 | kunmap_atomic(paddr); | ||
880 | if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) { | ||
881 | set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING); | ||
882 | bitmap->allclean = 0; | ||
883 | } | ||
858 | } | 884 | } |
859 | 885 | ||
860 | /* this gets called when the md device is ready to unplug its underlying | 886 | /* this gets called when the md device is ready to unplug its underlying |
@@ -862,42 +888,37 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) | |||
862 | * sync the dirty pages of the bitmap file to disk */ | 888 | * sync the dirty pages of the bitmap file to disk */ |
863 | void bitmap_unplug(struct bitmap *bitmap) | 889 | void bitmap_unplug(struct bitmap *bitmap) |
864 | { | 890 | { |
865 | unsigned long i, flags; | 891 | unsigned long i; |
866 | int dirty, need_write; | 892 | int dirty, need_write; |
867 | struct page *page; | ||
868 | int wait = 0; | 893 | int wait = 0; |
869 | 894 | ||
870 | if (!bitmap) | 895 | if (!bitmap || !bitmap->storage.filemap || |
896 | test_bit(BITMAP_STALE, &bitmap->flags)) | ||
871 | return; | 897 | return; |
872 | 898 | ||
873 | /* look at each page to see if there are any set bits that need to be | 899 | /* look at each page to see if there are any set bits that need to be |
874 | * flushed out to disk */ | 900 | * flushed out to disk */ |
875 | for (i = 0; i < bitmap->file_pages; i++) { | 901 | for (i = 0; i < bitmap->storage.file_pages; i++) { |
876 | spin_lock_irqsave(&bitmap->lock, flags); | 902 | if (!bitmap->storage.filemap) |
877 | if (!bitmap->filemap) { | ||
878 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
879 | return; | 903 | return; |
904 | dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); | ||
905 | need_write = test_and_clear_page_attr(bitmap, i, | ||
906 | BITMAP_PAGE_NEEDWRITE); | ||
907 | if (dirty || need_write) { | ||
908 | clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); | ||
909 | write_page(bitmap, bitmap->storage.filemap[i], 0); | ||
880 | } | 910 | } |
881 | page = bitmap->filemap[i]; | ||
882 | dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); | ||
883 | need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); | ||
884 | clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); | ||
885 | clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); | ||
886 | if (dirty) | 911 | if (dirty) |
887 | wait = 1; | 912 | wait = 1; |
888 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
889 | |||
890 | if (dirty || need_write) | ||
891 | write_page(bitmap, page, 0); | ||
892 | } | 913 | } |
893 | if (wait) { /* if any writes were performed, we need to wait on them */ | 914 | if (wait) { /* if any writes were performed, we need to wait on them */ |
894 | if (bitmap->file) | 915 | if (bitmap->storage.file) |
895 | wait_event(bitmap->write_wait, | 916 | wait_event(bitmap->write_wait, |
896 | atomic_read(&bitmap->pending_writes)==0); | 917 | atomic_read(&bitmap->pending_writes)==0); |
897 | else | 918 | else |
898 | md_super_wait(bitmap->mddev); | 919 | md_super_wait(bitmap->mddev); |
899 | } | 920 | } |
900 | if (bitmap->flags & BITMAP_WRITE_ERROR) | 921 | if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) |
901 | bitmap_file_kick(bitmap); | 922 | bitmap_file_kick(bitmap); |
902 | } | 923 | } |
903 | EXPORT_SYMBOL(bitmap_unplug); | 924 | EXPORT_SYMBOL(bitmap_unplug); |
@@ -917,98 +938,77 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n | |||
917 | static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | 938 | static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) |
918 | { | 939 | { |
919 | unsigned long i, chunks, index, oldindex, bit; | 940 | unsigned long i, chunks, index, oldindex, bit; |
920 | struct page *page = NULL, *oldpage = NULL; | 941 | struct page *page = NULL; |
921 | unsigned long num_pages, bit_cnt = 0; | 942 | unsigned long bit_cnt = 0; |
922 | struct file *file; | 943 | struct file *file; |
923 | unsigned long bytes, offset; | 944 | unsigned long offset; |
924 | int outofdate; | 945 | int outofdate; |
925 | int ret = -ENOSPC; | 946 | int ret = -ENOSPC; |
926 | void *paddr; | 947 | void *paddr; |
948 | struct bitmap_storage *store = &bitmap->storage; | ||
927 | 949 | ||
928 | chunks = bitmap->chunks; | 950 | chunks = bitmap->counts.chunks; |
929 | file = bitmap->file; | 951 | file = store->file; |
930 | 952 | ||
931 | BUG_ON(!file && !bitmap->mddev->bitmap_info.offset); | 953 | if (!file && !bitmap->mddev->bitmap_info.offset) { |
954 | /* No permanent bitmap - fill with '1s'. */ | ||
955 | store->filemap = NULL; | ||
956 | store->file_pages = 0; | ||
957 | for (i = 0; i < chunks ; i++) { | ||
958 | /* if the disk bit is set, set the memory bit */ | ||
959 | int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift) | ||
960 | >= start); | ||
961 | bitmap_set_memory_bits(bitmap, | ||
962 | (sector_t)i << bitmap->counts.chunkshift, | ||
963 | needed); | ||
964 | } | ||
965 | return 0; | ||
966 | } | ||
932 | 967 | ||
933 | outofdate = bitmap->flags & BITMAP_STALE; | 968 | outofdate = test_bit(BITMAP_STALE, &bitmap->flags); |
934 | if (outofdate) | 969 | if (outofdate) |
935 | printk(KERN_INFO "%s: bitmap file is out of date, doing full " | 970 | printk(KERN_INFO "%s: bitmap file is out of date, doing full " |
936 | "recovery\n", bmname(bitmap)); | 971 | "recovery\n", bmname(bitmap)); |
937 | 972 | ||
938 | bytes = DIV_ROUND_UP(bitmap->chunks, 8); | 973 | if (file && i_size_read(file->f_mapping->host) < store->bytes) { |
939 | if (!bitmap->mddev->bitmap_info.external) | ||
940 | bytes += sizeof(bitmap_super_t); | ||
941 | |||
942 | num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); | ||
943 | |||
944 | if (file && i_size_read(file->f_mapping->host) < bytes) { | ||
945 | printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", | 974 | printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", |
946 | bmname(bitmap), | 975 | bmname(bitmap), |
947 | (unsigned long) i_size_read(file->f_mapping->host), | 976 | (unsigned long) i_size_read(file->f_mapping->host), |
948 | bytes); | 977 | store->bytes); |
949 | goto err; | 978 | goto err; |
950 | } | 979 | } |
951 | 980 | ||
952 | ret = -ENOMEM; | ||
953 | |||
954 | bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); | ||
955 | if (!bitmap->filemap) | ||
956 | goto err; | ||
957 | |||
958 | /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */ | ||
959 | bitmap->filemap_attr = kzalloc( | ||
960 | roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), | ||
961 | GFP_KERNEL); | ||
962 | if (!bitmap->filemap_attr) | ||
963 | goto err; | ||
964 | |||
965 | oldindex = ~0L; | 981 | oldindex = ~0L; |
982 | offset = 0; | ||
983 | if (!bitmap->mddev->bitmap_info.external) | ||
984 | offset = sizeof(bitmap_super_t); | ||
966 | 985 | ||
967 | for (i = 0; i < chunks; i++) { | 986 | for (i = 0; i < chunks; i++) { |
968 | int b; | 987 | int b; |
969 | index = file_page_index(bitmap, i); | 988 | index = file_page_index(&bitmap->storage, i); |
970 | bit = file_page_offset(bitmap, i); | 989 | bit = file_page_offset(&bitmap->storage, i); |
971 | if (index != oldindex) { /* this is a new page, read it in */ | 990 | if (index != oldindex) { /* this is a new page, read it in */ |
972 | int count; | 991 | int count; |
973 | /* unmap the old page, we're done with it */ | 992 | /* unmap the old page, we're done with it */ |
974 | if (index == num_pages-1) | 993 | if (index == store->file_pages-1) |
975 | count = bytes - index * PAGE_SIZE; | 994 | count = store->bytes - index * PAGE_SIZE; |
976 | else | 995 | else |
977 | count = PAGE_SIZE; | 996 | count = PAGE_SIZE; |
978 | if (index == 0 && bitmap->sb_page) { | 997 | page = store->filemap[index]; |
979 | /* | 998 | if (file) |
980 | * if we're here then the superblock page | 999 | ret = read_page(file, index, bitmap, |
981 | * contains some bits (PAGE_SIZE != sizeof sb) | 1000 | count, page); |
982 | * we've already read it in, so just use it | 1001 | else |
983 | */ | 1002 | ret = read_sb_page( |
984 | page = bitmap->sb_page; | 1003 | bitmap->mddev, |
985 | offset = sizeof(bitmap_super_t); | 1004 | bitmap->mddev->bitmap_info.offset, |
986 | if (!file) | 1005 | page, |
987 | page = read_sb_page( | 1006 | index, count); |
988 | bitmap->mddev, | 1007 | |
989 | bitmap->mddev->bitmap_info.offset, | 1008 | if (ret) |
990 | page, | ||
991 | index, count); | ||
992 | } else if (file) { | ||
993 | page = read_page(file, index, bitmap, count); | ||
994 | offset = 0; | ||
995 | } else { | ||
996 | page = read_sb_page(bitmap->mddev, | ||
997 | bitmap->mddev->bitmap_info.offset, | ||
998 | NULL, | ||
999 | index, count); | ||
1000 | offset = 0; | ||
1001 | } | ||
1002 | if (IS_ERR(page)) { /* read error */ | ||
1003 | ret = PTR_ERR(page); | ||
1004 | goto err; | 1009 | goto err; |
1005 | } | ||
1006 | 1010 | ||
1007 | oldindex = index; | 1011 | oldindex = index; |
1008 | oldpage = page; | ||
1009 | |||
1010 | bitmap->filemap[bitmap->file_pages++] = page; | ||
1011 | bitmap->last_page_size = count; | ||
1012 | 1012 | ||
1013 | if (outofdate) { | 1013 | if (outofdate) { |
1014 | /* | 1014 | /* |
@@ -1022,39 +1022,33 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
1022 | write_page(bitmap, page, 1); | 1022 | write_page(bitmap, page, 1); |
1023 | 1023 | ||
1024 | ret = -EIO; | 1024 | ret = -EIO; |
1025 | if (bitmap->flags & BITMAP_WRITE_ERROR) | 1025 | if (test_bit(BITMAP_WRITE_ERROR, |
1026 | &bitmap->flags)) | ||
1026 | goto err; | 1027 | goto err; |
1027 | } | 1028 | } |
1028 | } | 1029 | } |
1029 | paddr = kmap_atomic(page); | 1030 | paddr = kmap_atomic(page); |
1030 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 1031 | if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) |
1031 | b = test_bit(bit, paddr); | 1032 | b = test_bit(bit, paddr); |
1032 | else | 1033 | else |
1033 | b = test_bit_le(bit, paddr); | 1034 | b = test_bit_le(bit, paddr); |
1034 | kunmap_atomic(paddr); | 1035 | kunmap_atomic(paddr); |
1035 | if (b) { | 1036 | if (b) { |
1036 | /* if the disk bit is set, set the memory bit */ | 1037 | /* if the disk bit is set, set the memory bit */ |
1037 | int needed = ((sector_t)(i+1) << bitmap->chunkshift | 1038 | int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift |
1038 | >= start); | 1039 | >= start); |
1039 | bitmap_set_memory_bits(bitmap, | 1040 | bitmap_set_memory_bits(bitmap, |
1040 | (sector_t)i << bitmap->chunkshift, | 1041 | (sector_t)i << bitmap->counts.chunkshift, |
1041 | needed); | 1042 | needed); |
1042 | bit_cnt++; | 1043 | bit_cnt++; |
1043 | } | 1044 | } |
1044 | } | 1045 | offset = 0; |
1045 | |||
1046 | /* everything went OK */ | ||
1047 | ret = 0; | ||
1048 | bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); | ||
1049 | |||
1050 | if (bit_cnt) { /* Kick recovery if any bits were set */ | ||
1051 | set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); | ||
1052 | md_wakeup_thread(bitmap->mddev->thread); | ||
1053 | } | 1046 | } |
1054 | 1047 | ||
1055 | printk(KERN_INFO "%s: bitmap initialized from disk: " | 1048 | printk(KERN_INFO "%s: bitmap initialized from disk: " |
1056 | "read %lu/%lu pages, set %lu of %lu bits\n", | 1049 | "read %lu pages, set %lu of %lu bits\n", |
1057 | bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks); | 1050 | bmname(bitmap), store->file_pages, |
1051 | bit_cnt, chunks); | ||
1058 | 1052 | ||
1059 | return 0; | 1053 | return 0; |
1060 | 1054 | ||
@@ -1071,22 +1065,38 @@ void bitmap_write_all(struct bitmap *bitmap) | |||
1071 | */ | 1065 | */ |
1072 | int i; | 1066 | int i; |
1073 | 1067 | ||
1074 | spin_lock_irq(&bitmap->lock); | 1068 | if (!bitmap || !bitmap->storage.filemap) |
1075 | for (i = 0; i < bitmap->file_pages; i++) | 1069 | return; |
1076 | set_page_attr(bitmap, bitmap->filemap[i], | 1070 | if (bitmap->storage.file) |
1071 | /* Only one copy, so nothing needed */ | ||
1072 | return; | ||
1073 | |||
1074 | for (i = 0; i < bitmap->storage.file_pages; i++) | ||
1075 | set_page_attr(bitmap, i, | ||
1077 | BITMAP_PAGE_NEEDWRITE); | 1076 | BITMAP_PAGE_NEEDWRITE); |
1078 | bitmap->allclean = 0; | 1077 | bitmap->allclean = 0; |
1079 | spin_unlock_irq(&bitmap->lock); | ||
1080 | } | 1078 | } |
1081 | 1079 | ||
1082 | static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) | 1080 | static void bitmap_count_page(struct bitmap_counts *bitmap, |
1081 | sector_t offset, int inc) | ||
1083 | { | 1082 | { |
1084 | sector_t chunk = offset >> bitmap->chunkshift; | 1083 | sector_t chunk = offset >> bitmap->chunkshift; |
1085 | unsigned long page = chunk >> PAGE_COUNTER_SHIFT; | 1084 | unsigned long page = chunk >> PAGE_COUNTER_SHIFT; |
1086 | bitmap->bp[page].count += inc; | 1085 | bitmap->bp[page].count += inc; |
1087 | bitmap_checkfree(bitmap, page); | 1086 | bitmap_checkfree(bitmap, page); |
1088 | } | 1087 | } |
1089 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | 1088 | |
1089 | static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) | ||
1090 | { | ||
1091 | sector_t chunk = offset >> bitmap->chunkshift; | ||
1092 | unsigned long page = chunk >> PAGE_COUNTER_SHIFT; | ||
1093 | struct bitmap_page *bp = &bitmap->bp[page]; | ||
1094 | |||
1095 | if (!bp->pending) | ||
1096 | bp->pending = 1; | ||
1097 | } | ||
1098 | |||
1099 | static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, | ||
1090 | sector_t offset, sector_t *blocks, | 1100 | sector_t offset, sector_t *blocks, |
1091 | int create); | 1101 | int create); |
1092 | 1102 | ||
@@ -1099,10 +1109,9 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1099 | { | 1109 | { |
1100 | struct bitmap *bitmap; | 1110 | struct bitmap *bitmap; |
1101 | unsigned long j; | 1111 | unsigned long j; |
1102 | unsigned long flags; | 1112 | unsigned long nextpage; |
1103 | struct page *page = NULL, *lastpage = NULL; | ||
1104 | sector_t blocks; | 1113 | sector_t blocks; |
1105 | void *paddr; | 1114 | struct bitmap_counts *counts; |
1106 | 1115 | ||
1107 | /* Use a mutex to guard daemon_work against | 1116 | /* Use a mutex to guard daemon_work against |
1108 | * bitmap_destroy. | 1117 | * bitmap_destroy. |
@@ -1124,112 +1133,90 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1124 | } | 1133 | } |
1125 | bitmap->allclean = 1; | 1134 | bitmap->allclean = 1; |
1126 | 1135 | ||
1127 | spin_lock_irqsave(&bitmap->lock, flags); | 1136 | /* Any file-page which is PENDING now needs to be written. |
1128 | for (j = 0; j < bitmap->chunks; j++) { | 1137 | * So set NEEDWRITE now, then after we make any last-minute changes |
1138 | * we will write it. | ||
1139 | */ | ||
1140 | for (j = 0; j < bitmap->storage.file_pages; j++) | ||
1141 | if (test_and_clear_page_attr(bitmap, j, | ||
1142 | BITMAP_PAGE_PENDING)) | ||
1143 | set_page_attr(bitmap, j, | ||
1144 | BITMAP_PAGE_NEEDWRITE); | ||
1145 | |||
1146 | if (bitmap->need_sync && | ||
1147 | mddev->bitmap_info.external == 0) { | ||
1148 | /* Arrange for superblock update as well as | ||
1149 | * other changes */ | ||
1150 | bitmap_super_t *sb; | ||
1151 | bitmap->need_sync = 0; | ||
1152 | if (bitmap->storage.filemap) { | ||
1153 | sb = kmap_atomic(bitmap->storage.sb_page); | ||
1154 | sb->events_cleared = | ||
1155 | cpu_to_le64(bitmap->events_cleared); | ||
1156 | kunmap_atomic(sb); | ||
1157 | set_page_attr(bitmap, 0, | ||
1158 | BITMAP_PAGE_NEEDWRITE); | ||
1159 | } | ||
1160 | } | ||
1161 | /* Now look at the bitmap counters and if any are '2' or '1', | ||
1162 | * decrement and handle accordingly. | ||
1163 | */ | ||
1164 | counts = &bitmap->counts; | ||
1165 | spin_lock_irq(&counts->lock); | ||
1166 | nextpage = 0; | ||
1167 | for (j = 0; j < counts->chunks; j++) { | ||
1129 | bitmap_counter_t *bmc; | 1168 | bitmap_counter_t *bmc; |
1130 | if (!bitmap->filemap) | 1169 | sector_t block = (sector_t)j << counts->chunkshift; |
1131 | /* error or shutdown */ | ||
1132 | break; | ||
1133 | 1170 | ||
1134 | page = filemap_get_page(bitmap, j); | 1171 | if (j == nextpage) { |
1135 | 1172 | nextpage += PAGE_COUNTER_RATIO; | |
1136 | if (page != lastpage) { | 1173 | if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) { |
1137 | /* skip this page unless it's marked as needing cleaning */ | 1174 | j |= PAGE_COUNTER_MASK; |
1138 | if (!test_page_attr(bitmap, page, BITMAP_PAGE_PENDING)) { | ||
1139 | int need_write = test_page_attr(bitmap, page, | ||
1140 | BITMAP_PAGE_NEEDWRITE); | ||
1141 | if (need_write) | ||
1142 | clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); | ||
1143 | |||
1144 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
1145 | if (need_write) | ||
1146 | write_page(bitmap, page, 0); | ||
1147 | spin_lock_irqsave(&bitmap->lock, flags); | ||
1148 | j |= (PAGE_BITS - 1); | ||
1149 | continue; | 1175 | continue; |
1150 | } | 1176 | } |
1151 | 1177 | counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0; | |
1152 | /* grab the new page, sync and release the old */ | ||
1153 | if (lastpage != NULL) { | ||
1154 | if (test_page_attr(bitmap, lastpage, | ||
1155 | BITMAP_PAGE_NEEDWRITE)) { | ||
1156 | clear_page_attr(bitmap, lastpage, | ||
1157 | BITMAP_PAGE_NEEDWRITE); | ||
1158 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
1159 | write_page(bitmap, lastpage, 0); | ||
1160 | } else { | ||
1161 | set_page_attr(bitmap, lastpage, | ||
1162 | BITMAP_PAGE_NEEDWRITE); | ||
1163 | bitmap->allclean = 0; | ||
1164 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
1165 | } | ||
1166 | } else | ||
1167 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
1168 | lastpage = page; | ||
1169 | |||
1170 | /* We are possibly going to clear some bits, so make | ||
1171 | * sure that events_cleared is up-to-date. | ||
1172 | */ | ||
1173 | if (bitmap->need_sync && | ||
1174 | mddev->bitmap_info.external == 0) { | ||
1175 | bitmap_super_t *sb; | ||
1176 | bitmap->need_sync = 0; | ||
1177 | sb = kmap_atomic(bitmap->sb_page); | ||
1178 | sb->events_cleared = | ||
1179 | cpu_to_le64(bitmap->events_cleared); | ||
1180 | kunmap_atomic(sb); | ||
1181 | write_page(bitmap, bitmap->sb_page, 1); | ||
1182 | } | ||
1183 | spin_lock_irqsave(&bitmap->lock, flags); | ||
1184 | if (!bitmap->need_sync) | ||
1185 | clear_page_attr(bitmap, page, BITMAP_PAGE_PENDING); | ||
1186 | else | ||
1187 | bitmap->allclean = 0; | ||
1188 | } | 1178 | } |
1189 | bmc = bitmap_get_counter(bitmap, | 1179 | bmc = bitmap_get_counter(counts, |
1190 | (sector_t)j << bitmap->chunkshift, | 1180 | block, |
1191 | &blocks, 0); | 1181 | &blocks, 0); |
1192 | if (!bmc) | 1182 | |
1183 | if (!bmc) { | ||
1193 | j |= PAGE_COUNTER_MASK; | 1184 | j |= PAGE_COUNTER_MASK; |
1194 | else if (*bmc) { | 1185 | continue; |
1195 | if (*bmc == 1 && !bitmap->need_sync) { | ||
1196 | /* we can clear the bit */ | ||
1197 | *bmc = 0; | ||
1198 | bitmap_count_page(bitmap, | ||
1199 | (sector_t)j << bitmap->chunkshift, | ||
1200 | -1); | ||
1201 | |||
1202 | /* clear the bit */ | ||
1203 | paddr = kmap_atomic(page); | ||
1204 | if (bitmap->flags & BITMAP_HOSTENDIAN) | ||
1205 | clear_bit(file_page_offset(bitmap, j), | ||
1206 | paddr); | ||
1207 | else | ||
1208 | __clear_bit_le( | ||
1209 | file_page_offset(bitmap, | ||
1210 | j), | ||
1211 | paddr); | ||
1212 | kunmap_atomic(paddr); | ||
1213 | } else if (*bmc <= 2) { | ||
1214 | *bmc = 1; /* maybe clear the bit next time */ | ||
1215 | set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); | ||
1216 | bitmap->allclean = 0; | ||
1217 | } | ||
1218 | } | 1186 | } |
1219 | } | 1187 | if (*bmc == 1 && !bitmap->need_sync) { |
1220 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1188 | /* We can clear the bit */ |
1221 | 1189 | *bmc = 0; | |
1222 | /* now sync the final page */ | 1190 | bitmap_count_page(counts, block, -1); |
1223 | if (lastpage != NULL) { | 1191 | bitmap_file_clear_bit(bitmap, block); |
1224 | spin_lock_irqsave(&bitmap->lock, flags); | 1192 | } else if (*bmc && *bmc <= 2) { |
1225 | if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { | 1193 | *bmc = 1; |
1226 | clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); | 1194 | bitmap_set_pending(counts, block); |
1227 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
1228 | write_page(bitmap, lastpage, 0); | ||
1229 | } else { | ||
1230 | set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); | ||
1231 | bitmap->allclean = 0; | 1195 | bitmap->allclean = 0; |
1232 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1196 | } |
1197 | } | ||
1198 | spin_unlock_irq(&counts->lock); | ||
1199 | |||
1200 | /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. | ||
1201 | * DIRTY pages need to be written by bitmap_unplug so it can wait | ||
1202 | * for them. | ||
1203 | * If we find any DIRTY page we stop there and let bitmap_unplug | ||
1204 | * handle all the rest. This is important in the case where | ||
1205 | * the first blocking holds the superblock and it has been updated. | ||
1206 | * We mustn't write any other blocks before the superblock. | ||
1207 | */ | ||
1208 | for (j = 0; | ||
1209 | j < bitmap->storage.file_pages | ||
1210 | && !test_bit(BITMAP_STALE, &bitmap->flags); | ||
1211 | j++) { | ||
1212 | |||
1213 | if (test_page_attr(bitmap, j, | ||
1214 | BITMAP_PAGE_DIRTY)) | ||
1215 | /* bitmap_unplug will handle the rest */ | ||
1216 | break; | ||
1217 | if (test_and_clear_page_attr(bitmap, j, | ||
1218 | BITMAP_PAGE_NEEDWRITE)) { | ||
1219 | write_page(bitmap, bitmap->storage.filemap[j], 0); | ||
1233 | } | 1220 | } |
1234 | } | 1221 | } |
1235 | 1222 | ||
@@ -1240,7 +1227,7 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1240 | mutex_unlock(&mddev->bitmap_info.mutex); | 1227 | mutex_unlock(&mddev->bitmap_info.mutex); |
1241 | } | 1228 | } |
1242 | 1229 | ||
1243 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | 1230 | static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, |
1244 | sector_t offset, sector_t *blocks, | 1231 | sector_t offset, sector_t *blocks, |
1245 | int create) | 1232 | int create) |
1246 | __releases(bitmap->lock) | 1233 | __releases(bitmap->lock) |
@@ -1302,10 +1289,10 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
1302 | sector_t blocks; | 1289 | sector_t blocks; |
1303 | bitmap_counter_t *bmc; | 1290 | bitmap_counter_t *bmc; |
1304 | 1291 | ||
1305 | spin_lock_irq(&bitmap->lock); | 1292 | spin_lock_irq(&bitmap->counts.lock); |
1306 | bmc = bitmap_get_counter(bitmap, offset, &blocks, 1); | 1293 | bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); |
1307 | if (!bmc) { | 1294 | if (!bmc) { |
1308 | spin_unlock_irq(&bitmap->lock); | 1295 | spin_unlock_irq(&bitmap->counts.lock); |
1309 | return 0; | 1296 | return 0; |
1310 | } | 1297 | } |
1311 | 1298 | ||
@@ -1317,7 +1304,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
1317 | */ | 1304 | */ |
1318 | prepare_to_wait(&bitmap->overflow_wait, &__wait, | 1305 | prepare_to_wait(&bitmap->overflow_wait, &__wait, |
1319 | TASK_UNINTERRUPTIBLE); | 1306 | TASK_UNINTERRUPTIBLE); |
1320 | spin_unlock_irq(&bitmap->lock); | 1307 | spin_unlock_irq(&bitmap->counts.lock); |
1321 | io_schedule(); | 1308 | io_schedule(); |
1322 | finish_wait(&bitmap->overflow_wait, &__wait); | 1309 | finish_wait(&bitmap->overflow_wait, &__wait); |
1323 | continue; | 1310 | continue; |
@@ -1326,7 +1313,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
1326 | switch (*bmc) { | 1313 | switch (*bmc) { |
1327 | case 0: | 1314 | case 0: |
1328 | bitmap_file_set_bit(bitmap, offset); | 1315 | bitmap_file_set_bit(bitmap, offset); |
1329 | bitmap_count_page(bitmap, offset, 1); | 1316 | bitmap_count_page(&bitmap->counts, offset, 1); |
1330 | /* fall through */ | 1317 | /* fall through */ |
1331 | case 1: | 1318 | case 1: |
1332 | *bmc = 2; | 1319 | *bmc = 2; |
@@ -1334,7 +1321,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
1334 | 1321 | ||
1335 | (*bmc)++; | 1322 | (*bmc)++; |
1336 | 1323 | ||
1337 | spin_unlock_irq(&bitmap->lock); | 1324 | spin_unlock_irq(&bitmap->counts.lock); |
1338 | 1325 | ||
1339 | offset += blocks; | 1326 | offset += blocks; |
1340 | if (sectors > blocks) | 1327 | if (sectors > blocks) |
@@ -1364,10 +1351,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1364 | unsigned long flags; | 1351 | unsigned long flags; |
1365 | bitmap_counter_t *bmc; | 1352 | bitmap_counter_t *bmc; |
1366 | 1353 | ||
1367 | spin_lock_irqsave(&bitmap->lock, flags); | 1354 | spin_lock_irqsave(&bitmap->counts.lock, flags); |
1368 | bmc = bitmap_get_counter(bitmap, offset, &blocks, 0); | 1355 | bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0); |
1369 | if (!bmc) { | 1356 | if (!bmc) { |
1370 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1357 | spin_unlock_irqrestore(&bitmap->counts.lock, flags); |
1371 | return; | 1358 | return; |
1372 | } | 1359 | } |
1373 | 1360 | ||
@@ -1386,14 +1373,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1386 | 1373 | ||
1387 | (*bmc)--; | 1374 | (*bmc)--; |
1388 | if (*bmc <= 2) { | 1375 | if (*bmc <= 2) { |
1389 | set_page_attr(bitmap, | 1376 | bitmap_set_pending(&bitmap->counts, offset); |
1390 | filemap_get_page( | ||
1391 | bitmap, | ||
1392 | offset >> bitmap->chunkshift), | ||
1393 | BITMAP_PAGE_PENDING); | ||
1394 | bitmap->allclean = 0; | 1377 | bitmap->allclean = 0; |
1395 | } | 1378 | } |
1396 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1379 | spin_unlock_irqrestore(&bitmap->counts.lock, flags); |
1397 | offset += blocks; | 1380 | offset += blocks; |
1398 | if (sectors > blocks) | 1381 | if (sectors > blocks) |
1399 | sectors -= blocks; | 1382 | sectors -= blocks; |
@@ -1412,8 +1395,8 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t | |||
1412 | *blocks = 1024; | 1395 | *blocks = 1024; |
1413 | return 1; /* always resync if no bitmap */ | 1396 | return 1; /* always resync if no bitmap */ |
1414 | } | 1397 | } |
1415 | spin_lock_irq(&bitmap->lock); | 1398 | spin_lock_irq(&bitmap->counts.lock); |
1416 | bmc = bitmap_get_counter(bitmap, offset, blocks, 0); | 1399 | bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); |
1417 | rv = 0; | 1400 | rv = 0; |
1418 | if (bmc) { | 1401 | if (bmc) { |
1419 | /* locked */ | 1402 | /* locked */ |
@@ -1427,7 +1410,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t | |||
1427 | } | 1410 | } |
1428 | } | 1411 | } |
1429 | } | 1412 | } |
1430 | spin_unlock_irq(&bitmap->lock); | 1413 | spin_unlock_irq(&bitmap->counts.lock); |
1431 | return rv; | 1414 | return rv; |
1432 | } | 1415 | } |
1433 | 1416 | ||
@@ -1464,8 +1447,8 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i | |||
1464 | *blocks = 1024; | 1447 | *blocks = 1024; |
1465 | return; | 1448 | return; |
1466 | } | 1449 | } |
1467 | spin_lock_irqsave(&bitmap->lock, flags); | 1450 | spin_lock_irqsave(&bitmap->counts.lock, flags); |
1468 | bmc = bitmap_get_counter(bitmap, offset, blocks, 0); | 1451 | bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); |
1469 | if (bmc == NULL) | 1452 | if (bmc == NULL) |
1470 | goto unlock; | 1453 | goto unlock; |
1471 | /* locked */ | 1454 | /* locked */ |
@@ -1476,15 +1459,13 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i | |||
1476 | *bmc |= NEEDED_MASK; | 1459 | *bmc |= NEEDED_MASK; |
1477 | else { | 1460 | else { |
1478 | if (*bmc <= 2) { | 1461 | if (*bmc <= 2) { |
1479 | set_page_attr(bitmap, | 1462 | bitmap_set_pending(&bitmap->counts, offset); |
1480 | filemap_get_page(bitmap, offset >> bitmap->chunkshift), | ||
1481 | BITMAP_PAGE_PENDING); | ||
1482 | bitmap->allclean = 0; | 1463 | bitmap->allclean = 0; |
1483 | } | 1464 | } |
1484 | } | 1465 | } |
1485 | } | 1466 | } |
1486 | unlock: | 1467 | unlock: |
1487 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1468 | spin_unlock_irqrestore(&bitmap->counts.lock, flags); |
1488 | } | 1469 | } |
1489 | EXPORT_SYMBOL(bitmap_end_sync); | 1470 | EXPORT_SYMBOL(bitmap_end_sync); |
1490 | 1471 | ||
@@ -1524,7 +1505,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) | |||
1524 | 1505 | ||
1525 | bitmap->mddev->curr_resync_completed = sector; | 1506 | bitmap->mddev->curr_resync_completed = sector; |
1526 | set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); | 1507 | set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); |
1527 | sector &= ~((1ULL << bitmap->chunkshift) - 1); | 1508 | sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); |
1528 | s = 0; | 1509 | s = 0; |
1529 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { | 1510 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { |
1530 | bitmap_end_sync(bitmap, s, &blocks, 0); | 1511 | bitmap_end_sync(bitmap, s, &blocks, 0); |
@@ -1538,27 +1519,25 @@ EXPORT_SYMBOL(bitmap_cond_end_sync); | |||
1538 | static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) | 1519 | static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) |
1539 | { | 1520 | { |
1540 | /* For each chunk covered by any of these sectors, set the | 1521 | /* For each chunk covered by any of these sectors, set the |
1541 | * counter to 1 and set resync_needed. They should all | 1522 | * counter to 2 and possibly set resync_needed. They should all |
1542 | * be 0 at this point | 1523 | * be 0 at this point |
1543 | */ | 1524 | */ |
1544 | 1525 | ||
1545 | sector_t secs; | 1526 | sector_t secs; |
1546 | bitmap_counter_t *bmc; | 1527 | bitmap_counter_t *bmc; |
1547 | spin_lock_irq(&bitmap->lock); | 1528 | spin_lock_irq(&bitmap->counts.lock); |
1548 | bmc = bitmap_get_counter(bitmap, offset, &secs, 1); | 1529 | bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1); |
1549 | if (!bmc) { | 1530 | if (!bmc) { |
1550 | spin_unlock_irq(&bitmap->lock); | 1531 | spin_unlock_irq(&bitmap->counts.lock); |
1551 | return; | 1532 | return; |
1552 | } | 1533 | } |
1553 | if (!*bmc) { | 1534 | if (!*bmc) { |
1554 | struct page *page; | ||
1555 | *bmc = 2 | (needed ? NEEDED_MASK : 0); | 1535 | *bmc = 2 | (needed ? NEEDED_MASK : 0); |
1556 | bitmap_count_page(bitmap, offset, 1); | 1536 | bitmap_count_page(&bitmap->counts, offset, 1); |
1557 | page = filemap_get_page(bitmap, offset >> bitmap->chunkshift); | 1537 | bitmap_set_pending(&bitmap->counts, offset); |
1558 | set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); | ||
1559 | bitmap->allclean = 0; | 1538 | bitmap->allclean = 0; |
1560 | } | 1539 | } |
1561 | spin_unlock_irq(&bitmap->lock); | 1540 | spin_unlock_irq(&bitmap->counts.lock); |
1562 | } | 1541 | } |
1563 | 1542 | ||
1564 | /* dirty the memory and file bits for bitmap chunks "s" to "e" */ | 1543 | /* dirty the memory and file bits for bitmap chunks "s" to "e" */ |
@@ -1567,11 +1546,9 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) | |||
1567 | unsigned long chunk; | 1546 | unsigned long chunk; |
1568 | 1547 | ||
1569 | for (chunk = s; chunk <= e; chunk++) { | 1548 | for (chunk = s; chunk <= e; chunk++) { |
1570 | sector_t sec = (sector_t)chunk << bitmap->chunkshift; | 1549 | sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; |
1571 | bitmap_set_memory_bits(bitmap, sec, 1); | 1550 | bitmap_set_memory_bits(bitmap, sec, 1); |
1572 | spin_lock_irq(&bitmap->lock); | ||
1573 | bitmap_file_set_bit(bitmap, sec); | 1551 | bitmap_file_set_bit(bitmap, sec); |
1574 | spin_unlock_irq(&bitmap->lock); | ||
1575 | if (sec < bitmap->mddev->recovery_cp) | 1552 | if (sec < bitmap->mddev->recovery_cp) |
1576 | /* We are asserting that the array is dirty, | 1553 | /* We are asserting that the array is dirty, |
1577 | * so move the recovery_cp address back so | 1554 | * so move the recovery_cp address back so |
@@ -1616,11 +1593,15 @@ static void bitmap_free(struct bitmap *bitmap) | |||
1616 | if (!bitmap) /* there was no bitmap */ | 1593 | if (!bitmap) /* there was no bitmap */ |
1617 | return; | 1594 | return; |
1618 | 1595 | ||
1619 | /* release the bitmap file and kill the daemon */ | 1596 | /* Shouldn't be needed - but just in case.... */ |
1620 | bitmap_file_put(bitmap); | 1597 | wait_event(bitmap->write_wait, |
1598 | atomic_read(&bitmap->pending_writes) == 0); | ||
1599 | |||
1600 | /* release the bitmap file */ | ||
1601 | bitmap_file_unmap(&bitmap->storage); | ||
1621 | 1602 | ||
1622 | bp = bitmap->bp; | 1603 | bp = bitmap->counts.bp; |
1623 | pages = bitmap->pages; | 1604 | pages = bitmap->counts.pages; |
1624 | 1605 | ||
1625 | /* free all allocated memory */ | 1606 | /* free all allocated memory */ |
1626 | 1607 | ||
@@ -1659,25 +1640,19 @@ int bitmap_create(struct mddev *mddev) | |||
1659 | { | 1640 | { |
1660 | struct bitmap *bitmap; | 1641 | struct bitmap *bitmap; |
1661 | sector_t blocks = mddev->resync_max_sectors; | 1642 | sector_t blocks = mddev->resync_max_sectors; |
1662 | unsigned long chunks; | ||
1663 | unsigned long pages; | ||
1664 | struct file *file = mddev->bitmap_info.file; | 1643 | struct file *file = mddev->bitmap_info.file; |
1665 | int err; | 1644 | int err; |
1666 | struct sysfs_dirent *bm = NULL; | 1645 | struct sysfs_dirent *bm = NULL; |
1667 | 1646 | ||
1668 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); | 1647 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); |
1669 | 1648 | ||
1670 | if (!file | ||
1671 | && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ | ||
1672 | return 0; | ||
1673 | |||
1674 | BUG_ON(file && mddev->bitmap_info.offset); | 1649 | BUG_ON(file && mddev->bitmap_info.offset); |
1675 | 1650 | ||
1676 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); | 1651 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); |
1677 | if (!bitmap) | 1652 | if (!bitmap) |
1678 | return -ENOMEM; | 1653 | return -ENOMEM; |
1679 | 1654 | ||
1680 | spin_lock_init(&bitmap->lock); | 1655 | spin_lock_init(&bitmap->counts.lock); |
1681 | atomic_set(&bitmap->pending_writes, 0); | 1656 | atomic_set(&bitmap->pending_writes, 0); |
1682 | init_waitqueue_head(&bitmap->write_wait); | 1657 | init_waitqueue_head(&bitmap->write_wait); |
1683 | init_waitqueue_head(&bitmap->overflow_wait); | 1658 | init_waitqueue_head(&bitmap->overflow_wait); |
@@ -1693,7 +1668,7 @@ int bitmap_create(struct mddev *mddev) | |||
1693 | } else | 1668 | } else |
1694 | bitmap->sysfs_can_clear = NULL; | 1669 | bitmap->sysfs_can_clear = NULL; |
1695 | 1670 | ||
1696 | bitmap->file = file; | 1671 | bitmap->storage.file = file; |
1697 | if (file) { | 1672 | if (file) { |
1698 | get_file(file); | 1673 | get_file(file); |
1699 | /* As future accesses to this file will use bmap, | 1674 | /* As future accesses to this file will use bmap, |
@@ -1724,32 +1699,15 @@ int bitmap_create(struct mddev *mddev) | |||
1724 | goto error; | 1699 | goto error; |
1725 | 1700 | ||
1726 | bitmap->daemon_lastrun = jiffies; | 1701 | bitmap->daemon_lastrun = jiffies; |
1727 | bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize) | 1702 | err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); |
1728 | - BITMAP_BLOCK_SHIFT); | 1703 | if (err) |
1729 | |||
1730 | chunks = (blocks + (1 << bitmap->chunkshift) - 1) >> | ||
1731 | bitmap->chunkshift; | ||
1732 | pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; | ||
1733 | |||
1734 | BUG_ON(!pages); | ||
1735 | |||
1736 | bitmap->chunks = chunks; | ||
1737 | bitmap->pages = pages; | ||
1738 | bitmap->missing_pages = pages; | ||
1739 | |||
1740 | bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); | ||
1741 | |||
1742 | err = -ENOMEM; | ||
1743 | if (!bitmap->bp) | ||
1744 | goto error; | 1704 | goto error; |
1745 | 1705 | ||
1746 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", | 1706 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", |
1747 | pages, bmname(bitmap)); | 1707 | bitmap->counts.pages, bmname(bitmap)); |
1748 | 1708 | ||
1749 | mddev->bitmap = bitmap; | 1709 | mddev->bitmap = bitmap; |
1750 | 1710 | return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; | |
1751 | |||
1752 | return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; | ||
1753 | 1711 | ||
1754 | error: | 1712 | error: |
1755 | bitmap_free(bitmap); | 1713 | bitmap_free(bitmap); |
@@ -1790,13 +1748,17 @@ int bitmap_load(struct mddev *mddev) | |||
1790 | 1748 | ||
1791 | if (err) | 1749 | if (err) |
1792 | goto out; | 1750 | goto out; |
1751 | clear_bit(BITMAP_STALE, &bitmap->flags); | ||
1752 | |||
1753 | /* Kick recovery in case any bits were set */ | ||
1754 | set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); | ||
1793 | 1755 | ||
1794 | mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; | 1756 | mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; |
1795 | md_wakeup_thread(mddev->thread); | 1757 | md_wakeup_thread(mddev->thread); |
1796 | 1758 | ||
1797 | bitmap_update_sb(bitmap); | 1759 | bitmap_update_sb(bitmap); |
1798 | 1760 | ||
1799 | if (bitmap->flags & BITMAP_WRITE_ERROR) | 1761 | if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) |
1800 | err = -EIO; | 1762 | err = -EIO; |
1801 | out: | 1763 | out: |
1802 | return err; | 1764 | return err; |
@@ -1806,30 +1768,194 @@ EXPORT_SYMBOL_GPL(bitmap_load); | |||
1806 | void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) | 1768 | void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) |
1807 | { | 1769 | { |
1808 | unsigned long chunk_kb; | 1770 | unsigned long chunk_kb; |
1809 | unsigned long flags; | 1771 | struct bitmap_counts *counts; |
1810 | 1772 | ||
1811 | if (!bitmap) | 1773 | if (!bitmap) |
1812 | return; | 1774 | return; |
1813 | 1775 | ||
1814 | spin_lock_irqsave(&bitmap->lock, flags); | 1776 | counts = &bitmap->counts; |
1777 | |||
1815 | chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; | 1778 | chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; |
1816 | seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " | 1779 | seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " |
1817 | "%lu%s chunk", | 1780 | "%lu%s chunk", |
1818 | bitmap->pages - bitmap->missing_pages, | 1781 | counts->pages - counts->missing_pages, |
1819 | bitmap->pages, | 1782 | counts->pages, |
1820 | (bitmap->pages - bitmap->missing_pages) | 1783 | (counts->pages - counts->missing_pages) |
1821 | << (PAGE_SHIFT - 10), | 1784 | << (PAGE_SHIFT - 10), |
1822 | chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, | 1785 | chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, |
1823 | chunk_kb ? "KB" : "B"); | 1786 | chunk_kb ? "KB" : "B"); |
1824 | if (bitmap->file) { | 1787 | if (bitmap->storage.file) { |
1825 | seq_printf(seq, ", file: "); | 1788 | seq_printf(seq, ", file: "); |
1826 | seq_path(seq, &bitmap->file->f_path, " \t\n"); | 1789 | seq_path(seq, &bitmap->storage.file->f_path, " \t\n"); |
1827 | } | 1790 | } |
1828 | 1791 | ||
1829 | seq_printf(seq, "\n"); | 1792 | seq_printf(seq, "\n"); |
1830 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
1831 | } | 1793 | } |
1832 | 1794 | ||
1795 | int bitmap_resize(struct bitmap *bitmap, sector_t blocks, | ||
1796 | int chunksize, int init) | ||
1797 | { | ||
1798 | /* If chunk_size is 0, choose an appropriate chunk size. | ||
1799 | * Then possibly allocate new storage space. | ||
1800 | * Then quiesce, copy bits, replace bitmap, and re-start | ||
1801 | * | ||
1802 | * This function is called both to set up the initial bitmap | ||
1803 | * and to resize the bitmap while the array is active. | ||
1804 | * If this happens as a result of the array being resized, | ||
1805 | * chunksize will be zero, and we need to choose a suitable | ||
1806 | * chunksize, otherwise we use what we are given. | ||
1807 | */ | ||
1808 | struct bitmap_storage store; | ||
1809 | struct bitmap_counts old_counts; | ||
1810 | unsigned long chunks; | ||
1811 | sector_t block; | ||
1812 | sector_t old_blocks, new_blocks; | ||
1813 | int chunkshift; | ||
1814 | int ret = 0; | ||
1815 | long pages; | ||
1816 | struct bitmap_page *new_bp; | ||
1817 | |||
1818 | if (chunksize == 0) { | ||
1819 | /* If there is enough space, leave the chunk size unchanged, | ||
1820 | * else increase by factor of two until there is enough space. | ||
1821 | */ | ||
1822 | long bytes; | ||
1823 | long space = bitmap->mddev->bitmap_info.space; | ||
1824 | |||
1825 | if (space == 0) { | ||
1826 | /* We don't know how much space there is, so limit | ||
1827 | * to current size - in sectors. | ||
1828 | */ | ||
1829 | bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8); | ||
1830 | if (!bitmap->mddev->bitmap_info.external) | ||
1831 | bytes += sizeof(bitmap_super_t); | ||
1832 | space = DIV_ROUND_UP(bytes, 512); | ||
1833 | bitmap->mddev->bitmap_info.space = space; | ||
1834 | } | ||
1835 | chunkshift = bitmap->counts.chunkshift; | ||
1836 | chunkshift--; | ||
1837 | do { | ||
1838 | /* 'chunkshift' is shift from block size to chunk size */ | ||
1839 | chunkshift++; | ||
1840 | chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); | ||
1841 | bytes = DIV_ROUND_UP(chunks, 8); | ||
1842 | if (!bitmap->mddev->bitmap_info.external) | ||
1843 | bytes += sizeof(bitmap_super_t); | ||
1844 | } while (bytes > (space << 9)); | ||
1845 | } else | ||
1846 | chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; | ||
1847 | |||
1848 | chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); | ||
1849 | memset(&store, 0, sizeof(store)); | ||
1850 | if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) | ||
1851 | ret = bitmap_storage_alloc(&store, chunks, | ||
1852 | !bitmap->mddev->bitmap_info.external); | ||
1853 | if (ret) | ||
1854 | goto err; | ||
1855 | |||
1856 | pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); | ||
1857 | |||
1858 | new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL); | ||
1859 | ret = -ENOMEM; | ||
1860 | if (!new_bp) { | ||
1861 | bitmap_file_unmap(&store); | ||
1862 | goto err; | ||
1863 | } | ||
1864 | |||
1865 | if (!init) | ||
1866 | bitmap->mddev->pers->quiesce(bitmap->mddev, 1); | ||
1867 | |||
1868 | store.file = bitmap->storage.file; | ||
1869 | bitmap->storage.file = NULL; | ||
1870 | |||
1871 | if (store.sb_page && bitmap->storage.sb_page) | ||
1872 | memcpy(page_address(store.sb_page), | ||
1873 | page_address(bitmap->storage.sb_page), | ||
1874 | sizeof(bitmap_super_t)); | ||
1875 | bitmap_file_unmap(&bitmap->storage); | ||
1876 | bitmap->storage = store; | ||
1877 | |||
1878 | old_counts = bitmap->counts; | ||
1879 | bitmap->counts.bp = new_bp; | ||
1880 | bitmap->counts.pages = pages; | ||
1881 | bitmap->counts.missing_pages = pages; | ||
1882 | bitmap->counts.chunkshift = chunkshift; | ||
1883 | bitmap->counts.chunks = chunks; | ||
1884 | bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift + | ||
1885 | BITMAP_BLOCK_SHIFT); | ||
1886 | |||
1887 | blocks = min(old_counts.chunks << old_counts.chunkshift, | ||
1888 | chunks << chunkshift); | ||
1889 | |||
1890 | spin_lock_irq(&bitmap->counts.lock); | ||
1891 | for (block = 0; block < blocks; ) { | ||
1892 | bitmap_counter_t *bmc_old, *bmc_new; | ||
1893 | int set; | ||
1894 | |||
1895 | bmc_old = bitmap_get_counter(&old_counts, block, | ||
1896 | &old_blocks, 0); | ||
1897 | set = bmc_old && NEEDED(*bmc_old); | ||
1898 | |||
1899 | if (set) { | ||
1900 | bmc_new = bitmap_get_counter(&bitmap->counts, block, | ||
1901 | &new_blocks, 1); | ||
1902 | if (*bmc_new == 0) { | ||
1903 | /* need to set on-disk bits too. */ | ||
1904 | sector_t end = block + new_blocks; | ||
1905 | sector_t start = block >> chunkshift; | ||
1906 | start <<= chunkshift; | ||
1907 | while (start < end) { | ||
1908 | bitmap_file_set_bit(bitmap, block); | ||
1909 | start += 1 << chunkshift; | ||
1910 | } | ||
1911 | *bmc_new = 2; | ||
1912 | bitmap_count_page(&bitmap->counts, | ||
1913 | block, 1); | ||
1914 | bitmap_set_pending(&bitmap->counts, | ||
1915 | block); | ||
1916 | } | ||
1917 | *bmc_new |= NEEDED_MASK; | ||
1918 | if (new_blocks < old_blocks) | ||
1919 | old_blocks = new_blocks; | ||
1920 | } | ||
1921 | block += old_blocks; | ||
1922 | } | ||
1923 | |||
1924 | if (!init) { | ||
1925 | int i; | ||
1926 | while (block < (chunks << chunkshift)) { | ||
1927 | bitmap_counter_t *bmc; | ||
1928 | bmc = bitmap_get_counter(&bitmap->counts, block, | ||
1929 | &new_blocks, 1); | ||
1930 | if (bmc) { | ||
1931 | /* new space. It needs to be resynced, so | ||
1932 | * we set NEEDED_MASK. | ||
1933 | */ | ||
1934 | if (*bmc == 0) { | ||
1935 | *bmc = NEEDED_MASK | 2; | ||
1936 | bitmap_count_page(&bitmap->counts, | ||
1937 | block, 1); | ||
1938 | bitmap_set_pending(&bitmap->counts, | ||
1939 | block); | ||
1940 | } | ||
1941 | } | ||
1942 | block += new_blocks; | ||
1943 | } | ||
1944 | for (i = 0; i < bitmap->storage.file_pages; i++) | ||
1945 | set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); | ||
1946 | } | ||
1947 | spin_unlock_irq(&bitmap->counts.lock); | ||
1948 | |||
1949 | if (!init) { | ||
1950 | bitmap_unplug(bitmap); | ||
1951 | bitmap->mddev->pers->quiesce(bitmap->mddev, 0); | ||
1952 | } | ||
1953 | ret = 0; | ||
1954 | err: | ||
1955 | return ret; | ||
1956 | } | ||
1957 | EXPORT_SYMBOL_GPL(bitmap_resize); | ||
1958 | |||
1833 | static ssize_t | 1959 | static ssize_t |
1834 | location_show(struct mddev *mddev, char *page) | 1960 | location_show(struct mddev *mddev, char *page) |
1835 | { | 1961 | { |
@@ -1923,6 +2049,43 @@ location_store(struct mddev *mddev, const char *buf, size_t len) | |||
1923 | static struct md_sysfs_entry bitmap_location = | 2049 | static struct md_sysfs_entry bitmap_location = |
1924 | __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); | 2050 | __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); |
1925 | 2051 | ||
2052 | /* 'bitmap/space' is the space available at 'location' for the | ||
2053 | * bitmap. This allows the kernel to know when it is safe to | ||
2054 | * resize the bitmap to match a resized array. | ||
2055 | */ | ||
2056 | static ssize_t | ||
2057 | space_show(struct mddev *mddev, char *page) | ||
2058 | { | ||
2059 | return sprintf(page, "%lu\n", mddev->bitmap_info.space); | ||
2060 | } | ||
2061 | |||
2062 | static ssize_t | ||
2063 | space_store(struct mddev *mddev, const char *buf, size_t len) | ||
2064 | { | ||
2065 | unsigned long sectors; | ||
2066 | int rv; | ||
2067 | |||
2068 | rv = kstrtoul(buf, 10, §ors); | ||
2069 | if (rv) | ||
2070 | return rv; | ||
2071 | |||
2072 | if (sectors == 0) | ||
2073 | return -EINVAL; | ||
2074 | |||
2075 | if (mddev->bitmap && | ||
2076 | sectors < (mddev->bitmap->storage.bytes + 511) >> 9) | ||
2077 | return -EFBIG; /* Bitmap is too big for this small space */ | ||
2078 | |||
2079 | /* could make sure it isn't too big, but that isn't really | ||
2080 | * needed - user-space should be careful. | ||
2081 | */ | ||
2082 | mddev->bitmap_info.space = sectors; | ||
2083 | return len; | ||
2084 | } | ||
2085 | |||
2086 | static struct md_sysfs_entry bitmap_space = | ||
2087 | __ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store); | ||
2088 | |||
1926 | static ssize_t | 2089 | static ssize_t |
1927 | timeout_show(struct mddev *mddev, char *page) | 2090 | timeout_show(struct mddev *mddev, char *page) |
1928 | { | 2091 | { |
@@ -2098,6 +2261,7 @@ __ATTR(max_backlog_used, S_IRUGO | S_IWUSR, | |||
2098 | 2261 | ||
2099 | static struct attribute *md_bitmap_attrs[] = { | 2262 | static struct attribute *md_bitmap_attrs[] = { |
2100 | &bitmap_location.attr, | 2263 | &bitmap_location.attr, |
2264 | &bitmap_space.attr, | ||
2101 | &bitmap_timeout.attr, | 2265 | &bitmap_timeout.attr, |
2102 | &bitmap_backlog.attr, | 2266 | &bitmap_backlog.attr, |
2103 | &bitmap_chunksize.attr, | 2267 | &bitmap_chunksize.attr, |
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index b44b0aba2d47..df4aeb6ac6f0 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h | |||
@@ -111,9 +111,9 @@ typedef __u16 bitmap_counter_t; | |||
111 | 111 | ||
112 | /* use these for bitmap->flags and bitmap->sb->state bit-fields */ | 112 | /* use these for bitmap->flags and bitmap->sb->state bit-fields */ |
113 | enum bitmap_state { | 113 | enum bitmap_state { |
114 | BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ | 114 | BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ |
115 | BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ | 115 | BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ |
116 | BITMAP_HOSTENDIAN = 0x8000, | 116 | BITMAP_HOSTENDIAN =15, |
117 | }; | 117 | }; |
118 | 118 | ||
119 | /* the superblock at the front of the bitmap file -- little endian */ | 119 | /* the superblock at the front of the bitmap file -- little endian */ |
@@ -128,8 +128,10 @@ typedef struct bitmap_super_s { | |||
128 | __le32 chunksize; /* 52 the bitmap chunk size in bytes */ | 128 | __le32 chunksize; /* 52 the bitmap chunk size in bytes */ |
129 | __le32 daemon_sleep; /* 56 seconds between disk flushes */ | 129 | __le32 daemon_sleep; /* 56 seconds between disk flushes */ |
130 | __le32 write_behind; /* 60 number of outstanding write-behind writes */ | 130 | __le32 write_behind; /* 60 number of outstanding write-behind writes */ |
131 | __le32 sectors_reserved; /* 64 number of 512-byte sectors that are | ||
132 | * reserved for the bitmap. */ | ||
131 | 133 | ||
132 | __u8 pad[256 - 64]; /* set to zero */ | 134 | __u8 pad[256 - 68]; /* set to zero */ |
133 | } bitmap_super_t; | 135 | } bitmap_super_t; |
134 | 136 | ||
135 | /* notes: | 137 | /* notes: |
@@ -160,35 +162,48 @@ struct bitmap_page { | |||
160 | */ | 162 | */ |
161 | unsigned int hijacked:1; | 163 | unsigned int hijacked:1; |
162 | /* | 164 | /* |
165 | * If any counter in this page is '1' or '2' - and so could be | ||
166 | * cleared then that page is marked as 'pending' | ||
167 | */ | ||
168 | unsigned int pending:1; | ||
169 | /* | ||
163 | * count of dirty bits on the page | 170 | * count of dirty bits on the page |
164 | */ | 171 | */ |
165 | unsigned int count:31; | 172 | unsigned int count:30; |
166 | }; | 173 | }; |
167 | 174 | ||
168 | /* the main bitmap structure - one per mddev */ | 175 | /* the main bitmap structure - one per mddev */ |
169 | struct bitmap { | 176 | struct bitmap { |
170 | struct bitmap_page *bp; | ||
171 | unsigned long pages; /* total number of pages in the bitmap */ | ||
172 | unsigned long missing_pages; /* number of pages not yet allocated */ | ||
173 | 177 | ||
174 | struct mddev *mddev; /* the md device that the bitmap is for */ | 178 | struct bitmap_counts { |
179 | spinlock_t lock; | ||
180 | struct bitmap_page *bp; | ||
181 | unsigned long pages; /* total number of pages | ||
182 | * in the bitmap */ | ||
183 | unsigned long missing_pages; /* number of pages | ||
184 | * not yet allocated */ | ||
185 | unsigned long chunkshift; /* chunksize = 2^chunkshift | ||
186 | * (for bitops) */ | ||
187 | unsigned long chunks; /* Total number of data | ||
188 | * chunks for the array */ | ||
189 | } counts; | ||
175 | 190 | ||
176 | /* bitmap chunksize -- how much data does each bit represent? */ | 191 | struct mddev *mddev; /* the md device that the bitmap is for */ |
177 | unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */ | ||
178 | unsigned long chunks; /* total number of data chunks for the array */ | ||
179 | 192 | ||
180 | __u64 events_cleared; | 193 | __u64 events_cleared; |
181 | int need_sync; | 194 | int need_sync; |
182 | 195 | ||
183 | /* bitmap spinlock */ | 196 | struct bitmap_storage { |
184 | spinlock_t lock; | 197 | struct file *file; /* backing disk file */ |
185 | 198 | struct page *sb_page; /* cached copy of the bitmap | |
186 | struct file *file; /* backing disk file */ | 199 | * file superblock */ |
187 | struct page *sb_page; /* cached copy of the bitmap file superblock */ | 200 | struct page **filemap; /* list of cache pages for |
188 | struct page **filemap; /* list of cache pages for the file */ | 201 | * the file */ |
189 | unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ | 202 | unsigned long *filemap_attr; /* attributes associated |
190 | unsigned long file_pages; /* number of pages in the file */ | 203 | * w/ filemap pages */ |
191 | int last_page_size; /* bytes in the last page */ | 204 | unsigned long file_pages; /* number of pages in the file*/ |
205 | unsigned long bytes; /* total bytes in the bitmap */ | ||
206 | } storage; | ||
192 | 207 | ||
193 | unsigned long flags; | 208 | unsigned long flags; |
194 | 209 | ||
@@ -242,6 +257,9 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); | |||
242 | 257 | ||
243 | void bitmap_unplug(struct bitmap *bitmap); | 258 | void bitmap_unplug(struct bitmap *bitmap); |
244 | void bitmap_daemon_work(struct mddev *mddev); | 259 | void bitmap_daemon_work(struct mddev *mddev); |
260 | |||
261 | int bitmap_resize(struct bitmap *bitmap, sector_t blocks, | ||
262 | int chunksize, int init); | ||
245 | #endif | 263 | #endif |
246 | 264 | ||
247 | #endif | 265 | #endif |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 68965e663248..017c34d78d61 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
@@ -155,10 +155,7 @@ static void context_free(struct raid_set *rs) | |||
155 | for (i = 0; i < rs->md.raid_disks; i++) { | 155 | for (i = 0; i < rs->md.raid_disks; i++) { |
156 | if (rs->dev[i].meta_dev) | 156 | if (rs->dev[i].meta_dev) |
157 | dm_put_device(rs->ti, rs->dev[i].meta_dev); | 157 | dm_put_device(rs->ti, rs->dev[i].meta_dev); |
158 | if (rs->dev[i].rdev.sb_page) | 158 | md_rdev_clear(&rs->dev[i].rdev); |
159 | put_page(rs->dev[i].rdev.sb_page); | ||
160 | rs->dev[i].rdev.sb_page = NULL; | ||
161 | rs->dev[i].rdev.sb_loaded = 0; | ||
162 | if (rs->dev[i].data_dev) | 159 | if (rs->dev[i].data_dev) |
163 | dm_put_device(rs->ti, rs->dev[i].data_dev); | 160 | dm_put_device(rs->ti, rs->dev[i].data_dev); |
164 | } | 161 | } |
@@ -606,7 +603,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size) | |||
606 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { | 603 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { |
607 | DMERR("Failed to read superblock of device at position %d", | 604 | DMERR("Failed to read superblock of device at position %d", |
608 | rdev->raid_disk); | 605 | rdev->raid_disk); |
609 | set_bit(Faulty, &rdev->flags); | 606 | md_error(rdev->mddev, rdev); |
610 | return -EINVAL; | 607 | return -EINVAL; |
611 | } | 608 | } |
612 | 609 | ||
@@ -617,16 +614,18 @@ static int read_disk_sb(struct md_rdev *rdev, int size) | |||
617 | 614 | ||
618 | static void super_sync(struct mddev *mddev, struct md_rdev *rdev) | 615 | static void super_sync(struct mddev *mddev, struct md_rdev *rdev) |
619 | { | 616 | { |
620 | struct md_rdev *r; | 617 | int i; |
621 | uint64_t failed_devices; | 618 | uint64_t failed_devices; |
622 | struct dm_raid_superblock *sb; | 619 | struct dm_raid_superblock *sb; |
620 | struct raid_set *rs = container_of(mddev, struct raid_set, md); | ||
623 | 621 | ||
624 | sb = page_address(rdev->sb_page); | 622 | sb = page_address(rdev->sb_page); |
625 | failed_devices = le64_to_cpu(sb->failed_devices); | 623 | failed_devices = le64_to_cpu(sb->failed_devices); |
626 | 624 | ||
627 | rdev_for_each(r, mddev) | 625 | for (i = 0; i < mddev->raid_disks; i++) |
628 | if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) | 626 | if (!rs->dev[i].data_dev || |
629 | failed_devices |= (1ULL << r->raid_disk); | 627 | test_bit(Faulty, &(rs->dev[i].rdev.flags))) |
628 | failed_devices |= (1ULL << i); | ||
630 | 629 | ||
631 | memset(sb, 0, sizeof(*sb)); | 630 | memset(sb, 0, sizeof(*sb)); |
632 | 631 | ||
@@ -1252,12 +1251,13 @@ static void raid_resume(struct dm_target *ti) | |||
1252 | { | 1251 | { |
1253 | struct raid_set *rs = ti->private; | 1252 | struct raid_set *rs = ti->private; |
1254 | 1253 | ||
1254 | set_bit(MD_CHANGE_DEVS, &rs->md.flags); | ||
1255 | if (!rs->bitmap_loaded) { | 1255 | if (!rs->bitmap_loaded) { |
1256 | bitmap_load(&rs->md); | 1256 | bitmap_load(&rs->md); |
1257 | rs->bitmap_loaded = 1; | 1257 | rs->bitmap_loaded = 1; |
1258 | } else | 1258 | } |
1259 | md_wakeup_thread(rs->md.thread); | ||
1260 | 1259 | ||
1260 | clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); | ||
1261 | mddev_resume(&rs->md); | 1261 | mddev_resume(&rs->md); |
1262 | } | 1262 | } |
1263 | 1263 | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index 01233d855eb2..1c2f9048e1ae 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -402,6 +402,7 @@ void mddev_resume(struct mddev *mddev) | |||
402 | wake_up(&mddev->sb_wait); | 402 | wake_up(&mddev->sb_wait); |
403 | mddev->pers->quiesce(mddev, 0); | 403 | mddev->pers->quiesce(mddev, 0); |
404 | 404 | ||
405 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
405 | md_wakeup_thread(mddev->thread); | 406 | md_wakeup_thread(mddev->thread); |
406 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ | 407 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ |
407 | } | 408 | } |
@@ -452,7 +453,7 @@ static void submit_flushes(struct work_struct *ws) | |||
452 | atomic_inc(&rdev->nr_pending); | 453 | atomic_inc(&rdev->nr_pending); |
453 | atomic_inc(&rdev->nr_pending); | 454 | atomic_inc(&rdev->nr_pending); |
454 | rcu_read_unlock(); | 455 | rcu_read_unlock(); |
455 | bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev); | 456 | bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); |
456 | bi->bi_end_io = md_end_flush; | 457 | bi->bi_end_io = md_end_flush; |
457 | bi->bi_private = rdev; | 458 | bi->bi_private = rdev; |
458 | bi->bi_bdev = rdev->bdev; | 459 | bi->bi_bdev = rdev->bdev; |
@@ -607,6 +608,7 @@ void mddev_init(struct mddev *mddev) | |||
607 | init_waitqueue_head(&mddev->sb_wait); | 608 | init_waitqueue_head(&mddev->sb_wait); |
608 | init_waitqueue_head(&mddev->recovery_wait); | 609 | init_waitqueue_head(&mddev->recovery_wait); |
609 | mddev->reshape_position = MaxSector; | 610 | mddev->reshape_position = MaxSector; |
611 | mddev->reshape_backwards = 0; | ||
610 | mddev->resync_min = 0; | 612 | mddev->resync_min = 0; |
611 | mddev->resync_max = MaxSector; | 613 | mddev->resync_max = MaxSector; |
612 | mddev->level = LEVEL_NONE; | 614 | mddev->level = LEVEL_NONE; |
@@ -802,7 +804,7 @@ static int alloc_disk_sb(struct md_rdev * rdev) | |||
802 | return 0; | 804 | return 0; |
803 | } | 805 | } |
804 | 806 | ||
805 | static void free_disk_sb(struct md_rdev * rdev) | 807 | void md_rdev_clear(struct md_rdev *rdev) |
806 | { | 808 | { |
807 | if (rdev->sb_page) { | 809 | if (rdev->sb_page) { |
808 | put_page(rdev->sb_page); | 810 | put_page(rdev->sb_page); |
@@ -815,8 +817,10 @@ static void free_disk_sb(struct md_rdev * rdev) | |||
815 | put_page(rdev->bb_page); | 817 | put_page(rdev->bb_page); |
816 | rdev->bb_page = NULL; | 818 | rdev->bb_page = NULL; |
817 | } | 819 | } |
820 | kfree(rdev->badblocks.page); | ||
821 | rdev->badblocks.page = NULL; | ||
818 | } | 822 | } |
819 | 823 | EXPORT_SYMBOL_GPL(md_rdev_clear); | |
820 | 824 | ||
821 | static void super_written(struct bio *bio, int error) | 825 | static void super_written(struct bio *bio, int error) |
822 | { | 826 | { |
@@ -887,6 +891,10 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | |||
887 | rdev->meta_bdev : rdev->bdev; | 891 | rdev->meta_bdev : rdev->bdev; |
888 | if (metadata_op) | 892 | if (metadata_op) |
889 | bio->bi_sector = sector + rdev->sb_start; | 893 | bio->bi_sector = sector + rdev->sb_start; |
894 | else if (rdev->mddev->reshape_position != MaxSector && | ||
895 | (rdev->mddev->reshape_backwards == | ||
896 | (sector >= rdev->mddev->reshape_position))) | ||
897 | bio->bi_sector = sector + rdev->new_data_offset; | ||
890 | else | 898 | else |
891 | bio->bi_sector = sector + rdev->data_offset; | 899 | bio->bi_sector = sector + rdev->data_offset; |
892 | bio_add_page(bio, page, size, 0); | 900 | bio_add_page(bio, page, size, 0); |
@@ -1034,12 +1042,17 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) | |||
1034 | struct super_type { | 1042 | struct super_type { |
1035 | char *name; | 1043 | char *name; |
1036 | struct module *owner; | 1044 | struct module *owner; |
1037 | int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, | 1045 | int (*load_super)(struct md_rdev *rdev, |
1046 | struct md_rdev *refdev, | ||
1038 | int minor_version); | 1047 | int minor_version); |
1039 | int (*validate_super)(struct mddev *mddev, struct md_rdev *rdev); | 1048 | int (*validate_super)(struct mddev *mddev, |
1040 | void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); | 1049 | struct md_rdev *rdev); |
1050 | void (*sync_super)(struct mddev *mddev, | ||
1051 | struct md_rdev *rdev); | ||
1041 | unsigned long long (*rdev_size_change)(struct md_rdev *rdev, | 1052 | unsigned long long (*rdev_size_change)(struct md_rdev *rdev, |
1042 | sector_t num_sectors); | 1053 | sector_t num_sectors); |
1054 | int (*allow_new_offset)(struct md_rdev *rdev, | ||
1055 | unsigned long long new_offset); | ||
1043 | }; | 1056 | }; |
1044 | 1057 | ||
1045 | /* | 1058 | /* |
@@ -1111,6 +1124,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor | |||
1111 | 1124 | ||
1112 | rdev->preferred_minor = sb->md_minor; | 1125 | rdev->preferred_minor = sb->md_minor; |
1113 | rdev->data_offset = 0; | 1126 | rdev->data_offset = 0; |
1127 | rdev->new_data_offset = 0; | ||
1114 | rdev->sb_size = MD_SB_BYTES; | 1128 | rdev->sb_size = MD_SB_BYTES; |
1115 | rdev->badblocks.shift = -1; | 1129 | rdev->badblocks.shift = -1; |
1116 | 1130 | ||
@@ -1184,7 +1198,11 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1184 | mddev->dev_sectors = ((sector_t)sb->size) * 2; | 1198 | mddev->dev_sectors = ((sector_t)sb->size) * 2; |
1185 | mddev->events = ev1; | 1199 | mddev->events = ev1; |
1186 | mddev->bitmap_info.offset = 0; | 1200 | mddev->bitmap_info.offset = 0; |
1201 | mddev->bitmap_info.space = 0; | ||
1202 | /* bitmap can use 60 K after the 4K superblocks */ | ||
1187 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; | 1203 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
1204 | mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); | ||
1205 | mddev->reshape_backwards = 0; | ||
1188 | 1206 | ||
1189 | if (mddev->minor_version >= 91) { | 1207 | if (mddev->minor_version >= 91) { |
1190 | mddev->reshape_position = sb->reshape_position; | 1208 | mddev->reshape_position = sb->reshape_position; |
@@ -1192,6 +1210,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1192 | mddev->new_level = sb->new_level; | 1210 | mddev->new_level = sb->new_level; |
1193 | mddev->new_layout = sb->new_layout; | 1211 | mddev->new_layout = sb->new_layout; |
1194 | mddev->new_chunk_sectors = sb->new_chunk >> 9; | 1212 | mddev->new_chunk_sectors = sb->new_chunk >> 9; |
1213 | if (mddev->delta_disks < 0) | ||
1214 | mddev->reshape_backwards = 1; | ||
1195 | } else { | 1215 | } else { |
1196 | mddev->reshape_position = MaxSector; | 1216 | mddev->reshape_position = MaxSector; |
1197 | mddev->delta_disks = 0; | 1217 | mddev->delta_disks = 0; |
@@ -1218,9 +1238,12 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1218 | mddev->max_disks = MD_SB_DISKS; | 1238 | mddev->max_disks = MD_SB_DISKS; |
1219 | 1239 | ||
1220 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && | 1240 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && |
1221 | mddev->bitmap_info.file == NULL) | 1241 | mddev->bitmap_info.file == NULL) { |
1222 | mddev->bitmap_info.offset = | 1242 | mddev->bitmap_info.offset = |
1223 | mddev->bitmap_info.default_offset; | 1243 | mddev->bitmap_info.default_offset; |
1244 | mddev->bitmap_info.space = | ||
1245 | mddev->bitmap_info.space; | ||
1246 | } | ||
1224 | 1247 | ||
1225 | } else if (mddev->pers == NULL) { | 1248 | } else if (mddev->pers == NULL) { |
1226 | /* Insist on good event counter while assembling, except | 1249 | /* Insist on good event counter while assembling, except |
@@ -1434,6 +1457,12 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) | |||
1434 | return num_sectors; | 1457 | return num_sectors; |
1435 | } | 1458 | } |
1436 | 1459 | ||
1460 | static int | ||
1461 | super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) | ||
1462 | { | ||
1463 | /* non-zero offset changes not possible with v0.90 */ | ||
1464 | return new_offset == 0; | ||
1465 | } | ||
1437 | 1466 | ||
1438 | /* | 1467 | /* |
1439 | * version 1 superblock | 1468 | * version 1 superblock |
@@ -1469,6 +1498,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ | |||
1469 | struct mdp_superblock_1 *sb; | 1498 | struct mdp_superblock_1 *sb; |
1470 | int ret; | 1499 | int ret; |
1471 | sector_t sb_start; | 1500 | sector_t sb_start; |
1501 | sector_t sectors; | ||
1472 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; | 1502 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; |
1473 | int bmask; | 1503 | int bmask; |
1474 | 1504 | ||
@@ -1523,9 +1553,18 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ | |||
1523 | bdevname(rdev->bdev,b)); | 1553 | bdevname(rdev->bdev,b)); |
1524 | return -EINVAL; | 1554 | return -EINVAL; |
1525 | } | 1555 | } |
1556 | if (sb->pad0 || | ||
1557 | sb->pad3[0] || | ||
1558 | memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) | ||
1559 | /* Some padding is non-zero, might be a new feature */ | ||
1560 | return -EINVAL; | ||
1526 | 1561 | ||
1527 | rdev->preferred_minor = 0xffff; | 1562 | rdev->preferred_minor = 0xffff; |
1528 | rdev->data_offset = le64_to_cpu(sb->data_offset); | 1563 | rdev->data_offset = le64_to_cpu(sb->data_offset); |
1564 | rdev->new_data_offset = rdev->data_offset; | ||
1565 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && | ||
1566 | (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) | ||
1567 | rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); | ||
1529 | atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); | 1568 | atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); |
1530 | 1569 | ||
1531 | rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; | 1570 | rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; |
@@ -1536,6 +1575,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ | |||
1536 | if (minor_version | 1575 | if (minor_version |
1537 | && rdev->data_offset < sb_start + (rdev->sb_size/512)) | 1576 | && rdev->data_offset < sb_start + (rdev->sb_size/512)) |
1538 | return -EINVAL; | 1577 | return -EINVAL; |
1578 | if (minor_version | ||
1579 | && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) | ||
1580 | return -EINVAL; | ||
1539 | 1581 | ||
1540 | if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) | 1582 | if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) |
1541 | rdev->desc_nr = -1; | 1583 | rdev->desc_nr = -1; |
@@ -1607,16 +1649,14 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ | |||
1607 | else | 1649 | else |
1608 | ret = 0; | 1650 | ret = 0; |
1609 | } | 1651 | } |
1610 | if (minor_version) | 1652 | if (minor_version) { |
1611 | rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - | 1653 | sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); |
1612 | le64_to_cpu(sb->data_offset); | 1654 | sectors -= rdev->data_offset; |
1613 | else | 1655 | } else |
1614 | rdev->sectors = rdev->sb_start; | 1656 | sectors = rdev->sb_start; |
1615 | if (rdev->sectors < le64_to_cpu(sb->data_size)) | 1657 | if (sectors < le64_to_cpu(sb->data_size)) |
1616 | return -EINVAL; | 1658 | return -EINVAL; |
1617 | rdev->sectors = le64_to_cpu(sb->data_size); | 1659 | rdev->sectors = le64_to_cpu(sb->data_size); |
1618 | if (le64_to_cpu(sb->size) > rdev->sectors) | ||
1619 | return -EINVAL; | ||
1620 | return ret; | 1660 | return ret; |
1621 | } | 1661 | } |
1622 | 1662 | ||
@@ -1644,17 +1684,37 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1644 | mddev->dev_sectors = le64_to_cpu(sb->size); | 1684 | mddev->dev_sectors = le64_to_cpu(sb->size); |
1645 | mddev->events = ev1; | 1685 | mddev->events = ev1; |
1646 | mddev->bitmap_info.offset = 0; | 1686 | mddev->bitmap_info.offset = 0; |
1687 | mddev->bitmap_info.space = 0; | ||
1688 | /* Default location for bitmap is 1K after superblock | ||
1689 | * using 3K - total of 4K | ||
1690 | */ | ||
1647 | mddev->bitmap_info.default_offset = 1024 >> 9; | 1691 | mddev->bitmap_info.default_offset = 1024 >> 9; |
1648 | 1692 | mddev->bitmap_info.default_space = (4096-1024) >> 9; | |
1693 | mddev->reshape_backwards = 0; | ||
1694 | |||
1649 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); | 1695 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); |
1650 | memcpy(mddev->uuid, sb->set_uuid, 16); | 1696 | memcpy(mddev->uuid, sb->set_uuid, 16); |
1651 | 1697 | ||
1652 | mddev->max_disks = (4096-256)/2; | 1698 | mddev->max_disks = (4096-256)/2; |
1653 | 1699 | ||
1654 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && | 1700 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && |
1655 | mddev->bitmap_info.file == NULL ) | 1701 | mddev->bitmap_info.file == NULL) { |
1656 | mddev->bitmap_info.offset = | 1702 | mddev->bitmap_info.offset = |
1657 | (__s32)le32_to_cpu(sb->bitmap_offset); | 1703 | (__s32)le32_to_cpu(sb->bitmap_offset); |
1704 | /* Metadata doesn't record how much space is available. | ||
1705 | * For 1.0, we assume we can use up to the superblock | ||
1706 | * if before, else to 4K beyond superblock. | ||
1707 | * For others, assume no change is possible. | ||
1708 | */ | ||
1709 | if (mddev->minor_version > 0) | ||
1710 | mddev->bitmap_info.space = 0; | ||
1711 | else if (mddev->bitmap_info.offset > 0) | ||
1712 | mddev->bitmap_info.space = | ||
1713 | 8 - mddev->bitmap_info.offset; | ||
1714 | else | ||
1715 | mddev->bitmap_info.space = | ||
1716 | -mddev->bitmap_info.offset; | ||
1717 | } | ||
1658 | 1718 | ||
1659 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { | 1719 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
1660 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); | 1720 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); |
@@ -1662,6 +1722,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1662 | mddev->new_level = le32_to_cpu(sb->new_level); | 1722 | mddev->new_level = le32_to_cpu(sb->new_level); |
1663 | mddev->new_layout = le32_to_cpu(sb->new_layout); | 1723 | mddev->new_layout = le32_to_cpu(sb->new_layout); |
1664 | mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); | 1724 | mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); |
1725 | if (mddev->delta_disks < 0 || | ||
1726 | (mddev->delta_disks == 0 && | ||
1727 | (le32_to_cpu(sb->feature_map) | ||
1728 | & MD_FEATURE_RESHAPE_BACKWARDS))) | ||
1729 | mddev->reshape_backwards = 1; | ||
1665 | } else { | 1730 | } else { |
1666 | mddev->reshape_position = MaxSector; | 1731 | mddev->reshape_position = MaxSector; |
1667 | mddev->delta_disks = 0; | 1732 | mddev->delta_disks = 0; |
@@ -1735,7 +1800,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
1735 | sb->feature_map = 0; | 1800 | sb->feature_map = 0; |
1736 | sb->pad0 = 0; | 1801 | sb->pad0 = 0; |
1737 | sb->recovery_offset = cpu_to_le64(0); | 1802 | sb->recovery_offset = cpu_to_le64(0); |
1738 | memset(sb->pad1, 0, sizeof(sb->pad1)); | ||
1739 | memset(sb->pad3, 0, sizeof(sb->pad3)); | 1803 | memset(sb->pad3, 0, sizeof(sb->pad3)); |
1740 | 1804 | ||
1741 | sb->utime = cpu_to_le64((__u64)mddev->utime); | 1805 | sb->utime = cpu_to_le64((__u64)mddev->utime); |
@@ -1757,6 +1821,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
1757 | sb->devflags |= WriteMostly1; | 1821 | sb->devflags |= WriteMostly1; |
1758 | else | 1822 | else |
1759 | sb->devflags &= ~WriteMostly1; | 1823 | sb->devflags &= ~WriteMostly1; |
1824 | sb->data_offset = cpu_to_le64(rdev->data_offset); | ||
1825 | sb->data_size = cpu_to_le64(rdev->sectors); | ||
1760 | 1826 | ||
1761 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) { | 1827 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) { |
1762 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); | 1828 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); |
@@ -1781,6 +1847,16 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
1781 | sb->delta_disks = cpu_to_le32(mddev->delta_disks); | 1847 | sb->delta_disks = cpu_to_le32(mddev->delta_disks); |
1782 | sb->new_level = cpu_to_le32(mddev->new_level); | 1848 | sb->new_level = cpu_to_le32(mddev->new_level); |
1783 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); | 1849 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
1850 | if (mddev->delta_disks == 0 && | ||
1851 | mddev->reshape_backwards) | ||
1852 | sb->feature_map | ||
1853 | |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); | ||
1854 | if (rdev->new_data_offset != rdev->data_offset) { | ||
1855 | sb->feature_map | ||
1856 | |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); | ||
1857 | sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset | ||
1858 | - rdev->data_offset)); | ||
1859 | } | ||
1784 | } | 1860 | } |
1785 | 1861 | ||
1786 | if (rdev->badblocks.count == 0) | 1862 | if (rdev->badblocks.count == 0) |
@@ -1857,6 +1933,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) | |||
1857 | sector_t max_sectors; | 1933 | sector_t max_sectors; |
1858 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) | 1934 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
1859 | return 0; /* component must fit device */ | 1935 | return 0; /* component must fit device */ |
1936 | if (rdev->data_offset != rdev->new_data_offset) | ||
1937 | return 0; /* too confusing */ | ||
1860 | if (rdev->sb_start < rdev->data_offset) { | 1938 | if (rdev->sb_start < rdev->data_offset) { |
1861 | /* minor versions 1 and 2; superblock before data */ | 1939 | /* minor versions 1 and 2; superblock before data */ |
1862 | max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; | 1940 | max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; |
@@ -1884,6 +1962,40 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) | |||
1884 | rdev->sb_page); | 1962 | rdev->sb_page); |
1885 | md_super_wait(rdev->mddev); | 1963 | md_super_wait(rdev->mddev); |
1886 | return num_sectors; | 1964 | return num_sectors; |
1965 | |||
1966 | } | ||
1967 | |||
1968 | static int | ||
1969 | super_1_allow_new_offset(struct md_rdev *rdev, | ||
1970 | unsigned long long new_offset) | ||
1971 | { | ||
1972 | /* All necessary checks on new >= old have been done */ | ||
1973 | struct bitmap *bitmap; | ||
1974 | if (new_offset >= rdev->data_offset) | ||
1975 | return 1; | ||
1976 | |||
1977 | /* with 1.0 metadata, there is no metadata to tread on | ||
1978 | * so we can always move back */ | ||
1979 | if (rdev->mddev->minor_version == 0) | ||
1980 | return 1; | ||
1981 | |||
1982 | /* otherwise we must be sure not to step on | ||
1983 | * any metadata, so stay: | ||
1984 | * 36K beyond start of superblock | ||
1985 | * beyond end of badblocks | ||
1986 | * beyond write-intent bitmap | ||
1987 | */ | ||
1988 | if (rdev->sb_start + (32+4)*2 > new_offset) | ||
1989 | return 0; | ||
1990 | bitmap = rdev->mddev->bitmap; | ||
1991 | if (bitmap && !rdev->mddev->bitmap_info.file && | ||
1992 | rdev->sb_start + rdev->mddev->bitmap_info.offset + | ||
1993 | bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) | ||
1994 | return 0; | ||
1995 | if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) | ||
1996 | return 0; | ||
1997 | |||
1998 | return 1; | ||
1887 | } | 1999 | } |
1888 | 2000 | ||
1889 | static struct super_type super_types[] = { | 2001 | static struct super_type super_types[] = { |
@@ -1894,6 +2006,7 @@ static struct super_type super_types[] = { | |||
1894 | .validate_super = super_90_validate, | 2006 | .validate_super = super_90_validate, |
1895 | .sync_super = super_90_sync, | 2007 | .sync_super = super_90_sync, |
1896 | .rdev_size_change = super_90_rdev_size_change, | 2008 | .rdev_size_change = super_90_rdev_size_change, |
2009 | .allow_new_offset = super_90_allow_new_offset, | ||
1897 | }, | 2010 | }, |
1898 | [1] = { | 2011 | [1] = { |
1899 | .name = "md-1", | 2012 | .name = "md-1", |
@@ -1902,6 +2015,7 @@ static struct super_type super_types[] = { | |||
1902 | .validate_super = super_1_validate, | 2015 | .validate_super = super_1_validate, |
1903 | .sync_super = super_1_sync, | 2016 | .sync_super = super_1_sync, |
1904 | .rdev_size_change = super_1_rdev_size_change, | 2017 | .rdev_size_change = super_1_rdev_size_change, |
2018 | .allow_new_offset = super_1_allow_new_offset, | ||
1905 | }, | 2019 | }, |
1906 | }; | 2020 | }; |
1907 | 2021 | ||
@@ -2105,9 +2219,7 @@ static void unbind_rdev_from_array(struct md_rdev * rdev) | |||
2105 | sysfs_remove_link(&rdev->kobj, "block"); | 2219 | sysfs_remove_link(&rdev->kobj, "block"); |
2106 | sysfs_put(rdev->sysfs_state); | 2220 | sysfs_put(rdev->sysfs_state); |
2107 | rdev->sysfs_state = NULL; | 2221 | rdev->sysfs_state = NULL; |
2108 | kfree(rdev->badblocks.page); | ||
2109 | rdev->badblocks.count = 0; | 2222 | rdev->badblocks.count = 0; |
2110 | rdev->badblocks.page = NULL; | ||
2111 | /* We need to delay this, otherwise we can deadlock when | 2223 | /* We need to delay this, otherwise we can deadlock when |
2112 | * writing to 'remove' to "dev/state". We also need | 2224 | * writing to 'remove' to "dev/state". We also need |
2113 | * to delay it due to rcu usage. | 2225 | * to delay it due to rcu usage. |
@@ -2158,7 +2270,7 @@ static void export_rdev(struct md_rdev * rdev) | |||
2158 | bdevname(rdev->bdev,b)); | 2270 | bdevname(rdev->bdev,b)); |
2159 | if (rdev->mddev) | 2271 | if (rdev->mddev) |
2160 | MD_BUG(); | 2272 | MD_BUG(); |
2161 | free_disk_sb(rdev); | 2273 | md_rdev_clear(rdev); |
2162 | #ifndef MODULE | 2274 | #ifndef MODULE |
2163 | if (test_bit(AutoDetected, &rdev->flags)) | 2275 | if (test_bit(AutoDetected, &rdev->flags)) |
2164 | md_autodetect_dev(rdev->bdev->bd_dev); | 2276 | md_autodetect_dev(rdev->bdev->bd_dev); |
@@ -2809,9 +2921,8 @@ offset_show(struct md_rdev *rdev, char *page) | |||
2809 | static ssize_t | 2921 | static ssize_t |
2810 | offset_store(struct md_rdev *rdev, const char *buf, size_t len) | 2922 | offset_store(struct md_rdev *rdev, const char *buf, size_t len) |
2811 | { | 2923 | { |
2812 | char *e; | 2924 | unsigned long long offset; |
2813 | unsigned long long offset = simple_strtoull(buf, &e, 10); | 2925 | if (strict_strtoull(buf, 10, &offset) < 0) |
2814 | if (e==buf || (*e && *e != '\n')) | ||
2815 | return -EINVAL; | 2926 | return -EINVAL; |
2816 | if (rdev->mddev->pers && rdev->raid_disk >= 0) | 2927 | if (rdev->mddev->pers && rdev->raid_disk >= 0) |
2817 | return -EBUSY; | 2928 | return -EBUSY; |
@@ -2826,6 +2937,63 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2826 | static struct rdev_sysfs_entry rdev_offset = | 2937 | static struct rdev_sysfs_entry rdev_offset = |
2827 | __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); | 2938 | __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); |
2828 | 2939 | ||
2940 | static ssize_t new_offset_show(struct md_rdev *rdev, char *page) | ||
2941 | { | ||
2942 | return sprintf(page, "%llu\n", | ||
2943 | (unsigned long long)rdev->new_data_offset); | ||
2944 | } | ||
2945 | |||
2946 | static ssize_t new_offset_store(struct md_rdev *rdev, | ||
2947 | const char *buf, size_t len) | ||
2948 | { | ||
2949 | unsigned long long new_offset; | ||
2950 | struct mddev *mddev = rdev->mddev; | ||
2951 | |||
2952 | if (strict_strtoull(buf, 10, &new_offset) < 0) | ||
2953 | return -EINVAL; | ||
2954 | |||
2955 | if (mddev->sync_thread) | ||
2956 | return -EBUSY; | ||
2957 | if (new_offset == rdev->data_offset) | ||
2958 | /* reset is always permitted */ | ||
2959 | ; | ||
2960 | else if (new_offset > rdev->data_offset) { | ||
2961 | /* must not push array size beyond rdev_sectors */ | ||
2962 | if (new_offset - rdev->data_offset | ||
2963 | + mddev->dev_sectors > rdev->sectors) | ||
2964 | return -E2BIG; | ||
2965 | } | ||
2966 | /* Metadata worries about other space details. */ | ||
2967 | |||
2968 | /* decreasing the offset is inconsistent with a backwards | ||
2969 | * reshape. | ||
2970 | */ | ||
2971 | if (new_offset < rdev->data_offset && | ||
2972 | mddev->reshape_backwards) | ||
2973 | return -EINVAL; | ||
2974 | /* Increasing offset is inconsistent with forwards | ||
2975 | * reshape. reshape_direction should be set to | ||
2976 | * 'backwards' first. | ||
2977 | */ | ||
2978 | if (new_offset > rdev->data_offset && | ||
2979 | !mddev->reshape_backwards) | ||
2980 | return -EINVAL; | ||
2981 | |||
2982 | if (mddev->pers && mddev->persistent && | ||
2983 | !super_types[mddev->major_version] | ||
2984 | .allow_new_offset(rdev, new_offset)) | ||
2985 | return -E2BIG; | ||
2986 | rdev->new_data_offset = new_offset; | ||
2987 | if (new_offset > rdev->data_offset) | ||
2988 | mddev->reshape_backwards = 1; | ||
2989 | else if (new_offset < rdev->data_offset) | ||
2990 | mddev->reshape_backwards = 0; | ||
2991 | |||
2992 | return len; | ||
2993 | } | ||
2994 | static struct rdev_sysfs_entry rdev_new_offset = | ||
2995 | __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); | ||
2996 | |||
2829 | static ssize_t | 2997 | static ssize_t |
2830 | rdev_size_show(struct md_rdev *rdev, char *page) | 2998 | rdev_size_show(struct md_rdev *rdev, char *page) |
2831 | { | 2999 | { |
@@ -2870,6 +3038,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2870 | 3038 | ||
2871 | if (strict_blocks_to_sectors(buf, §ors) < 0) | 3039 | if (strict_blocks_to_sectors(buf, §ors) < 0) |
2872 | return -EINVAL; | 3040 | return -EINVAL; |
3041 | if (rdev->data_offset != rdev->new_data_offset) | ||
3042 | return -EINVAL; /* too confusing */ | ||
2873 | if (my_mddev->pers && rdev->raid_disk >= 0) { | 3043 | if (my_mddev->pers && rdev->raid_disk >= 0) { |
2874 | if (my_mddev->persistent) { | 3044 | if (my_mddev->persistent) { |
2875 | sectors = super_types[my_mddev->major_version]. | 3045 | sectors = super_types[my_mddev->major_version]. |
@@ -3006,6 +3176,7 @@ static struct attribute *rdev_default_attrs[] = { | |||
3006 | &rdev_errors.attr, | 3176 | &rdev_errors.attr, |
3007 | &rdev_slot.attr, | 3177 | &rdev_slot.attr, |
3008 | &rdev_offset.attr, | 3178 | &rdev_offset.attr, |
3179 | &rdev_new_offset.attr, | ||
3009 | &rdev_size.attr, | 3180 | &rdev_size.attr, |
3010 | &rdev_recovery_start.attr, | 3181 | &rdev_recovery_start.attr, |
3011 | &rdev_bad_blocks.attr, | 3182 | &rdev_bad_blocks.attr, |
@@ -3080,6 +3251,7 @@ int md_rdev_init(struct md_rdev *rdev) | |||
3080 | rdev->raid_disk = -1; | 3251 | rdev->raid_disk = -1; |
3081 | rdev->flags = 0; | 3252 | rdev->flags = 0; |
3082 | rdev->data_offset = 0; | 3253 | rdev->data_offset = 0; |
3254 | rdev->new_data_offset = 0; | ||
3083 | rdev->sb_events = 0; | 3255 | rdev->sb_events = 0; |
3084 | rdev->last_read_error.tv_sec = 0; | 3256 | rdev->last_read_error.tv_sec = 0; |
3085 | rdev->last_read_error.tv_nsec = 0; | 3257 | rdev->last_read_error.tv_nsec = 0; |
@@ -3178,8 +3350,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe | |||
3178 | abort_free: | 3350 | abort_free: |
3179 | if (rdev->bdev) | 3351 | if (rdev->bdev) |
3180 | unlock_rdev(rdev); | 3352 | unlock_rdev(rdev); |
3181 | free_disk_sb(rdev); | 3353 | md_rdev_clear(rdev); |
3182 | kfree(rdev->badblocks.page); | ||
3183 | kfree(rdev); | 3354 | kfree(rdev); |
3184 | return ERR_PTR(err); | 3355 | return ERR_PTR(err); |
3185 | } | 3356 | } |
@@ -3419,6 +3590,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3419 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 3590 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
3420 | mddev->raid_disks -= mddev->delta_disks; | 3591 | mddev->raid_disks -= mddev->delta_disks; |
3421 | mddev->delta_disks = 0; | 3592 | mddev->delta_disks = 0; |
3593 | mddev->reshape_backwards = 0; | ||
3422 | module_put(pers->owner); | 3594 | module_put(pers->owner); |
3423 | printk(KERN_WARNING "md: %s: %s would not accept array\n", | 3595 | printk(KERN_WARNING "md: %s: %s would not accept array\n", |
3424 | mdname(mddev), clevel); | 3596 | mdname(mddev), clevel); |
@@ -3492,6 +3664,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3492 | mddev->layout = mddev->new_layout; | 3664 | mddev->layout = mddev->new_layout; |
3493 | mddev->chunk_sectors = mddev->new_chunk_sectors; | 3665 | mddev->chunk_sectors = mddev->new_chunk_sectors; |
3494 | mddev->delta_disks = 0; | 3666 | mddev->delta_disks = 0; |
3667 | mddev->reshape_backwards = 0; | ||
3495 | mddev->degraded = 0; | 3668 | mddev->degraded = 0; |
3496 | if (mddev->pers->sync_request == NULL) { | 3669 | if (mddev->pers->sync_request == NULL) { |
3497 | /* this is now an array without redundancy, so | 3670 | /* this is now an array without redundancy, so |
@@ -3501,10 +3674,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3501 | del_timer_sync(&mddev->safemode_timer); | 3674 | del_timer_sync(&mddev->safemode_timer); |
3502 | } | 3675 | } |
3503 | pers->run(mddev); | 3676 | pers->run(mddev); |
3504 | mddev_resume(mddev); | ||
3505 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3677 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
3506 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 3678 | mddev_resume(mddev); |
3507 | md_wakeup_thread(mddev->thread); | ||
3508 | sysfs_notify(&mddev->kobj, NULL, "level"); | 3679 | sysfs_notify(&mddev->kobj, NULL, "level"); |
3509 | md_new_event(mddev); | 3680 | md_new_event(mddev); |
3510 | return rv; | 3681 | return rv; |
@@ -3582,9 +3753,20 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len) | |||
3582 | if (mddev->pers) | 3753 | if (mddev->pers) |
3583 | rv = update_raid_disks(mddev, n); | 3754 | rv = update_raid_disks(mddev, n); |
3584 | else if (mddev->reshape_position != MaxSector) { | 3755 | else if (mddev->reshape_position != MaxSector) { |
3756 | struct md_rdev *rdev; | ||
3585 | int olddisks = mddev->raid_disks - mddev->delta_disks; | 3757 | int olddisks = mddev->raid_disks - mddev->delta_disks; |
3758 | |||
3759 | rdev_for_each(rdev, mddev) { | ||
3760 | if (olddisks < n && | ||
3761 | rdev->data_offset < rdev->new_data_offset) | ||
3762 | return -EINVAL; | ||
3763 | if (olddisks > n && | ||
3764 | rdev->data_offset > rdev->new_data_offset) | ||
3765 | return -EINVAL; | ||
3766 | } | ||
3586 | mddev->delta_disks = n - olddisks; | 3767 | mddev->delta_disks = n - olddisks; |
3587 | mddev->raid_disks = n; | 3768 | mddev->raid_disks = n; |
3769 | mddev->reshape_backwards = (mddev->delta_disks < 0); | ||
3588 | } else | 3770 | } else |
3589 | mddev->raid_disks = n; | 3771 | mddev->raid_disks = n; |
3590 | return rv ? rv : len; | 3772 | return rv ? rv : len; |
@@ -4266,7 +4448,8 @@ sync_completed_show(struct mddev *mddev, char *page) | |||
4266 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 4448 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
4267 | return sprintf(page, "none\n"); | 4449 | return sprintf(page, "none\n"); |
4268 | 4450 | ||
4269 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 4451 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
4452 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | ||
4270 | max_sectors = mddev->resync_max_sectors; | 4453 | max_sectors = mddev->resync_max_sectors; |
4271 | else | 4454 | else |
4272 | max_sectors = mddev->dev_sectors; | 4455 | max_sectors = mddev->dev_sectors; |
@@ -4428,6 +4611,7 @@ reshape_position_show(struct mddev *mddev, char *page) | |||
4428 | static ssize_t | 4611 | static ssize_t |
4429 | reshape_position_store(struct mddev *mddev, const char *buf, size_t len) | 4612 | reshape_position_store(struct mddev *mddev, const char *buf, size_t len) |
4430 | { | 4613 | { |
4614 | struct md_rdev *rdev; | ||
4431 | char *e; | 4615 | char *e; |
4432 | unsigned long long new = simple_strtoull(buf, &e, 10); | 4616 | unsigned long long new = simple_strtoull(buf, &e, 10); |
4433 | if (mddev->pers) | 4617 | if (mddev->pers) |
@@ -4436,9 +4620,12 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) | |||
4436 | return -EINVAL; | 4620 | return -EINVAL; |
4437 | mddev->reshape_position = new; | 4621 | mddev->reshape_position = new; |
4438 | mddev->delta_disks = 0; | 4622 | mddev->delta_disks = 0; |
4623 | mddev->reshape_backwards = 0; | ||
4439 | mddev->new_level = mddev->level; | 4624 | mddev->new_level = mddev->level; |
4440 | mddev->new_layout = mddev->layout; | 4625 | mddev->new_layout = mddev->layout; |
4441 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 4626 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
4627 | rdev_for_each(rdev, mddev) | ||
4628 | rdev->new_data_offset = rdev->data_offset; | ||
4442 | return len; | 4629 | return len; |
4443 | } | 4630 | } |
4444 | 4631 | ||
@@ -4447,6 +4634,42 @@ __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, | |||
4447 | reshape_position_store); | 4634 | reshape_position_store); |
4448 | 4635 | ||
4449 | static ssize_t | 4636 | static ssize_t |
4637 | reshape_direction_show(struct mddev *mddev, char *page) | ||
4638 | { | ||
4639 | return sprintf(page, "%s\n", | ||
4640 | mddev->reshape_backwards ? "backwards" : "forwards"); | ||
4641 | } | ||
4642 | |||
4643 | static ssize_t | ||
4644 | reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) | ||
4645 | { | ||
4646 | int backwards = 0; | ||
4647 | if (cmd_match(buf, "forwards")) | ||
4648 | backwards = 0; | ||
4649 | else if (cmd_match(buf, "backwards")) | ||
4650 | backwards = 1; | ||
4651 | else | ||
4652 | return -EINVAL; | ||
4653 | if (mddev->reshape_backwards == backwards) | ||
4654 | return len; | ||
4655 | |||
4656 | /* check if we are allowed to change */ | ||
4657 | if (mddev->delta_disks) | ||
4658 | return -EBUSY; | ||
4659 | |||
4660 | if (mddev->persistent && | ||
4661 | mddev->major_version == 0) | ||
4662 | return -EINVAL; | ||
4663 | |||
4664 | mddev->reshape_backwards = backwards; | ||
4665 | return len; | ||
4666 | } | ||
4667 | |||
4668 | static struct md_sysfs_entry md_reshape_direction = | ||
4669 | __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, | ||
4670 | reshape_direction_store); | ||
4671 | |||
4672 | static ssize_t | ||
4450 | array_size_show(struct mddev *mddev, char *page) | 4673 | array_size_show(struct mddev *mddev, char *page) |
4451 | { | 4674 | { |
4452 | if (mddev->external_size) | 4675 | if (mddev->external_size) |
@@ -4501,6 +4724,7 @@ static struct attribute *md_default_attrs[] = { | |||
4501 | &md_safe_delay.attr, | 4724 | &md_safe_delay.attr, |
4502 | &md_array_state.attr, | 4725 | &md_array_state.attr, |
4503 | &md_reshape_position.attr, | 4726 | &md_reshape_position.attr, |
4727 | &md_reshape_direction.attr, | ||
4504 | &md_array_size.attr, | 4728 | &md_array_size.attr, |
4505 | &max_corr_read_errors.attr, | 4729 | &max_corr_read_errors.attr, |
4506 | NULL, | 4730 | NULL, |
@@ -4914,7 +5138,8 @@ int md_run(struct mddev *mddev) | |||
4914 | err = -EINVAL; | 5138 | err = -EINVAL; |
4915 | mddev->pers->stop(mddev); | 5139 | mddev->pers->stop(mddev); |
4916 | } | 5140 | } |
4917 | if (err == 0 && mddev->pers->sync_request) { | 5141 | if (err == 0 && mddev->pers->sync_request && |
5142 | (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { | ||
4918 | err = bitmap_create(mddev); | 5143 | err = bitmap_create(mddev); |
4919 | if (err) { | 5144 | if (err) { |
4920 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", | 5145 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", |
@@ -5064,6 +5289,7 @@ static void md_clean(struct mddev *mddev) | |||
5064 | mddev->events = 0; | 5289 | mddev->events = 0; |
5065 | mddev->can_decrease_events = 0; | 5290 | mddev->can_decrease_events = 0; |
5066 | mddev->delta_disks = 0; | 5291 | mddev->delta_disks = 0; |
5292 | mddev->reshape_backwards = 0; | ||
5067 | mddev->new_level = LEVEL_NONE; | 5293 | mddev->new_level = LEVEL_NONE; |
5068 | mddev->new_layout = 0; | 5294 | mddev->new_layout = 0; |
5069 | mddev->new_chunk_sectors = 0; | 5295 | mddev->new_chunk_sectors = 0; |
@@ -5079,6 +5305,7 @@ static void md_clean(struct mddev *mddev) | |||
5079 | mddev->merge_check_needed = 0; | 5305 | mddev->merge_check_needed = 0; |
5080 | mddev->bitmap_info.offset = 0; | 5306 | mddev->bitmap_info.offset = 0; |
5081 | mddev->bitmap_info.default_offset = 0; | 5307 | mddev->bitmap_info.default_offset = 0; |
5308 | mddev->bitmap_info.default_space = 0; | ||
5082 | mddev->bitmap_info.chunksize = 0; | 5309 | mddev->bitmap_info.chunksize = 0; |
5083 | mddev->bitmap_info.daemon_sleep = 0; | 5310 | mddev->bitmap_info.daemon_sleep = 0; |
5084 | mddev->bitmap_info.max_write_behind = 0; | 5311 | mddev->bitmap_info.max_write_behind = 0; |
@@ -5421,7 +5648,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg) | |||
5421 | goto out; | 5648 | goto out; |
5422 | 5649 | ||
5423 | /* bitmap disabled, zero the first byte and copy out */ | 5650 | /* bitmap disabled, zero the first byte and copy out */ |
5424 | if (!mddev->bitmap || !mddev->bitmap->file) { | 5651 | if (!mddev->bitmap || !mddev->bitmap->storage.file) { |
5425 | file->pathname[0] = '\0'; | 5652 | file->pathname[0] = '\0'; |
5426 | goto copy_out; | 5653 | goto copy_out; |
5427 | } | 5654 | } |
@@ -5430,7 +5657,8 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg) | |||
5430 | if (!buf) | 5657 | if (!buf) |
5431 | goto out; | 5658 | goto out; |
5432 | 5659 | ||
5433 | ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); | 5660 | ptr = d_path(&mddev->bitmap->storage.file->f_path, |
5661 | buf, sizeof(file->pathname)); | ||
5434 | if (IS_ERR(ptr)) | 5662 | if (IS_ERR(ptr)) |
5435 | goto out; | 5663 | goto out; |
5436 | 5664 | ||
@@ -5875,6 +6103,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) | |||
5875 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 6103 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
5876 | 6104 | ||
5877 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; | 6105 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
6106 | mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); | ||
5878 | mddev->bitmap_info.offset = 0; | 6107 | mddev->bitmap_info.offset = 0; |
5879 | 6108 | ||
5880 | mddev->reshape_position = MaxSector; | 6109 | mddev->reshape_position = MaxSector; |
@@ -5888,6 +6117,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) | |||
5888 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 6117 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
5889 | mddev->new_layout = mddev->layout; | 6118 | mddev->new_layout = mddev->layout; |
5890 | mddev->delta_disks = 0; | 6119 | mddev->delta_disks = 0; |
6120 | mddev->reshape_backwards = 0; | ||
5891 | 6121 | ||
5892 | return 0; | 6122 | return 0; |
5893 | } | 6123 | } |
@@ -5922,11 +6152,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) | |||
5922 | */ | 6152 | */ |
5923 | if (mddev->sync_thread) | 6153 | if (mddev->sync_thread) |
5924 | return -EBUSY; | 6154 | return -EBUSY; |
5925 | if (mddev->bitmap) | 6155 | |
5926 | /* Sorry, cannot grow a bitmap yet, just remove it, | ||
5927 | * grow, and re-add. | ||
5928 | */ | ||
5929 | return -EBUSY; | ||
5930 | rdev_for_each(rdev, mddev) { | 6156 | rdev_for_each(rdev, mddev) { |
5931 | sector_t avail = rdev->sectors; | 6157 | sector_t avail = rdev->sectors; |
5932 | 6158 | ||
@@ -5944,6 +6170,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) | |||
5944 | static int update_raid_disks(struct mddev *mddev, int raid_disks) | 6170 | static int update_raid_disks(struct mddev *mddev, int raid_disks) |
5945 | { | 6171 | { |
5946 | int rv; | 6172 | int rv; |
6173 | struct md_rdev *rdev; | ||
5947 | /* change the number of raid disks */ | 6174 | /* change the number of raid disks */ |
5948 | if (mddev->pers->check_reshape == NULL) | 6175 | if (mddev->pers->check_reshape == NULL) |
5949 | return -EINVAL; | 6176 | return -EINVAL; |
@@ -5952,11 +6179,27 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) | |||
5952 | return -EINVAL; | 6179 | return -EINVAL; |
5953 | if (mddev->sync_thread || mddev->reshape_position != MaxSector) | 6180 | if (mddev->sync_thread || mddev->reshape_position != MaxSector) |
5954 | return -EBUSY; | 6181 | return -EBUSY; |
6182 | |||
6183 | rdev_for_each(rdev, mddev) { | ||
6184 | if (mddev->raid_disks < raid_disks && | ||
6185 | rdev->data_offset < rdev->new_data_offset) | ||
6186 | return -EINVAL; | ||
6187 | if (mddev->raid_disks > raid_disks && | ||
6188 | rdev->data_offset > rdev->new_data_offset) | ||
6189 | return -EINVAL; | ||
6190 | } | ||
6191 | |||
5955 | mddev->delta_disks = raid_disks - mddev->raid_disks; | 6192 | mddev->delta_disks = raid_disks - mddev->raid_disks; |
6193 | if (mddev->delta_disks < 0) | ||
6194 | mddev->reshape_backwards = 1; | ||
6195 | else if (mddev->delta_disks > 0) | ||
6196 | mddev->reshape_backwards = 0; | ||
5956 | 6197 | ||
5957 | rv = mddev->pers->check_reshape(mddev); | 6198 | rv = mddev->pers->check_reshape(mddev); |
5958 | if (rv < 0) | 6199 | if (rv < 0) { |
5959 | mddev->delta_disks = 0; | 6200 | mddev->delta_disks = 0; |
6201 | mddev->reshape_backwards = 0; | ||
6202 | } | ||
5960 | return rv; | 6203 | return rv; |
5961 | } | 6204 | } |
5962 | 6205 | ||
@@ -6039,6 +6282,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
6039 | return -EINVAL; | 6282 | return -EINVAL; |
6040 | mddev->bitmap_info.offset = | 6283 | mddev->bitmap_info.offset = |
6041 | mddev->bitmap_info.default_offset; | 6284 | mddev->bitmap_info.default_offset; |
6285 | mddev->bitmap_info.space = | ||
6286 | mddev->bitmap_info.default_space; | ||
6042 | mddev->pers->quiesce(mddev, 1); | 6287 | mddev->pers->quiesce(mddev, 1); |
6043 | rv = bitmap_create(mddev); | 6288 | rv = bitmap_create(mddev); |
6044 | if (!rv) | 6289 | if (!rv) |
@@ -6050,7 +6295,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
6050 | /* remove the bitmap */ | 6295 | /* remove the bitmap */ |
6051 | if (!mddev->bitmap) | 6296 | if (!mddev->bitmap) |
6052 | return -ENOENT; | 6297 | return -ENOENT; |
6053 | if (mddev->bitmap->file) | 6298 | if (mddev->bitmap->storage.file) |
6054 | return -EINVAL; | 6299 | return -EINVAL; |
6055 | mddev->pers->quiesce(mddev, 1); | 6300 | mddev->pers->quiesce(mddev, 1); |
6056 | bitmap_destroy(mddev); | 6301 | bitmap_destroy(mddev); |
@@ -6373,6 +6618,9 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
6373 | struct mddev *mddev = mddev_find(bdev->bd_dev); | 6618 | struct mddev *mddev = mddev_find(bdev->bd_dev); |
6374 | int err; | 6619 | int err; |
6375 | 6620 | ||
6621 | if (!mddev) | ||
6622 | return -ENODEV; | ||
6623 | |||
6376 | if (mddev->gendisk != bdev->bd_disk) { | 6624 | if (mddev->gendisk != bdev->bd_disk) { |
6377 | /* we are racing with mddev_put which is discarding this | 6625 | /* we are racing with mddev_put which is discarding this |
6378 | * bd_disk. | 6626 | * bd_disk. |
@@ -6584,7 +6832,8 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev) | |||
6584 | 6832 | ||
6585 | resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); | 6833 | resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); |
6586 | 6834 | ||
6587 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 6835 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
6836 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | ||
6588 | max_sectors = mddev->resync_max_sectors; | 6837 | max_sectors = mddev->resync_max_sectors; |
6589 | else | 6838 | else |
6590 | max_sectors = mddev->dev_sectors; | 6839 | max_sectors = mddev->dev_sectors; |
@@ -7147,7 +7396,7 @@ void md_do_sync(struct mddev *mddev) | |||
7147 | j = mddev->recovery_cp; | 7396 | j = mddev->recovery_cp; |
7148 | 7397 | ||
7149 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | 7398 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
7150 | max_sectors = mddev->dev_sectors; | 7399 | max_sectors = mddev->resync_max_sectors; |
7151 | else { | 7400 | else { |
7152 | /* recovery follows the physical size of devices */ | 7401 | /* recovery follows the physical size of devices */ |
7153 | max_sectors = mddev->dev_sectors; | 7402 | max_sectors = mddev->dev_sectors; |
@@ -7598,7 +7847,7 @@ void md_check_recovery(struct mddev *mddev) | |||
7598 | goto unlock; | 7847 | goto unlock; |
7599 | 7848 | ||
7600 | if (mddev->pers->sync_request) { | 7849 | if (mddev->pers->sync_request) { |
7601 | if (spares && mddev->bitmap && ! mddev->bitmap->file) { | 7850 | if (spares) { |
7602 | /* We are adding a device or devices to an array | 7851 | /* We are adding a device or devices to an array |
7603 | * which has the bitmap stored on all devices. | 7852 | * which has the bitmap stored on all devices. |
7604 | * So make sure all bitmap pages get written | 7853 | * So make sure all bitmap pages get written |
@@ -7646,6 +7895,20 @@ void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) | |||
7646 | } | 7895 | } |
7647 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); | 7896 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); |
7648 | 7897 | ||
7898 | void md_finish_reshape(struct mddev *mddev) | ||
7899 | { | ||
7900 | /* called be personality module when reshape completes. */ | ||
7901 | struct md_rdev *rdev; | ||
7902 | |||
7903 | rdev_for_each(rdev, mddev) { | ||
7904 | if (rdev->data_offset > rdev->new_data_offset) | ||
7905 | rdev->sectors += rdev->data_offset - rdev->new_data_offset; | ||
7906 | else | ||
7907 | rdev->sectors -= rdev->new_data_offset - rdev->data_offset; | ||
7908 | rdev->data_offset = rdev->new_data_offset; | ||
7909 | } | ||
7910 | } | ||
7911 | EXPORT_SYMBOL(md_finish_reshape); | ||
7649 | 7912 | ||
7650 | /* Bad block management. | 7913 | /* Bad block management. |
7651 | * We can record which blocks on each device are 'bad' and so just | 7914 | * We can record which blocks on each device are 'bad' and so just |
@@ -7894,10 +8157,15 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, | |||
7894 | } | 8157 | } |
7895 | 8158 | ||
7896 | int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, | 8159 | int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
7897 | int acknowledged) | 8160 | int is_new) |
7898 | { | 8161 | { |
7899 | int rv = md_set_badblocks(&rdev->badblocks, | 8162 | int rv; |
7900 | s + rdev->data_offset, sectors, acknowledged); | 8163 | if (is_new) |
8164 | s += rdev->new_data_offset; | ||
8165 | else | ||
8166 | s += rdev->data_offset; | ||
8167 | rv = md_set_badblocks(&rdev->badblocks, | ||
8168 | s, sectors, 0); | ||
7901 | if (rv) { | 8169 | if (rv) { |
7902 | /* Make sure they get written out promptly */ | 8170 | /* Make sure they get written out promptly */ |
7903 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 8171 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
@@ -8003,11 +8271,15 @@ out: | |||
8003 | return rv; | 8271 | return rv; |
8004 | } | 8272 | } |
8005 | 8273 | ||
8006 | int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors) | 8274 | int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
8275 | int is_new) | ||
8007 | { | 8276 | { |
8277 | if (is_new) | ||
8278 | s += rdev->new_data_offset; | ||
8279 | else | ||
8280 | s += rdev->data_offset; | ||
8008 | return md_clear_badblocks(&rdev->badblocks, | 8281 | return md_clear_badblocks(&rdev->badblocks, |
8009 | s + rdev->data_offset, | 8282 | s, sectors); |
8010 | sectors); | ||
8011 | } | 8283 | } |
8012 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); | 8284 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); |
8013 | 8285 | ||
diff --git a/drivers/md/md.h b/drivers/md/md.h index 1c2063ccf48e..7b4a3c318cae 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -55,6 +55,7 @@ struct md_rdev { | |||
55 | int sb_loaded; | 55 | int sb_loaded; |
56 | __u64 sb_events; | 56 | __u64 sb_events; |
57 | sector_t data_offset; /* start of data in array */ | 57 | sector_t data_offset; /* start of data in array */ |
58 | sector_t new_data_offset;/* only relevant while reshaping */ | ||
58 | sector_t sb_start; /* offset of the super block (in 512byte sectors) */ | 59 | sector_t sb_start; /* offset of the super block (in 512byte sectors) */ |
59 | int sb_size; /* bytes in the superblock */ | 60 | int sb_size; /* bytes in the superblock */ |
60 | int preferred_minor; /* autorun support */ | 61 | int preferred_minor; /* autorun support */ |
@@ -193,8 +194,9 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, | |||
193 | return 0; | 194 | return 0; |
194 | } | 195 | } |
195 | extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, | 196 | extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
196 | int acknowledged); | 197 | int is_new); |
197 | extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors); | 198 | extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
199 | int is_new); | ||
198 | extern void md_ack_all_badblocks(struct badblocks *bb); | 200 | extern void md_ack_all_badblocks(struct badblocks *bb); |
199 | 201 | ||
200 | struct mddev { | 202 | struct mddev { |
@@ -262,6 +264,7 @@ struct mddev { | |||
262 | sector_t reshape_position; | 264 | sector_t reshape_position; |
263 | int delta_disks, new_level, new_layout; | 265 | int delta_disks, new_level, new_layout; |
264 | int new_chunk_sectors; | 266 | int new_chunk_sectors; |
267 | int reshape_backwards; | ||
265 | 268 | ||
266 | atomic_t plug_cnt; /* If device is expecting | 269 | atomic_t plug_cnt; /* If device is expecting |
267 | * more bios soon. | 270 | * more bios soon. |
@@ -390,10 +393,13 @@ struct mddev { | |||
390 | * For external metadata, offset | 393 | * For external metadata, offset |
391 | * from start of device. | 394 | * from start of device. |
392 | */ | 395 | */ |
396 | unsigned long space; /* space available at this offset */ | ||
393 | loff_t default_offset; /* this is the offset to use when | 397 | loff_t default_offset; /* this is the offset to use when |
394 | * hot-adding a bitmap. It should | 398 | * hot-adding a bitmap. It should |
395 | * eventually be settable by sysfs. | 399 | * eventually be settable by sysfs. |
396 | */ | 400 | */ |
401 | unsigned long default_space; /* space available at | ||
402 | * default offset */ | ||
397 | struct mutex mutex; | 403 | struct mutex mutex; |
398 | unsigned long chunksize; | 404 | unsigned long chunksize; |
399 | unsigned long daemon_sleep; /* how many jiffies between updates? */ | 405 | unsigned long daemon_sleep; /* how many jiffies between updates? */ |
@@ -591,6 +597,7 @@ extern void md_write_start(struct mddev *mddev, struct bio *bi); | |||
591 | extern void md_write_end(struct mddev *mddev); | 597 | extern void md_write_end(struct mddev *mddev); |
592 | extern void md_done_sync(struct mddev *mddev, int blocks, int ok); | 598 | extern void md_done_sync(struct mddev *mddev, int blocks, int ok); |
593 | extern void md_error(struct mddev *mddev, struct md_rdev *rdev); | 599 | extern void md_error(struct mddev *mddev, struct md_rdev *rdev); |
600 | extern void md_finish_reshape(struct mddev *mddev); | ||
594 | 601 | ||
595 | extern int mddev_congested(struct mddev *mddev, int bits); | 602 | extern int mddev_congested(struct mddev *mddev, int bits); |
596 | extern void md_flush_request(struct mddev *mddev, struct bio *bio); | 603 | extern void md_flush_request(struct mddev *mddev, struct bio *bio); |
@@ -615,6 +622,7 @@ extern int md_run(struct mddev *mddev); | |||
615 | extern void md_stop(struct mddev *mddev); | 622 | extern void md_stop(struct mddev *mddev); |
616 | extern void md_stop_writes(struct mddev *mddev); | 623 | extern void md_stop_writes(struct mddev *mddev); |
617 | extern int md_rdev_init(struct md_rdev *rdev); | 624 | extern int md_rdev_init(struct md_rdev *rdev); |
625 | extern void md_rdev_clear(struct md_rdev *rdev); | ||
618 | 626 | ||
619 | extern void mddev_suspend(struct mddev *mddev); | 627 | extern void mddev_suspend(struct mddev *mddev); |
620 | extern void mddev_resume(struct mddev *mddev); | 628 | extern void mddev_resume(struct mddev *mddev); |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 15dd59b84e94..835de7168cd3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -1859,7 +1859,9 @@ static void fix_read_error(struct r1conf *conf, int read_disk, | |||
1859 | 1859 | ||
1860 | rdev = conf->mirrors[d].rdev; | 1860 | rdev = conf->mirrors[d].rdev; |
1861 | if (rdev && | 1861 | if (rdev && |
1862 | test_bit(In_sync, &rdev->flags) && | 1862 | (test_bit(In_sync, &rdev->flags) || |
1863 | (!test_bit(Faulty, &rdev->flags) && | ||
1864 | rdev->recovery_offset >= sect + s)) && | ||
1863 | is_badblock(rdev, sect, s, | 1865 | is_badblock(rdev, sect, s, |
1864 | &first_bad, &bad_sectors) == 0 && | 1866 | &first_bad, &bad_sectors) == 0 && |
1865 | sync_page_io(rdev, sect, s<<9, | 1867 | sync_page_io(rdev, sect, s<<9, |
@@ -2024,7 +2026,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio | |||
2024 | continue; | 2026 | continue; |
2025 | if (test_bit(BIO_UPTODATE, &bio->bi_flags) && | 2027 | if (test_bit(BIO_UPTODATE, &bio->bi_flags) && |
2026 | test_bit(R1BIO_MadeGood, &r1_bio->state)) { | 2028 | test_bit(R1BIO_MadeGood, &r1_bio->state)) { |
2027 | rdev_clear_badblocks(rdev, r1_bio->sector, s); | 2029 | rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); |
2028 | } | 2030 | } |
2029 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && | 2031 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && |
2030 | test_bit(R1BIO_WriteError, &r1_bio->state)) { | 2032 | test_bit(R1BIO_WriteError, &r1_bio->state)) { |
@@ -2044,7 +2046,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) | |||
2044 | struct md_rdev *rdev = conf->mirrors[m].rdev; | 2046 | struct md_rdev *rdev = conf->mirrors[m].rdev; |
2045 | rdev_clear_badblocks(rdev, | 2047 | rdev_clear_badblocks(rdev, |
2046 | r1_bio->sector, | 2048 | r1_bio->sector, |
2047 | r1_bio->sectors); | 2049 | r1_bio->sectors, 0); |
2048 | rdev_dec_pending(rdev, conf->mddev); | 2050 | rdev_dec_pending(rdev, conf->mddev); |
2049 | } else if (r1_bio->bios[m] != NULL) { | 2051 | } else if (r1_bio->bios[m] != NULL) { |
2050 | /* This drive got a write error. We need to | 2052 | /* This drive got a write error. We need to |
@@ -2598,7 +2600,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2598 | if (!disk->rdev || | 2600 | if (!disk->rdev || |
2599 | !test_bit(In_sync, &disk->rdev->flags)) { | 2601 | !test_bit(In_sync, &disk->rdev->flags)) { |
2600 | disk->head_position = 0; | 2602 | disk->head_position = 0; |
2601 | if (disk->rdev) | 2603 | if (disk->rdev && |
2604 | (disk->rdev->saved_raid_disk < 0)) | ||
2602 | conf->fullsync = 1; | 2605 | conf->fullsync = 1; |
2603 | } else if (conf->last_used < 0) | 2606 | } else if (conf->last_used < 0) |
2604 | /* | 2607 | /* |
@@ -2750,9 +2753,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) | |||
2750 | * any io in the removed space completes, but it hardly seems | 2753 | * any io in the removed space completes, but it hardly seems |
2751 | * worth it. | 2754 | * worth it. |
2752 | */ | 2755 | */ |
2753 | md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); | 2756 | sector_t newsize = raid1_size(mddev, sectors, 0); |
2754 | if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) | 2757 | if (mddev->external_size && |
2758 | mddev->array_sectors > newsize) | ||
2755 | return -EINVAL; | 2759 | return -EINVAL; |
2760 | if (mddev->bitmap) { | ||
2761 | int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0); | ||
2762 | if (ret) | ||
2763 | return ret; | ||
2764 | } | ||
2765 | md_set_array_sectors(mddev, newsize); | ||
2756 | set_capacity(mddev->gendisk, mddev->array_sectors); | 2766 | set_capacity(mddev->gendisk, mddev->array_sectors); |
2757 | revalidate_disk(mddev->gendisk); | 2767 | revalidate_disk(mddev->gendisk); |
2758 | if (sectors > mddev->dev_sectors && | 2768 | if (sectors > mddev->dev_sectors && |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3f91c2e1dfe7..987db37cb875 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/seq_file.h> | 25 | #include <linux/seq_file.h> |
26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
27 | #include <linux/kthread.h> | ||
27 | #include "md.h" | 28 | #include "md.h" |
28 | #include "raid10.h" | 29 | #include "raid10.h" |
29 | #include "raid0.h" | 30 | #include "raid0.h" |
@@ -68,6 +69,11 @@ static int max_queued_requests = 1024; | |||
68 | static void allow_barrier(struct r10conf *conf); | 69 | static void allow_barrier(struct r10conf *conf); |
69 | static void lower_barrier(struct r10conf *conf); | 70 | static void lower_barrier(struct r10conf *conf); |
70 | static int enough(struct r10conf *conf, int ignore); | 71 | static int enough(struct r10conf *conf, int ignore); |
72 | static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | ||
73 | int *skipped); | ||
74 | static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); | ||
75 | static void end_reshape_write(struct bio *bio, int error); | ||
76 | static void end_reshape(struct r10conf *conf); | ||
71 | 77 | ||
72 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) | 78 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) |
73 | { | 79 | { |
@@ -112,7 +118,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
112 | if (!r10_bio) | 118 | if (!r10_bio) |
113 | return NULL; | 119 | return NULL; |
114 | 120 | ||
115 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) | 121 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || |
122 | test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) | ||
116 | nalloc = conf->copies; /* resync */ | 123 | nalloc = conf->copies; /* resync */ |
117 | else | 124 | else |
118 | nalloc = 2; /* recovery */ | 125 | nalloc = 2; /* recovery */ |
@@ -140,9 +147,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
140 | struct bio *rbio = r10_bio->devs[j].repl_bio; | 147 | struct bio *rbio = r10_bio->devs[j].repl_bio; |
141 | bio = r10_bio->devs[j].bio; | 148 | bio = r10_bio->devs[j].bio; |
142 | for (i = 0; i < RESYNC_PAGES; i++) { | 149 | for (i = 0; i < RESYNC_PAGES; i++) { |
143 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, | 150 | if (j > 0 && !test_bit(MD_RECOVERY_SYNC, |
144 | &conf->mddev->recovery)) { | 151 | &conf->mddev->recovery)) { |
145 | /* we can share bv_page's during recovery */ | 152 | /* we can share bv_page's during recovery |
153 | * and reshape */ | ||
146 | struct bio *rbio = r10_bio->devs[0].bio; | 154 | struct bio *rbio = r10_bio->devs[0].bio; |
147 | page = rbio->bi_io_vec[i].bv_page; | 155 | page = rbio->bi_io_vec[i].bv_page; |
148 | get_page(page); | 156 | get_page(page); |
@@ -165,10 +173,11 @@ out_free_pages: | |||
165 | while (j--) | 173 | while (j--) |
166 | for (i = 0; i < RESYNC_PAGES ; i++) | 174 | for (i = 0; i < RESYNC_PAGES ; i++) |
167 | safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); | 175 | safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); |
168 | j = -1; | 176 | j = 0; |
169 | out_free_bio: | 177 | out_free_bio: |
170 | while (++j < nalloc) { | 178 | for ( ; j < nalloc; j++) { |
171 | bio_put(r10_bio->devs[j].bio); | 179 | if (r10_bio->devs[j].bio) |
180 | bio_put(r10_bio->devs[j].bio); | ||
172 | if (r10_bio->devs[j].repl_bio) | 181 | if (r10_bio->devs[j].repl_bio) |
173 | bio_put(r10_bio->devs[j].repl_bio); | 182 | bio_put(r10_bio->devs[j].repl_bio); |
174 | } | 183 | } |
@@ -504,79 +513,96 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
504 | * sector offset to a virtual address | 513 | * sector offset to a virtual address |
505 | */ | 514 | */ |
506 | 515 | ||
507 | static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) | 516 | static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) |
508 | { | 517 | { |
509 | int n,f; | 518 | int n,f; |
510 | sector_t sector; | 519 | sector_t sector; |
511 | sector_t chunk; | 520 | sector_t chunk; |
512 | sector_t stripe; | 521 | sector_t stripe; |
513 | int dev; | 522 | int dev; |
514 | |||
515 | int slot = 0; | 523 | int slot = 0; |
516 | 524 | ||
517 | /* now calculate first sector/dev */ | 525 | /* now calculate first sector/dev */ |
518 | chunk = r10bio->sector >> conf->chunk_shift; | 526 | chunk = r10bio->sector >> geo->chunk_shift; |
519 | sector = r10bio->sector & conf->chunk_mask; | 527 | sector = r10bio->sector & geo->chunk_mask; |
520 | 528 | ||
521 | chunk *= conf->near_copies; | 529 | chunk *= geo->near_copies; |
522 | stripe = chunk; | 530 | stripe = chunk; |
523 | dev = sector_div(stripe, conf->raid_disks); | 531 | dev = sector_div(stripe, geo->raid_disks); |
524 | if (conf->far_offset) | 532 | if (geo->far_offset) |
525 | stripe *= conf->far_copies; | 533 | stripe *= geo->far_copies; |
526 | 534 | ||
527 | sector += stripe << conf->chunk_shift; | 535 | sector += stripe << geo->chunk_shift; |
528 | 536 | ||
529 | /* and calculate all the others */ | 537 | /* and calculate all the others */ |
530 | for (n=0; n < conf->near_copies; n++) { | 538 | for (n = 0; n < geo->near_copies; n++) { |
531 | int d = dev; | 539 | int d = dev; |
532 | sector_t s = sector; | 540 | sector_t s = sector; |
533 | r10bio->devs[slot].addr = sector; | 541 | r10bio->devs[slot].addr = sector; |
534 | r10bio->devs[slot].devnum = d; | 542 | r10bio->devs[slot].devnum = d; |
535 | slot++; | 543 | slot++; |
536 | 544 | ||
537 | for (f = 1; f < conf->far_copies; f++) { | 545 | for (f = 1; f < geo->far_copies; f++) { |
538 | d += conf->near_copies; | 546 | d += geo->near_copies; |
539 | if (d >= conf->raid_disks) | 547 | if (d >= geo->raid_disks) |
540 | d -= conf->raid_disks; | 548 | d -= geo->raid_disks; |
541 | s += conf->stride; | 549 | s += geo->stride; |
542 | r10bio->devs[slot].devnum = d; | 550 | r10bio->devs[slot].devnum = d; |
543 | r10bio->devs[slot].addr = s; | 551 | r10bio->devs[slot].addr = s; |
544 | slot++; | 552 | slot++; |
545 | } | 553 | } |
546 | dev++; | 554 | dev++; |
547 | if (dev >= conf->raid_disks) { | 555 | if (dev >= geo->raid_disks) { |
548 | dev = 0; | 556 | dev = 0; |
549 | sector += (conf->chunk_mask + 1); | 557 | sector += (geo->chunk_mask + 1); |
550 | } | 558 | } |
551 | } | 559 | } |
552 | BUG_ON(slot != conf->copies); | 560 | } |
561 | |||
562 | static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) | ||
563 | { | ||
564 | struct geom *geo = &conf->geo; | ||
565 | |||
566 | if (conf->reshape_progress != MaxSector && | ||
567 | ((r10bio->sector >= conf->reshape_progress) != | ||
568 | conf->mddev->reshape_backwards)) { | ||
569 | set_bit(R10BIO_Previous, &r10bio->state); | ||
570 | geo = &conf->prev; | ||
571 | } else | ||
572 | clear_bit(R10BIO_Previous, &r10bio->state); | ||
573 | |||
574 | __raid10_find_phys(geo, r10bio); | ||
553 | } | 575 | } |
554 | 576 | ||
555 | static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) | 577 | static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) |
556 | { | 578 | { |
557 | sector_t offset, chunk, vchunk; | 579 | sector_t offset, chunk, vchunk; |
580 | /* Never use conf->prev as this is only called during resync | ||
581 | * or recovery, so reshape isn't happening | ||
582 | */ | ||
583 | struct geom *geo = &conf->geo; | ||
558 | 584 | ||
559 | offset = sector & conf->chunk_mask; | 585 | offset = sector & geo->chunk_mask; |
560 | if (conf->far_offset) { | 586 | if (geo->far_offset) { |
561 | int fc; | 587 | int fc; |
562 | chunk = sector >> conf->chunk_shift; | 588 | chunk = sector >> geo->chunk_shift; |
563 | fc = sector_div(chunk, conf->far_copies); | 589 | fc = sector_div(chunk, geo->far_copies); |
564 | dev -= fc * conf->near_copies; | 590 | dev -= fc * geo->near_copies; |
565 | if (dev < 0) | 591 | if (dev < 0) |
566 | dev += conf->raid_disks; | 592 | dev += geo->raid_disks; |
567 | } else { | 593 | } else { |
568 | while (sector >= conf->stride) { | 594 | while (sector >= geo->stride) { |
569 | sector -= conf->stride; | 595 | sector -= geo->stride; |
570 | if (dev < conf->near_copies) | 596 | if (dev < geo->near_copies) |
571 | dev += conf->raid_disks - conf->near_copies; | 597 | dev += geo->raid_disks - geo->near_copies; |
572 | else | 598 | else |
573 | dev -= conf->near_copies; | 599 | dev -= geo->near_copies; |
574 | } | 600 | } |
575 | chunk = sector >> conf->chunk_shift; | 601 | chunk = sector >> geo->chunk_shift; |
576 | } | 602 | } |
577 | vchunk = chunk * conf->raid_disks + dev; | 603 | vchunk = chunk * geo->raid_disks + dev; |
578 | sector_div(vchunk, conf->near_copies); | 604 | sector_div(vchunk, geo->near_copies); |
579 | return (vchunk << conf->chunk_shift) + offset; | 605 | return (vchunk << geo->chunk_shift) + offset; |
580 | } | 606 | } |
581 | 607 | ||
582 | /** | 608 | /** |
@@ -597,10 +623,17 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
597 | struct r10conf *conf = mddev->private; | 623 | struct r10conf *conf = mddev->private; |
598 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 624 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
599 | int max; | 625 | int max; |
600 | unsigned int chunk_sectors = mddev->chunk_sectors; | 626 | unsigned int chunk_sectors; |
601 | unsigned int bio_sectors = bvm->bi_size >> 9; | 627 | unsigned int bio_sectors = bvm->bi_size >> 9; |
628 | struct geom *geo = &conf->geo; | ||
629 | |||
630 | chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1; | ||
631 | if (conf->reshape_progress != MaxSector && | ||
632 | ((sector >= conf->reshape_progress) != | ||
633 | conf->mddev->reshape_backwards)) | ||
634 | geo = &conf->prev; | ||
602 | 635 | ||
603 | if (conf->near_copies < conf->raid_disks) { | 636 | if (geo->near_copies < geo->raid_disks) { |
604 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) | 637 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) |
605 | + bio_sectors)) << 9; | 638 | + bio_sectors)) << 9; |
606 | if (max < 0) | 639 | if (max < 0) |
@@ -614,6 +647,12 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
614 | if (mddev->merge_check_needed) { | 647 | if (mddev->merge_check_needed) { |
615 | struct r10bio r10_bio; | 648 | struct r10bio r10_bio; |
616 | int s; | 649 | int s; |
650 | if (conf->reshape_progress != MaxSector) { | ||
651 | /* Cannot give any guidance during reshape */ | ||
652 | if (max <= biovec->bv_len && bio_sectors == 0) | ||
653 | return biovec->bv_len; | ||
654 | return 0; | ||
655 | } | ||
617 | r10_bio.sector = sector; | 656 | r10_bio.sector = sector; |
618 | raid10_find_phys(conf, &r10_bio); | 657 | raid10_find_phys(conf, &r10_bio); |
619 | rcu_read_lock(); | 658 | rcu_read_lock(); |
@@ -681,6 +720,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, | |||
681 | struct md_rdev *rdev, *best_rdev; | 720 | struct md_rdev *rdev, *best_rdev; |
682 | int do_balance; | 721 | int do_balance; |
683 | int best_slot; | 722 | int best_slot; |
723 | struct geom *geo = &conf->geo; | ||
684 | 724 | ||
685 | raid10_find_phys(conf, r10_bio); | 725 | raid10_find_phys(conf, r10_bio); |
686 | rcu_read_lock(); | 726 | rcu_read_lock(); |
@@ -761,11 +801,11 @@ retry: | |||
761 | * sequential read speed for 'far copies' arrays. So only | 801 | * sequential read speed for 'far copies' arrays. So only |
762 | * keep it for 'near' arrays, and review those later. | 802 | * keep it for 'near' arrays, and review those later. |
763 | */ | 803 | */ |
764 | if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) | 804 | if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) |
765 | break; | 805 | break; |
766 | 806 | ||
767 | /* for far > 1 always use the lowest address */ | 807 | /* for far > 1 always use the lowest address */ |
768 | if (conf->far_copies > 1) | 808 | if (geo->far_copies > 1) |
769 | new_distance = r10_bio->devs[slot].addr; | 809 | new_distance = r10_bio->devs[slot].addr; |
770 | else | 810 | else |
771 | new_distance = abs(r10_bio->devs[slot].addr - | 811 | new_distance = abs(r10_bio->devs[slot].addr - |
@@ -812,7 +852,10 @@ static int raid10_congested(void *data, int bits) | |||
812 | if (mddev_congested(mddev, bits)) | 852 | if (mddev_congested(mddev, bits)) |
813 | return 1; | 853 | return 1; |
814 | rcu_read_lock(); | 854 | rcu_read_lock(); |
815 | for (i = 0; i < conf->raid_disks && ret == 0; i++) { | 855 | for (i = 0; |
856 | (i < conf->geo.raid_disks || i < conf->prev.raid_disks) | ||
857 | && ret == 0; | ||
858 | i++) { | ||
816 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | 859 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); |
817 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 860 | if (rdev && !test_bit(Faulty, &rdev->flags)) { |
818 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 861 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
@@ -973,13 +1016,24 @@ static void unfreeze_array(struct r10conf *conf) | |||
973 | spin_unlock_irq(&conf->resync_lock); | 1016 | spin_unlock_irq(&conf->resync_lock); |
974 | } | 1017 | } |
975 | 1018 | ||
1019 | static sector_t choose_data_offset(struct r10bio *r10_bio, | ||
1020 | struct md_rdev *rdev) | ||
1021 | { | ||
1022 | if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || | ||
1023 | test_bit(R10BIO_Previous, &r10_bio->state)) | ||
1024 | return rdev->data_offset; | ||
1025 | else | ||
1026 | return rdev->new_data_offset; | ||
1027 | } | ||
1028 | |||
976 | static void make_request(struct mddev *mddev, struct bio * bio) | 1029 | static void make_request(struct mddev *mddev, struct bio * bio) |
977 | { | 1030 | { |
978 | struct r10conf *conf = mddev->private; | 1031 | struct r10conf *conf = mddev->private; |
979 | struct r10bio *r10_bio; | 1032 | struct r10bio *r10_bio; |
980 | struct bio *read_bio; | 1033 | struct bio *read_bio; |
981 | int i; | 1034 | int i; |
982 | int chunk_sects = conf->chunk_mask + 1; | 1035 | sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); |
1036 | int chunk_sects = chunk_mask + 1; | ||
983 | const int rw = bio_data_dir(bio); | 1037 | const int rw = bio_data_dir(bio); |
984 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 1038 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
985 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); | 1039 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); |
@@ -988,6 +1042,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
988 | int plugged; | 1042 | int plugged; |
989 | int sectors_handled; | 1043 | int sectors_handled; |
990 | int max_sectors; | 1044 | int max_sectors; |
1045 | int sectors; | ||
991 | 1046 | ||
992 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | 1047 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
993 | md_flush_request(mddev, bio); | 1048 | md_flush_request(mddev, bio); |
@@ -997,9 +1052,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
997 | /* If this request crosses a chunk boundary, we need to | 1052 | /* If this request crosses a chunk boundary, we need to |
998 | * split it. This will only happen for 1 PAGE (or less) requests. | 1053 | * split it. This will only happen for 1 PAGE (or less) requests. |
999 | */ | 1054 | */ |
1000 | if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) | 1055 | if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9) |
1001 | > chunk_sects && | 1056 | > chunk_sects |
1002 | conf->near_copies < conf->raid_disks)) { | 1057 | && (conf->geo.near_copies < conf->geo.raid_disks |
1058 | || conf->prev.near_copies < conf->prev.raid_disks))) { | ||
1003 | struct bio_pair *bp; | 1059 | struct bio_pair *bp; |
1004 | /* Sanity check -- queue functions should prevent this happening */ | 1060 | /* Sanity check -- queue functions should prevent this happening */ |
1005 | if (bio->bi_vcnt != 1 || | 1061 | if (bio->bi_vcnt != 1 || |
@@ -1051,10 +1107,41 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1051 | */ | 1107 | */ |
1052 | wait_barrier(conf); | 1108 | wait_barrier(conf); |
1053 | 1109 | ||
1110 | sectors = bio->bi_size >> 9; | ||
1111 | while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
1112 | bio->bi_sector < conf->reshape_progress && | ||
1113 | bio->bi_sector + sectors > conf->reshape_progress) { | ||
1114 | /* IO spans the reshape position. Need to wait for | ||
1115 | * reshape to pass | ||
1116 | */ | ||
1117 | allow_barrier(conf); | ||
1118 | wait_event(conf->wait_barrier, | ||
1119 | conf->reshape_progress <= bio->bi_sector || | ||
1120 | conf->reshape_progress >= bio->bi_sector + sectors); | ||
1121 | wait_barrier(conf); | ||
1122 | } | ||
1123 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
1124 | bio_data_dir(bio) == WRITE && | ||
1125 | (mddev->reshape_backwards | ||
1126 | ? (bio->bi_sector < conf->reshape_safe && | ||
1127 | bio->bi_sector + sectors > conf->reshape_progress) | ||
1128 | : (bio->bi_sector + sectors > conf->reshape_safe && | ||
1129 | bio->bi_sector < conf->reshape_progress))) { | ||
1130 | /* Need to update reshape_position in metadata */ | ||
1131 | mddev->reshape_position = conf->reshape_progress; | ||
1132 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
1133 | set_bit(MD_CHANGE_PENDING, &mddev->flags); | ||
1134 | md_wakeup_thread(mddev->thread); | ||
1135 | wait_event(mddev->sb_wait, | ||
1136 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | ||
1137 | |||
1138 | conf->reshape_safe = mddev->reshape_position; | ||
1139 | } | ||
1140 | |||
1054 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | 1141 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); |
1055 | 1142 | ||
1056 | r10_bio->master_bio = bio; | 1143 | r10_bio->master_bio = bio; |
1057 | r10_bio->sectors = bio->bi_size >> 9; | 1144 | r10_bio->sectors = sectors; |
1058 | 1145 | ||
1059 | r10_bio->mddev = mddev; | 1146 | r10_bio->mddev = mddev; |
1060 | r10_bio->sector = bio->bi_sector; | 1147 | r10_bio->sector = bio->bi_sector; |
@@ -1093,7 +1180,7 @@ read_again: | |||
1093 | r10_bio->devs[slot].rdev = rdev; | 1180 | r10_bio->devs[slot].rdev = rdev; |
1094 | 1181 | ||
1095 | read_bio->bi_sector = r10_bio->devs[slot].addr + | 1182 | read_bio->bi_sector = r10_bio->devs[slot].addr + |
1096 | rdev->data_offset; | 1183 | choose_data_offset(r10_bio, rdev); |
1097 | read_bio->bi_bdev = rdev->bdev; | 1184 | read_bio->bi_bdev = rdev->bdev; |
1098 | read_bio->bi_end_io = raid10_end_read_request; | 1185 | read_bio->bi_end_io = raid10_end_read_request; |
1099 | read_bio->bi_rw = READ | do_sync; | 1186 | read_bio->bi_rw = READ | do_sync; |
@@ -1297,7 +1384,8 @@ retry_write: | |||
1297 | r10_bio->devs[i].bio = mbio; | 1384 | r10_bio->devs[i].bio = mbio; |
1298 | 1385 | ||
1299 | mbio->bi_sector = (r10_bio->devs[i].addr+ | 1386 | mbio->bi_sector = (r10_bio->devs[i].addr+ |
1300 | conf->mirrors[d].rdev->data_offset); | 1387 | choose_data_offset(r10_bio, |
1388 | conf->mirrors[d].rdev)); | ||
1301 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 1389 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
1302 | mbio->bi_end_io = raid10_end_write_request; | 1390 | mbio->bi_end_io = raid10_end_write_request; |
1303 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1391 | mbio->bi_rw = WRITE | do_sync | do_fua; |
@@ -1321,8 +1409,10 @@ retry_write: | |||
1321 | * so it cannot disappear, so the replacement cannot | 1409 | * so it cannot disappear, so the replacement cannot |
1322 | * become NULL here | 1410 | * become NULL here |
1323 | */ | 1411 | */ |
1324 | mbio->bi_sector = (r10_bio->devs[i].addr+ | 1412 | mbio->bi_sector = (r10_bio->devs[i].addr + |
1325 | conf->mirrors[d].replacement->data_offset); | 1413 | choose_data_offset( |
1414 | r10_bio, | ||
1415 | conf->mirrors[d].replacement)); | ||
1326 | mbio->bi_bdev = conf->mirrors[d].replacement->bdev; | 1416 | mbio->bi_bdev = conf->mirrors[d].replacement->bdev; |
1327 | mbio->bi_end_io = raid10_end_write_request; | 1417 | mbio->bi_end_io = raid10_end_write_request; |
1328 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1418 | mbio->bi_rw = WRITE | do_sync | do_fua; |
@@ -1368,19 +1458,19 @@ static void status(struct seq_file *seq, struct mddev *mddev) | |||
1368 | struct r10conf *conf = mddev->private; | 1458 | struct r10conf *conf = mddev->private; |
1369 | int i; | 1459 | int i; |
1370 | 1460 | ||
1371 | if (conf->near_copies < conf->raid_disks) | 1461 | if (conf->geo.near_copies < conf->geo.raid_disks) |
1372 | seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); | 1462 | seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); |
1373 | if (conf->near_copies > 1) | 1463 | if (conf->geo.near_copies > 1) |
1374 | seq_printf(seq, " %d near-copies", conf->near_copies); | 1464 | seq_printf(seq, " %d near-copies", conf->geo.near_copies); |
1375 | if (conf->far_copies > 1) { | 1465 | if (conf->geo.far_copies > 1) { |
1376 | if (conf->far_offset) | 1466 | if (conf->geo.far_offset) |
1377 | seq_printf(seq, " %d offset-copies", conf->far_copies); | 1467 | seq_printf(seq, " %d offset-copies", conf->geo.far_copies); |
1378 | else | 1468 | else |
1379 | seq_printf(seq, " %d far-copies", conf->far_copies); | 1469 | seq_printf(seq, " %d far-copies", conf->geo.far_copies); |
1380 | } | 1470 | } |
1381 | seq_printf(seq, " [%d/%d] [", conf->raid_disks, | 1471 | seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, |
1382 | conf->raid_disks - mddev->degraded); | 1472 | conf->geo.raid_disks - mddev->degraded); |
1383 | for (i = 0; i < conf->raid_disks; i++) | 1473 | for (i = 0; i < conf->geo.raid_disks; i++) |
1384 | seq_printf(seq, "%s", | 1474 | seq_printf(seq, "%s", |
1385 | conf->mirrors[i].rdev && | 1475 | conf->mirrors[i].rdev && |
1386 | test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); | 1476 | test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); |
@@ -1392,7 +1482,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) | |||
1392 | * Don't consider the device numbered 'ignore' | 1482 | * Don't consider the device numbered 'ignore' |
1393 | * as we might be about to remove it. | 1483 | * as we might be about to remove it. |
1394 | */ | 1484 | */ |
1395 | static int enough(struct r10conf *conf, int ignore) | 1485 | static int _enough(struct r10conf *conf, struct geom *geo, int ignore) |
1396 | { | 1486 | { |
1397 | int first = 0; | 1487 | int first = 0; |
1398 | 1488 | ||
@@ -1403,7 +1493,7 @@ static int enough(struct r10conf *conf, int ignore) | |||
1403 | if (conf->mirrors[first].rdev && | 1493 | if (conf->mirrors[first].rdev && |
1404 | first != ignore) | 1494 | first != ignore) |
1405 | cnt++; | 1495 | cnt++; |
1406 | first = (first+1) % conf->raid_disks; | 1496 | first = (first+1) % geo->raid_disks; |
1407 | } | 1497 | } |
1408 | if (cnt == 0) | 1498 | if (cnt == 0) |
1409 | return 0; | 1499 | return 0; |
@@ -1411,6 +1501,12 @@ static int enough(struct r10conf *conf, int ignore) | |||
1411 | return 1; | 1501 | return 1; |
1412 | } | 1502 | } |
1413 | 1503 | ||
1504 | static int enough(struct r10conf *conf, int ignore) | ||
1505 | { | ||
1506 | return _enough(conf, &conf->geo, ignore) && | ||
1507 | _enough(conf, &conf->prev, ignore); | ||
1508 | } | ||
1509 | |||
1414 | static void error(struct mddev *mddev, struct md_rdev *rdev) | 1510 | static void error(struct mddev *mddev, struct md_rdev *rdev) |
1415 | { | 1511 | { |
1416 | char b[BDEVNAME_SIZE]; | 1512 | char b[BDEVNAME_SIZE]; |
@@ -1445,7 +1541,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) | |||
1445 | "md/raid10:%s: Disk failure on %s, disabling device.\n" | 1541 | "md/raid10:%s: Disk failure on %s, disabling device.\n" |
1446 | "md/raid10:%s: Operation continuing on %d devices.\n", | 1542 | "md/raid10:%s: Operation continuing on %d devices.\n", |
1447 | mdname(mddev), bdevname(rdev->bdev, b), | 1543 | mdname(mddev), bdevname(rdev->bdev, b), |
1448 | mdname(mddev), conf->raid_disks - mddev->degraded); | 1544 | mdname(mddev), conf->geo.raid_disks - mddev->degraded); |
1449 | } | 1545 | } |
1450 | 1546 | ||
1451 | static void print_conf(struct r10conf *conf) | 1547 | static void print_conf(struct r10conf *conf) |
@@ -1458,10 +1554,10 @@ static void print_conf(struct r10conf *conf) | |||
1458 | printk(KERN_DEBUG "(!conf)\n"); | 1554 | printk(KERN_DEBUG "(!conf)\n"); |
1459 | return; | 1555 | return; |
1460 | } | 1556 | } |
1461 | printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, | 1557 | printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, |
1462 | conf->raid_disks); | 1558 | conf->geo.raid_disks); |
1463 | 1559 | ||
1464 | for (i = 0; i < conf->raid_disks; i++) { | 1560 | for (i = 0; i < conf->geo.raid_disks; i++) { |
1465 | char b[BDEVNAME_SIZE]; | 1561 | char b[BDEVNAME_SIZE]; |
1466 | tmp = conf->mirrors + i; | 1562 | tmp = conf->mirrors + i; |
1467 | if (tmp->rdev) | 1563 | if (tmp->rdev) |
@@ -1493,7 +1589,7 @@ static int raid10_spare_active(struct mddev *mddev) | |||
1493 | * Find all non-in_sync disks within the RAID10 configuration | 1589 | * Find all non-in_sync disks within the RAID10 configuration |
1494 | * and mark them in_sync | 1590 | * and mark them in_sync |
1495 | */ | 1591 | */ |
1496 | for (i = 0; i < conf->raid_disks; i++) { | 1592 | for (i = 0; i < conf->geo.raid_disks; i++) { |
1497 | tmp = conf->mirrors + i; | 1593 | tmp = conf->mirrors + i; |
1498 | if (tmp->replacement | 1594 | if (tmp->replacement |
1499 | && tmp->replacement->recovery_offset == MaxSector | 1595 | && tmp->replacement->recovery_offset == MaxSector |
@@ -1535,7 +1631,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1535 | int err = -EEXIST; | 1631 | int err = -EEXIST; |
1536 | int mirror; | 1632 | int mirror; |
1537 | int first = 0; | 1633 | int first = 0; |
1538 | int last = conf->raid_disks - 1; | 1634 | int last = conf->geo.raid_disks - 1; |
1539 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 1635 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
1540 | 1636 | ||
1541 | if (mddev->recovery_cp < MaxSector) | 1637 | if (mddev->recovery_cp < MaxSector) |
@@ -1543,7 +1639,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1543 | * very different from resync | 1639 | * very different from resync |
1544 | */ | 1640 | */ |
1545 | return -EBUSY; | 1641 | return -EBUSY; |
1546 | if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) | 1642 | if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1)) |
1547 | return -EINVAL; | 1643 | return -EINVAL; |
1548 | 1644 | ||
1549 | if (rdev->raid_disk >= 0) | 1645 | if (rdev->raid_disk >= 0) |
@@ -1635,6 +1731,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1635 | if (!test_bit(Faulty, &rdev->flags) && | 1731 | if (!test_bit(Faulty, &rdev->flags) && |
1636 | mddev->recovery_disabled != p->recovery_disabled && | 1732 | mddev->recovery_disabled != p->recovery_disabled && |
1637 | (!p->replacement || p->replacement == rdev) && | 1733 | (!p->replacement || p->replacement == rdev) && |
1734 | number < conf->geo.raid_disks && | ||
1638 | enough(conf, -1)) { | 1735 | enough(conf, -1)) { |
1639 | err = -EBUSY; | 1736 | err = -EBUSY; |
1640 | goto abort; | 1737 | goto abort; |
@@ -1676,7 +1773,11 @@ static void end_sync_read(struct bio *bio, int error) | |||
1676 | struct r10conf *conf = r10_bio->mddev->private; | 1773 | struct r10conf *conf = r10_bio->mddev->private; |
1677 | int d; | 1774 | int d; |
1678 | 1775 | ||
1679 | d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); | 1776 | if (bio == r10_bio->master_bio) { |
1777 | /* this is a reshape read */ | ||
1778 | d = r10_bio->read_slot; /* really the read dev */ | ||
1779 | } else | ||
1780 | d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); | ||
1680 | 1781 | ||
1681 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 1782 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
1682 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 1783 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
@@ -2218,7 +2319,9 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2218 | " (%d sectors at %llu on %s)\n", | 2319 | " (%d sectors at %llu on %s)\n", |
2219 | mdname(mddev), s, | 2320 | mdname(mddev), s, |
2220 | (unsigned long long)( | 2321 | (unsigned long long)( |
2221 | sect + rdev->data_offset), | 2322 | sect + |
2323 | choose_data_offset(r10_bio, | ||
2324 | rdev)), | ||
2222 | bdevname(rdev->bdev, b)); | 2325 | bdevname(rdev->bdev, b)); |
2223 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 2326 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
2224 | "drive\n", | 2327 | "drive\n", |
@@ -2256,7 +2359,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2256 | " (%d sectors at %llu on %s)\n", | 2359 | " (%d sectors at %llu on %s)\n", |
2257 | mdname(mddev), s, | 2360 | mdname(mddev), s, |
2258 | (unsigned long long)( | 2361 | (unsigned long long)( |
2259 | sect + rdev->data_offset), | 2362 | sect + |
2363 | choose_data_offset(r10_bio, rdev)), | ||
2260 | bdevname(rdev->bdev, b)); | 2364 | bdevname(rdev->bdev, b)); |
2261 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 2365 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
2262 | "drive\n", | 2366 | "drive\n", |
@@ -2269,7 +2373,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2269 | " (%d sectors at %llu on %s)\n", | 2373 | " (%d sectors at %llu on %s)\n", |
2270 | mdname(mddev), s, | 2374 | mdname(mddev), s, |
2271 | (unsigned long long)( | 2375 | (unsigned long long)( |
2272 | sect + rdev->data_offset), | 2376 | sect + |
2377 | choose_data_offset(r10_bio, rdev)), | ||
2273 | bdevname(rdev->bdev, b)); | 2378 | bdevname(rdev->bdev, b)); |
2274 | atomic_add(s, &rdev->corrected_errors); | 2379 | atomic_add(s, &rdev->corrected_errors); |
2275 | } | 2380 | } |
@@ -2343,7 +2448,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) | |||
2343 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 2448 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
2344 | md_trim_bio(wbio, sector - bio->bi_sector, sectors); | 2449 | md_trim_bio(wbio, sector - bio->bi_sector, sectors); |
2345 | wbio->bi_sector = (r10_bio->devs[i].addr+ | 2450 | wbio->bi_sector = (r10_bio->devs[i].addr+ |
2346 | rdev->data_offset+ | 2451 | choose_data_offset(r10_bio, rdev) + |
2347 | (sector - r10_bio->sector)); | 2452 | (sector - r10_bio->sector)); |
2348 | wbio->bi_bdev = rdev->bdev; | 2453 | wbio->bi_bdev = rdev->bdev; |
2349 | if (submit_bio_wait(WRITE, wbio) == 0) | 2454 | if (submit_bio_wait(WRITE, wbio) == 0) |
@@ -2420,7 +2525,7 @@ read_more: | |||
2420 | r10_bio->devs[slot].bio = bio; | 2525 | r10_bio->devs[slot].bio = bio; |
2421 | r10_bio->devs[slot].rdev = rdev; | 2526 | r10_bio->devs[slot].rdev = rdev; |
2422 | bio->bi_sector = r10_bio->devs[slot].addr | 2527 | bio->bi_sector = r10_bio->devs[slot].addr |
2423 | + rdev->data_offset; | 2528 | + choose_data_offset(r10_bio, rdev); |
2424 | bio->bi_bdev = rdev->bdev; | 2529 | bio->bi_bdev = rdev->bdev; |
2425 | bio->bi_rw = READ | do_sync; | 2530 | bio->bi_rw = READ | do_sync; |
2426 | bio->bi_private = r10_bio; | 2531 | bio->bi_private = r10_bio; |
@@ -2480,7 +2585,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2480 | rdev_clear_badblocks( | 2585 | rdev_clear_badblocks( |
2481 | rdev, | 2586 | rdev, |
2482 | r10_bio->devs[m].addr, | 2587 | r10_bio->devs[m].addr, |
2483 | r10_bio->sectors); | 2588 | r10_bio->sectors, 0); |
2484 | } else { | 2589 | } else { |
2485 | if (!rdev_set_badblocks( | 2590 | if (!rdev_set_badblocks( |
2486 | rdev, | 2591 | rdev, |
@@ -2496,7 +2601,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2496 | rdev_clear_badblocks( | 2601 | rdev_clear_badblocks( |
2497 | rdev, | 2602 | rdev, |
2498 | r10_bio->devs[m].addr, | 2603 | r10_bio->devs[m].addr, |
2499 | r10_bio->sectors); | 2604 | r10_bio->sectors, 0); |
2500 | } else { | 2605 | } else { |
2501 | if (!rdev_set_badblocks( | 2606 | if (!rdev_set_badblocks( |
2502 | rdev, | 2607 | rdev, |
@@ -2515,7 +2620,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2515 | rdev_clear_badblocks( | 2620 | rdev_clear_badblocks( |
2516 | rdev, | 2621 | rdev, |
2517 | r10_bio->devs[m].addr, | 2622 | r10_bio->devs[m].addr, |
2518 | r10_bio->sectors); | 2623 | r10_bio->sectors, 0); |
2519 | rdev_dec_pending(rdev, conf->mddev); | 2624 | rdev_dec_pending(rdev, conf->mddev); |
2520 | } else if (bio != NULL && | 2625 | } else if (bio != NULL && |
2521 | !test_bit(BIO_UPTODATE, &bio->bi_flags)) { | 2626 | !test_bit(BIO_UPTODATE, &bio->bi_flags)) { |
@@ -2532,7 +2637,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2532 | rdev_clear_badblocks( | 2637 | rdev_clear_badblocks( |
2533 | rdev, | 2638 | rdev, |
2534 | r10_bio->devs[m].addr, | 2639 | r10_bio->devs[m].addr, |
2535 | r10_bio->sectors); | 2640 | r10_bio->sectors, 0); |
2536 | rdev_dec_pending(rdev, conf->mddev); | 2641 | rdev_dec_pending(rdev, conf->mddev); |
2537 | } | 2642 | } |
2538 | } | 2643 | } |
@@ -2573,6 +2678,8 @@ static void raid10d(struct mddev *mddev) | |||
2573 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || | 2678 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
2574 | test_bit(R10BIO_WriteError, &r10_bio->state)) | 2679 | test_bit(R10BIO_WriteError, &r10_bio->state)) |
2575 | handle_write_completed(conf, r10_bio); | 2680 | handle_write_completed(conf, r10_bio); |
2681 | else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) | ||
2682 | reshape_request_write(mddev, r10_bio); | ||
2576 | else if (test_bit(R10BIO_IsSync, &r10_bio->state)) | 2683 | else if (test_bit(R10BIO_IsSync, &r10_bio->state)) |
2577 | sync_request_write(mddev, r10_bio); | 2684 | sync_request_write(mddev, r10_bio); |
2578 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) | 2685 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
@@ -2603,7 +2710,7 @@ static int init_resync(struct r10conf *conf) | |||
2603 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; | 2710 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; |
2604 | BUG_ON(conf->r10buf_pool); | 2711 | BUG_ON(conf->r10buf_pool); |
2605 | conf->have_replacement = 0; | 2712 | conf->have_replacement = 0; |
2606 | for (i = 0; i < conf->raid_disks; i++) | 2713 | for (i = 0; i < conf->geo.raid_disks; i++) |
2607 | if (conf->mirrors[i].replacement) | 2714 | if (conf->mirrors[i].replacement) |
2608 | conf->have_replacement = 1; | 2715 | conf->have_replacement = 1; |
2609 | conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); | 2716 | conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); |
@@ -2657,6 +2764,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2657 | sector_t sync_blocks; | 2764 | sector_t sync_blocks; |
2658 | sector_t sectors_skipped = 0; | 2765 | sector_t sectors_skipped = 0; |
2659 | int chunks_skipped = 0; | 2766 | int chunks_skipped = 0; |
2767 | sector_t chunk_mask = conf->geo.chunk_mask; | ||
2660 | 2768 | ||
2661 | if (!conf->r10buf_pool) | 2769 | if (!conf->r10buf_pool) |
2662 | if (init_resync(conf)) | 2770 | if (init_resync(conf)) |
@@ -2664,7 +2772,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2664 | 2772 | ||
2665 | skipped: | 2773 | skipped: |
2666 | max_sector = mddev->dev_sectors; | 2774 | max_sector = mddev->dev_sectors; |
2667 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 2775 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
2776 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | ||
2668 | max_sector = mddev->resync_max_sectors; | 2777 | max_sector = mddev->resync_max_sectors; |
2669 | if (sector_nr >= max_sector) { | 2778 | if (sector_nr >= max_sector) { |
2670 | /* If we aborted, we need to abort the | 2779 | /* If we aborted, we need to abort the |
@@ -2676,11 +2785,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2676 | * we need to convert that to several | 2785 | * we need to convert that to several |
2677 | * virtual addresses. | 2786 | * virtual addresses. |
2678 | */ | 2787 | */ |
2788 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { | ||
2789 | end_reshape(conf); | ||
2790 | return 0; | ||
2791 | } | ||
2792 | |||
2679 | if (mddev->curr_resync < max_sector) { /* aborted */ | 2793 | if (mddev->curr_resync < max_sector) { /* aborted */ |
2680 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 2794 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
2681 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, | 2795 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, |
2682 | &sync_blocks, 1); | 2796 | &sync_blocks, 1); |
2683 | else for (i=0; i<conf->raid_disks; i++) { | 2797 | else for (i = 0; i < conf->geo.raid_disks; i++) { |
2684 | sector_t sect = | 2798 | sector_t sect = |
2685 | raid10_find_virt(conf, mddev->curr_resync, i); | 2799 | raid10_find_virt(conf, mddev->curr_resync, i); |
2686 | bitmap_end_sync(mddev->bitmap, sect, | 2800 | bitmap_end_sync(mddev->bitmap, sect, |
@@ -2694,7 +2808,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2694 | /* Completed a full sync so the replacements | 2808 | /* Completed a full sync so the replacements |
2695 | * are now fully recovered. | 2809 | * are now fully recovered. |
2696 | */ | 2810 | */ |
2697 | for (i = 0; i < conf->raid_disks; i++) | 2811 | for (i = 0; i < conf->geo.raid_disks; i++) |
2698 | if (conf->mirrors[i].replacement) | 2812 | if (conf->mirrors[i].replacement) |
2699 | conf->mirrors[i].replacement | 2813 | conf->mirrors[i].replacement |
2700 | ->recovery_offset | 2814 | ->recovery_offset |
@@ -2707,7 +2821,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2707 | *skipped = 1; | 2821 | *skipped = 1; |
2708 | return sectors_skipped; | 2822 | return sectors_skipped; |
2709 | } | 2823 | } |
2710 | if (chunks_skipped >= conf->raid_disks) { | 2824 | |
2825 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | ||
2826 | return reshape_request(mddev, sector_nr, skipped); | ||
2827 | |||
2828 | if (chunks_skipped >= conf->geo.raid_disks) { | ||
2711 | /* if there has been nothing to do on any drive, | 2829 | /* if there has been nothing to do on any drive, |
2712 | * then there is nothing to do at all.. | 2830 | * then there is nothing to do at all.. |
2713 | */ | 2831 | */ |
@@ -2721,9 +2839,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2721 | /* make sure whole request will fit in a chunk - if chunks | 2839 | /* make sure whole request will fit in a chunk - if chunks |
2722 | * are meaningful | 2840 | * are meaningful |
2723 | */ | 2841 | */ |
2724 | if (conf->near_copies < conf->raid_disks && | 2842 | if (conf->geo.near_copies < conf->geo.raid_disks && |
2725 | max_sector > (sector_nr | conf->chunk_mask)) | 2843 | max_sector > (sector_nr | chunk_mask)) |
2726 | max_sector = (sector_nr | conf->chunk_mask) + 1; | 2844 | max_sector = (sector_nr | chunk_mask) + 1; |
2727 | /* | 2845 | /* |
2728 | * If there is non-resync activity waiting for us then | 2846 | * If there is non-resync activity waiting for us then |
2729 | * put in a delay to throttle resync. | 2847 | * put in a delay to throttle resync. |
@@ -2752,7 +2870,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2752 | int j; | 2870 | int j; |
2753 | r10_bio = NULL; | 2871 | r10_bio = NULL; |
2754 | 2872 | ||
2755 | for (i=0 ; i<conf->raid_disks; i++) { | 2873 | for (i = 0 ; i < conf->geo.raid_disks; i++) { |
2756 | int still_degraded; | 2874 | int still_degraded; |
2757 | struct r10bio *rb2; | 2875 | struct r10bio *rb2; |
2758 | sector_t sect; | 2876 | sector_t sect; |
@@ -2806,7 +2924,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2806 | /* Need to check if the array will still be | 2924 | /* Need to check if the array will still be |
2807 | * degraded | 2925 | * degraded |
2808 | */ | 2926 | */ |
2809 | for (j=0; j<conf->raid_disks; j++) | 2927 | for (j = 0; j < conf->geo.raid_disks; j++) |
2810 | if (conf->mirrors[j].rdev == NULL || | 2928 | if (conf->mirrors[j].rdev == NULL || |
2811 | test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { | 2929 | test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { |
2812 | still_degraded = 1; | 2930 | still_degraded = 1; |
@@ -2984,9 +3102,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2984 | r10_bio->sector = sector_nr; | 3102 | r10_bio->sector = sector_nr; |
2985 | set_bit(R10BIO_IsSync, &r10_bio->state); | 3103 | set_bit(R10BIO_IsSync, &r10_bio->state); |
2986 | raid10_find_phys(conf, r10_bio); | 3104 | raid10_find_phys(conf, r10_bio); |
2987 | r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; | 3105 | r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; |
2988 | 3106 | ||
2989 | for (i=0; i<conf->copies; i++) { | 3107 | for (i = 0; i < conf->copies; i++) { |
2990 | int d = r10_bio->devs[i].devnum; | 3108 | int d = r10_bio->devs[i].devnum; |
2991 | sector_t first_bad, sector; | 3109 | sector_t first_bad, sector; |
2992 | int bad_sectors; | 3110 | int bad_sectors; |
@@ -3152,16 +3270,17 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) | |||
3152 | struct r10conf *conf = mddev->private; | 3270 | struct r10conf *conf = mddev->private; |
3153 | 3271 | ||
3154 | if (!raid_disks) | 3272 | if (!raid_disks) |
3155 | raid_disks = conf->raid_disks; | 3273 | raid_disks = min(conf->geo.raid_disks, |
3274 | conf->prev.raid_disks); | ||
3156 | if (!sectors) | 3275 | if (!sectors) |
3157 | sectors = conf->dev_sectors; | 3276 | sectors = conf->dev_sectors; |
3158 | 3277 | ||
3159 | size = sectors >> conf->chunk_shift; | 3278 | size = sectors >> conf->geo.chunk_shift; |
3160 | sector_div(size, conf->far_copies); | 3279 | sector_div(size, conf->geo.far_copies); |
3161 | size = size * raid_disks; | 3280 | size = size * raid_disks; |
3162 | sector_div(size, conf->near_copies); | 3281 | sector_div(size, conf->geo.near_copies); |
3163 | 3282 | ||
3164 | return size << conf->chunk_shift; | 3283 | return size << conf->geo.chunk_shift; |
3165 | } | 3284 | } |
3166 | 3285 | ||
3167 | static void calc_sectors(struct r10conf *conf, sector_t size) | 3286 | static void calc_sectors(struct r10conf *conf, sector_t size) |
@@ -3171,10 +3290,10 @@ static void calc_sectors(struct r10conf *conf, sector_t size) | |||
3171 | * conf->stride | 3290 | * conf->stride |
3172 | */ | 3291 | */ |
3173 | 3292 | ||
3174 | size = size >> conf->chunk_shift; | 3293 | size = size >> conf->geo.chunk_shift; |
3175 | sector_div(size, conf->far_copies); | 3294 | sector_div(size, conf->geo.far_copies); |
3176 | size = size * conf->raid_disks; | 3295 | size = size * conf->geo.raid_disks; |
3177 | sector_div(size, conf->near_copies); | 3296 | sector_div(size, conf->geo.near_copies); |
3178 | /* 'size' is now the number of chunks in the array */ | 3297 | /* 'size' is now the number of chunks in the array */ |
3179 | /* calculate "used chunks per device" */ | 3298 | /* calculate "used chunks per device" */ |
3180 | size = size * conf->copies; | 3299 | size = size * conf->copies; |
@@ -3182,38 +3301,76 @@ static void calc_sectors(struct r10conf *conf, sector_t size) | |||
3182 | /* We need to round up when dividing by raid_disks to | 3301 | /* We need to round up when dividing by raid_disks to |
3183 | * get the stride size. | 3302 | * get the stride size. |
3184 | */ | 3303 | */ |
3185 | size = DIV_ROUND_UP_SECTOR_T(size, conf->raid_disks); | 3304 | size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); |
3186 | 3305 | ||
3187 | conf->dev_sectors = size << conf->chunk_shift; | 3306 | conf->dev_sectors = size << conf->geo.chunk_shift; |
3188 | 3307 | ||
3189 | if (conf->far_offset) | 3308 | if (conf->geo.far_offset) |
3190 | conf->stride = 1 << conf->chunk_shift; | 3309 | conf->geo.stride = 1 << conf->geo.chunk_shift; |
3191 | else { | 3310 | else { |
3192 | sector_div(size, conf->far_copies); | 3311 | sector_div(size, conf->geo.far_copies); |
3193 | conf->stride = size << conf->chunk_shift; | 3312 | conf->geo.stride = size << conf->geo.chunk_shift; |
3194 | } | 3313 | } |
3195 | } | 3314 | } |
3196 | 3315 | ||
3316 | enum geo_type {geo_new, geo_old, geo_start}; | ||
3317 | static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) | ||
3318 | { | ||
3319 | int nc, fc, fo; | ||
3320 | int layout, chunk, disks; | ||
3321 | switch (new) { | ||
3322 | case geo_old: | ||
3323 | layout = mddev->layout; | ||
3324 | chunk = mddev->chunk_sectors; | ||
3325 | disks = mddev->raid_disks - mddev->delta_disks; | ||
3326 | break; | ||
3327 | case geo_new: | ||
3328 | layout = mddev->new_layout; | ||
3329 | chunk = mddev->new_chunk_sectors; | ||
3330 | disks = mddev->raid_disks; | ||
3331 | break; | ||
3332 | default: /* avoid 'may be unused' warnings */ | ||
3333 | case geo_start: /* new when starting reshape - raid_disks not | ||
3334 | * updated yet. */ | ||
3335 | layout = mddev->new_layout; | ||
3336 | chunk = mddev->new_chunk_sectors; | ||
3337 | disks = mddev->raid_disks + mddev->delta_disks; | ||
3338 | break; | ||
3339 | } | ||
3340 | if (layout >> 17) | ||
3341 | return -1; | ||
3342 | if (chunk < (PAGE_SIZE >> 9) || | ||
3343 | !is_power_of_2(chunk)) | ||
3344 | return -2; | ||
3345 | nc = layout & 255; | ||
3346 | fc = (layout >> 8) & 255; | ||
3347 | fo = layout & (1<<16); | ||
3348 | geo->raid_disks = disks; | ||
3349 | geo->near_copies = nc; | ||
3350 | geo->far_copies = fc; | ||
3351 | geo->far_offset = fo; | ||
3352 | geo->chunk_mask = chunk - 1; | ||
3353 | geo->chunk_shift = ffz(~chunk); | ||
3354 | return nc*fc; | ||
3355 | } | ||
3356 | |||
3197 | static struct r10conf *setup_conf(struct mddev *mddev) | 3357 | static struct r10conf *setup_conf(struct mddev *mddev) |
3198 | { | 3358 | { |
3199 | struct r10conf *conf = NULL; | 3359 | struct r10conf *conf = NULL; |
3200 | int nc, fc, fo; | ||
3201 | int err = -EINVAL; | 3360 | int err = -EINVAL; |
3361 | struct geom geo; | ||
3362 | int copies; | ||
3363 | |||
3364 | copies = setup_geo(&geo, mddev, geo_new); | ||
3202 | 3365 | ||
3203 | if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || | 3366 | if (copies == -2) { |
3204 | !is_power_of_2(mddev->new_chunk_sectors)) { | ||
3205 | printk(KERN_ERR "md/raid10:%s: chunk size must be " | 3367 | printk(KERN_ERR "md/raid10:%s: chunk size must be " |
3206 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", | 3368 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", |
3207 | mdname(mddev), PAGE_SIZE); | 3369 | mdname(mddev), PAGE_SIZE); |
3208 | goto out; | 3370 | goto out; |
3209 | } | 3371 | } |
3210 | 3372 | ||
3211 | nc = mddev->new_layout & 255; | 3373 | if (copies < 2 || copies > mddev->raid_disks) { |
3212 | fc = (mddev->new_layout >> 8) & 255; | ||
3213 | fo = mddev->new_layout & (1<<16); | ||
3214 | |||
3215 | if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || | ||
3216 | (mddev->new_layout >> 17)) { | ||
3217 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", | 3374 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", |
3218 | mdname(mddev), mddev->new_layout); | 3375 | mdname(mddev), mddev->new_layout); |
3219 | goto out; | 3376 | goto out; |
@@ -3224,7 +3381,9 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3224 | if (!conf) | 3381 | if (!conf) |
3225 | goto out; | 3382 | goto out; |
3226 | 3383 | ||
3227 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, | 3384 | /* FIXME calc properly */ |
3385 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + | ||
3386 | max(0,mddev->delta_disks)), | ||
3228 | GFP_KERNEL); | 3387 | GFP_KERNEL); |
3229 | if (!conf->mirrors) | 3388 | if (!conf->mirrors) |
3230 | goto out; | 3389 | goto out; |
@@ -3233,22 +3392,29 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3233 | if (!conf->tmppage) | 3392 | if (!conf->tmppage) |
3234 | goto out; | 3393 | goto out; |
3235 | 3394 | ||
3236 | 3395 | conf->geo = geo; | |
3237 | conf->raid_disks = mddev->raid_disks; | 3396 | conf->copies = copies; |
3238 | conf->near_copies = nc; | ||
3239 | conf->far_copies = fc; | ||
3240 | conf->copies = nc*fc; | ||
3241 | conf->far_offset = fo; | ||
3242 | conf->chunk_mask = mddev->new_chunk_sectors - 1; | ||
3243 | conf->chunk_shift = ffz(~mddev->new_chunk_sectors); | ||
3244 | |||
3245 | conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, | 3397 | conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, |
3246 | r10bio_pool_free, conf); | 3398 | r10bio_pool_free, conf); |
3247 | if (!conf->r10bio_pool) | 3399 | if (!conf->r10bio_pool) |
3248 | goto out; | 3400 | goto out; |
3249 | 3401 | ||
3250 | calc_sectors(conf, mddev->dev_sectors); | 3402 | calc_sectors(conf, mddev->dev_sectors); |
3251 | 3403 | if (mddev->reshape_position == MaxSector) { | |
3404 | conf->prev = conf->geo; | ||
3405 | conf->reshape_progress = MaxSector; | ||
3406 | } else { | ||
3407 | if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { | ||
3408 | err = -EINVAL; | ||
3409 | goto out; | ||
3410 | } | ||
3411 | conf->reshape_progress = mddev->reshape_position; | ||
3412 | if (conf->prev.far_offset) | ||
3413 | conf->prev.stride = 1 << conf->prev.chunk_shift; | ||
3414 | else | ||
3415 | /* far_copies must be 1 */ | ||
3416 | conf->prev.stride = conf->dev_sectors; | ||
3417 | } | ||
3252 | spin_lock_init(&conf->device_lock); | 3418 | spin_lock_init(&conf->device_lock); |
3253 | INIT_LIST_HEAD(&conf->retry_list); | 3419 | INIT_LIST_HEAD(&conf->retry_list); |
3254 | 3420 | ||
@@ -3263,8 +3429,9 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3263 | return conf; | 3429 | return conf; |
3264 | 3430 | ||
3265 | out: | 3431 | out: |
3266 | printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", | 3432 | if (err == -ENOMEM) |
3267 | mdname(mddev)); | 3433 | printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", |
3434 | mdname(mddev)); | ||
3268 | if (conf) { | 3435 | if (conf) { |
3269 | if (conf->r10bio_pool) | 3436 | if (conf->r10bio_pool) |
3270 | mempool_destroy(conf->r10bio_pool); | 3437 | mempool_destroy(conf->r10bio_pool); |
@@ -3282,12 +3449,8 @@ static int run(struct mddev *mddev) | |||
3282 | struct mirror_info *disk; | 3449 | struct mirror_info *disk; |
3283 | struct md_rdev *rdev; | 3450 | struct md_rdev *rdev; |
3284 | sector_t size; | 3451 | sector_t size; |
3285 | 3452 | sector_t min_offset_diff = 0; | |
3286 | /* | 3453 | int first = 1; |
3287 | * copy the already verified devices into our private RAID10 | ||
3288 | * bookkeeping area. [whatever we allocate in run(), | ||
3289 | * should be freed in stop()] | ||
3290 | */ | ||
3291 | 3454 | ||
3292 | if (mddev->private == NULL) { | 3455 | if (mddev->private == NULL) { |
3293 | conf = setup_conf(mddev); | 3456 | conf = setup_conf(mddev); |
@@ -3304,17 +3467,20 @@ static int run(struct mddev *mddev) | |||
3304 | 3467 | ||
3305 | chunk_size = mddev->chunk_sectors << 9; | 3468 | chunk_size = mddev->chunk_sectors << 9; |
3306 | blk_queue_io_min(mddev->queue, chunk_size); | 3469 | blk_queue_io_min(mddev->queue, chunk_size); |
3307 | if (conf->raid_disks % conf->near_copies) | 3470 | if (conf->geo.raid_disks % conf->geo.near_copies) |
3308 | blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); | 3471 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); |
3309 | else | 3472 | else |
3310 | blk_queue_io_opt(mddev->queue, chunk_size * | 3473 | blk_queue_io_opt(mddev->queue, chunk_size * |
3311 | (conf->raid_disks / conf->near_copies)); | 3474 | (conf->geo.raid_disks / conf->geo.near_copies)); |
3312 | 3475 | ||
3313 | rdev_for_each(rdev, mddev) { | 3476 | rdev_for_each(rdev, mddev) { |
3477 | long long diff; | ||
3314 | 3478 | ||
3315 | disk_idx = rdev->raid_disk; | 3479 | disk_idx = rdev->raid_disk; |
3316 | if (disk_idx >= conf->raid_disks | 3480 | if (disk_idx < 0) |
3317 | || disk_idx < 0) | 3481 | continue; |
3482 | if (disk_idx >= conf->geo.raid_disks && | ||
3483 | disk_idx >= conf->prev.raid_disks) | ||
3318 | continue; | 3484 | continue; |
3319 | disk = conf->mirrors + disk_idx; | 3485 | disk = conf->mirrors + disk_idx; |
3320 | 3486 | ||
@@ -3327,12 +3493,20 @@ static int run(struct mddev *mddev) | |||
3327 | goto out_free_conf; | 3493 | goto out_free_conf; |
3328 | disk->rdev = rdev; | 3494 | disk->rdev = rdev; |
3329 | } | 3495 | } |
3496 | diff = (rdev->new_data_offset - rdev->data_offset); | ||
3497 | if (!mddev->reshape_backwards) | ||
3498 | diff = -diff; | ||
3499 | if (diff < 0) | ||
3500 | diff = 0; | ||
3501 | if (first || diff < min_offset_diff) | ||
3502 | min_offset_diff = diff; | ||
3330 | 3503 | ||
3331 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 3504 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
3332 | rdev->data_offset << 9); | 3505 | rdev->data_offset << 9); |
3333 | 3506 | ||
3334 | disk->head_position = 0; | 3507 | disk->head_position = 0; |
3335 | } | 3508 | } |
3509 | |||
3336 | /* need to check that every block has at least one working mirror */ | 3510 | /* need to check that every block has at least one working mirror */ |
3337 | if (!enough(conf, -1)) { | 3511 | if (!enough(conf, -1)) { |
3338 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", | 3512 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", |
@@ -3340,8 +3514,21 @@ static int run(struct mddev *mddev) | |||
3340 | goto out_free_conf; | 3514 | goto out_free_conf; |
3341 | } | 3515 | } |
3342 | 3516 | ||
3517 | if (conf->reshape_progress != MaxSector) { | ||
3518 | /* must ensure that shape change is supported */ | ||
3519 | if (conf->geo.far_copies != 1 && | ||
3520 | conf->geo.far_offset == 0) | ||
3521 | goto out_free_conf; | ||
3522 | if (conf->prev.far_copies != 1 && | ||
3523 | conf->geo.far_offset == 0) | ||
3524 | goto out_free_conf; | ||
3525 | } | ||
3526 | |||
3343 | mddev->degraded = 0; | 3527 | mddev->degraded = 0; |
3344 | for (i = 0; i < conf->raid_disks; i++) { | 3528 | for (i = 0; |
3529 | i < conf->geo.raid_disks | ||
3530 | || i < conf->prev.raid_disks; | ||
3531 | i++) { | ||
3345 | 3532 | ||
3346 | disk = conf->mirrors + i; | 3533 | disk = conf->mirrors + i; |
3347 | 3534 | ||
@@ -3368,8 +3555,8 @@ static int run(struct mddev *mddev) | |||
3368 | mdname(mddev)); | 3555 | mdname(mddev)); |
3369 | printk(KERN_INFO | 3556 | printk(KERN_INFO |
3370 | "md/raid10:%s: active with %d out of %d devices\n", | 3557 | "md/raid10:%s: active with %d out of %d devices\n", |
3371 | mdname(mddev), conf->raid_disks - mddev->degraded, | 3558 | mdname(mddev), conf->geo.raid_disks - mddev->degraded, |
3372 | conf->raid_disks); | 3559 | conf->geo.raid_disks); |
3373 | /* | 3560 | /* |
3374 | * Ok, everything is just fine now | 3561 | * Ok, everything is just fine now |
3375 | */ | 3562 | */ |
@@ -3386,11 +3573,11 @@ static int run(struct mddev *mddev) | |||
3386 | * maybe... | 3573 | * maybe... |
3387 | */ | 3574 | */ |
3388 | { | 3575 | { |
3389 | int stripe = conf->raid_disks * | 3576 | int stripe = conf->geo.raid_disks * |
3390 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); | 3577 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); |
3391 | stripe /= conf->near_copies; | 3578 | stripe /= conf->geo.near_copies; |
3392 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) | 3579 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
3393 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; | 3580 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
3394 | } | 3581 | } |
3395 | 3582 | ||
3396 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | 3583 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); |
@@ -3398,6 +3585,30 @@ static int run(struct mddev *mddev) | |||
3398 | if (md_integrity_register(mddev)) | 3585 | if (md_integrity_register(mddev)) |
3399 | goto out_free_conf; | 3586 | goto out_free_conf; |
3400 | 3587 | ||
3588 | if (conf->reshape_progress != MaxSector) { | ||
3589 | unsigned long before_length, after_length; | ||
3590 | |||
3591 | before_length = ((1 << conf->prev.chunk_shift) * | ||
3592 | conf->prev.far_copies); | ||
3593 | after_length = ((1 << conf->geo.chunk_shift) * | ||
3594 | conf->geo.far_copies); | ||
3595 | |||
3596 | if (max(before_length, after_length) > min_offset_diff) { | ||
3597 | /* This cannot work */ | ||
3598 | printk("md/raid10: offset difference not enough to continue reshape\n"); | ||
3599 | goto out_free_conf; | ||
3600 | } | ||
3601 | conf->offset_diff = min_offset_diff; | ||
3602 | |||
3603 | conf->reshape_safe = conf->reshape_progress; | ||
3604 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
3605 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
3606 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
3607 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
3608 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, | ||
3609 | "reshape"); | ||
3610 | } | ||
3611 | |||
3401 | return 0; | 3612 | return 0; |
3402 | 3613 | ||
3403 | out_free_conf: | 3614 | out_free_conf: |
@@ -3460,14 +3671,23 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) | |||
3460 | struct r10conf *conf = mddev->private; | 3671 | struct r10conf *conf = mddev->private; |
3461 | sector_t oldsize, size; | 3672 | sector_t oldsize, size; |
3462 | 3673 | ||
3463 | if (conf->far_copies > 1 && !conf->far_offset) | 3674 | if (mddev->reshape_position != MaxSector) |
3675 | return -EBUSY; | ||
3676 | |||
3677 | if (conf->geo.far_copies > 1 && !conf->geo.far_offset) | ||
3464 | return -EINVAL; | 3678 | return -EINVAL; |
3465 | 3679 | ||
3466 | oldsize = raid10_size(mddev, 0, 0); | 3680 | oldsize = raid10_size(mddev, 0, 0); |
3467 | size = raid10_size(mddev, sectors, 0); | 3681 | size = raid10_size(mddev, sectors, 0); |
3468 | md_set_array_sectors(mddev, size); | 3682 | if (mddev->external_size && |
3469 | if (mddev->array_sectors > size) | 3683 | mddev->array_sectors > size) |
3470 | return -EINVAL; | 3684 | return -EINVAL; |
3685 | if (mddev->bitmap) { | ||
3686 | int ret = bitmap_resize(mddev->bitmap, size, 0, 0); | ||
3687 | if (ret) | ||
3688 | return ret; | ||
3689 | } | ||
3690 | md_set_array_sectors(mddev, size); | ||
3471 | set_capacity(mddev->gendisk, mddev->array_sectors); | 3691 | set_capacity(mddev->gendisk, mddev->array_sectors); |
3472 | revalidate_disk(mddev->gendisk); | 3692 | revalidate_disk(mddev->gendisk); |
3473 | if (sectors > mddev->dev_sectors && | 3693 | if (sectors > mddev->dev_sectors && |
@@ -3534,6 +3754,758 @@ static void *raid10_takeover(struct mddev *mddev) | |||
3534 | return ERR_PTR(-EINVAL); | 3754 | return ERR_PTR(-EINVAL); |
3535 | } | 3755 | } |
3536 | 3756 | ||
3757 | static int raid10_check_reshape(struct mddev *mddev) | ||
3758 | { | ||
3759 | /* Called when there is a request to change | ||
3760 | * - layout (to ->new_layout) | ||
3761 | * - chunk size (to ->new_chunk_sectors) | ||
3762 | * - raid_disks (by delta_disks) | ||
3763 | * or when trying to restart a reshape that was ongoing. | ||
3764 | * | ||
3765 | * We need to validate the request and possibly allocate | ||
3766 | * space if that might be an issue later. | ||
3767 | * | ||
3768 | * Currently we reject any reshape of a 'far' mode array, | ||
3769 | * allow chunk size to change if new is generally acceptable, | ||
3770 | * allow raid_disks to increase, and allow | ||
3771 | * a switch between 'near' mode and 'offset' mode. | ||
3772 | */ | ||
3773 | struct r10conf *conf = mddev->private; | ||
3774 | struct geom geo; | ||
3775 | |||
3776 | if (conf->geo.far_copies != 1 && !conf->geo.far_offset) | ||
3777 | return -EINVAL; | ||
3778 | |||
3779 | if (setup_geo(&geo, mddev, geo_start) != conf->copies) | ||
3780 | /* mustn't change number of copies */ | ||
3781 | return -EINVAL; | ||
3782 | if (geo.far_copies > 1 && !geo.far_offset) | ||
3783 | /* Cannot switch to 'far' mode */ | ||
3784 | return -EINVAL; | ||
3785 | |||
3786 | if (mddev->array_sectors & geo.chunk_mask) | ||
3787 | /* not factor of array size */ | ||
3788 | return -EINVAL; | ||
3789 | |||
3790 | if (!enough(conf, -1)) | ||
3791 | return -EINVAL; | ||
3792 | |||
3793 | kfree(conf->mirrors_new); | ||
3794 | conf->mirrors_new = NULL; | ||
3795 | if (mddev->delta_disks > 0) { | ||
3796 | /* allocate new 'mirrors' list */ | ||
3797 | conf->mirrors_new = kzalloc( | ||
3798 | sizeof(struct mirror_info) | ||
3799 | *(mddev->raid_disks + | ||
3800 | mddev->delta_disks), | ||
3801 | GFP_KERNEL); | ||
3802 | if (!conf->mirrors_new) | ||
3803 | return -ENOMEM; | ||
3804 | } | ||
3805 | return 0; | ||
3806 | } | ||
3807 | |||
3808 | /* | ||
3809 | * Need to check if array has failed when deciding whether to: | ||
3810 | * - start an array | ||
3811 | * - remove non-faulty devices | ||
3812 | * - add a spare | ||
3813 | * - allow a reshape | ||
3814 | * This determination is simple when no reshape is happening. | ||
3815 | * However if there is a reshape, we need to carefully check | ||
3816 | * both the before and after sections. | ||
3817 | * This is because some failed devices may only affect one | ||
3818 | * of the two sections, and some non-in_sync devices may | ||
3819 | * be insync in the section most affected by failed devices. | ||
3820 | */ | ||
3821 | static int calc_degraded(struct r10conf *conf) | ||
3822 | { | ||
3823 | int degraded, degraded2; | ||
3824 | int i; | ||
3825 | |||
3826 | rcu_read_lock(); | ||
3827 | degraded = 0; | ||
3828 | /* 'prev' section first */ | ||
3829 | for (i = 0; i < conf->prev.raid_disks; i++) { | ||
3830 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | ||
3831 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
3832 | degraded++; | ||
3833 | else if (!test_bit(In_sync, &rdev->flags)) | ||
3834 | /* When we can reduce the number of devices in | ||
3835 | * an array, this might not contribute to | ||
3836 | * 'degraded'. It does now. | ||
3837 | */ | ||
3838 | degraded++; | ||
3839 | } | ||
3840 | rcu_read_unlock(); | ||
3841 | if (conf->geo.raid_disks == conf->prev.raid_disks) | ||
3842 | return degraded; | ||
3843 | rcu_read_lock(); | ||
3844 | degraded2 = 0; | ||
3845 | for (i = 0; i < conf->geo.raid_disks; i++) { | ||
3846 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | ||
3847 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
3848 | degraded2++; | ||
3849 | else if (!test_bit(In_sync, &rdev->flags)) { | ||
3850 | /* If reshape is increasing the number of devices, | ||
3851 | * this section has already been recovered, so | ||
3852 | * it doesn't contribute to degraded. | ||
3853 | * else it does. | ||
3854 | */ | ||
3855 | if (conf->geo.raid_disks <= conf->prev.raid_disks) | ||
3856 | degraded2++; | ||
3857 | } | ||
3858 | } | ||
3859 | rcu_read_unlock(); | ||
3860 | if (degraded2 > degraded) | ||
3861 | return degraded2; | ||
3862 | return degraded; | ||
3863 | } | ||
3864 | |||
3865 | static int raid10_start_reshape(struct mddev *mddev) | ||
3866 | { | ||
3867 | /* A 'reshape' has been requested. This commits | ||
3868 | * the various 'new' fields and sets MD_RECOVER_RESHAPE | ||
3869 | * This also checks if there are enough spares and adds them | ||
3870 | * to the array. | ||
3871 | * We currently require enough spares to make the final | ||
3872 | * array non-degraded. We also require that the difference | ||
3873 | * between old and new data_offset - on each device - is | ||
3874 | * enough that we never risk over-writing. | ||
3875 | */ | ||
3876 | |||
3877 | unsigned long before_length, after_length; | ||
3878 | sector_t min_offset_diff = 0; | ||
3879 | int first = 1; | ||
3880 | struct geom new; | ||
3881 | struct r10conf *conf = mddev->private; | ||
3882 | struct md_rdev *rdev; | ||
3883 | int spares = 0; | ||
3884 | int ret; | ||
3885 | |||
3886 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | ||
3887 | return -EBUSY; | ||
3888 | |||
3889 | if (setup_geo(&new, mddev, geo_start) != conf->copies) | ||
3890 | return -EINVAL; | ||
3891 | |||
3892 | before_length = ((1 << conf->prev.chunk_shift) * | ||
3893 | conf->prev.far_copies); | ||
3894 | after_length = ((1 << conf->geo.chunk_shift) * | ||
3895 | conf->geo.far_copies); | ||
3896 | |||
3897 | rdev_for_each(rdev, mddev) { | ||
3898 | if (!test_bit(In_sync, &rdev->flags) | ||
3899 | && !test_bit(Faulty, &rdev->flags)) | ||
3900 | spares++; | ||
3901 | if (rdev->raid_disk >= 0) { | ||
3902 | long long diff = (rdev->new_data_offset | ||
3903 | - rdev->data_offset); | ||
3904 | if (!mddev->reshape_backwards) | ||
3905 | diff = -diff; | ||
3906 | if (diff < 0) | ||
3907 | diff = 0; | ||
3908 | if (first || diff < min_offset_diff) | ||
3909 | min_offset_diff = diff; | ||
3910 | } | ||
3911 | } | ||
3912 | |||
3913 | if (max(before_length, after_length) > min_offset_diff) | ||
3914 | return -EINVAL; | ||
3915 | |||
3916 | if (spares < mddev->delta_disks) | ||
3917 | return -EINVAL; | ||
3918 | |||
3919 | conf->offset_diff = min_offset_diff; | ||
3920 | spin_lock_irq(&conf->device_lock); | ||
3921 | if (conf->mirrors_new) { | ||
3922 | memcpy(conf->mirrors_new, conf->mirrors, | ||
3923 | sizeof(struct mirror_info)*conf->prev.raid_disks); | ||
3924 | smp_mb(); | ||
3925 | kfree(conf->mirrors_old); /* FIXME and elsewhere */ | ||
3926 | conf->mirrors_old = conf->mirrors; | ||
3927 | conf->mirrors = conf->mirrors_new; | ||
3928 | conf->mirrors_new = NULL; | ||
3929 | } | ||
3930 | setup_geo(&conf->geo, mddev, geo_start); | ||
3931 | smp_mb(); | ||
3932 | if (mddev->reshape_backwards) { | ||
3933 | sector_t size = raid10_size(mddev, 0, 0); | ||
3934 | if (size < mddev->array_sectors) { | ||
3935 | spin_unlock_irq(&conf->device_lock); | ||
3936 | printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n", | ||
3937 | mdname(mddev)); | ||
3938 | return -EINVAL; | ||
3939 | } | ||
3940 | mddev->resync_max_sectors = size; | ||
3941 | conf->reshape_progress = size; | ||
3942 | } else | ||
3943 | conf->reshape_progress = 0; | ||
3944 | spin_unlock_irq(&conf->device_lock); | ||
3945 | |||
3946 | if (mddev->delta_disks && mddev->bitmap) { | ||
3947 | ret = bitmap_resize(mddev->bitmap, | ||
3948 | raid10_size(mddev, 0, | ||
3949 | conf->geo.raid_disks), | ||
3950 | 0, 0); | ||
3951 | if (ret) | ||
3952 | goto abort; | ||
3953 | } | ||
3954 | if (mddev->delta_disks > 0) { | ||
3955 | rdev_for_each(rdev, mddev) | ||
3956 | if (rdev->raid_disk < 0 && | ||
3957 | !test_bit(Faulty, &rdev->flags)) { | ||
3958 | if (raid10_add_disk(mddev, rdev) == 0) { | ||
3959 | if (rdev->raid_disk >= | ||
3960 | conf->prev.raid_disks) | ||
3961 | set_bit(In_sync, &rdev->flags); | ||
3962 | else | ||
3963 | rdev->recovery_offset = 0; | ||
3964 | |||
3965 | if (sysfs_link_rdev(mddev, rdev)) | ||
3966 | /* Failure here is OK */; | ||
3967 | } | ||
3968 | } else if (rdev->raid_disk >= conf->prev.raid_disks | ||
3969 | && !test_bit(Faulty, &rdev->flags)) { | ||
3970 | /* This is a spare that was manually added */ | ||
3971 | set_bit(In_sync, &rdev->flags); | ||
3972 | } | ||
3973 | } | ||
3974 | /* When a reshape changes the number of devices, | ||
3975 | * ->degraded is measured against the larger of the | ||
3976 | * pre and post numbers. | ||
3977 | */ | ||
3978 | spin_lock_irq(&conf->device_lock); | ||
3979 | mddev->degraded = calc_degraded(conf); | ||
3980 | spin_unlock_irq(&conf->device_lock); | ||
3981 | mddev->raid_disks = conf->geo.raid_disks; | ||
3982 | mddev->reshape_position = conf->reshape_progress; | ||
3983 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
3984 | |||
3985 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
3986 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
3987 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
3988 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
3989 | |||
3990 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, | ||
3991 | "reshape"); | ||
3992 | if (!mddev->sync_thread) { | ||
3993 | ret = -EAGAIN; | ||
3994 | goto abort; | ||
3995 | } | ||
3996 | conf->reshape_checkpoint = jiffies; | ||
3997 | md_wakeup_thread(mddev->sync_thread); | ||
3998 | md_new_event(mddev); | ||
3999 | return 0; | ||
4000 | |||
4001 | abort: | ||
4002 | mddev->recovery = 0; | ||
4003 | spin_lock_irq(&conf->device_lock); | ||
4004 | conf->geo = conf->prev; | ||
4005 | mddev->raid_disks = conf->geo.raid_disks; | ||
4006 | rdev_for_each(rdev, mddev) | ||
4007 | rdev->new_data_offset = rdev->data_offset; | ||
4008 | smp_wmb(); | ||
4009 | conf->reshape_progress = MaxSector; | ||
4010 | mddev->reshape_position = MaxSector; | ||
4011 | spin_unlock_irq(&conf->device_lock); | ||
4012 | return ret; | ||
4013 | } | ||
4014 | |||
4015 | /* Calculate the last device-address that could contain | ||
4016 | * any block from the chunk that includes the array-address 's' | ||
4017 | * and report the next address. | ||
4018 | * i.e. the address returned will be chunk-aligned and after | ||
4019 | * any data that is in the chunk containing 's'. | ||
4020 | */ | ||
4021 | static sector_t last_dev_address(sector_t s, struct geom *geo) | ||
4022 | { | ||
4023 | s = (s | geo->chunk_mask) + 1; | ||
4024 | s >>= geo->chunk_shift; | ||
4025 | s *= geo->near_copies; | ||
4026 | s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); | ||
4027 | s *= geo->far_copies; | ||
4028 | s <<= geo->chunk_shift; | ||
4029 | return s; | ||
4030 | } | ||
4031 | |||
4032 | /* Calculate the first device-address that could contain | ||
4033 | * any block from the chunk that includes the array-address 's'. | ||
4034 | * This too will be the start of a chunk | ||
4035 | */ | ||
4036 | static sector_t first_dev_address(sector_t s, struct geom *geo) | ||
4037 | { | ||
4038 | s >>= geo->chunk_shift; | ||
4039 | s *= geo->near_copies; | ||
4040 | sector_div(s, geo->raid_disks); | ||
4041 | s *= geo->far_copies; | ||
4042 | s <<= geo->chunk_shift; | ||
4043 | return s; | ||
4044 | } | ||
4045 | |||
4046 | static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | ||
4047 | int *skipped) | ||
4048 | { | ||
4049 | /* We simply copy at most one chunk (smallest of old and new) | ||
4050 | * at a time, possibly less if that exceeds RESYNC_PAGES, | ||
4051 | * or we hit a bad block or something. | ||
4052 | * This might mean we pause for normal IO in the middle of | ||
4053 | * a chunk, but that is not a problem was mddev->reshape_position | ||
4054 | * can record any location. | ||
4055 | * | ||
4056 | * If we will want to write to a location that isn't | ||
4057 | * yet recorded as 'safe' (i.e. in metadata on disk) then | ||
4058 | * we need to flush all reshape requests and update the metadata. | ||
4059 | * | ||
4060 | * When reshaping forwards (e.g. to more devices), we interpret | ||
4061 | * 'safe' as the earliest block which might not have been copied | ||
4062 | * down yet. We divide this by previous stripe size and multiply | ||
4063 | * by previous stripe length to get lowest device offset that we | ||
4064 | * cannot write to yet. | ||
4065 | * We interpret 'sector_nr' as an address that we want to write to. | ||
4066 | * From this we use last_device_address() to find where we might | ||
4067 | * write to, and first_device_address on the 'safe' position. | ||
4068 | * If this 'next' write position is after the 'safe' position, | ||
4069 | * we must update the metadata to increase the 'safe' position. | ||
4070 | * | ||
4071 | * When reshaping backwards, we round in the opposite direction | ||
4072 | * and perform the reverse test: next write position must not be | ||
4073 | * less than current safe position. | ||
4074 | * | ||
4075 | * In all this the minimum difference in data offsets | ||
4076 | * (conf->offset_diff - always positive) allows a bit of slack, | ||
4077 | * so next can be after 'safe', but not by more than offset_disk | ||
4078 | * | ||
4079 | * We need to prepare all the bios here before we start any IO | ||
4080 | * to ensure the size we choose is acceptable to all devices. | ||
4081 | * The means one for each copy for write-out and an extra one for | ||
4082 | * read-in. | ||
4083 | * We store the read-in bio in ->master_bio and the others in | ||
4084 | * ->devs[x].bio and ->devs[x].repl_bio. | ||
4085 | */ | ||
4086 | struct r10conf *conf = mddev->private; | ||
4087 | struct r10bio *r10_bio; | ||
4088 | sector_t next, safe, last; | ||
4089 | int max_sectors; | ||
4090 | int nr_sectors; | ||
4091 | int s; | ||
4092 | struct md_rdev *rdev; | ||
4093 | int need_flush = 0; | ||
4094 | struct bio *blist; | ||
4095 | struct bio *bio, *read_bio; | ||
4096 | int sectors_done = 0; | ||
4097 | |||
4098 | if (sector_nr == 0) { | ||
4099 | /* If restarting in the middle, skip the initial sectors */ | ||
4100 | if (mddev->reshape_backwards && | ||
4101 | conf->reshape_progress < raid10_size(mddev, 0, 0)) { | ||
4102 | sector_nr = (raid10_size(mddev, 0, 0) | ||
4103 | - conf->reshape_progress); | ||
4104 | } else if (!mddev->reshape_backwards && | ||
4105 | conf->reshape_progress > 0) | ||
4106 | sector_nr = conf->reshape_progress; | ||
4107 | if (sector_nr) { | ||
4108 | mddev->curr_resync_completed = sector_nr; | ||
4109 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | ||
4110 | *skipped = 1; | ||
4111 | return sector_nr; | ||
4112 | } | ||
4113 | } | ||
4114 | |||
4115 | /* We don't use sector_nr to track where we are up to | ||
4116 | * as that doesn't work well for ->reshape_backwards. | ||
4117 | * So just use ->reshape_progress. | ||
4118 | */ | ||
4119 | if (mddev->reshape_backwards) { | ||
4120 | /* 'next' is the earliest device address that we might | ||
4121 | * write to for this chunk in the new layout | ||
4122 | */ | ||
4123 | next = first_dev_address(conf->reshape_progress - 1, | ||
4124 | &conf->geo); | ||
4125 | |||
4126 | /* 'safe' is the last device address that we might read from | ||
4127 | * in the old layout after a restart | ||
4128 | */ | ||
4129 | safe = last_dev_address(conf->reshape_safe - 1, | ||
4130 | &conf->prev); | ||
4131 | |||
4132 | if (next + conf->offset_diff < safe) | ||
4133 | need_flush = 1; | ||
4134 | |||
4135 | last = conf->reshape_progress - 1; | ||
4136 | sector_nr = last & ~(sector_t)(conf->geo.chunk_mask | ||
4137 | & conf->prev.chunk_mask); | ||
4138 | if (sector_nr + RESYNC_BLOCK_SIZE/512 < last) | ||
4139 | sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512; | ||
4140 | } else { | ||
4141 | /* 'next' is after the last device address that we | ||
4142 | * might write to for this chunk in the new layout | ||
4143 | */ | ||
4144 | next = last_dev_address(conf->reshape_progress, &conf->geo); | ||
4145 | |||
4146 | /* 'safe' is the earliest device address that we might | ||
4147 | * read from in the old layout after a restart | ||
4148 | */ | ||
4149 | safe = first_dev_address(conf->reshape_safe, &conf->prev); | ||
4150 | |||
4151 | /* Need to update metadata if 'next' might be beyond 'safe' | ||
4152 | * as that would possibly corrupt data | ||
4153 | */ | ||
4154 | if (next > safe + conf->offset_diff) | ||
4155 | need_flush = 1; | ||
4156 | |||
4157 | sector_nr = conf->reshape_progress; | ||
4158 | last = sector_nr | (conf->geo.chunk_mask | ||
4159 | & conf->prev.chunk_mask); | ||
4160 | |||
4161 | if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last) | ||
4162 | last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1; | ||
4163 | } | ||
4164 | |||
4165 | if (need_flush || | ||
4166 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | ||
4167 | /* Need to update reshape_position in metadata */ | ||
4168 | wait_barrier(conf); | ||
4169 | mddev->reshape_position = conf->reshape_progress; | ||
4170 | if (mddev->reshape_backwards) | ||
4171 | mddev->curr_resync_completed = raid10_size(mddev, 0, 0) | ||
4172 | - conf->reshape_progress; | ||
4173 | else | ||
4174 | mddev->curr_resync_completed = conf->reshape_progress; | ||
4175 | conf->reshape_checkpoint = jiffies; | ||
4176 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
4177 | md_wakeup_thread(mddev->thread); | ||
4178 | wait_event(mddev->sb_wait, mddev->flags == 0 || | ||
4179 | kthread_should_stop()); | ||
4180 | conf->reshape_safe = mddev->reshape_position; | ||
4181 | allow_barrier(conf); | ||
4182 | } | ||
4183 | |||
4184 | read_more: | ||
4185 | /* Now schedule reads for blocks from sector_nr to last */ | ||
4186 | r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); | ||
4187 | raise_barrier(conf, sectors_done != 0); | ||
4188 | atomic_set(&r10_bio->remaining, 0); | ||
4189 | r10_bio->mddev = mddev; | ||
4190 | r10_bio->sector = sector_nr; | ||
4191 | set_bit(R10BIO_IsReshape, &r10_bio->state); | ||
4192 | r10_bio->sectors = last - sector_nr + 1; | ||
4193 | rdev = read_balance(conf, r10_bio, &max_sectors); | ||
4194 | BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); | ||
4195 | |||
4196 | if (!rdev) { | ||
4197 | /* Cannot read from here, so need to record bad blocks | ||
4198 | * on all the target devices. | ||
4199 | */ | ||
4200 | // FIXME | ||
4201 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
4202 | return sectors_done; | ||
4203 | } | ||
4204 | |||
4205 | read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); | ||
4206 | |||
4207 | read_bio->bi_bdev = rdev->bdev; | ||
4208 | read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr | ||
4209 | + rdev->data_offset); | ||
4210 | read_bio->bi_private = r10_bio; | ||
4211 | read_bio->bi_end_io = end_sync_read; | ||
4212 | read_bio->bi_rw = READ; | ||
4213 | read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
4214 | read_bio->bi_flags |= 1 << BIO_UPTODATE; | ||
4215 | read_bio->bi_vcnt = 0; | ||
4216 | read_bio->bi_idx = 0; | ||
4217 | read_bio->bi_size = 0; | ||
4218 | r10_bio->master_bio = read_bio; | ||
4219 | r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; | ||
4220 | |||
4221 | /* Now find the locations in the new layout */ | ||
4222 | __raid10_find_phys(&conf->geo, r10_bio); | ||
4223 | |||
4224 | blist = read_bio; | ||
4225 | read_bio->bi_next = NULL; | ||
4226 | |||
4227 | for (s = 0; s < conf->copies*2; s++) { | ||
4228 | struct bio *b; | ||
4229 | int d = r10_bio->devs[s/2].devnum; | ||
4230 | struct md_rdev *rdev2; | ||
4231 | if (s&1) { | ||
4232 | rdev2 = conf->mirrors[d].replacement; | ||
4233 | b = r10_bio->devs[s/2].repl_bio; | ||
4234 | } else { | ||
4235 | rdev2 = conf->mirrors[d].rdev; | ||
4236 | b = r10_bio->devs[s/2].bio; | ||
4237 | } | ||
4238 | if (!rdev2 || test_bit(Faulty, &rdev2->flags)) | ||
4239 | continue; | ||
4240 | b->bi_bdev = rdev2->bdev; | ||
4241 | b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; | ||
4242 | b->bi_private = r10_bio; | ||
4243 | b->bi_end_io = end_reshape_write; | ||
4244 | b->bi_rw = WRITE; | ||
4245 | b->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
4246 | b->bi_flags |= 1 << BIO_UPTODATE; | ||
4247 | b->bi_next = blist; | ||
4248 | b->bi_vcnt = 0; | ||
4249 | b->bi_idx = 0; | ||
4250 | b->bi_size = 0; | ||
4251 | blist = b; | ||
4252 | } | ||
4253 | |||
4254 | /* Now add as many pages as possible to all of these bios. */ | ||
4255 | |||
4256 | nr_sectors = 0; | ||
4257 | for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { | ||
4258 | struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page; | ||
4259 | int len = (max_sectors - s) << 9; | ||
4260 | if (len > PAGE_SIZE) | ||
4261 | len = PAGE_SIZE; | ||
4262 | for (bio = blist; bio ; bio = bio->bi_next) { | ||
4263 | struct bio *bio2; | ||
4264 | if (bio_add_page(bio, page, len, 0)) | ||
4265 | continue; | ||
4266 | |||
4267 | /* Didn't fit, must stop */ | ||
4268 | for (bio2 = blist; | ||
4269 | bio2 && bio2 != bio; | ||
4270 | bio2 = bio2->bi_next) { | ||
4271 | /* Remove last page from this bio */ | ||
4272 | bio2->bi_vcnt--; | ||
4273 | bio2->bi_size -= len; | ||
4274 | bio2->bi_flags &= ~(1<<BIO_SEG_VALID); | ||
4275 | } | ||
4276 | goto bio_full; | ||
4277 | } | ||
4278 | sector_nr += len >> 9; | ||
4279 | nr_sectors += len >> 9; | ||
4280 | } | ||
4281 | bio_full: | ||
4282 | r10_bio->sectors = nr_sectors; | ||
4283 | |||
4284 | /* Now submit the read */ | ||
4285 | md_sync_acct(read_bio->bi_bdev, r10_bio->sectors); | ||
4286 | atomic_inc(&r10_bio->remaining); | ||
4287 | read_bio->bi_next = NULL; | ||
4288 | generic_make_request(read_bio); | ||
4289 | sector_nr += nr_sectors; | ||
4290 | sectors_done += nr_sectors; | ||
4291 | if (sector_nr <= last) | ||
4292 | goto read_more; | ||
4293 | |||
4294 | /* Now that we have done the whole section we can | ||
4295 | * update reshape_progress | ||
4296 | */ | ||
4297 | if (mddev->reshape_backwards) | ||
4298 | conf->reshape_progress -= sectors_done; | ||
4299 | else | ||
4300 | conf->reshape_progress += sectors_done; | ||
4301 | |||
4302 | return sectors_done; | ||
4303 | } | ||
4304 | |||
4305 | static void end_reshape_request(struct r10bio *r10_bio); | ||
4306 | static int handle_reshape_read_error(struct mddev *mddev, | ||
4307 | struct r10bio *r10_bio); | ||
4308 | static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) | ||
4309 | { | ||
4310 | /* Reshape read completed. Hopefully we have a block | ||
4311 | * to write out. | ||
4312 | * If we got a read error then we do sync 1-page reads from | ||
4313 | * elsewhere until we find the data - or give up. | ||
4314 | */ | ||
4315 | struct r10conf *conf = mddev->private; | ||
4316 | int s; | ||
4317 | |||
4318 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) | ||
4319 | if (handle_reshape_read_error(mddev, r10_bio) < 0) { | ||
4320 | /* Reshape has been aborted */ | ||
4321 | md_done_sync(mddev, r10_bio->sectors, 0); | ||
4322 | return; | ||
4323 | } | ||
4324 | |||
4325 | /* We definitely have the data in the pages, schedule the | ||
4326 | * writes. | ||
4327 | */ | ||
4328 | atomic_set(&r10_bio->remaining, 1); | ||
4329 | for (s = 0; s < conf->copies*2; s++) { | ||
4330 | struct bio *b; | ||
4331 | int d = r10_bio->devs[s/2].devnum; | ||
4332 | struct md_rdev *rdev; | ||
4333 | if (s&1) { | ||
4334 | rdev = conf->mirrors[d].replacement; | ||
4335 | b = r10_bio->devs[s/2].repl_bio; | ||
4336 | } else { | ||
4337 | rdev = conf->mirrors[d].rdev; | ||
4338 | b = r10_bio->devs[s/2].bio; | ||
4339 | } | ||
4340 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
4341 | continue; | ||
4342 | atomic_inc(&rdev->nr_pending); | ||
4343 | md_sync_acct(b->bi_bdev, r10_bio->sectors); | ||
4344 | atomic_inc(&r10_bio->remaining); | ||
4345 | b->bi_next = NULL; | ||
4346 | generic_make_request(b); | ||
4347 | } | ||
4348 | end_reshape_request(r10_bio); | ||
4349 | } | ||
4350 | |||
4351 | static void end_reshape(struct r10conf *conf) | ||
4352 | { | ||
4353 | if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) | ||
4354 | return; | ||
4355 | |||
4356 | spin_lock_irq(&conf->device_lock); | ||
4357 | conf->prev = conf->geo; | ||
4358 | md_finish_reshape(conf->mddev); | ||
4359 | smp_wmb(); | ||
4360 | conf->reshape_progress = MaxSector; | ||
4361 | spin_unlock_irq(&conf->device_lock); | ||
4362 | |||
4363 | /* read-ahead size must cover two whole stripes, which is | ||
4364 | * 2 * (datadisks) * chunksize where 'n' is the number of raid devices | ||
4365 | */ | ||
4366 | if (conf->mddev->queue) { | ||
4367 | int stripe = conf->geo.raid_disks * | ||
4368 | ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); | ||
4369 | stripe /= conf->geo.near_copies; | ||
4370 | if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | ||
4371 | conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | ||
4372 | } | ||
4373 | conf->fullsync = 0; | ||
4374 | } | ||
4375 | |||
4376 | |||
4377 | static int handle_reshape_read_error(struct mddev *mddev, | ||
4378 | struct r10bio *r10_bio) | ||
4379 | { | ||
4380 | /* Use sync reads to get the blocks from somewhere else */ | ||
4381 | int sectors = r10_bio->sectors; | ||
4382 | struct r10bio r10b; | ||
4383 | struct r10conf *conf = mddev->private; | ||
4384 | int slot = 0; | ||
4385 | int idx = 0; | ||
4386 | struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec; | ||
4387 | |||
4388 | r10b.sector = r10_bio->sector; | ||
4389 | __raid10_find_phys(&conf->prev, &r10b); | ||
4390 | |||
4391 | while (sectors) { | ||
4392 | int s = sectors; | ||
4393 | int success = 0; | ||
4394 | int first_slot = slot; | ||
4395 | |||
4396 | if (s > (PAGE_SIZE >> 9)) | ||
4397 | s = PAGE_SIZE >> 9; | ||
4398 | |||
4399 | while (!success) { | ||
4400 | int d = r10b.devs[slot].devnum; | ||
4401 | struct md_rdev *rdev = conf->mirrors[d].rdev; | ||
4402 | sector_t addr; | ||
4403 | if (rdev == NULL || | ||
4404 | test_bit(Faulty, &rdev->flags) || | ||
4405 | !test_bit(In_sync, &rdev->flags)) | ||
4406 | goto failed; | ||
4407 | |||
4408 | addr = r10b.devs[slot].addr + idx * PAGE_SIZE; | ||
4409 | success = sync_page_io(rdev, | ||
4410 | addr, | ||
4411 | s << 9, | ||
4412 | bvec[idx].bv_page, | ||
4413 | READ, false); | ||
4414 | if (success) | ||
4415 | break; | ||
4416 | failed: | ||
4417 | slot++; | ||
4418 | if (slot >= conf->copies) | ||
4419 | slot = 0; | ||
4420 | if (slot == first_slot) | ||
4421 | break; | ||
4422 | } | ||
4423 | if (!success) { | ||
4424 | /* couldn't read this block, must give up */ | ||
4425 | set_bit(MD_RECOVERY_INTR, | ||
4426 | &mddev->recovery); | ||
4427 | return -EIO; | ||
4428 | } | ||
4429 | sectors -= s; | ||
4430 | idx++; | ||
4431 | } | ||
4432 | return 0; | ||
4433 | } | ||
4434 | |||
4435 | static void end_reshape_write(struct bio *bio, int error) | ||
4436 | { | ||
4437 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
4438 | struct r10bio *r10_bio = bio->bi_private; | ||
4439 | struct mddev *mddev = r10_bio->mddev; | ||
4440 | struct r10conf *conf = mddev->private; | ||
4441 | int d; | ||
4442 | int slot; | ||
4443 | int repl; | ||
4444 | struct md_rdev *rdev = NULL; | ||
4445 | |||
4446 | d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); | ||
4447 | if (repl) | ||
4448 | rdev = conf->mirrors[d].replacement; | ||
4449 | if (!rdev) { | ||
4450 | smp_mb(); | ||
4451 | rdev = conf->mirrors[d].rdev; | ||
4452 | } | ||
4453 | |||
4454 | if (!uptodate) { | ||
4455 | /* FIXME should record badblock */ | ||
4456 | md_error(mddev, rdev); | ||
4457 | } | ||
4458 | |||
4459 | rdev_dec_pending(rdev, mddev); | ||
4460 | end_reshape_request(r10_bio); | ||
4461 | } | ||
4462 | |||
4463 | static void end_reshape_request(struct r10bio *r10_bio) | ||
4464 | { | ||
4465 | if (!atomic_dec_and_test(&r10_bio->remaining)) | ||
4466 | return; | ||
4467 | md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); | ||
4468 | bio_put(r10_bio->master_bio); | ||
4469 | put_buf(r10_bio); | ||
4470 | } | ||
4471 | |||
4472 | static void raid10_finish_reshape(struct mddev *mddev) | ||
4473 | { | ||
4474 | struct r10conf *conf = mddev->private; | ||
4475 | |||
4476 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
4477 | return; | ||
4478 | |||
4479 | if (mddev->delta_disks > 0) { | ||
4480 | sector_t size = raid10_size(mddev, 0, 0); | ||
4481 | md_set_array_sectors(mddev, size); | ||
4482 | if (mddev->recovery_cp > mddev->resync_max_sectors) { | ||
4483 | mddev->recovery_cp = mddev->resync_max_sectors; | ||
4484 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
4485 | } | ||
4486 | mddev->resync_max_sectors = size; | ||
4487 | set_capacity(mddev->gendisk, mddev->array_sectors); | ||
4488 | revalidate_disk(mddev->gendisk); | ||
4489 | } else { | ||
4490 | int d; | ||
4491 | for (d = conf->geo.raid_disks ; | ||
4492 | d < conf->geo.raid_disks - mddev->delta_disks; | ||
4493 | d++) { | ||
4494 | struct md_rdev *rdev = conf->mirrors[d].rdev; | ||
4495 | if (rdev) | ||
4496 | clear_bit(In_sync, &rdev->flags); | ||
4497 | rdev = conf->mirrors[d].replacement; | ||
4498 | if (rdev) | ||
4499 | clear_bit(In_sync, &rdev->flags); | ||
4500 | } | ||
4501 | } | ||
4502 | mddev->layout = mddev->new_layout; | ||
4503 | mddev->chunk_sectors = 1 << conf->geo.chunk_shift; | ||
4504 | mddev->reshape_position = MaxSector; | ||
4505 | mddev->delta_disks = 0; | ||
4506 | mddev->reshape_backwards = 0; | ||
4507 | } | ||
4508 | |||
3537 | static struct md_personality raid10_personality = | 4509 | static struct md_personality raid10_personality = |
3538 | { | 4510 | { |
3539 | .name = "raid10", | 4511 | .name = "raid10", |
@@ -3552,6 +4524,9 @@ static struct md_personality raid10_personality = | |||
3552 | .size = raid10_size, | 4524 | .size = raid10_size, |
3553 | .resize = raid10_resize, | 4525 | .resize = raid10_resize, |
3554 | .takeover = raid10_takeover, | 4526 | .takeover = raid10_takeover, |
4527 | .check_reshape = raid10_check_reshape, | ||
4528 | .start_reshape = raid10_start_reshape, | ||
4529 | .finish_reshape = raid10_finish_reshape, | ||
3555 | }; | 4530 | }; |
3556 | 4531 | ||
3557 | static int __init raid_init(void) | 4532 | static int __init raid_init(void) |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 7c615613c381..135b1b0a1554 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -14,32 +14,38 @@ struct mirror_info { | |||
14 | struct r10conf { | 14 | struct r10conf { |
15 | struct mddev *mddev; | 15 | struct mddev *mddev; |
16 | struct mirror_info *mirrors; | 16 | struct mirror_info *mirrors; |
17 | int raid_disks; | 17 | struct mirror_info *mirrors_new, *mirrors_old; |
18 | spinlock_t device_lock; | 18 | spinlock_t device_lock; |
19 | 19 | ||
20 | /* geometry */ | 20 | /* geometry */ |
21 | int near_copies; /* number of copies laid out | 21 | struct geom { |
22 | int raid_disks; | ||
23 | int near_copies; /* number of copies laid out | ||
22 | * raid0 style */ | 24 | * raid0 style */ |
23 | int far_copies; /* number of copies laid out | 25 | int far_copies; /* number of copies laid out |
24 | * at large strides across drives | 26 | * at large strides across drives |
25 | */ | 27 | */ |
26 | int far_offset; /* far_copies are offset by 1 | 28 | int far_offset; /* far_copies are offset by 1 |
27 | * stripe instead of many | 29 | * stripe instead of many |
28 | */ | 30 | */ |
29 | int copies; /* near_copies * far_copies. | 31 | sector_t stride; /* distance between far copies. |
30 | * must be <= raid_disks | ||
31 | */ | ||
32 | sector_t stride; /* distance between far copies. | ||
33 | * This is size / far_copies unless | 32 | * This is size / far_copies unless |
34 | * far_offset, in which case it is | 33 | * far_offset, in which case it is |
35 | * 1 stripe. | 34 | * 1 stripe. |
36 | */ | 35 | */ |
36 | int chunk_shift; /* shift from chunks to sectors */ | ||
37 | sector_t chunk_mask; | ||
38 | } prev, geo; | ||
39 | int copies; /* near_copies * far_copies. | ||
40 | * must be <= raid_disks | ||
41 | */ | ||
37 | 42 | ||
38 | sector_t dev_sectors; /* temp copy of | 43 | sector_t dev_sectors; /* temp copy of |
39 | * mddev->dev_sectors */ | 44 | * mddev->dev_sectors */ |
40 | 45 | sector_t reshape_progress; | |
41 | int chunk_shift; /* shift from chunks to sectors */ | 46 | sector_t reshape_safe; |
42 | sector_t chunk_mask; | 47 | unsigned long reshape_checkpoint; |
48 | sector_t offset_diff; | ||
43 | 49 | ||
44 | struct list_head retry_list; | 50 | struct list_head retry_list; |
45 | /* queue pending writes and submit them on unplug */ | 51 | /* queue pending writes and submit them on unplug */ |
@@ -136,6 +142,7 @@ enum r10bio_state { | |||
136 | R10BIO_Uptodate, | 142 | R10BIO_Uptodate, |
137 | R10BIO_IsSync, | 143 | R10BIO_IsSync, |
138 | R10BIO_IsRecover, | 144 | R10BIO_IsRecover, |
145 | R10BIO_IsReshape, | ||
139 | R10BIO_Degraded, | 146 | R10BIO_Degraded, |
140 | /* Set ReadError on bios that experience a read error | 147 | /* Set ReadError on bios that experience a read error |
141 | * so that raid10d knows what to do with them. | 148 | * so that raid10d knows what to do with them. |
@@ -146,5 +153,10 @@ enum r10bio_state { | |||
146 | */ | 153 | */ |
147 | R10BIO_MadeGood, | 154 | R10BIO_MadeGood, |
148 | R10BIO_WriteError, | 155 | R10BIO_WriteError, |
156 | /* During a reshape we might be performing IO on the | ||
157 | * 'previous' part of the array, in which case this | ||
158 | * flag is set | ||
159 | */ | ||
160 | R10BIO_Previous, | ||
149 | }; | 161 | }; |
150 | #endif | 162 | #endif |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f351422938e0..d26767246d26 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
488 | return sh; | 488 | return sh; |
489 | } | 489 | } |
490 | 490 | ||
491 | /* Determine if 'data_offset' or 'new_data_offset' should be used | ||
492 | * in this stripe_head. | ||
493 | */ | ||
494 | static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) | ||
495 | { | ||
496 | sector_t progress = conf->reshape_progress; | ||
497 | /* Need a memory barrier to make sure we see the value | ||
498 | * of conf->generation, or ->data_offset that was set before | ||
499 | * reshape_progress was updated. | ||
500 | */ | ||
501 | smp_rmb(); | ||
502 | if (progress == MaxSector) | ||
503 | return 0; | ||
504 | if (sh->generation == conf->generation - 1) | ||
505 | return 0; | ||
506 | /* We are in a reshape, and this is a new-generation stripe, | ||
507 | * so use new_data_offset. | ||
508 | */ | ||
509 | return 1; | ||
510 | } | ||
511 | |||
491 | static void | 512 | static void |
492 | raid5_end_read_request(struct bio *bi, int error); | 513 | raid5_end_read_request(struct bio *bi, int error); |
493 | static void | 514 | static void |
@@ -518,6 +539,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
518 | replace_only = 1; | 539 | replace_only = 1; |
519 | } else | 540 | } else |
520 | continue; | 541 | continue; |
542 | if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) | ||
543 | rw |= REQ_SYNC; | ||
521 | 544 | ||
522 | bi = &sh->dev[i].req; | 545 | bi = &sh->dev[i].req; |
523 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ | 546 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ |
@@ -603,7 +626,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
603 | __func__, (unsigned long long)sh->sector, | 626 | __func__, (unsigned long long)sh->sector, |
604 | bi->bi_rw, i); | 627 | bi->bi_rw, i); |
605 | atomic_inc(&sh->count); | 628 | atomic_inc(&sh->count); |
606 | bi->bi_sector = sh->sector + rdev->data_offset; | 629 | if (use_new_offset(conf, sh)) |
630 | bi->bi_sector = (sh->sector | ||
631 | + rdev->new_data_offset); | ||
632 | else | ||
633 | bi->bi_sector = (sh->sector | ||
634 | + rdev->data_offset); | ||
607 | bi->bi_flags = 1 << BIO_UPTODATE; | 635 | bi->bi_flags = 1 << BIO_UPTODATE; |
608 | bi->bi_idx = 0; | 636 | bi->bi_idx = 0; |
609 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 637 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
@@ -627,7 +655,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
627 | __func__, (unsigned long long)sh->sector, | 655 | __func__, (unsigned long long)sh->sector, |
628 | rbi->bi_rw, i); | 656 | rbi->bi_rw, i); |
629 | atomic_inc(&sh->count); | 657 | atomic_inc(&sh->count); |
630 | rbi->bi_sector = sh->sector + rrdev->data_offset; | 658 | if (use_new_offset(conf, sh)) |
659 | rbi->bi_sector = (sh->sector | ||
660 | + rrdev->new_data_offset); | ||
661 | else | ||
662 | rbi->bi_sector = (sh->sector | ||
663 | + rrdev->data_offset); | ||
631 | rbi->bi_flags = 1 << BIO_UPTODATE; | 664 | rbi->bi_flags = 1 << BIO_UPTODATE; |
632 | rbi->bi_idx = 0; | 665 | rbi->bi_idx = 0; |
633 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 666 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
@@ -1114,6 +1147,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1114 | dev->sector + STRIPE_SECTORS) { | 1147 | dev->sector + STRIPE_SECTORS) { |
1115 | if (wbi->bi_rw & REQ_FUA) | 1148 | if (wbi->bi_rw & REQ_FUA) |
1116 | set_bit(R5_WantFUA, &dev->flags); | 1149 | set_bit(R5_WantFUA, &dev->flags); |
1150 | if (wbi->bi_rw & REQ_SYNC) | ||
1151 | set_bit(R5_SyncIO, &dev->flags); | ||
1117 | tx = async_copy_data(1, wbi, dev->page, | 1152 | tx = async_copy_data(1, wbi, dev->page, |
1118 | dev->sector, tx); | 1153 | dev->sector, tx); |
1119 | wbi = r5_next_bio(wbi, dev->sector); | 1154 | wbi = r5_next_bio(wbi, dev->sector); |
@@ -1131,13 +1166,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
1131 | int pd_idx = sh->pd_idx; | 1166 | int pd_idx = sh->pd_idx; |
1132 | int qd_idx = sh->qd_idx; | 1167 | int qd_idx = sh->qd_idx; |
1133 | int i; | 1168 | int i; |
1134 | bool fua = false; | 1169 | bool fua = false, sync = false; |
1135 | 1170 | ||
1136 | pr_debug("%s: stripe %llu\n", __func__, | 1171 | pr_debug("%s: stripe %llu\n", __func__, |
1137 | (unsigned long long)sh->sector); | 1172 | (unsigned long long)sh->sector); |
1138 | 1173 | ||
1139 | for (i = disks; i--; ) | 1174 | for (i = disks; i--; ) { |
1140 | fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); | 1175 | fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); |
1176 | sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); | ||
1177 | } | ||
1141 | 1178 | ||
1142 | for (i = disks; i--; ) { | 1179 | for (i = disks; i--; ) { |
1143 | struct r5dev *dev = &sh->dev[i]; | 1180 | struct r5dev *dev = &sh->dev[i]; |
@@ -1146,6 +1183,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
1146 | set_bit(R5_UPTODATE, &dev->flags); | 1183 | set_bit(R5_UPTODATE, &dev->flags); |
1147 | if (fua) | 1184 | if (fua) |
1148 | set_bit(R5_WantFUA, &dev->flags); | 1185 | set_bit(R5_WantFUA, &dev->flags); |
1186 | if (sync) | ||
1187 | set_bit(R5_SyncIO, &dev->flags); | ||
1149 | } | 1188 | } |
1150 | } | 1189 | } |
1151 | 1190 | ||
@@ -1648,7 +1687,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1648 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1687 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
1649 | char b[BDEVNAME_SIZE]; | 1688 | char b[BDEVNAME_SIZE]; |
1650 | struct md_rdev *rdev = NULL; | 1689 | struct md_rdev *rdev = NULL; |
1651 | 1690 | sector_t s; | |
1652 | 1691 | ||
1653 | for (i=0 ; i<disks; i++) | 1692 | for (i=0 ; i<disks; i++) |
1654 | if (bi == &sh->dev[i].req) | 1693 | if (bi == &sh->dev[i].req) |
@@ -1671,6 +1710,10 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1671 | if (!rdev) | 1710 | if (!rdev) |
1672 | rdev = conf->disks[i].rdev; | 1711 | rdev = conf->disks[i].rdev; |
1673 | 1712 | ||
1713 | if (use_new_offset(conf, sh)) | ||
1714 | s = sh->sector + rdev->new_data_offset; | ||
1715 | else | ||
1716 | s = sh->sector + rdev->data_offset; | ||
1674 | if (uptodate) { | 1717 | if (uptodate) { |
1675 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 1718 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
1676 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | 1719 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { |
@@ -1683,8 +1726,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1683 | "md/raid:%s: read error corrected" | 1726 | "md/raid:%s: read error corrected" |
1684 | " (%lu sectors at %llu on %s)\n", | 1727 | " (%lu sectors at %llu on %s)\n", |
1685 | mdname(conf->mddev), STRIPE_SECTORS, | 1728 | mdname(conf->mddev), STRIPE_SECTORS, |
1686 | (unsigned long long)(sh->sector | 1729 | (unsigned long long)s, |
1687 | + rdev->data_offset), | ||
1688 | bdevname(rdev->bdev, b)); | 1730 | bdevname(rdev->bdev, b)); |
1689 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | 1731 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); |
1690 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1732 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
@@ -1704,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1704 | "md/raid:%s: read error on replacement device " | 1746 | "md/raid:%s: read error on replacement device " |
1705 | "(sector %llu on %s).\n", | 1747 | "(sector %llu on %s).\n", |
1706 | mdname(conf->mddev), | 1748 | mdname(conf->mddev), |
1707 | (unsigned long long)(sh->sector | 1749 | (unsigned long long)s, |
1708 | + rdev->data_offset), | ||
1709 | bdn); | 1750 | bdn); |
1710 | else if (conf->mddev->degraded >= conf->max_degraded) | 1751 | else if (conf->mddev->degraded >= conf->max_degraded) |
1711 | printk_ratelimited( | 1752 | printk_ratelimited( |
@@ -1713,8 +1754,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1713 | "md/raid:%s: read error not correctable " | 1754 | "md/raid:%s: read error not correctable " |
1714 | "(sector %llu on %s).\n", | 1755 | "(sector %llu on %s).\n", |
1715 | mdname(conf->mddev), | 1756 | mdname(conf->mddev), |
1716 | (unsigned long long)(sh->sector | 1757 | (unsigned long long)s, |
1717 | + rdev->data_offset), | ||
1718 | bdn); | 1758 | bdn); |
1719 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) | 1759 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) |
1720 | /* Oh, no!!! */ | 1760 | /* Oh, no!!! */ |
@@ -1723,8 +1763,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1723 | "md/raid:%s: read error NOT corrected!! " | 1763 | "md/raid:%s: read error NOT corrected!! " |
1724 | "(sector %llu on %s).\n", | 1764 | "(sector %llu on %s).\n", |
1725 | mdname(conf->mddev), | 1765 | mdname(conf->mddev), |
1726 | (unsigned long long)(sh->sector | 1766 | (unsigned long long)s, |
1727 | + rdev->data_offset), | ||
1728 | bdn); | 1767 | bdn); |
1729 | else if (atomic_read(&rdev->read_errors) | 1768 | else if (atomic_read(&rdev->read_errors) |
1730 | > conf->max_nr_stripes) | 1769 | > conf->max_nr_stripes) |
@@ -3561,7 +3600,7 @@ finish: | |||
3561 | if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { | 3600 | if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { |
3562 | rdev = conf->disks[i].rdev; | 3601 | rdev = conf->disks[i].rdev; |
3563 | rdev_clear_badblocks(rdev, sh->sector, | 3602 | rdev_clear_badblocks(rdev, sh->sector, |
3564 | STRIPE_SECTORS); | 3603 | STRIPE_SECTORS, 0); |
3565 | rdev_dec_pending(rdev, conf->mddev); | 3604 | rdev_dec_pending(rdev, conf->mddev); |
3566 | } | 3605 | } |
3567 | if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { | 3606 | if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { |
@@ -3570,7 +3609,7 @@ finish: | |||
3570 | /* rdev have been moved down */ | 3609 | /* rdev have been moved down */ |
3571 | rdev = conf->disks[i].rdev; | 3610 | rdev = conf->disks[i].rdev; |
3572 | rdev_clear_badblocks(rdev, sh->sector, | 3611 | rdev_clear_badblocks(rdev, sh->sector, |
3573 | STRIPE_SECTORS); | 3612 | STRIPE_SECTORS, 0); |
3574 | rdev_dec_pending(rdev, conf->mddev); | 3613 | rdev_dec_pending(rdev, conf->mddev); |
3575 | } | 3614 | } |
3576 | } | 3615 | } |
@@ -3842,6 +3881,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
3842 | raid_bio->bi_next = (void*)rdev; | 3881 | raid_bio->bi_next = (void*)rdev; |
3843 | align_bi->bi_bdev = rdev->bdev; | 3882 | align_bi->bi_bdev = rdev->bdev; |
3844 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); | 3883 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); |
3884 | /* No reshape active, so we can trust rdev->data_offset */ | ||
3845 | align_bi->bi_sector += rdev->data_offset; | 3885 | align_bi->bi_sector += rdev->data_offset; |
3846 | 3886 | ||
3847 | if (!bio_fits_rdev(align_bi) || | 3887 | if (!bio_fits_rdev(align_bi) || |
@@ -3953,12 +3993,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
3953 | plugged = mddev_check_plugged(mddev); | 3993 | plugged = mddev_check_plugged(mddev); |
3954 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { | 3994 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { |
3955 | DEFINE_WAIT(w); | 3995 | DEFINE_WAIT(w); |
3956 | int disks, data_disks; | ||
3957 | int previous; | 3996 | int previous; |
3958 | 3997 | ||
3959 | retry: | 3998 | retry: |
3960 | previous = 0; | 3999 | previous = 0; |
3961 | disks = conf->raid_disks; | ||
3962 | prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); | 4000 | prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); |
3963 | if (unlikely(conf->reshape_progress != MaxSector)) { | 4001 | if (unlikely(conf->reshape_progress != MaxSector)) { |
3964 | /* spinlock is needed as reshape_progress may be | 4002 | /* spinlock is needed as reshape_progress may be |
@@ -3970,13 +4008,12 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
3970 | * to check again. | 4008 | * to check again. |
3971 | */ | 4009 | */ |
3972 | spin_lock_irq(&conf->device_lock); | 4010 | spin_lock_irq(&conf->device_lock); |
3973 | if (mddev->delta_disks < 0 | 4011 | if (mddev->reshape_backwards |
3974 | ? logical_sector < conf->reshape_progress | 4012 | ? logical_sector < conf->reshape_progress |
3975 | : logical_sector >= conf->reshape_progress) { | 4013 | : logical_sector >= conf->reshape_progress) { |
3976 | disks = conf->previous_raid_disks; | ||
3977 | previous = 1; | 4014 | previous = 1; |
3978 | } else { | 4015 | } else { |
3979 | if (mddev->delta_disks < 0 | 4016 | if (mddev->reshape_backwards |
3980 | ? logical_sector < conf->reshape_safe | 4017 | ? logical_sector < conf->reshape_safe |
3981 | : logical_sector >= conf->reshape_safe) { | 4018 | : logical_sector >= conf->reshape_safe) { |
3982 | spin_unlock_irq(&conf->device_lock); | 4019 | spin_unlock_irq(&conf->device_lock); |
@@ -3986,7 +4023,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
3986 | } | 4023 | } |
3987 | spin_unlock_irq(&conf->device_lock); | 4024 | spin_unlock_irq(&conf->device_lock); |
3988 | } | 4025 | } |
3989 | data_disks = disks - conf->max_degraded; | ||
3990 | 4026 | ||
3991 | new_sector = raid5_compute_sector(conf, logical_sector, | 4027 | new_sector = raid5_compute_sector(conf, logical_sector, |
3992 | previous, | 4028 | previous, |
@@ -4009,7 +4045,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
4009 | */ | 4045 | */ |
4010 | int must_retry = 0; | 4046 | int must_retry = 0; |
4011 | spin_lock_irq(&conf->device_lock); | 4047 | spin_lock_irq(&conf->device_lock); |
4012 | if (mddev->delta_disks < 0 | 4048 | if (mddev->reshape_backwards |
4013 | ? logical_sector >= conf->reshape_progress | 4049 | ? logical_sector >= conf->reshape_progress |
4014 | : logical_sector < conf->reshape_progress) | 4050 | : logical_sector < conf->reshape_progress) |
4015 | /* mismatch, need to try again */ | 4051 | /* mismatch, need to try again */ |
@@ -4108,11 +4144,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4108 | 4144 | ||
4109 | if (sector_nr == 0) { | 4145 | if (sector_nr == 0) { |
4110 | /* If restarting in the middle, skip the initial sectors */ | 4146 | /* If restarting in the middle, skip the initial sectors */ |
4111 | if (mddev->delta_disks < 0 && | 4147 | if (mddev->reshape_backwards && |
4112 | conf->reshape_progress < raid5_size(mddev, 0, 0)) { | 4148 | conf->reshape_progress < raid5_size(mddev, 0, 0)) { |
4113 | sector_nr = raid5_size(mddev, 0, 0) | 4149 | sector_nr = raid5_size(mddev, 0, 0) |
4114 | - conf->reshape_progress; | 4150 | - conf->reshape_progress; |
4115 | } else if (mddev->delta_disks >= 0 && | 4151 | } else if (!mddev->reshape_backwards && |
4116 | conf->reshape_progress > 0) | 4152 | conf->reshape_progress > 0) |
4117 | sector_nr = conf->reshape_progress; | 4153 | sector_nr = conf->reshape_progress; |
4118 | sector_div(sector_nr, new_data_disks); | 4154 | sector_div(sector_nr, new_data_disks); |
@@ -4133,13 +4169,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4133 | else | 4169 | else |
4134 | reshape_sectors = mddev->chunk_sectors; | 4170 | reshape_sectors = mddev->chunk_sectors; |
4135 | 4171 | ||
4136 | /* we update the metadata when there is more than 3Meg | 4172 | /* We update the metadata at least every 10 seconds, or when |
4137 | * in the block range (that is rather arbitrary, should | 4173 | * the data about to be copied would over-write the source of |
4138 | * probably be time based) or when the data about to be | 4174 | * the data at the front of the range. i.e. one new_stripe |
4139 | * copied would over-write the source of the data at | 4175 | * along from reshape_progress new_maps to after where |
4140 | * the front of the range. | 4176 | * reshape_safe old_maps to |
4141 | * i.e. one new_stripe along from reshape_progress new_maps | ||
4142 | * to after where reshape_safe old_maps to | ||
4143 | */ | 4177 | */ |
4144 | writepos = conf->reshape_progress; | 4178 | writepos = conf->reshape_progress; |
4145 | sector_div(writepos, new_data_disks); | 4179 | sector_div(writepos, new_data_disks); |
@@ -4147,7 +4181,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4147 | sector_div(readpos, data_disks); | 4181 | sector_div(readpos, data_disks); |
4148 | safepos = conf->reshape_safe; | 4182 | safepos = conf->reshape_safe; |
4149 | sector_div(safepos, data_disks); | 4183 | sector_div(safepos, data_disks); |
4150 | if (mddev->delta_disks < 0) { | 4184 | if (mddev->reshape_backwards) { |
4151 | writepos -= min_t(sector_t, reshape_sectors, writepos); | 4185 | writepos -= min_t(sector_t, reshape_sectors, writepos); |
4152 | readpos += reshape_sectors; | 4186 | readpos += reshape_sectors; |
4153 | safepos += reshape_sectors; | 4187 | safepos += reshape_sectors; |
@@ -4157,11 +4191,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4157 | safepos -= min_t(sector_t, reshape_sectors, safepos); | 4191 | safepos -= min_t(sector_t, reshape_sectors, safepos); |
4158 | } | 4192 | } |
4159 | 4193 | ||
4194 | /* Having calculated the 'writepos' possibly use it | ||
4195 | * to set 'stripe_addr' which is where we will write to. | ||
4196 | */ | ||
4197 | if (mddev->reshape_backwards) { | ||
4198 | BUG_ON(conf->reshape_progress == 0); | ||
4199 | stripe_addr = writepos; | ||
4200 | BUG_ON((mddev->dev_sectors & | ||
4201 | ~((sector_t)reshape_sectors - 1)) | ||
4202 | - reshape_sectors - stripe_addr | ||
4203 | != sector_nr); | ||
4204 | } else { | ||
4205 | BUG_ON(writepos != sector_nr + reshape_sectors); | ||
4206 | stripe_addr = sector_nr; | ||
4207 | } | ||
4208 | |||
4160 | /* 'writepos' is the most advanced device address we might write. | 4209 | /* 'writepos' is the most advanced device address we might write. |
4161 | * 'readpos' is the least advanced device address we might read. | 4210 | * 'readpos' is the least advanced device address we might read. |
4162 | * 'safepos' is the least address recorded in the metadata as having | 4211 | * 'safepos' is the least address recorded in the metadata as having |
4163 | * been reshaped. | 4212 | * been reshaped. |
4164 | * If 'readpos' is behind 'writepos', then there is no way that we can | 4213 | * If there is a min_offset_diff, these are adjusted either by |
4214 | * increasing the safepos/readpos if diff is negative, or | ||
4215 | * increasing writepos if diff is positive. | ||
4216 | * If 'readpos' is then behind 'writepos', there is no way that we can | ||
4165 | * ensure safety in the face of a crash - that must be done by userspace | 4217 | * ensure safety in the face of a crash - that must be done by userspace |
4166 | * making a backup of the data. So in that case there is no particular | 4218 | * making a backup of the data. So in that case there is no particular |
4167 | * rush to update metadata. | 4219 | * rush to update metadata. |
@@ -4174,7 +4226,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4174 | * Maybe that number should be configurable, but I'm not sure it is | 4226 | * Maybe that number should be configurable, but I'm not sure it is |
4175 | * worth it.... maybe it could be a multiple of safemode_delay??? | 4227 | * worth it.... maybe it could be a multiple of safemode_delay??? |
4176 | */ | 4228 | */ |
4177 | if ((mddev->delta_disks < 0 | 4229 | if (conf->min_offset_diff < 0) { |
4230 | safepos += -conf->min_offset_diff; | ||
4231 | readpos += -conf->min_offset_diff; | ||
4232 | } else | ||
4233 | writepos += conf->min_offset_diff; | ||
4234 | |||
4235 | if ((mddev->reshape_backwards | ||
4178 | ? (safepos > writepos && readpos < writepos) | 4236 | ? (safepos > writepos && readpos < writepos) |
4179 | : (safepos < writepos && readpos > writepos)) || | 4237 | : (safepos < writepos && readpos > writepos)) || |
4180 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | 4238 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { |
@@ -4195,17 +4253,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4195 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 4253 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
4196 | } | 4254 | } |
4197 | 4255 | ||
4198 | if (mddev->delta_disks < 0) { | ||
4199 | BUG_ON(conf->reshape_progress == 0); | ||
4200 | stripe_addr = writepos; | ||
4201 | BUG_ON((mddev->dev_sectors & | ||
4202 | ~((sector_t)reshape_sectors - 1)) | ||
4203 | - reshape_sectors - stripe_addr | ||
4204 | != sector_nr); | ||
4205 | } else { | ||
4206 | BUG_ON(writepos != sector_nr + reshape_sectors); | ||
4207 | stripe_addr = sector_nr; | ||
4208 | } | ||
4209 | INIT_LIST_HEAD(&stripes); | 4256 | INIT_LIST_HEAD(&stripes); |
4210 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { | 4257 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { |
4211 | int j; | 4258 | int j; |
@@ -4239,7 +4286,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4239 | list_add(&sh->lru, &stripes); | 4286 | list_add(&sh->lru, &stripes); |
4240 | } | 4287 | } |
4241 | spin_lock_irq(&conf->device_lock); | 4288 | spin_lock_irq(&conf->device_lock); |
4242 | if (mddev->delta_disks < 0) | 4289 | if (mddev->reshape_backwards) |
4243 | conf->reshape_progress -= reshape_sectors * new_data_disks; | 4290 | conf->reshape_progress -= reshape_sectors * new_data_disks; |
4244 | else | 4291 | else |
4245 | conf->reshape_progress += reshape_sectors * new_data_disks; | 4292 | conf->reshape_progress += reshape_sectors * new_data_disks; |
@@ -4952,16 +4999,42 @@ static int run(struct mddev *mddev) | |||
4952 | struct md_rdev *rdev; | 4999 | struct md_rdev *rdev; |
4953 | sector_t reshape_offset = 0; | 5000 | sector_t reshape_offset = 0; |
4954 | int i; | 5001 | int i; |
5002 | long long min_offset_diff = 0; | ||
5003 | int first = 1; | ||
4955 | 5004 | ||
4956 | if (mddev->recovery_cp != MaxSector) | 5005 | if (mddev->recovery_cp != MaxSector) |
4957 | printk(KERN_NOTICE "md/raid:%s: not clean" | 5006 | printk(KERN_NOTICE "md/raid:%s: not clean" |
4958 | " -- starting background reconstruction\n", | 5007 | " -- starting background reconstruction\n", |
4959 | mdname(mddev)); | 5008 | mdname(mddev)); |
5009 | |||
5010 | rdev_for_each(rdev, mddev) { | ||
5011 | long long diff; | ||
5012 | if (rdev->raid_disk < 0) | ||
5013 | continue; | ||
5014 | diff = (rdev->new_data_offset - rdev->data_offset); | ||
5015 | if (first) { | ||
5016 | min_offset_diff = diff; | ||
5017 | first = 0; | ||
5018 | } else if (mddev->reshape_backwards && | ||
5019 | diff < min_offset_diff) | ||
5020 | min_offset_diff = diff; | ||
5021 | else if (!mddev->reshape_backwards && | ||
5022 | diff > min_offset_diff) | ||
5023 | min_offset_diff = diff; | ||
5024 | } | ||
5025 | |||
4960 | if (mddev->reshape_position != MaxSector) { | 5026 | if (mddev->reshape_position != MaxSector) { |
4961 | /* Check that we can continue the reshape. | 5027 | /* Check that we can continue the reshape. |
4962 | * Currently only disks can change, it must | 5028 | * Difficulties arise if the stripe we would write to |
4963 | * increase, and we must be past the point where | 5029 | * next is at or after the stripe we would read from next. |
4964 | * a stripe over-writes itself | 5030 | * For a reshape that changes the number of devices, this |
5031 | * is only possible for a very short time, and mdadm makes | ||
5032 | * sure that time appears to have past before assembling | ||
5033 | * the array. So we fail if that time hasn't passed. | ||
5034 | * For a reshape that keeps the number of devices the same | ||
5035 | * mdadm must be monitoring the reshape can keeping the | ||
5036 | * critical areas read-only and backed up. It will start | ||
5037 | * the array in read-only mode, so we check for that. | ||
4965 | */ | 5038 | */ |
4966 | sector_t here_new, here_old; | 5039 | sector_t here_new, here_old; |
4967 | int old_disks; | 5040 | int old_disks; |
@@ -4993,26 +5066,34 @@ static int run(struct mddev *mddev) | |||
4993 | /* here_old is the first stripe that we might need to read | 5066 | /* here_old is the first stripe that we might need to read |
4994 | * from */ | 5067 | * from */ |
4995 | if (mddev->delta_disks == 0) { | 5068 | if (mddev->delta_disks == 0) { |
5069 | if ((here_new * mddev->new_chunk_sectors != | ||
5070 | here_old * mddev->chunk_sectors)) { | ||
5071 | printk(KERN_ERR "md/raid:%s: reshape position is" | ||
5072 | " confused - aborting\n", mdname(mddev)); | ||
5073 | return -EINVAL; | ||
5074 | } | ||
4996 | /* We cannot be sure it is safe to start an in-place | 5075 | /* We cannot be sure it is safe to start an in-place |
4997 | * reshape. It is only safe if user-space if monitoring | 5076 | * reshape. It is only safe if user-space is monitoring |
4998 | * and taking constant backups. | 5077 | * and taking constant backups. |
4999 | * mdadm always starts a situation like this in | 5078 | * mdadm always starts a situation like this in |
5000 | * readonly mode so it can take control before | 5079 | * readonly mode so it can take control before |
5001 | * allowing any writes. So just check for that. | 5080 | * allowing any writes. So just check for that. |
5002 | */ | 5081 | */ |
5003 | if ((here_new * mddev->new_chunk_sectors != | 5082 | if (abs(min_offset_diff) >= mddev->chunk_sectors && |
5004 | here_old * mddev->chunk_sectors) || | 5083 | abs(min_offset_diff) >= mddev->new_chunk_sectors) |
5005 | mddev->ro == 0) { | 5084 | /* not really in-place - so OK */; |
5006 | printk(KERN_ERR "md/raid:%s: in-place reshape must be started" | 5085 | else if (mddev->ro == 0) { |
5007 | " in read-only mode - aborting\n", | 5086 | printk(KERN_ERR "md/raid:%s: in-place reshape " |
5087 | "must be started in read-only mode " | ||
5088 | "- aborting\n", | ||
5008 | mdname(mddev)); | 5089 | mdname(mddev)); |
5009 | return -EINVAL; | 5090 | return -EINVAL; |
5010 | } | 5091 | } |
5011 | } else if (mddev->delta_disks < 0 | 5092 | } else if (mddev->reshape_backwards |
5012 | ? (here_new * mddev->new_chunk_sectors <= | 5093 | ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= |
5013 | here_old * mddev->chunk_sectors) | 5094 | here_old * mddev->chunk_sectors) |
5014 | : (here_new * mddev->new_chunk_sectors >= | 5095 | : (here_new * mddev->new_chunk_sectors >= |
5015 | here_old * mddev->chunk_sectors)) { | 5096 | here_old * mddev->chunk_sectors + (-min_offset_diff))) { |
5016 | /* Reading from the same stripe as writing to - bad */ | 5097 | /* Reading from the same stripe as writing to - bad */ |
5017 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " | 5098 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " |
5018 | "auto-recovery - aborting.\n", | 5099 | "auto-recovery - aborting.\n", |
@@ -5037,6 +5118,7 @@ static int run(struct mddev *mddev) | |||
5037 | if (IS_ERR(conf)) | 5118 | if (IS_ERR(conf)) |
5038 | return PTR_ERR(conf); | 5119 | return PTR_ERR(conf); |
5039 | 5120 | ||
5121 | conf->min_offset_diff = min_offset_diff; | ||
5040 | mddev->thread = conf->thread; | 5122 | mddev->thread = conf->thread; |
5041 | conf->thread = NULL; | 5123 | conf->thread = NULL; |
5042 | mddev->private = conf; | 5124 | mddev->private = conf; |
@@ -5182,9 +5264,12 @@ static int run(struct mddev *mddev) | |||
5182 | blk_queue_io_opt(mddev->queue, chunk_size * | 5264 | blk_queue_io_opt(mddev->queue, chunk_size * |
5183 | (conf->raid_disks - conf->max_degraded)); | 5265 | (conf->raid_disks - conf->max_degraded)); |
5184 | 5266 | ||
5185 | rdev_for_each(rdev, mddev) | 5267 | rdev_for_each(rdev, mddev) { |
5186 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 5268 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
5187 | rdev->data_offset << 9); | 5269 | rdev->data_offset << 9); |
5270 | disk_stack_limits(mddev->gendisk, rdev->bdev, | ||
5271 | rdev->new_data_offset << 9); | ||
5272 | } | ||
5188 | } | 5273 | } |
5189 | 5274 | ||
5190 | return 0; | 5275 | return 0; |
@@ -5418,12 +5503,18 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) | |||
5418 | * any io in the removed space completes, but it hardly seems | 5503 | * any io in the removed space completes, but it hardly seems |
5419 | * worth it. | 5504 | * worth it. |
5420 | */ | 5505 | */ |
5506 | sector_t newsize; | ||
5421 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); | 5507 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); |
5422 | md_set_array_sectors(mddev, raid5_size(mddev, sectors, | 5508 | newsize = raid5_size(mddev, sectors, mddev->raid_disks); |
5423 | mddev->raid_disks)); | 5509 | if (mddev->external_size && |
5424 | if (mddev->array_sectors > | 5510 | mddev->array_sectors > newsize) |
5425 | raid5_size(mddev, sectors, mddev->raid_disks)) | ||
5426 | return -EINVAL; | 5511 | return -EINVAL; |
5512 | if (mddev->bitmap) { | ||
5513 | int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); | ||
5514 | if (ret) | ||
5515 | return ret; | ||
5516 | } | ||
5517 | md_set_array_sectors(mddev, newsize); | ||
5427 | set_capacity(mddev->gendisk, mddev->array_sectors); | 5518 | set_capacity(mddev->gendisk, mddev->array_sectors); |
5428 | revalidate_disk(mddev->gendisk); | 5519 | revalidate_disk(mddev->gendisk); |
5429 | if (sectors > mddev->dev_sectors && | 5520 | if (sectors > mddev->dev_sectors && |
@@ -5468,9 +5559,6 @@ static int check_reshape(struct mddev *mddev) | |||
5468 | mddev->new_layout == mddev->layout && | 5559 | mddev->new_layout == mddev->layout && |
5469 | mddev->new_chunk_sectors == mddev->chunk_sectors) | 5560 | mddev->new_chunk_sectors == mddev->chunk_sectors) |
5470 | return 0; /* nothing to do */ | 5561 | return 0; /* nothing to do */ |
5471 | if (mddev->bitmap) | ||
5472 | /* Cannot grow a bitmap yet */ | ||
5473 | return -EBUSY; | ||
5474 | if (has_failed(conf)) | 5562 | if (has_failed(conf)) |
5475 | return -EINVAL; | 5563 | return -EINVAL; |
5476 | if (mddev->delta_disks < 0) { | 5564 | if (mddev->delta_disks < 0) { |
@@ -5505,10 +5593,14 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5505 | if (!check_stripe_cache(mddev)) | 5593 | if (!check_stripe_cache(mddev)) |
5506 | return -ENOSPC; | 5594 | return -ENOSPC; |
5507 | 5595 | ||
5508 | rdev_for_each(rdev, mddev) | 5596 | if (has_failed(conf)) |
5597 | return -EINVAL; | ||
5598 | |||
5599 | rdev_for_each(rdev, mddev) { | ||
5509 | if (!test_bit(In_sync, &rdev->flags) | 5600 | if (!test_bit(In_sync, &rdev->flags) |
5510 | && !test_bit(Faulty, &rdev->flags)) | 5601 | && !test_bit(Faulty, &rdev->flags)) |
5511 | spares++; | 5602 | spares++; |
5603 | } | ||
5512 | 5604 | ||
5513 | if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) | 5605 | if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) |
5514 | /* Not enough devices even to make a degraded array | 5606 | /* Not enough devices even to make a degraded array |
@@ -5535,12 +5627,16 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5535 | conf->chunk_sectors = mddev->new_chunk_sectors; | 5627 | conf->chunk_sectors = mddev->new_chunk_sectors; |
5536 | conf->prev_algo = conf->algorithm; | 5628 | conf->prev_algo = conf->algorithm; |
5537 | conf->algorithm = mddev->new_layout; | 5629 | conf->algorithm = mddev->new_layout; |
5538 | if (mddev->delta_disks < 0) | 5630 | conf->generation++; |
5631 | /* Code that selects data_offset needs to see the generation update | ||
5632 | * if reshape_progress has been set - so a memory barrier needed. | ||
5633 | */ | ||
5634 | smp_mb(); | ||
5635 | if (mddev->reshape_backwards) | ||
5539 | conf->reshape_progress = raid5_size(mddev, 0, 0); | 5636 | conf->reshape_progress = raid5_size(mddev, 0, 0); |
5540 | else | 5637 | else |
5541 | conf->reshape_progress = 0; | 5638 | conf->reshape_progress = 0; |
5542 | conf->reshape_safe = conf->reshape_progress; | 5639 | conf->reshape_safe = conf->reshape_progress; |
5543 | conf->generation++; | ||
5544 | spin_unlock_irq(&conf->device_lock); | 5640 | spin_unlock_irq(&conf->device_lock); |
5545 | 5641 | ||
5546 | /* Add some new drives, as many as will fit. | 5642 | /* Add some new drives, as many as will fit. |
@@ -5592,6 +5688,9 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5592 | mddev->recovery = 0; | 5688 | mddev->recovery = 0; |
5593 | spin_lock_irq(&conf->device_lock); | 5689 | spin_lock_irq(&conf->device_lock); |
5594 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; | 5690 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; |
5691 | rdev_for_each(rdev, mddev) | ||
5692 | rdev->new_data_offset = rdev->data_offset; | ||
5693 | smp_wmb(); | ||
5595 | conf->reshape_progress = MaxSector; | 5694 | conf->reshape_progress = MaxSector; |
5596 | mddev->reshape_position = MaxSector; | 5695 | mddev->reshape_position = MaxSector; |
5597 | spin_unlock_irq(&conf->device_lock); | 5696 | spin_unlock_irq(&conf->device_lock); |
@@ -5610,9 +5709,13 @@ static void end_reshape(struct r5conf *conf) | |||
5610 | { | 5709 | { |
5611 | 5710 | ||
5612 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { | 5711 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { |
5712 | struct md_rdev *rdev; | ||
5613 | 5713 | ||
5614 | spin_lock_irq(&conf->device_lock); | 5714 | spin_lock_irq(&conf->device_lock); |
5615 | conf->previous_raid_disks = conf->raid_disks; | 5715 | conf->previous_raid_disks = conf->raid_disks; |
5716 | rdev_for_each(rdev, conf->mddev) | ||
5717 | rdev->data_offset = rdev->new_data_offset; | ||
5718 | smp_wmb(); | ||
5616 | conf->reshape_progress = MaxSector; | 5719 | conf->reshape_progress = MaxSector; |
5617 | spin_unlock_irq(&conf->device_lock); | 5720 | spin_unlock_irq(&conf->device_lock); |
5618 | wake_up(&conf->wait_for_overlap); | 5721 | wake_up(&conf->wait_for_overlap); |
@@ -5652,17 +5755,18 @@ static void raid5_finish_reshape(struct mddev *mddev) | |||
5652 | d < conf->raid_disks - mddev->delta_disks; | 5755 | d < conf->raid_disks - mddev->delta_disks; |
5653 | d++) { | 5756 | d++) { |
5654 | struct md_rdev *rdev = conf->disks[d].rdev; | 5757 | struct md_rdev *rdev = conf->disks[d].rdev; |
5655 | if (rdev && | 5758 | if (rdev) |
5656 | raid5_remove_disk(mddev, rdev) == 0) { | 5759 | clear_bit(In_sync, &rdev->flags); |
5657 | sysfs_unlink_rdev(mddev, rdev); | 5760 | rdev = conf->disks[d].replacement; |
5658 | rdev->raid_disk = -1; | 5761 | if (rdev) |
5659 | } | 5762 | clear_bit(In_sync, &rdev->flags); |
5660 | } | 5763 | } |
5661 | } | 5764 | } |
5662 | mddev->layout = conf->algorithm; | 5765 | mddev->layout = conf->algorithm; |
5663 | mddev->chunk_sectors = conf->chunk_sectors; | 5766 | mddev->chunk_sectors = conf->chunk_sectors; |
5664 | mddev->reshape_position = MaxSector; | 5767 | mddev->reshape_position = MaxSector; |
5665 | mddev->delta_disks = 0; | 5768 | mddev->delta_disks = 0; |
5769 | mddev->reshape_backwards = 0; | ||
5666 | } | 5770 | } |
5667 | } | 5771 | } |
5668 | 5772 | ||
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 8d8e13934a48..2164021f3b5f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -285,6 +285,7 @@ enum r5dev_flags { | |||
285 | */ | 285 | */ |
286 | R5_Wantdrain, /* dev->towrite needs to be drained */ | 286 | R5_Wantdrain, /* dev->towrite needs to be drained */ |
287 | R5_WantFUA, /* Write should be FUA */ | 287 | R5_WantFUA, /* Write should be FUA */ |
288 | R5_SyncIO, /* The IO is sync */ | ||
288 | R5_WriteError, /* got a write error - need to record it */ | 289 | R5_WriteError, /* got a write error - need to record it */ |
289 | R5_MadeGood, /* A bad block has been fixed by writing to it */ | 290 | R5_MadeGood, /* A bad block has been fixed by writing to it */ |
290 | R5_ReadRepl, /* Will/did read from replacement rather than orig */ | 291 | R5_ReadRepl, /* Will/did read from replacement rather than orig */ |
@@ -385,6 +386,12 @@ struct r5conf { | |||
385 | short generation; /* increments with every reshape */ | 386 | short generation; /* increments with every reshape */ |
386 | unsigned long reshape_checkpoint; /* Time we last updated | 387 | unsigned long reshape_checkpoint; /* Time we last updated |
387 | * metadata */ | 388 | * metadata */ |
389 | long long min_offset_diff; /* minimum difference between | ||
390 | * data_offset and | ||
391 | * new_data_offset across all | ||
392 | * devices. May be negative, | ||
393 | * but is closest to zero. | ||
394 | */ | ||
388 | 395 | ||
389 | struct list_head handle_list; /* stripes needing handling */ | 396 | struct list_head handle_list; /* stripes needing handling */ |
390 | struct list_head hold_list; /* preread ready stripes */ | 397 | struct list_head hold_list; /* preread ready stripes */ |
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 8c0a3adc5df5..ee753536ab70 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h | |||
@@ -233,7 +233,10 @@ struct mdp_superblock_1 { | |||
233 | __le32 delta_disks; /* change in number of raid_disks */ | 233 | __le32 delta_disks; /* change in number of raid_disks */ |
234 | __le32 new_layout; /* new layout */ | 234 | __le32 new_layout; /* new layout */ |
235 | __le32 new_chunk; /* new chunk size (512byte sectors) */ | 235 | __le32 new_chunk; /* new chunk size (512byte sectors) */ |
236 | __u8 pad1[128-124]; /* set to 0 when written */ | 236 | __le32 new_offset; /* signed number to add to data_offset in new |
237 | * layout. 0 == no-change. This can be | ||
238 | * different on each device in the array. | ||
239 | */ | ||
237 | 240 | ||
238 | /* constant this-device information - 64 bytes */ | 241 | /* constant this-device information - 64 bytes */ |
239 | __le64 data_offset; /* sector start of data, often 0 */ | 242 | __le64 data_offset; /* sector start of data, often 0 */ |
@@ -281,10 +284,18 @@ struct mdp_superblock_1 { | |||
281 | * active device with same 'role'. | 284 | * active device with same 'role'. |
282 | * 'recovery_offset' is also set. | 285 | * 'recovery_offset' is also set. |
283 | */ | 286 | */ |
287 | #define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number | ||
288 | * of devices, but is going | ||
289 | * backwards anyway. | ||
290 | */ | ||
291 | #define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */ | ||
284 | #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ | 292 | #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |
285 | |MD_FEATURE_RECOVERY_OFFSET \ | 293 | |MD_FEATURE_RECOVERY_OFFSET \ |
286 | |MD_FEATURE_RESHAPE_ACTIVE \ | 294 | |MD_FEATURE_RESHAPE_ACTIVE \ |
287 | |MD_FEATURE_BAD_BLOCKS \ | 295 | |MD_FEATURE_BAD_BLOCKS \ |
288 | |MD_FEATURE_REPLACEMENT) | 296 | |MD_FEATURE_REPLACEMENT \ |
297 | |MD_FEATURE_RESHAPE_BACKWARDS \ | ||
298 | |MD_FEATURE_NEW_OFFSET \ | ||
299 | ) | ||
289 | 300 | ||
290 | #endif | 301 | #endif |
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 53272e9860a7..640c69ceec96 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h | |||
@@ -99,8 +99,20 @@ extern const struct raid6_calls raid6_altivec2; | |||
99 | extern const struct raid6_calls raid6_altivec4; | 99 | extern const struct raid6_calls raid6_altivec4; |
100 | extern const struct raid6_calls raid6_altivec8; | 100 | extern const struct raid6_calls raid6_altivec8; |
101 | 101 | ||
102 | struct raid6_recov_calls { | ||
103 | void (*data2)(int, size_t, int, int, void **); | ||
104 | void (*datap)(int, size_t, int, void **); | ||
105 | int (*valid)(void); | ||
106 | const char *name; | ||
107 | int priority; | ||
108 | }; | ||
109 | |||
110 | extern const struct raid6_recov_calls raid6_recov_intx1; | ||
111 | extern const struct raid6_recov_calls raid6_recov_ssse3; | ||
112 | |||
102 | /* Algorithm list */ | 113 | /* Algorithm list */ |
103 | extern const struct raid6_calls * const raid6_algos[]; | 114 | extern const struct raid6_calls * const raid6_algos[]; |
115 | extern const struct raid6_recov_calls *const raid6_recov_algos[]; | ||
104 | int raid6_select_algo(void); | 116 | int raid6_select_algo(void); |
105 | 117 | ||
106 | /* Return values from chk_syndrome */ | 118 | /* Return values from chk_syndrome */ |
@@ -111,14 +123,16 @@ int raid6_select_algo(void); | |||
111 | 123 | ||
112 | /* Galois field tables */ | 124 | /* Galois field tables */ |
113 | extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); | 125 | extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); |
126 | extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256))); | ||
114 | extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); | 127 | extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); |
115 | extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); | 128 | extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); |
116 | extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); | 129 | extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); |
117 | 130 | ||
118 | /* Recovery routines */ | 131 | /* Recovery routines */ |
119 | void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, | 132 | extern void (*raid6_2data_recov)(int disks, size_t bytes, int faila, int failb, |
120 | void **ptrs); | 133 | void **ptrs); |
121 | void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); | 134 | extern void (*raid6_datap_recov)(int disks, size_t bytes, int faila, |
135 | void **ptrs); | ||
122 | void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, | 136 | void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, |
123 | void **ptrs); | 137 | void **ptrs); |
124 | 138 | ||
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 8a38102770f3..de06dfe165b8 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile | |||
@@ -1,6 +1,6 @@ | |||
1 | obj-$(CONFIG_RAID6_PQ) += raid6_pq.o | 1 | obj-$(CONFIG_RAID6_PQ) += raid6_pq.o |
2 | 2 | ||
3 | raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ | 3 | raid6_pq-y += algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \ |
4 | int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ | 4 | int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ |
5 | altivec8.o mmx.o sse1.o sse2.o | 5 | altivec8.o mmx.o sse1.o sse2.o |
6 | hostprogs-y += mktables | 6 | hostprogs-y += mktables |
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 8b02f60ffc86..589f5f50ad2e 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c | |||
@@ -17,11 +17,11 @@ | |||
17 | */ | 17 | */ |
18 | 18 | ||
19 | #include <linux/raid/pq.h> | 19 | #include <linux/raid/pq.h> |
20 | #include <linux/module.h> | ||
21 | #ifndef __KERNEL__ | 20 | #ifndef __KERNEL__ |
22 | #include <sys/mman.h> | 21 | #include <sys/mman.h> |
23 | #include <stdio.h> | 22 | #include <stdio.h> |
24 | #else | 23 | #else |
24 | #include <linux/module.h> | ||
25 | #include <linux/gfp.h> | 25 | #include <linux/gfp.h> |
26 | #if !RAID6_USE_EMPTY_ZERO_PAGE | 26 | #if !RAID6_USE_EMPTY_ZERO_PAGE |
27 | /* In .bss so it's zeroed */ | 27 | /* In .bss so it's zeroed */ |
@@ -34,10 +34,6 @@ struct raid6_calls raid6_call; | |||
34 | EXPORT_SYMBOL_GPL(raid6_call); | 34 | EXPORT_SYMBOL_GPL(raid6_call); |
35 | 35 | ||
36 | const struct raid6_calls * const raid6_algos[] = { | 36 | const struct raid6_calls * const raid6_algos[] = { |
37 | &raid6_intx1, | ||
38 | &raid6_intx2, | ||
39 | &raid6_intx4, | ||
40 | &raid6_intx8, | ||
41 | #if defined(__ia64__) | 37 | #if defined(__ia64__) |
42 | &raid6_intx16, | 38 | &raid6_intx16, |
43 | &raid6_intx32, | 39 | &raid6_intx32, |
@@ -61,6 +57,24 @@ const struct raid6_calls * const raid6_algos[] = { | |||
61 | &raid6_altivec4, | 57 | &raid6_altivec4, |
62 | &raid6_altivec8, | 58 | &raid6_altivec8, |
63 | #endif | 59 | #endif |
60 | &raid6_intx1, | ||
61 | &raid6_intx2, | ||
62 | &raid6_intx4, | ||
63 | &raid6_intx8, | ||
64 | NULL | ||
65 | }; | ||
66 | |||
67 | void (*raid6_2data_recov)(int, size_t, int, int, void **); | ||
68 | EXPORT_SYMBOL_GPL(raid6_2data_recov); | ||
69 | |||
70 | void (*raid6_datap_recov)(int, size_t, int, void **); | ||
71 | EXPORT_SYMBOL_GPL(raid6_datap_recov); | ||
72 | |||
73 | const struct raid6_recov_calls *const raid6_recov_algos[] = { | ||
74 | #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) | ||
75 | &raid6_recov_ssse3, | ||
76 | #endif | ||
77 | &raid6_recov_intx1, | ||
64 | NULL | 78 | NULL |
65 | }; | 79 | }; |
66 | 80 | ||
@@ -72,59 +86,55 @@ const struct raid6_calls * const raid6_algos[] = { | |||
72 | #define time_before(x, y) ((x) < (y)) | 86 | #define time_before(x, y) ((x) < (y)) |
73 | #endif | 87 | #endif |
74 | 88 | ||
75 | /* Try to pick the best algorithm */ | 89 | static inline const struct raid6_recov_calls *raid6_choose_recov(void) |
76 | /* This code uses the gfmul table as convenient data set to abuse */ | ||
77 | |||
78 | int __init raid6_select_algo(void) | ||
79 | { | 90 | { |
80 | const struct raid6_calls * const * algo; | 91 | const struct raid6_recov_calls *const *algo; |
81 | const struct raid6_calls * best; | 92 | const struct raid6_recov_calls *best; |
82 | char *syndromes; | ||
83 | void *dptrs[(65536/PAGE_SIZE)+2]; | ||
84 | int i, disks; | ||
85 | unsigned long perf, bestperf; | ||
86 | int bestprefer; | ||
87 | unsigned long j0, j1; | ||
88 | 93 | ||
89 | disks = (65536/PAGE_SIZE)+2; | 94 | for (best = NULL, algo = raid6_recov_algos; *algo; algo++) |
90 | for ( i = 0 ; i < disks-2 ; i++ ) { | 95 | if (!best || (*algo)->priority > best->priority) |
91 | dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; | 96 | if (!(*algo)->valid || (*algo)->valid()) |
92 | } | 97 | best = *algo; |
93 | 98 | ||
94 | /* Normal code - use a 2-page allocation to avoid D$ conflict */ | 99 | if (best) { |
95 | syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); | 100 | raid6_2data_recov = best->data2; |
101 | raid6_datap_recov = best->datap; | ||
96 | 102 | ||
97 | if ( !syndromes ) { | 103 | printk("raid6: using %s recovery algorithm\n", best->name); |
98 | printk("raid6: Yikes! No memory available.\n"); | 104 | } else |
99 | return -ENOMEM; | 105 | printk("raid6: Yikes! No recovery algorithm found!\n"); |
100 | } | ||
101 | 106 | ||
102 | dptrs[disks-2] = syndromes; | 107 | return best; |
103 | dptrs[disks-1] = syndromes + PAGE_SIZE; | 108 | } |
109 | |||
110 | static inline const struct raid6_calls *raid6_choose_gen( | ||
111 | void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) | ||
112 | { | ||
113 | unsigned long perf, bestperf, j0, j1; | ||
114 | const struct raid6_calls *const *algo; | ||
115 | const struct raid6_calls *best; | ||
104 | 116 | ||
105 | bestperf = 0; bestprefer = 0; best = NULL; | 117 | for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { |
118 | if (!best || (*algo)->prefer >= best->prefer) { | ||
119 | if ((*algo)->valid && !(*algo)->valid()) | ||
120 | continue; | ||
106 | 121 | ||
107 | for ( algo = raid6_algos ; *algo ; algo++ ) { | ||
108 | if ( !(*algo)->valid || (*algo)->valid() ) { | ||
109 | perf = 0; | 122 | perf = 0; |
110 | 123 | ||
111 | preempt_disable(); | 124 | preempt_disable(); |
112 | j0 = jiffies; | 125 | j0 = jiffies; |
113 | while ( (j1 = jiffies) == j0 ) | 126 | while ((j1 = jiffies) == j0) |
114 | cpu_relax(); | 127 | cpu_relax(); |
115 | while (time_before(jiffies, | 128 | while (time_before(jiffies, |
116 | j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { | 129 | j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { |
117 | (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); | 130 | (*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs); |
118 | perf++; | 131 | perf++; |
119 | } | 132 | } |
120 | preempt_enable(); | 133 | preempt_enable(); |
121 | 134 | ||
122 | if ( (*algo)->prefer > bestprefer || | 135 | if (perf > bestperf) { |
123 | ((*algo)->prefer == bestprefer && | ||
124 | perf > bestperf) ) { | ||
125 | best = *algo; | ||
126 | bestprefer = best->prefer; | ||
127 | bestperf = perf; | 136 | bestperf = perf; |
137 | best = *algo; | ||
128 | } | 138 | } |
129 | printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, | 139 | printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, |
130 | (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); | 140 | (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); |
@@ -139,9 +149,46 @@ int __init raid6_select_algo(void) | |||
139 | } else | 149 | } else |
140 | printk("raid6: Yikes! No algorithm found!\n"); | 150 | printk("raid6: Yikes! No algorithm found!\n"); |
141 | 151 | ||
152 | return best; | ||
153 | } | ||
154 | |||
155 | |||
156 | /* Try to pick the best algorithm */ | ||
157 | /* This code uses the gfmul table as convenient data set to abuse */ | ||
158 | |||
159 | int __init raid6_select_algo(void) | ||
160 | { | ||
161 | const int disks = (65536/PAGE_SIZE)+2; | ||
162 | |||
163 | const struct raid6_calls *gen_best; | ||
164 | const struct raid6_recov_calls *rec_best; | ||
165 | char *syndromes; | ||
166 | void *dptrs[(65536/PAGE_SIZE)+2]; | ||
167 | int i; | ||
168 | |||
169 | for (i = 0; i < disks-2; i++) | ||
170 | dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; | ||
171 | |||
172 | /* Normal code - use a 2-page allocation to avoid D$ conflict */ | ||
173 | syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); | ||
174 | |||
175 | if (!syndromes) { | ||
176 | printk("raid6: Yikes! No memory available.\n"); | ||
177 | return -ENOMEM; | ||
178 | } | ||
179 | |||
180 | dptrs[disks-2] = syndromes; | ||
181 | dptrs[disks-1] = syndromes + PAGE_SIZE; | ||
182 | |||
183 | /* select raid gen_syndrome function */ | ||
184 | gen_best = raid6_choose_gen(&dptrs, disks); | ||
185 | |||
186 | /* select raid recover functions */ | ||
187 | rec_best = raid6_choose_recov(); | ||
188 | |||
142 | free_pages((unsigned long)syndromes, 1); | 189 | free_pages((unsigned long)syndromes, 1); |
143 | 190 | ||
144 | return best ? 0 : -EINVAL; | 191 | return gen_best && rec_best ? 0 : -EINVAL; |
145 | } | 192 | } |
146 | 193 | ||
147 | static void raid6_exit(void) | 194 | static void raid6_exit(void) |
diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c index 8a3780902cec..39787db588b0 100644 --- a/lib/raid6/mktables.c +++ b/lib/raid6/mktables.c | |||
@@ -81,6 +81,31 @@ int main(int argc, char *argv[]) | |||
81 | printf("EXPORT_SYMBOL(raid6_gfmul);\n"); | 81 | printf("EXPORT_SYMBOL(raid6_gfmul);\n"); |
82 | printf("#endif\n"); | 82 | printf("#endif\n"); |
83 | 83 | ||
84 | /* Compute vector multiplication table */ | ||
85 | printf("\nconst u8 __attribute__((aligned(256)))\n" | ||
86 | "raid6_vgfmul[256][32] =\n" | ||
87 | "{\n"); | ||
88 | for (i = 0; i < 256; i++) { | ||
89 | printf("\t{\n"); | ||
90 | for (j = 0; j < 16; j += 8) { | ||
91 | printf("\t\t"); | ||
92 | for (k = 0; k < 8; k++) | ||
93 | printf("0x%02x,%c", gfmul(i, j + k), | ||
94 | (k == 7) ? '\n' : ' '); | ||
95 | } | ||
96 | for (j = 0; j < 16; j += 8) { | ||
97 | printf("\t\t"); | ||
98 | for (k = 0; k < 8; k++) | ||
99 | printf("0x%02x,%c", gfmul(i, (j + k) << 4), | ||
100 | (k == 7) ? '\n' : ' '); | ||
101 | } | ||
102 | printf("\t},\n"); | ||
103 | } | ||
104 | printf("};\n"); | ||
105 | printf("#ifdef __KERNEL__\n"); | ||
106 | printf("EXPORT_SYMBOL(raid6_vgfmul);\n"); | ||
107 | printf("#endif\n"); | ||
108 | |||
84 | /* Compute power-of-2 table (exponent) */ | 109 | /* Compute power-of-2 table (exponent) */ |
85 | v = 1; | 110 | v = 1; |
86 | printf("\nconst u8 __attribute__((aligned(256)))\n" | 111 | printf("\nconst u8 __attribute__((aligned(256)))\n" |
diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index fe275d7b6b36..1805a5cc5daa 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c | |||
@@ -22,7 +22,7 @@ | |||
22 | #include <linux/raid/pq.h> | 22 | #include <linux/raid/pq.h> |
23 | 23 | ||
24 | /* Recover two failed data blocks. */ | 24 | /* Recover two failed data blocks. */ |
25 | void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, | 25 | void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb, |
26 | void **ptrs) | 26 | void **ptrs) |
27 | { | 27 | { |
28 | u8 *p, *q, *dp, *dq; | 28 | u8 *p, *q, *dp, *dq; |
@@ -64,10 +64,9 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, | |||
64 | p++; q++; | 64 | p++; q++; |
65 | } | 65 | } |
66 | } | 66 | } |
67 | EXPORT_SYMBOL_GPL(raid6_2data_recov); | ||
68 | 67 | ||
69 | /* Recover failure of one data block plus the P block */ | 68 | /* Recover failure of one data block plus the P block */ |
70 | void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) | 69 | void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, void **ptrs) |
71 | { | 70 | { |
72 | u8 *p, *q, *dq; | 71 | u8 *p, *q, *dq; |
73 | const u8 *qmul; /* Q multiplier table */ | 72 | const u8 *qmul; /* Q multiplier table */ |
@@ -96,7 +95,15 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) | |||
96 | q++; dq++; | 95 | q++; dq++; |
97 | } | 96 | } |
98 | } | 97 | } |
99 | EXPORT_SYMBOL_GPL(raid6_datap_recov); | 98 | |
99 | |||
100 | const struct raid6_recov_calls raid6_recov_intx1 = { | ||
101 | .data2 = raid6_2data_recov_intx1, | ||
102 | .datap = raid6_datap_recov_intx1, | ||
103 | .valid = NULL, | ||
104 | .name = "intx1", | ||
105 | .priority = 0, | ||
106 | }; | ||
100 | 107 | ||
101 | #ifndef __KERNEL__ | 108 | #ifndef __KERNEL__ |
102 | /* Testing only */ | 109 | /* Testing only */ |
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c new file mode 100644 index 000000000000..37ae61930559 --- /dev/null +++ b/lib/raid6/recov_ssse3.c | |||
@@ -0,0 +1,335 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Intel Corporation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; version 2 | ||
7 | * of the License. | ||
8 | */ | ||
9 | |||
10 | #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) | ||
11 | |||
12 | #include <linux/raid/pq.h> | ||
13 | #include "x86.h" | ||
14 | |||
15 | static int raid6_has_ssse3(void) | ||
16 | { | ||
17 | return boot_cpu_has(X86_FEATURE_XMM) && | ||
18 | boot_cpu_has(X86_FEATURE_XMM2) && | ||
19 | boot_cpu_has(X86_FEATURE_SSSE3); | ||
20 | } | ||
21 | |||
22 | void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb, | ||
23 | void **ptrs) | ||
24 | { | ||
25 | u8 *p, *q, *dp, *dq; | ||
26 | const u8 *pbmul; /* P multiplier table for B data */ | ||
27 | const u8 *qmul; /* Q multiplier table (for both) */ | ||
28 | static const u8 __aligned(16) x0f[16] = { | ||
29 | 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, | ||
30 | 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; | ||
31 | |||
32 | p = (u8 *)ptrs[disks-2]; | ||
33 | q = (u8 *)ptrs[disks-1]; | ||
34 | |||
35 | /* Compute syndrome with zero for the missing data pages | ||
36 | Use the dead data pages as temporary storage for | ||
37 | delta p and delta q */ | ||
38 | dp = (u8 *)ptrs[faila]; | ||
39 | ptrs[faila] = (void *)raid6_empty_zero_page; | ||
40 | ptrs[disks-2] = dp; | ||
41 | dq = (u8 *)ptrs[failb]; | ||
42 | ptrs[failb] = (void *)raid6_empty_zero_page; | ||
43 | ptrs[disks-1] = dq; | ||
44 | |||
45 | raid6_call.gen_syndrome(disks, bytes, ptrs); | ||
46 | |||
47 | /* Restore pointer table */ | ||
48 | ptrs[faila] = dp; | ||
49 | ptrs[failb] = dq; | ||
50 | ptrs[disks-2] = p; | ||
51 | ptrs[disks-1] = q; | ||
52 | |||
53 | /* Now, pick the proper data tables */ | ||
54 | pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; | ||
55 | qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ | ||
56 | raid6_gfexp[failb]]]; | ||
57 | |||
58 | kernel_fpu_begin(); | ||
59 | |||
60 | asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0])); | ||
61 | |||
62 | #ifdef CONFIG_X86_64 | ||
63 | asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0])); | ||
64 | asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0])); | ||
65 | asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16])); | ||
66 | #endif | ||
67 | |||
68 | /* Now do it... */ | ||
69 | while (bytes) { | ||
70 | #ifdef CONFIG_X86_64 | ||
71 | /* xmm6, xmm14, xmm15 */ | ||
72 | |||
73 | asm volatile("movdqa %0,%%xmm1" : : "m" (q[0])); | ||
74 | asm volatile("movdqa %0,%%xmm9" : : "m" (q[16])); | ||
75 | asm volatile("movdqa %0,%%xmm0" : : "m" (p[0])); | ||
76 | asm volatile("movdqa %0,%%xmm8" : : "m" (p[16])); | ||
77 | asm volatile("pxor %0,%%xmm1" : : "m" (dq[0])); | ||
78 | asm volatile("pxor %0,%%xmm9" : : "m" (dq[16])); | ||
79 | asm volatile("pxor %0,%%xmm0" : : "m" (dp[0])); | ||
80 | asm volatile("pxor %0,%%xmm8" : : "m" (dp[16])); | ||
81 | |||
82 | /* xmm0/8 = px */ | ||
83 | |||
84 | asm volatile("movdqa %xmm6,%xmm4"); | ||
85 | asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); | ||
86 | asm volatile("movdqa %xmm6,%xmm12"); | ||
87 | asm volatile("movdqa %xmm5,%xmm13"); | ||
88 | asm volatile("movdqa %xmm1,%xmm3"); | ||
89 | asm volatile("movdqa %xmm9,%xmm11"); | ||
90 | asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */ | ||
91 | asm volatile("movdqa %xmm8,%xmm10"); | ||
92 | asm volatile("psraw $4,%xmm1"); | ||
93 | asm volatile("psraw $4,%xmm9"); | ||
94 | asm volatile("pand %xmm7,%xmm3"); | ||
95 | asm volatile("pand %xmm7,%xmm11"); | ||
96 | asm volatile("pand %xmm7,%xmm1"); | ||
97 | asm volatile("pand %xmm7,%xmm9"); | ||
98 | asm volatile("pshufb %xmm3,%xmm4"); | ||
99 | asm volatile("pshufb %xmm11,%xmm12"); | ||
100 | asm volatile("pshufb %xmm1,%xmm5"); | ||
101 | asm volatile("pshufb %xmm9,%xmm13"); | ||
102 | asm volatile("pxor %xmm4,%xmm5"); | ||
103 | asm volatile("pxor %xmm12,%xmm13"); | ||
104 | |||
105 | /* xmm5/13 = qx */ | ||
106 | |||
107 | asm volatile("movdqa %xmm14,%xmm4"); | ||
108 | asm volatile("movdqa %xmm15,%xmm1"); | ||
109 | asm volatile("movdqa %xmm14,%xmm12"); | ||
110 | asm volatile("movdqa %xmm15,%xmm9"); | ||
111 | asm volatile("movdqa %xmm2,%xmm3"); | ||
112 | asm volatile("movdqa %xmm10,%xmm11"); | ||
113 | asm volatile("psraw $4,%xmm2"); | ||
114 | asm volatile("psraw $4,%xmm10"); | ||
115 | asm volatile("pand %xmm7,%xmm3"); | ||
116 | asm volatile("pand %xmm7,%xmm11"); | ||
117 | asm volatile("pand %xmm7,%xmm2"); | ||
118 | asm volatile("pand %xmm7,%xmm10"); | ||
119 | asm volatile("pshufb %xmm3,%xmm4"); | ||
120 | asm volatile("pshufb %xmm11,%xmm12"); | ||
121 | asm volatile("pshufb %xmm2,%xmm1"); | ||
122 | asm volatile("pshufb %xmm10,%xmm9"); | ||
123 | asm volatile("pxor %xmm4,%xmm1"); | ||
124 | asm volatile("pxor %xmm12,%xmm9"); | ||
125 | |||
126 | /* xmm1/9 = pbmul[px] */ | ||
127 | asm volatile("pxor %xmm5,%xmm1"); | ||
128 | asm volatile("pxor %xmm13,%xmm9"); | ||
129 | /* xmm1/9 = db = DQ */ | ||
130 | asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0])); | ||
131 | asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16])); | ||
132 | |||
133 | asm volatile("pxor %xmm1,%xmm0"); | ||
134 | asm volatile("pxor %xmm9,%xmm8"); | ||
135 | asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0])); | ||
136 | asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16])); | ||
137 | |||
138 | bytes -= 32; | ||
139 | p += 32; | ||
140 | q += 32; | ||
141 | dp += 32; | ||
142 | dq += 32; | ||
143 | #else | ||
144 | asm volatile("movdqa %0,%%xmm1" : : "m" (*q)); | ||
145 | asm volatile("movdqa %0,%%xmm0" : : "m" (*p)); | ||
146 | asm volatile("pxor %0,%%xmm1" : : "m" (*dq)); | ||
147 | asm volatile("pxor %0,%%xmm0" : : "m" (*dp)); | ||
148 | |||
149 | /* 1 = dq ^ q | ||
150 | * 0 = dp ^ p | ||
151 | */ | ||
152 | asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0])); | ||
153 | asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); | ||
154 | |||
155 | asm volatile("movdqa %xmm1,%xmm3"); | ||
156 | asm volatile("psraw $4,%xmm1"); | ||
157 | asm volatile("pand %xmm7,%xmm3"); | ||
158 | asm volatile("pand %xmm7,%xmm1"); | ||
159 | asm volatile("pshufb %xmm3,%xmm4"); | ||
160 | asm volatile("pshufb %xmm1,%xmm5"); | ||
161 | asm volatile("pxor %xmm4,%xmm5"); | ||
162 | |||
163 | asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */ | ||
164 | |||
165 | /* xmm5 = qx */ | ||
166 | |||
167 | asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0])); | ||
168 | asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16])); | ||
169 | asm volatile("movdqa %xmm2,%xmm3"); | ||
170 | asm volatile("psraw $4,%xmm2"); | ||
171 | asm volatile("pand %xmm7,%xmm3"); | ||
172 | asm volatile("pand %xmm7,%xmm2"); | ||
173 | asm volatile("pshufb %xmm3,%xmm4"); | ||
174 | asm volatile("pshufb %xmm2,%xmm1"); | ||
175 | asm volatile("pxor %xmm4,%xmm1"); | ||
176 | |||
177 | /* xmm1 = pbmul[px] */ | ||
178 | asm volatile("pxor %xmm5,%xmm1"); | ||
179 | /* xmm1 = db = DQ */ | ||
180 | asm volatile("movdqa %%xmm1,%0" : "=m" (*dq)); | ||
181 | |||
182 | asm volatile("pxor %xmm1,%xmm0"); | ||
183 | asm volatile("movdqa %%xmm0,%0" : "=m" (*dp)); | ||
184 | |||
185 | bytes -= 16; | ||
186 | p += 16; | ||
187 | q += 16; | ||
188 | dp += 16; | ||
189 | dq += 16; | ||
190 | #endif | ||
191 | } | ||
192 | |||
193 | kernel_fpu_end(); | ||
194 | } | ||
195 | |||
196 | |||
197 | void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, void **ptrs) | ||
198 | { | ||
199 | u8 *p, *q, *dq; | ||
200 | const u8 *qmul; /* Q multiplier table */ | ||
201 | static const u8 __aligned(16) x0f[16] = { | ||
202 | 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, | ||
203 | 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; | ||
204 | |||
205 | p = (u8 *)ptrs[disks-2]; | ||
206 | q = (u8 *)ptrs[disks-1]; | ||
207 | |||
208 | /* Compute syndrome with zero for the missing data page | ||
209 | Use the dead data page as temporary storage for delta q */ | ||
210 | dq = (u8 *)ptrs[faila]; | ||
211 | ptrs[faila] = (void *)raid6_empty_zero_page; | ||
212 | ptrs[disks-1] = dq; | ||
213 | |||
214 | raid6_call.gen_syndrome(disks, bytes, ptrs); | ||
215 | |||
216 | /* Restore pointer table */ | ||
217 | ptrs[faila] = dq; | ||
218 | ptrs[disks-1] = q; | ||
219 | |||
220 | /* Now, pick the proper data tables */ | ||
221 | qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; | ||
222 | |||
223 | kernel_fpu_begin(); | ||
224 | |||
225 | asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0])); | ||
226 | |||
227 | while (bytes) { | ||
228 | #ifdef CONFIG_X86_64 | ||
229 | asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); | ||
230 | asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16])); | ||
231 | asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); | ||
232 | asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); | ||
233 | |||
234 | /* xmm3 = q[0] ^ dq[0] */ | ||
235 | |||
236 | asm volatile("pxor %0, %%xmm4" : : "m" (q[16])); | ||
237 | asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); | ||
238 | |||
239 | /* xmm4 = q[16] ^ dq[16] */ | ||
240 | |||
241 | asm volatile("movdqa %xmm3, %xmm6"); | ||
242 | asm volatile("movdqa %xmm4, %xmm8"); | ||
243 | |||
244 | /* xmm4 = xmm8 = q[16] ^ dq[16] */ | ||
245 | |||
246 | asm volatile("psraw $4, %xmm3"); | ||
247 | asm volatile("pand %xmm7, %xmm6"); | ||
248 | asm volatile("pand %xmm7, %xmm3"); | ||
249 | asm volatile("pshufb %xmm6, %xmm0"); | ||
250 | asm volatile("pshufb %xmm3, %xmm1"); | ||
251 | asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0])); | ||
252 | asm volatile("pxor %xmm0, %xmm1"); | ||
253 | asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16])); | ||
254 | |||
255 | /* xmm1 = qmul[q[0] ^ dq[0]] */ | ||
256 | |||
257 | asm volatile("psraw $4, %xmm4"); | ||
258 | asm volatile("pand %xmm7, %xmm8"); | ||
259 | asm volatile("pand %xmm7, %xmm4"); | ||
260 | asm volatile("pshufb %xmm8, %xmm10"); | ||
261 | asm volatile("pshufb %xmm4, %xmm11"); | ||
262 | asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); | ||
263 | asm volatile("pxor %xmm10, %xmm11"); | ||
264 | asm volatile("movdqa %0, %%xmm12" : : "m" (p[16])); | ||
265 | |||
266 | /* xmm11 = qmul[q[16] ^ dq[16]] */ | ||
267 | |||
268 | asm volatile("pxor %xmm1, %xmm2"); | ||
269 | |||
270 | /* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */ | ||
271 | |||
272 | asm volatile("pxor %xmm11, %xmm12"); | ||
273 | |||
274 | /* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */ | ||
275 | |||
276 | asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); | ||
277 | asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16])); | ||
278 | |||
279 | asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); | ||
280 | asm volatile("movdqa %%xmm12, %0" : "=m" (p[16])); | ||
281 | |||
282 | bytes -= 32; | ||
283 | p += 32; | ||
284 | q += 32; | ||
285 | dq += 32; | ||
286 | |||
287 | #else | ||
288 | asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); | ||
289 | asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); | ||
290 | asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); | ||
291 | asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); | ||
292 | |||
293 | /* xmm3 = *q ^ *dq */ | ||
294 | |||
295 | asm volatile("movdqa %xmm3, %xmm6"); | ||
296 | asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); | ||
297 | asm volatile("psraw $4, %xmm3"); | ||
298 | asm volatile("pand %xmm7, %xmm6"); | ||
299 | asm volatile("pand %xmm7, %xmm3"); | ||
300 | asm volatile("pshufb %xmm6, %xmm0"); | ||
301 | asm volatile("pshufb %xmm3, %xmm1"); | ||
302 | asm volatile("pxor %xmm0, %xmm1"); | ||
303 | |||
304 | /* xmm1 = qmul[*q ^ *dq */ | ||
305 | |||
306 | asm volatile("pxor %xmm1, %xmm2"); | ||
307 | |||
308 | /* xmm2 = *p ^ qmul[*q ^ *dq] */ | ||
309 | |||
310 | asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); | ||
311 | asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); | ||
312 | |||
313 | bytes -= 16; | ||
314 | p += 16; | ||
315 | q += 16; | ||
316 | dq += 16; | ||
317 | #endif | ||
318 | } | ||
319 | |||
320 | kernel_fpu_end(); | ||
321 | } | ||
322 | |||
323 | const struct raid6_recov_calls raid6_recov_ssse3 = { | ||
324 | .data2 = raid6_2data_recov_ssse3, | ||
325 | .datap = raid6_datap_recov_ssse3, | ||
326 | .valid = raid6_has_ssse3, | ||
327 | #ifdef CONFIG_X86_64 | ||
328 | .name = "ssse3x2", | ||
329 | #else | ||
330 | .name = "ssse3x1", | ||
331 | #endif | ||
332 | .priority = 1, | ||
333 | }; | ||
334 | |||
335 | #endif | ||
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index aa651697b6dc..c76151d94764 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile | |||
@@ -23,7 +23,7 @@ RANLIB = ranlib | |||
23 | all: raid6.a raid6test | 23 | all: raid6.a raid6test |
24 | 24 | ||
25 | raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ | 25 | raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ |
26 | altivec1.o altivec2.o altivec4.o altivec8.o recov.o algos.o \ | 26 | altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \ |
27 | tables.o | 27 | tables.o |
28 | rm -f $@ | 28 | rm -f $@ |
29 | $(AR) cq $@ $^ | 29 | $(AR) cq $@ $^ |
diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c index 7a930318b17d..5a485b7a7d3c 100644 --- a/lib/raid6/test/test.c +++ b/lib/raid6/test/test.c | |||
@@ -90,25 +90,35 @@ static int test_disks(int i, int j) | |||
90 | int main(int argc, char *argv[]) | 90 | int main(int argc, char *argv[]) |
91 | { | 91 | { |
92 | const struct raid6_calls *const *algo; | 92 | const struct raid6_calls *const *algo; |
93 | const struct raid6_recov_calls *const *ra; | ||
93 | int i, j; | 94 | int i, j; |
94 | int err = 0; | 95 | int err = 0; |
95 | 96 | ||
96 | makedata(); | 97 | makedata(); |
97 | 98 | ||
98 | for (algo = raid6_algos; *algo; algo++) { | 99 | for (ra = raid6_recov_algos; *ra; ra++) { |
99 | if (!(*algo)->valid || (*algo)->valid()) { | 100 | if ((*ra)->valid && !(*ra)->valid()) |
100 | raid6_call = **algo; | 101 | continue; |
102 | raid6_2data_recov = (*ra)->data2; | ||
103 | raid6_datap_recov = (*ra)->datap; | ||
101 | 104 | ||
102 | /* Nuke syndromes */ | 105 | printf("using recovery %s\n", (*ra)->name); |
103 | memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); | ||
104 | 106 | ||
105 | /* Generate assumed good syndrome */ | 107 | for (algo = raid6_algos; *algo; algo++) { |
106 | raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, | 108 | if (!(*algo)->valid || (*algo)->valid()) { |
107 | (void **)&dataptrs); | 109 | raid6_call = **algo; |
108 | 110 | ||
109 | for (i = 0; i < NDISKS-1; i++) | 111 | /* Nuke syndromes */ |
110 | for (j = i+1; j < NDISKS; j++) | 112 | memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); |
111 | err += test_disks(i, j); | 113 | |
114 | /* Generate assumed good syndrome */ | ||
115 | raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, | ||
116 | (void **)&dataptrs); | ||
117 | |||
118 | for (i = 0; i < NDISKS-1; i++) | ||
119 | for (j = i+1; j < NDISKS; j++) | ||
120 | err += test_disks(i, j); | ||
121 | } | ||
112 | } | 122 | } |
113 | printf("\n"); | 123 | printf("\n"); |
114 | } | 124 | } |
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h index cb2a8c91c886..d55d63232c55 100644 --- a/lib/raid6/x86.h +++ b/lib/raid6/x86.h | |||
@@ -35,24 +35,29 @@ static inline void kernel_fpu_end(void) | |||
35 | { | 35 | { |
36 | } | 36 | } |
37 | 37 | ||
38 | #define __aligned(x) __attribute__((aligned(x))) | ||
39 | |||
38 | #define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ | 40 | #define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ |
39 | #define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions | 41 | #define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions |
40 | * (fast save and restore) */ | 42 | * (fast save and restore) */ |
41 | #define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ | 43 | #define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ |
42 | #define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ | 44 | #define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ |
45 | #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ | ||
46 | #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ | ||
47 | #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ | ||
43 | #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ | 48 | #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ |
44 | 49 | ||
45 | /* Should work well enough on modern CPUs for testing */ | 50 | /* Should work well enough on modern CPUs for testing */ |
46 | static inline int boot_cpu_has(int flag) | 51 | static inline int boot_cpu_has(int flag) |
47 | { | 52 | { |
48 | u32 eax = (flag >> 5) ? 0x80000001 : 1; | 53 | u32 eax = (flag & 0x20) ? 0x80000001 : 1; |
49 | u32 edx; | 54 | u32 ecx, edx; |
50 | 55 | ||
51 | asm volatile("cpuid" | 56 | asm volatile("cpuid" |
52 | : "+a" (eax), "=d" (edx) | 57 | : "+a" (eax), "=d" (edx), "=c" (ecx) |
53 | : : "ecx", "ebx"); | 58 | : : "ebx"); |
54 | 59 | ||
55 | return (edx >> (flag & 31)) & 1; | 60 | return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1; |
56 | } | 61 | } |
57 | 62 | ||
58 | #endif /* ndef __KERNEL__ */ | 63 | #endif /* ndef __KERNEL__ */ |