aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/include/asm/xor_32.h6
-rw-r--r--arch/x86/include/asm/xor_64.h8
-rw-r--r--arch/x86/include/asm/xor_avx.h214
-rw-r--r--crypto/xor.c13
-rw-r--r--drivers/md/bitmap.c1100
-rw-r--r--drivers/md/bitmap.h60
-rw-r--r--drivers/md/dm-raid.c22
-rw-r--r--drivers/md/md.c370
-rw-r--r--drivers/md/md.h12
-rw-r--r--drivers/md/raid1.c22
-rw-r--r--drivers/md/raid10.c1281
-rw-r--r--drivers/md/raid10.h34
-rw-r--r--drivers/md/raid5.c252
-rw-r--r--drivers/md/raid5.h7
-rw-r--r--include/linux/raid/md_p.h15
-rw-r--r--include/linux/raid/pq.h18
-rw-r--r--lib/raid6/Makefile2
-rw-r--r--lib/raid6/algos.c127
-rw-r--r--lib/raid6/mktables.c25
-rw-r--r--lib/raid6/recov.c15
-rw-r--r--lib/raid6/recov_ssse3.c335
-rw-r--r--lib/raid6/test/Makefile2
-rw-r--r--lib/raid6/test/test.c32
-rw-r--r--lib/raid6/x86.h15
25 files changed, 3124 insertions, 868 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index dc611a40a336..1f2521434554 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
115 115
116# does binutils support specific instructions? 116# does binutils support specific instructions?
117asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) 117asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
118avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
118 119
119KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) 120KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
120KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) 121KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
121 122
122LDFLAGS := -m elf_$(UTS_MACHINE) 123LDFLAGS := -m elf_$(UTS_MACHINE)
123 124
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 133b40a0f495..454570891bdc 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
861 .do_5 = xor_sse_5, 861 .do_5 = xor_sse_5,
862}; 862};
863 863
864/* Also try the AVX routines */
865#include "xor_avx.h"
866
864/* Also try the generic routines. */ 867/* Also try the generic routines. */
865#include <asm-generic/xor.h> 868#include <asm-generic/xor.h>
866 869
@@ -871,6 +874,7 @@ do { \
871 xor_speed(&xor_block_8regs_p); \ 874 xor_speed(&xor_block_8regs_p); \
872 xor_speed(&xor_block_32regs); \ 875 xor_speed(&xor_block_32regs); \
873 xor_speed(&xor_block_32regs_p); \ 876 xor_speed(&xor_block_32regs_p); \
877 AVX_XOR_SPEED; \
874 if (cpu_has_xmm) \ 878 if (cpu_has_xmm) \
875 xor_speed(&xor_block_pIII_sse); \ 879 xor_speed(&xor_block_pIII_sse); \
876 if (cpu_has_mmx) { \ 880 if (cpu_has_mmx) { \
@@ -883,6 +887,6 @@ do { \
883 We may also be able to load into the L1 only depending on how the cpu 887 We may also be able to load into the L1 only depending on how the cpu
884 deals with a load to a line that is being prefetched. */ 888 deals with a load to a line that is being prefetched. */
885#define XOR_SELECT_TEMPLATE(FASTEST) \ 889#define XOR_SELECT_TEMPLATE(FASTEST) \
886 (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) 890 AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
887 891
888#endif /* _ASM_X86_XOR_32_H */ 892#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1549b5e261f6..b9b2323e90fe 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = {
347 .do_5 = xor_sse_5, 347 .do_5 = xor_sse_5,
348}; 348};
349 349
350
351/* Also try the AVX routines */
352#include "xor_avx.h"
353
350#undef XOR_TRY_TEMPLATES 354#undef XOR_TRY_TEMPLATES
351#define XOR_TRY_TEMPLATES \ 355#define XOR_TRY_TEMPLATES \
352do { \ 356do { \
357 AVX_XOR_SPEED; \
353 xor_speed(&xor_block_sse); \ 358 xor_speed(&xor_block_sse); \
354} while (0) 359} while (0)
355 360
356/* We force the use of the SSE xor block because it can write around L2. 361/* We force the use of the SSE xor block because it can write around L2.
357 We may also be able to load into the L1 only depending on how the cpu 362 We may also be able to load into the L1 only depending on how the cpu
358 deals with a load to a line that is being prefetched. */ 363 deals with a load to a line that is being prefetched. */
359#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) 364#define XOR_SELECT_TEMPLATE(FASTEST) \
365 AVX_SELECT(&xor_block_sse)
360 366
361#endif /* _ASM_X86_XOR_64_H */ 367#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 000000000000..2510d35f480e
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
1#ifndef _ASM_X86_XOR_AVX_H
2#define _ASM_X86_XOR_AVX_H
3
4/*
5 * Optimized RAID-5 checksumming functions for AVX
6 *
7 * Copyright (C) 2012 Intel Corporation
8 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
9 *
10 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; version 2
15 * of the License.
16 */
17
18#ifdef CONFIG_AS_AVX
19
20#include <linux/compiler.h>
21#include <asm/i387.h>
22
23#define ALIGN32 __aligned(32)
24
25#define YMM_SAVED_REGS 4
26
27#define YMMS_SAVE \
28do { \
29 preempt_disable(); \
30 cr0 = read_cr0(); \
31 clts(); \
32 asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
33 asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
34 asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
35 asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
36} while (0);
37
38#define YMMS_RESTORE \
39do { \
40 asm volatile("sfence" : : : "memory"); \
41 asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
42 asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
43 asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
44 asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
45 write_cr0(cr0); \
46 preempt_enable(); \
47} while (0);
48
49#define BLOCK4(i) \
50 BLOCK(32 * i, 0) \
51 BLOCK(32 * (i + 1), 1) \
52 BLOCK(32 * (i + 2), 2) \
53 BLOCK(32 * (i + 3), 3)
54
55#define BLOCK16() \
56 BLOCK4(0) \
57 BLOCK4(4) \
58 BLOCK4(8) \
59 BLOCK4(12)
60
61static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
62{
63 unsigned long cr0, lines = bytes >> 9;
64 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
65
66 YMMS_SAVE
67
68 while (lines--) {
69#undef BLOCK
70#define BLOCK(i, reg) \
71do { \
72 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
73 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
74 "m" (p0[i / sizeof(*p0)])); \
75 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
76 "=m" (p0[i / sizeof(*p0)])); \
77} while (0);
78
79 BLOCK16()
80
81 p0 = (unsigned long *)((uintptr_t)p0 + 512);
82 p1 = (unsigned long *)((uintptr_t)p1 + 512);
83 }
84
85 YMMS_RESTORE
86}
87
88static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
89 unsigned long *p2)
90{
91 unsigned long cr0, lines = bytes >> 9;
92 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
93
94 YMMS_SAVE
95
96 while (lines--) {
97#undef BLOCK
98#define BLOCK(i, reg) \
99do { \
100 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102 "m" (p1[i / sizeof(*p1)])); \
103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104 "m" (p0[i / sizeof(*p0)])); \
105 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
106 "=m" (p0[i / sizeof(*p0)])); \
107} while (0);
108
109 BLOCK16()
110
111 p0 = (unsigned long *)((uintptr_t)p0 + 512);
112 p1 = (unsigned long *)((uintptr_t)p1 + 512);
113 p2 = (unsigned long *)((uintptr_t)p2 + 512);
114 }
115
116 YMMS_RESTORE
117}
118
119static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
120 unsigned long *p2, unsigned long *p3)
121{
122 unsigned long cr0, lines = bytes >> 9;
123 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
124
125 YMMS_SAVE
126
127 while (lines--) {
128#undef BLOCK
129#define BLOCK(i, reg) \
130do { \
131 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
132 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
133 "m" (p2[i / sizeof(*p2)])); \
134 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
135 "m" (p1[i / sizeof(*p1)])); \
136 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137 "m" (p0[i / sizeof(*p0)])); \
138 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
139 "=m" (p0[i / sizeof(*p0)])); \
140} while (0);
141
142 BLOCK16();
143
144 p0 = (unsigned long *)((uintptr_t)p0 + 512);
145 p1 = (unsigned long *)((uintptr_t)p1 + 512);
146 p2 = (unsigned long *)((uintptr_t)p2 + 512);
147 p3 = (unsigned long *)((uintptr_t)p3 + 512);
148 }
149
150 YMMS_RESTORE
151}
152
153static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
154 unsigned long *p2, unsigned long *p3, unsigned long *p4)
155{
156 unsigned long cr0, lines = bytes >> 9;
157 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
158
159 YMMS_SAVE
160
161 while (lines--) {
162#undef BLOCK
163#define BLOCK(i, reg) \
164do { \
165 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
166 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
167 "m" (p3[i / sizeof(*p3)])); \
168 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
169 "m" (p2[i / sizeof(*p2)])); \
170 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
171 "m" (p1[i / sizeof(*p1)])); \
172 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
173 "m" (p0[i / sizeof(*p0)])); \
174 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
175 "=m" (p0[i / sizeof(*p0)])); \
176} while (0);
177
178 BLOCK16()
179
180 p0 = (unsigned long *)((uintptr_t)p0 + 512);
181 p1 = (unsigned long *)((uintptr_t)p1 + 512);
182 p2 = (unsigned long *)((uintptr_t)p2 + 512);
183 p3 = (unsigned long *)((uintptr_t)p3 + 512);
184 p4 = (unsigned long *)((uintptr_t)p4 + 512);
185 }
186
187 YMMS_RESTORE
188}
189
190static struct xor_block_template xor_block_avx = {
191 .name = "avx",
192 .do_2 = xor_avx_2,
193 .do_3 = xor_avx_3,
194 .do_4 = xor_avx_4,
195 .do_5 = xor_avx_5,
196};
197
198#define AVX_XOR_SPEED \
199do { \
200 if (cpu_has_avx) \
201 xor_speed(&xor_block_avx); \
202} while (0)
203
204#define AVX_SELECT(FASTEST) \
205 (cpu_has_avx ? &xor_block_avx : FASTEST)
206
207#else
208
209#define AVX_XOR_SPEED {}
210
211#define AVX_SELECT(FASTEST) (FASTEST)
212
213#endif
214#endif
diff --git a/crypto/xor.c b/crypto/xor.c
index 664b6dfa9e2c..65c7b416b4a3 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -21,6 +21,7 @@
21#include <linux/gfp.h> 21#include <linux/gfp.h>
22#include <linux/raid/xor.h> 22#include <linux/raid/xor.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/preempt.h>
24#include <asm/xor.h> 25#include <asm/xor.h>
25 26
26/* The xor routines to use. */ 27/* The xor routines to use. */
@@ -63,12 +64,14 @@ static void
63do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) 64do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
64{ 65{
65 int speed; 66 int speed;
66 unsigned long now; 67 unsigned long now, j;
67 int i, count, max; 68 int i, count, max;
68 69
69 tmpl->next = template_list; 70 tmpl->next = template_list;
70 template_list = tmpl; 71 template_list = tmpl;
71 72
73 preempt_disable();
74
72 /* 75 /*
73 * Count the number of XORs done during a whole jiffy, and use 76 * Count the number of XORs done during a whole jiffy, and use
74 * this to calculate the speed of checksumming. We use a 2-page 77 * this to calculate the speed of checksumming. We use a 2-page
@@ -76,9 +79,11 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
76 */ 79 */
77 max = 0; 80 max = 0;
78 for (i = 0; i < 5; i++) { 81 for (i = 0; i < 5; i++) {
79 now = jiffies; 82 j = jiffies;
80 count = 0; 83 count = 0;
81 while (jiffies == now) { 84 while ((now = jiffies) == j)
85 cpu_relax();
86 while (time_before(jiffies, now + 1)) {
82 mb(); /* prevent loop optimzation */ 87 mb(); /* prevent loop optimzation */
83 tmpl->do_2(BENCH_SIZE, b1, b2); 88 tmpl->do_2(BENCH_SIZE, b1, b2);
84 mb(); 89 mb();
@@ -89,6 +94,8 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
89 max = count; 94 max = count;
90 } 95 }
91 96
97 preempt_enable();
98
92 speed = max * (HZ * BENCH_SIZE / 1024); 99 speed = max * (HZ * BENCH_SIZE / 1024);
93 tmpl->speed = speed; 100 tmpl->speed = speed;
94 101
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 17e2b472e16d..15dbe03117e4 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -45,7 +45,7 @@ static inline char *bmname(struct bitmap *bitmap)
45 * if we find our page, we increment the page's refcount so that it stays 45 * if we find our page, we increment the page's refcount so that it stays
46 * allocated while we're using it 46 * allocated while we're using it
47 */ 47 */
48static int bitmap_checkpage(struct bitmap *bitmap, 48static int bitmap_checkpage(struct bitmap_counts *bitmap,
49 unsigned long page, int create) 49 unsigned long page, int create)
50__releases(bitmap->lock) 50__releases(bitmap->lock)
51__acquires(bitmap->lock) 51__acquires(bitmap->lock)
@@ -76,8 +76,7 @@ __acquires(bitmap->lock)
76 spin_lock_irq(&bitmap->lock); 76 spin_lock_irq(&bitmap->lock);
77 77
78 if (mappage == NULL) { 78 if (mappage == NULL) {
79 pr_debug("%s: bitmap map page allocation failed, hijacking\n", 79 pr_debug("md/bitmap: map page allocation failed, hijacking\n");
80 bmname(bitmap));
81 /* failed - set the hijacked flag so that we can use the 80 /* failed - set the hijacked flag so that we can use the
82 * pointer as a counter */ 81 * pointer as a counter */
83 if (!bitmap->bp[page].map) 82 if (!bitmap->bp[page].map)
@@ -100,7 +99,7 @@ __acquires(bitmap->lock)
100/* if page is completely empty, put it back on the free list, or dealloc it */ 99/* if page is completely empty, put it back on the free list, or dealloc it */
101/* if page was hijacked, unmark the flag so it might get alloced next time */ 100/* if page was hijacked, unmark the flag so it might get alloced next time */
102/* Note: lock should be held when calling this */ 101/* Note: lock should be held when calling this */
103static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) 102static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
104{ 103{
105 char *ptr; 104 char *ptr;
106 105
@@ -130,22 +129,14 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
130 */ 129 */
131 130
132/* IO operations when bitmap is stored near all superblocks */ 131/* IO operations when bitmap is stored near all superblocks */
133static struct page *read_sb_page(struct mddev *mddev, loff_t offset, 132static int read_sb_page(struct mddev *mddev, loff_t offset,
134 struct page *page, 133 struct page *page,
135 unsigned long index, int size) 134 unsigned long index, int size)
136{ 135{
137 /* choose a good rdev and read the page from there */ 136 /* choose a good rdev and read the page from there */
138 137
139 struct md_rdev *rdev; 138 struct md_rdev *rdev;
140 sector_t target; 139 sector_t target;
141 int did_alloc = 0;
142
143 if (!page) {
144 page = alloc_page(GFP_KERNEL);
145 if (!page)
146 return ERR_PTR(-ENOMEM);
147 did_alloc = 1;
148 }
149 140
150 rdev_for_each(rdev, mddev) { 141 rdev_for_each(rdev, mddev) {
151 if (! test_bit(In_sync, &rdev->flags) 142 if (! test_bit(In_sync, &rdev->flags)
@@ -158,15 +149,10 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset,
158 roundup(size, bdev_logical_block_size(rdev->bdev)), 149 roundup(size, bdev_logical_block_size(rdev->bdev)),
159 page, READ, true)) { 150 page, READ, true)) {
160 page->index = index; 151 page->index = index;
161 attach_page_buffers(page, NULL); /* so that free_buffer will 152 return 0;
162 * quietly no-op */
163 return page;
164 } 153 }
165 } 154 }
166 if (did_alloc) 155 return -EIO;
167 put_page(page);
168 return ERR_PTR(-EIO);
169
170} 156}
171 157
172static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) 158static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev)
@@ -208,6 +194,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
208 struct md_rdev *rdev = NULL; 194 struct md_rdev *rdev = NULL;
209 struct block_device *bdev; 195 struct block_device *bdev;
210 struct mddev *mddev = bitmap->mddev; 196 struct mddev *mddev = bitmap->mddev;
197 struct bitmap_storage *store = &bitmap->storage;
211 198
212 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { 199 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
213 int size = PAGE_SIZE; 200 int size = PAGE_SIZE;
@@ -215,9 +202,13 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
215 202
216 bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; 203 bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
217 204
218 if (page->index == bitmap->file_pages-1) 205 if (page->index == store->file_pages-1) {
219 size = roundup(bitmap->last_page_size, 206 int last_page_size = store->bytes & (PAGE_SIZE-1);
207 if (last_page_size == 0)
208 last_page_size = PAGE_SIZE;
209 size = roundup(last_page_size,
220 bdev_logical_block_size(bdev)); 210 bdev_logical_block_size(bdev));
211 }
221 /* Just make sure we aren't corrupting data or 212 /* Just make sure we aren't corrupting data or
222 * metadata 213 * metadata
223 */ 214 */
@@ -276,10 +267,10 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
276{ 267{
277 struct buffer_head *bh; 268 struct buffer_head *bh;
278 269
279 if (bitmap->file == NULL) { 270 if (bitmap->storage.file == NULL) {
280 switch (write_sb_page(bitmap, page, wait)) { 271 switch (write_sb_page(bitmap, page, wait)) {
281 case -EINVAL: 272 case -EINVAL:
282 bitmap->flags |= BITMAP_WRITE_ERROR; 273 set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
283 } 274 }
284 } else { 275 } else {
285 276
@@ -297,20 +288,16 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
297 wait_event(bitmap->write_wait, 288 wait_event(bitmap->write_wait,
298 atomic_read(&bitmap->pending_writes)==0); 289 atomic_read(&bitmap->pending_writes)==0);
299 } 290 }
300 if (bitmap->flags & BITMAP_WRITE_ERROR) 291 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
301 bitmap_file_kick(bitmap); 292 bitmap_file_kick(bitmap);
302} 293}
303 294
304static void end_bitmap_write(struct buffer_head *bh, int uptodate) 295static void end_bitmap_write(struct buffer_head *bh, int uptodate)
305{ 296{
306 struct bitmap *bitmap = bh->b_private; 297 struct bitmap *bitmap = bh->b_private;
307 unsigned long flags;
308 298
309 if (!uptodate) { 299 if (!uptodate)
310 spin_lock_irqsave(&bitmap->lock, flags); 300 set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
311 bitmap->flags |= BITMAP_WRITE_ERROR;
312 spin_unlock_irqrestore(&bitmap->lock, flags);
313 }
314 if (atomic_dec_and_test(&bitmap->pending_writes)) 301 if (atomic_dec_and_test(&bitmap->pending_writes))
315 wake_up(&bitmap->write_wait); 302 wake_up(&bitmap->write_wait);
316} 303}
@@ -325,8 +312,12 @@ __clear_page_buffers(struct page *page)
325} 312}
326static void free_buffers(struct page *page) 313static void free_buffers(struct page *page)
327{ 314{
328 struct buffer_head *bh = page_buffers(page); 315 struct buffer_head *bh;
329 316
317 if (!PagePrivate(page))
318 return;
319
320 bh = page_buffers(page);
330 while (bh) { 321 while (bh) {
331 struct buffer_head *next = bh->b_this_page; 322 struct buffer_head *next = bh->b_this_page;
332 free_buffer_head(bh); 323 free_buffer_head(bh);
@@ -343,11 +334,12 @@ static void free_buffers(struct page *page)
343 * This usage is similar to how swap files are handled, and allows us 334 * This usage is similar to how swap files are handled, and allows us
344 * to write to a file with no concerns of memory allocation failing. 335 * to write to a file with no concerns of memory allocation failing.
345 */ 336 */
346static struct page *read_page(struct file *file, unsigned long index, 337static int read_page(struct file *file, unsigned long index,
347 struct bitmap *bitmap, 338 struct bitmap *bitmap,
348 unsigned long count) 339 unsigned long count,
340 struct page *page)
349{ 341{
350 struct page *page = NULL; 342 int ret = 0;
351 struct inode *inode = file->f_path.dentry->d_inode; 343 struct inode *inode = file->f_path.dentry->d_inode;
352 struct buffer_head *bh; 344 struct buffer_head *bh;
353 sector_t block; 345 sector_t block;
@@ -355,16 +347,9 @@ static struct page *read_page(struct file *file, unsigned long index,
355 pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, 347 pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
356 (unsigned long long)index << PAGE_SHIFT); 348 (unsigned long long)index << PAGE_SHIFT);
357 349
358 page = alloc_page(GFP_KERNEL);
359 if (!page)
360 page = ERR_PTR(-ENOMEM);
361 if (IS_ERR(page))
362 goto out;
363
364 bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); 350 bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
365 if (!bh) { 351 if (!bh) {
366 put_page(page); 352 ret = -ENOMEM;
367 page = ERR_PTR(-ENOMEM);
368 goto out; 353 goto out;
369 } 354 }
370 attach_page_buffers(page, bh); 355 attach_page_buffers(page, bh);
@@ -376,8 +361,7 @@ static struct page *read_page(struct file *file, unsigned long index,
376 bh->b_blocknr = bmap(inode, block); 361 bh->b_blocknr = bmap(inode, block);
377 if (bh->b_blocknr == 0) { 362 if (bh->b_blocknr == 0) {
378 /* Cannot use this file! */ 363 /* Cannot use this file! */
379 free_buffers(page); 364 ret = -EINVAL;
380 page = ERR_PTR(-EINVAL);
381 goto out; 365 goto out;
382 } 366 }
383 bh->b_bdev = inode->i_sb->s_bdev; 367 bh->b_bdev = inode->i_sb->s_bdev;
@@ -400,17 +384,15 @@ static struct page *read_page(struct file *file, unsigned long index,
400 384
401 wait_event(bitmap->write_wait, 385 wait_event(bitmap->write_wait,
402 atomic_read(&bitmap->pending_writes)==0); 386 atomic_read(&bitmap->pending_writes)==0);
403 if (bitmap->flags & BITMAP_WRITE_ERROR) { 387 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
404 free_buffers(page); 388 ret = -EIO;
405 page = ERR_PTR(-EIO);
406 }
407out: 389out:
408 if (IS_ERR(page)) 390 if (ret)
409 printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n", 391 printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n",
410 (int)PAGE_SIZE, 392 (int)PAGE_SIZE,
411 (unsigned long long)index << PAGE_SHIFT, 393 (unsigned long long)index << PAGE_SHIFT,
412 PTR_ERR(page)); 394 ret);
413 return page; 395 return ret;
414} 396}
415 397
416/* 398/*
@@ -426,9 +408,9 @@ void bitmap_update_sb(struct bitmap *bitmap)
426 return; 408 return;
427 if (bitmap->mddev->bitmap_info.external) 409 if (bitmap->mddev->bitmap_info.external)
428 return; 410 return;
429 if (!bitmap->sb_page) /* no superblock */ 411 if (!bitmap->storage.sb_page) /* no superblock */
430 return; 412 return;
431 sb = kmap_atomic(bitmap->sb_page); 413 sb = kmap_atomic(bitmap->storage.sb_page);
432 sb->events = cpu_to_le64(bitmap->mddev->events); 414 sb->events = cpu_to_le64(bitmap->mddev->events);
433 if (bitmap->mddev->events < bitmap->events_cleared) 415 if (bitmap->mddev->events < bitmap->events_cleared)
434 /* rocking back to read-only */ 416 /* rocking back to read-only */
@@ -438,8 +420,13 @@ void bitmap_update_sb(struct bitmap *bitmap)
438 /* Just in case these have been changed via sysfs: */ 420 /* Just in case these have been changed via sysfs: */
439 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); 421 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
440 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); 422 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
423 /* This might have been changed by a reshape */
424 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
425 sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
426 sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
427 bitmap_info.space);
441 kunmap_atomic(sb); 428 kunmap_atomic(sb);
442 write_page(bitmap, bitmap->sb_page, 1); 429 write_page(bitmap, bitmap->storage.sb_page, 1);
443} 430}
444 431
445/* print out the bitmap file superblock */ 432/* print out the bitmap file superblock */
@@ -447,9 +434,9 @@ void bitmap_print_sb(struct bitmap *bitmap)
447{ 434{
448 bitmap_super_t *sb; 435 bitmap_super_t *sb;
449 436
450 if (!bitmap || !bitmap->sb_page) 437 if (!bitmap || !bitmap->storage.sb_page)
451 return; 438 return;
452 sb = kmap_atomic(bitmap->sb_page); 439 sb = kmap_atomic(bitmap->storage.sb_page);
453 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); 440 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
454 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); 441 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
455 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); 442 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
@@ -488,15 +475,15 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
488 unsigned long chunksize, daemon_sleep, write_behind; 475 unsigned long chunksize, daemon_sleep, write_behind;
489 int err = -EINVAL; 476 int err = -EINVAL;
490 477
491 bitmap->sb_page = alloc_page(GFP_KERNEL); 478 bitmap->storage.sb_page = alloc_page(GFP_KERNEL);
492 if (IS_ERR(bitmap->sb_page)) { 479 if (IS_ERR(bitmap->storage.sb_page)) {
493 err = PTR_ERR(bitmap->sb_page); 480 err = PTR_ERR(bitmap->storage.sb_page);
494 bitmap->sb_page = NULL; 481 bitmap->storage.sb_page = NULL;
495 return err; 482 return err;
496 } 483 }
497 bitmap->sb_page->index = 0; 484 bitmap->storage.sb_page->index = 0;
498 485
499 sb = kmap_atomic(bitmap->sb_page); 486 sb = kmap_atomic(bitmap->storage.sb_page);
500 487
501 sb->magic = cpu_to_le32(BITMAP_MAGIC); 488 sb->magic = cpu_to_le32(BITMAP_MAGIC);
502 sb->version = cpu_to_le32(BITMAP_MAJOR_HI); 489 sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
@@ -534,8 +521,8 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
534 521
535 memcpy(sb->uuid, bitmap->mddev->uuid, 16); 522 memcpy(sb->uuid, bitmap->mddev->uuid, 16);
536 523
537 bitmap->flags |= BITMAP_STALE; 524 set_bit(BITMAP_STALE, &bitmap->flags);
538 sb->state |= cpu_to_le32(BITMAP_STALE); 525 sb->state = cpu_to_le32(bitmap->flags);
539 bitmap->events_cleared = bitmap->mddev->events; 526 bitmap->events_cleared = bitmap->mddev->events;
540 sb->events_cleared = cpu_to_le64(bitmap->mddev->events); 527 sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
541 528
@@ -551,31 +538,45 @@ static int bitmap_read_sb(struct bitmap *bitmap)
551 bitmap_super_t *sb; 538 bitmap_super_t *sb;
552 unsigned long chunksize, daemon_sleep, write_behind; 539 unsigned long chunksize, daemon_sleep, write_behind;
553 unsigned long long events; 540 unsigned long long events;
541 unsigned long sectors_reserved = 0;
554 int err = -EINVAL; 542 int err = -EINVAL;
543 struct page *sb_page;
555 544
545 if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
546 chunksize = 128 * 1024 * 1024;
547 daemon_sleep = 5 * HZ;
548 write_behind = 0;
549 set_bit(BITMAP_STALE, &bitmap->flags);
550 err = 0;
551 goto out_no_sb;
552 }
556 /* page 0 is the superblock, read it... */ 553 /* page 0 is the superblock, read it... */
557 if (bitmap->file) { 554 sb_page = alloc_page(GFP_KERNEL);
558 loff_t isize = i_size_read(bitmap->file->f_mapping->host); 555 if (!sb_page)
556 return -ENOMEM;
557 bitmap->storage.sb_page = sb_page;
558
559 if (bitmap->storage.file) {
560 loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
559 int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; 561 int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
560 562
561 bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); 563 err = read_page(bitmap->storage.file, 0,
564 bitmap, bytes, sb_page);
562 } else { 565 } else {
563 bitmap->sb_page = read_sb_page(bitmap->mddev, 566 err = read_sb_page(bitmap->mddev,
564 bitmap->mddev->bitmap_info.offset, 567 bitmap->mddev->bitmap_info.offset,
565 NULL, 568 sb_page,
566 0, sizeof(bitmap_super_t)); 569 0, sizeof(bitmap_super_t));
567 } 570 }
568 if (IS_ERR(bitmap->sb_page)) { 571 if (err)
569 err = PTR_ERR(bitmap->sb_page);
570 bitmap->sb_page = NULL;
571 return err; 572 return err;
572 }
573 573
574 sb = kmap_atomic(bitmap->sb_page); 574 sb = kmap_atomic(sb_page);
575 575
576 chunksize = le32_to_cpu(sb->chunksize); 576 chunksize = le32_to_cpu(sb->chunksize);
577 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; 577 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
578 write_behind = le32_to_cpu(sb->write_behind); 578 write_behind = le32_to_cpu(sb->write_behind);
579 sectors_reserved = le32_to_cpu(sb->sectors_reserved);
579 580
580 /* verify that the bitmap-specific fields are valid */ 581 /* verify that the bitmap-specific fields are valid */
581 if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) 582 if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -618,60 +619,32 @@ static int bitmap_read_sb(struct bitmap *bitmap)
618 "-- forcing full recovery\n", 619 "-- forcing full recovery\n",
619 bmname(bitmap), events, 620 bmname(bitmap), events,
620 (unsigned long long) bitmap->mddev->events); 621 (unsigned long long) bitmap->mddev->events);
621 sb->state |= cpu_to_le32(BITMAP_STALE); 622 set_bit(BITMAP_STALE, &bitmap->flags);
622 } 623 }
623 } 624 }
624 625
625 /* assign fields using values from superblock */ 626 /* assign fields using values from superblock */
626 bitmap->mddev->bitmap_info.chunksize = chunksize;
627 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
628 bitmap->mddev->bitmap_info.max_write_behind = write_behind;
629 bitmap->flags |= le32_to_cpu(sb->state); 627 bitmap->flags |= le32_to_cpu(sb->state);
630 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) 628 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
631 bitmap->flags |= BITMAP_HOSTENDIAN; 629 set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
632 bitmap->events_cleared = le64_to_cpu(sb->events_cleared); 630 bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
633 if (bitmap->flags & BITMAP_STALE)
634 bitmap->events_cleared = bitmap->mddev->events;
635 err = 0; 631 err = 0;
636out: 632out:
637 kunmap_atomic(sb); 633 kunmap_atomic(sb);
634out_no_sb:
635 if (test_bit(BITMAP_STALE, &bitmap->flags))
636 bitmap->events_cleared = bitmap->mddev->events;
637 bitmap->mddev->bitmap_info.chunksize = chunksize;
638 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
639 bitmap->mddev->bitmap_info.max_write_behind = write_behind;
640 if (bitmap->mddev->bitmap_info.space == 0 ||
641 bitmap->mddev->bitmap_info.space > sectors_reserved)
642 bitmap->mddev->bitmap_info.space = sectors_reserved;
638 if (err) 643 if (err)
639 bitmap_print_sb(bitmap); 644 bitmap_print_sb(bitmap);
640 return err; 645 return err;
641} 646}
642 647
643enum bitmap_mask_op {
644 MASK_SET,
645 MASK_UNSET
646};
647
648/* record the state of the bitmap in the superblock. Return the old value */
649static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
650 enum bitmap_mask_op op)
651{
652 bitmap_super_t *sb;
653 int old;
654
655 if (!bitmap->sb_page) /* can't set the state */
656 return 0;
657 sb = kmap_atomic(bitmap->sb_page);
658 old = le32_to_cpu(sb->state) & bits;
659 switch (op) {
660 case MASK_SET:
661 sb->state |= cpu_to_le32(bits);
662 bitmap->flags |= bits;
663 break;
664 case MASK_UNSET:
665 sb->state &= cpu_to_le32(~bits);
666 bitmap->flags &= ~bits;
667 break;
668 default:
669 BUG();
670 }
671 kunmap_atomic(sb);
672 return old;
673}
674
675/* 648/*
676 * general bitmap file operations 649 * general bitmap file operations
677 */ 650 */
@@ -683,17 +656,19 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
683 * file a page at a time. There's a superblock at the start of the file. 656 * file a page at a time. There's a superblock at the start of the file.
684 */ 657 */
685/* calculate the index of the page that contains this bit */ 658/* calculate the index of the page that contains this bit */
686static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk) 659static inline unsigned long file_page_index(struct bitmap_storage *store,
660 unsigned long chunk)
687{ 661{
688 if (!bitmap->mddev->bitmap_info.external) 662 if (store->sb_page)
689 chunk += sizeof(bitmap_super_t) << 3; 663 chunk += sizeof(bitmap_super_t) << 3;
690 return chunk >> PAGE_BIT_SHIFT; 664 return chunk >> PAGE_BIT_SHIFT;
691} 665}
692 666
693/* calculate the (bit) offset of this bit within a page */ 667/* calculate the (bit) offset of this bit within a page */
694static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk) 668static inline unsigned long file_page_offset(struct bitmap_storage *store,
669 unsigned long chunk)
695{ 670{
696 if (!bitmap->mddev->bitmap_info.external) 671 if (store->sb_page)
697 chunk += sizeof(bitmap_super_t) << 3; 672 chunk += sizeof(bitmap_super_t) << 3;
698 return chunk & (PAGE_BITS - 1); 673 return chunk & (PAGE_BITS - 1);
699} 674}
@@ -705,57 +680,86 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon
705 * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page 680 * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
706 * 0 or page 1 681 * 0 or page 1
707 */ 682 */
708static inline struct page *filemap_get_page(struct bitmap *bitmap, 683static inline struct page *filemap_get_page(struct bitmap_storage *store,
709 unsigned long chunk) 684 unsigned long chunk)
710{ 685{
711 if (file_page_index(bitmap, chunk) >= bitmap->file_pages) 686 if (file_page_index(store, chunk) >= store->file_pages)
712 return NULL; 687 return NULL;
713 return bitmap->filemap[file_page_index(bitmap, chunk) 688 return store->filemap[file_page_index(store, chunk)
714 - file_page_index(bitmap, 0)]; 689 - file_page_index(store, 0)];
715} 690}
716 691
717static void bitmap_file_unmap(struct bitmap *bitmap) 692static int bitmap_storage_alloc(struct bitmap_storage *store,
693 unsigned long chunks, int with_super)
694{
695 int pnum;
696 unsigned long num_pages;
697 unsigned long bytes;
698
699 bytes = DIV_ROUND_UP(chunks, 8);
700 if (with_super)
701 bytes += sizeof(bitmap_super_t);
702
703 num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
704
705 store->filemap = kmalloc(sizeof(struct page *)
706 * num_pages, GFP_KERNEL);
707 if (!store->filemap)
708 return -ENOMEM;
709
710 if (with_super && !store->sb_page) {
711 store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
712 if (store->sb_page == NULL)
713 return -ENOMEM;
714 store->sb_page->index = 0;
715 }
716 pnum = 0;
717 if (store->sb_page) {
718 store->filemap[0] = store->sb_page;
719 pnum = 1;
720 }
721 for ( ; pnum < num_pages; pnum++) {
722 store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
723 if (!store->filemap[pnum]) {
724 store->file_pages = pnum;
725 return -ENOMEM;
726 }
727 store->filemap[pnum]->index = pnum;
728 }
729 store->file_pages = pnum;
730
731 /* We need 4 bits per page, rounded up to a multiple
732 * of sizeof(unsigned long) */
733 store->filemap_attr = kzalloc(
734 roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
735 GFP_KERNEL);
736 if (!store->filemap_attr)
737 return -ENOMEM;
738
739 store->bytes = bytes;
740
741 return 0;
742}
743
744static void bitmap_file_unmap(struct bitmap_storage *store)
718{ 745{
719 struct page **map, *sb_page; 746 struct page **map, *sb_page;
720 unsigned long *attr;
721 int pages; 747 int pages;
722 unsigned long flags; 748 struct file *file;
723 749
724 spin_lock_irqsave(&bitmap->lock, flags); 750 file = store->file;
725 map = bitmap->filemap; 751 map = store->filemap;
726 bitmap->filemap = NULL; 752 pages = store->file_pages;
727 attr = bitmap->filemap_attr; 753 sb_page = store->sb_page;
728 bitmap->filemap_attr = NULL;
729 pages = bitmap->file_pages;
730 bitmap->file_pages = 0;
731 sb_page = bitmap->sb_page;
732 bitmap->sb_page = NULL;
733 spin_unlock_irqrestore(&bitmap->lock, flags);
734 754
735 while (pages--) 755 while (pages--)
736 if (map[pages] != sb_page) /* 0 is sb_page, release it below */ 756 if (map[pages] != sb_page) /* 0 is sb_page, release it below */
737 free_buffers(map[pages]); 757 free_buffers(map[pages]);
738 kfree(map); 758 kfree(map);
739 kfree(attr); 759 kfree(store->filemap_attr);
740 760
741 if (sb_page) 761 if (sb_page)
742 free_buffers(sb_page); 762 free_buffers(sb_page);
743}
744
745static void bitmap_file_put(struct bitmap *bitmap)
746{
747 struct file *file;
748 unsigned long flags;
749
750 spin_lock_irqsave(&bitmap->lock, flags);
751 file = bitmap->file;
752 bitmap->file = NULL;
753 spin_unlock_irqrestore(&bitmap->lock, flags);
754
755 if (file)
756 wait_event(bitmap->write_wait,
757 atomic_read(&bitmap->pending_writes)==0);
758 bitmap_file_unmap(bitmap);
759 763
760 if (file) { 764 if (file) {
761 struct inode *inode = file->f_path.dentry->d_inode; 765 struct inode *inode = file->f_path.dentry->d_inode;
@@ -773,14 +777,14 @@ static void bitmap_file_kick(struct bitmap *bitmap)
773{ 777{
774 char *path, *ptr = NULL; 778 char *path, *ptr = NULL;
775 779
776 if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) { 780 if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
777 bitmap_update_sb(bitmap); 781 bitmap_update_sb(bitmap);
778 782
779 if (bitmap->file) { 783 if (bitmap->storage.file) {
780 path = kmalloc(PAGE_SIZE, GFP_KERNEL); 784 path = kmalloc(PAGE_SIZE, GFP_KERNEL);
781 if (path) 785 if (path)
782 ptr = d_path(&bitmap->file->f_path, path, 786 ptr = d_path(&bitmap->storage.file->f_path,
783 PAGE_SIZE); 787 path, PAGE_SIZE);
784 788
785 printk(KERN_ALERT 789 printk(KERN_ALERT
786 "%s: kicking failed bitmap file %s from array!\n", 790 "%s: kicking failed bitmap file %s from array!\n",
@@ -792,10 +796,6 @@ static void bitmap_file_kick(struct bitmap *bitmap)
792 "%s: disabling internal bitmap due to errors\n", 796 "%s: disabling internal bitmap due to errors\n",
793 bmname(bitmap)); 797 bmname(bitmap));
794 } 798 }
795
796 bitmap_file_put(bitmap);
797
798 return;
799} 799}
800 800
801enum bitmap_page_attr { 801enum bitmap_page_attr {
@@ -805,24 +805,30 @@ enum bitmap_page_attr {
805 BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ 805 BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */
806}; 806};
807 807
808static inline void set_page_attr(struct bitmap *bitmap, struct page *page, 808static inline void set_page_attr(struct bitmap *bitmap, int pnum,
809 enum bitmap_page_attr attr) 809 enum bitmap_page_attr attr)
810{ 810{
811 __set_bit((page->index<<2) + attr, bitmap->filemap_attr); 811 set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
812} 812}
813 813
814static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, 814static inline void clear_page_attr(struct bitmap *bitmap, int pnum,
815 enum bitmap_page_attr attr) 815 enum bitmap_page_attr attr)
816{ 816{
817 __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); 817 clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
818} 818}
819 819
820static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, 820static inline int test_page_attr(struct bitmap *bitmap, int pnum,
821 enum bitmap_page_attr attr) 821 enum bitmap_page_attr attr)
822{ 822{
823 return test_bit((page->index<<2) + attr, bitmap->filemap_attr); 823 return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
824} 824}
825 825
826static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum,
827 enum bitmap_page_attr attr)
828{
829 return test_and_clear_bit((pnum<<2) + attr,
830 bitmap->storage.filemap_attr);
831}
826/* 832/*
827 * bitmap_file_set_bit -- called before performing a write to the md device 833 * bitmap_file_set_bit -- called before performing a write to the md device
828 * to set (and eventually sync) a particular bit in the bitmap file 834 * to set (and eventually sync) a particular bit in the bitmap file
@@ -835,26 +841,46 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
835 unsigned long bit; 841 unsigned long bit;
836 struct page *page; 842 struct page *page;
837 void *kaddr; 843 void *kaddr;
838 unsigned long chunk = block >> bitmap->chunkshift; 844 unsigned long chunk = block >> bitmap->counts.chunkshift;
839 845
840 if (!bitmap->filemap) 846 page = filemap_get_page(&bitmap->storage, chunk);
841 return;
842
843 page = filemap_get_page(bitmap, chunk);
844 if (!page) 847 if (!page)
845 return; 848 return;
846 bit = file_page_offset(bitmap, chunk); 849 bit = file_page_offset(&bitmap->storage, chunk);
847 850
848 /* set the bit */ 851 /* set the bit */
849 kaddr = kmap_atomic(page); 852 kaddr = kmap_atomic(page);
850 if (bitmap->flags & BITMAP_HOSTENDIAN) 853 if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
851 set_bit(bit, kaddr); 854 set_bit(bit, kaddr);
852 else 855 else
853 __set_bit_le(bit, kaddr); 856 test_and_set_bit_le(bit, kaddr);
854 kunmap_atomic(kaddr); 857 kunmap_atomic(kaddr);
855 pr_debug("set file bit %lu page %lu\n", bit, page->index); 858 pr_debug("set file bit %lu page %lu\n", bit, page->index);
856 /* record page number so it gets flushed to disk when unplug occurs */ 859 /* record page number so it gets flushed to disk when unplug occurs */
857 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); 860 set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY);
861}
862
863static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
864{
865 unsigned long bit;
866 struct page *page;
867 void *paddr;
868 unsigned long chunk = block >> bitmap->counts.chunkshift;
869
870 page = filemap_get_page(&bitmap->storage, chunk);
871 if (!page)
872 return;
873 bit = file_page_offset(&bitmap->storage, chunk);
874 paddr = kmap_atomic(page);
875 if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
876 clear_bit(bit, paddr);
877 else
878 test_and_clear_bit_le(bit, paddr);
879 kunmap_atomic(paddr);
880 if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) {
881 set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING);
882 bitmap->allclean = 0;
883 }
858} 884}
859 885
860/* this gets called when the md device is ready to unplug its underlying 886/* this gets called when the md device is ready to unplug its underlying
@@ -862,42 +888,37 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
862 * sync the dirty pages of the bitmap file to disk */ 888 * sync the dirty pages of the bitmap file to disk */
863void bitmap_unplug(struct bitmap *bitmap) 889void bitmap_unplug(struct bitmap *bitmap)
864{ 890{
865 unsigned long i, flags; 891 unsigned long i;
866 int dirty, need_write; 892 int dirty, need_write;
867 struct page *page;
868 int wait = 0; 893 int wait = 0;
869 894
870 if (!bitmap) 895 if (!bitmap || !bitmap->storage.filemap ||
896 test_bit(BITMAP_STALE, &bitmap->flags))
871 return; 897 return;
872 898
873 /* look at each page to see if there are any set bits that need to be 899 /* look at each page to see if there are any set bits that need to be
874 * flushed out to disk */ 900 * flushed out to disk */
875 for (i = 0; i < bitmap->file_pages; i++) { 901 for (i = 0; i < bitmap->storage.file_pages; i++) {
876 spin_lock_irqsave(&bitmap->lock, flags); 902 if (!bitmap->storage.filemap)
877 if (!bitmap->filemap) {
878 spin_unlock_irqrestore(&bitmap->lock, flags);
879 return; 903 return;
904 dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
905 need_write = test_and_clear_page_attr(bitmap, i,
906 BITMAP_PAGE_NEEDWRITE);
907 if (dirty || need_write) {
908 clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
909 write_page(bitmap, bitmap->storage.filemap[i], 0);
880 } 910 }
881 page = bitmap->filemap[i];
882 dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
883 need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
884 clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
885 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
886 if (dirty) 911 if (dirty)
887 wait = 1; 912 wait = 1;
888 spin_unlock_irqrestore(&bitmap->lock, flags);
889
890 if (dirty || need_write)
891 write_page(bitmap, page, 0);
892 } 913 }
893 if (wait) { /* if any writes were performed, we need to wait on them */ 914 if (wait) { /* if any writes were performed, we need to wait on them */
894 if (bitmap->file) 915 if (bitmap->storage.file)
895 wait_event(bitmap->write_wait, 916 wait_event(bitmap->write_wait,
896 atomic_read(&bitmap->pending_writes)==0); 917 atomic_read(&bitmap->pending_writes)==0);
897 else 918 else
898 md_super_wait(bitmap->mddev); 919 md_super_wait(bitmap->mddev);
899 } 920 }
900 if (bitmap->flags & BITMAP_WRITE_ERROR) 921 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
901 bitmap_file_kick(bitmap); 922 bitmap_file_kick(bitmap);
902} 923}
903EXPORT_SYMBOL(bitmap_unplug); 924EXPORT_SYMBOL(bitmap_unplug);
@@ -917,98 +938,77 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
917static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) 938static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
918{ 939{
919 unsigned long i, chunks, index, oldindex, bit; 940 unsigned long i, chunks, index, oldindex, bit;
920 struct page *page = NULL, *oldpage = NULL; 941 struct page *page = NULL;
921 unsigned long num_pages, bit_cnt = 0; 942 unsigned long bit_cnt = 0;
922 struct file *file; 943 struct file *file;
923 unsigned long bytes, offset; 944 unsigned long offset;
924 int outofdate; 945 int outofdate;
925 int ret = -ENOSPC; 946 int ret = -ENOSPC;
926 void *paddr; 947 void *paddr;
948 struct bitmap_storage *store = &bitmap->storage;
927 949
928 chunks = bitmap->chunks; 950 chunks = bitmap->counts.chunks;
929 file = bitmap->file; 951 file = store->file;
930 952
931 BUG_ON(!file && !bitmap->mddev->bitmap_info.offset); 953 if (!file && !bitmap->mddev->bitmap_info.offset) {
954 /* No permanent bitmap - fill with '1s'. */
955 store->filemap = NULL;
956 store->file_pages = 0;
957 for (i = 0; i < chunks ; i++) {
958 /* if the disk bit is set, set the memory bit */
959 int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift)
960 >= start);
961 bitmap_set_memory_bits(bitmap,
962 (sector_t)i << bitmap->counts.chunkshift,
963 needed);
964 }
965 return 0;
966 }
932 967
933 outofdate = bitmap->flags & BITMAP_STALE; 968 outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
934 if (outofdate) 969 if (outofdate)
935 printk(KERN_INFO "%s: bitmap file is out of date, doing full " 970 printk(KERN_INFO "%s: bitmap file is out of date, doing full "
936 "recovery\n", bmname(bitmap)); 971 "recovery\n", bmname(bitmap));
937 972
938 bytes = DIV_ROUND_UP(bitmap->chunks, 8); 973 if (file && i_size_read(file->f_mapping->host) < store->bytes) {
939 if (!bitmap->mddev->bitmap_info.external)
940 bytes += sizeof(bitmap_super_t);
941
942 num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
943
944 if (file && i_size_read(file->f_mapping->host) < bytes) {
945 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", 974 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
946 bmname(bitmap), 975 bmname(bitmap),
947 (unsigned long) i_size_read(file->f_mapping->host), 976 (unsigned long) i_size_read(file->f_mapping->host),
948 bytes); 977 store->bytes);
949 goto err; 978 goto err;
950 } 979 }
951 980
952 ret = -ENOMEM;
953
954 bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
955 if (!bitmap->filemap)
956 goto err;
957
958 /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */
959 bitmap->filemap_attr = kzalloc(
960 roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
961 GFP_KERNEL);
962 if (!bitmap->filemap_attr)
963 goto err;
964
965 oldindex = ~0L; 981 oldindex = ~0L;
982 offset = 0;
983 if (!bitmap->mddev->bitmap_info.external)
984 offset = sizeof(bitmap_super_t);
966 985
967 for (i = 0; i < chunks; i++) { 986 for (i = 0; i < chunks; i++) {
968 int b; 987 int b;
969 index = file_page_index(bitmap, i); 988 index = file_page_index(&bitmap->storage, i);
970 bit = file_page_offset(bitmap, i); 989 bit = file_page_offset(&bitmap->storage, i);
971 if (index != oldindex) { /* this is a new page, read it in */ 990 if (index != oldindex) { /* this is a new page, read it in */
972 int count; 991 int count;
973 /* unmap the old page, we're done with it */ 992 /* unmap the old page, we're done with it */
974 if (index == num_pages-1) 993 if (index == store->file_pages-1)
975 count = bytes - index * PAGE_SIZE; 994 count = store->bytes - index * PAGE_SIZE;
976 else 995 else
977 count = PAGE_SIZE; 996 count = PAGE_SIZE;
978 if (index == 0 && bitmap->sb_page) { 997 page = store->filemap[index];
979 /* 998 if (file)
980 * if we're here then the superblock page 999 ret = read_page(file, index, bitmap,
981 * contains some bits (PAGE_SIZE != sizeof sb) 1000 count, page);
982 * we've already read it in, so just use it 1001 else
983 */ 1002 ret = read_sb_page(
984 page = bitmap->sb_page; 1003 bitmap->mddev,
985 offset = sizeof(bitmap_super_t); 1004 bitmap->mddev->bitmap_info.offset,
986 if (!file) 1005 page,
987 page = read_sb_page( 1006 index, count);
988 bitmap->mddev, 1007
989 bitmap->mddev->bitmap_info.offset, 1008 if (ret)
990 page,
991 index, count);
992 } else if (file) {
993 page = read_page(file, index, bitmap, count);
994 offset = 0;
995 } else {
996 page = read_sb_page(bitmap->mddev,
997 bitmap->mddev->bitmap_info.offset,
998 NULL,
999 index, count);
1000 offset = 0;
1001 }
1002 if (IS_ERR(page)) { /* read error */
1003 ret = PTR_ERR(page);
1004 goto err; 1009 goto err;
1005 }
1006 1010
1007 oldindex = index; 1011 oldindex = index;
1008 oldpage = page;
1009
1010 bitmap->filemap[bitmap->file_pages++] = page;
1011 bitmap->last_page_size = count;
1012 1012
1013 if (outofdate) { 1013 if (outofdate) {
1014 /* 1014 /*
@@ -1022,39 +1022,33 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
1022 write_page(bitmap, page, 1); 1022 write_page(bitmap, page, 1);
1023 1023
1024 ret = -EIO; 1024 ret = -EIO;
1025 if (bitmap->flags & BITMAP_WRITE_ERROR) 1025 if (test_bit(BITMAP_WRITE_ERROR,
1026 &bitmap->flags))
1026 goto err; 1027 goto err;
1027 } 1028 }
1028 } 1029 }
1029 paddr = kmap_atomic(page); 1030 paddr = kmap_atomic(page);
1030 if (bitmap->flags & BITMAP_HOSTENDIAN) 1031 if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
1031 b = test_bit(bit, paddr); 1032 b = test_bit(bit, paddr);
1032 else 1033 else
1033 b = test_bit_le(bit, paddr); 1034 b = test_bit_le(bit, paddr);
1034 kunmap_atomic(paddr); 1035 kunmap_atomic(paddr);
1035 if (b) { 1036 if (b) {
1036 /* if the disk bit is set, set the memory bit */ 1037 /* if the disk bit is set, set the memory bit */
1037 int needed = ((sector_t)(i+1) << bitmap->chunkshift 1038 int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
1038 >= start); 1039 >= start);
1039 bitmap_set_memory_bits(bitmap, 1040 bitmap_set_memory_bits(bitmap,
1040 (sector_t)i << bitmap->chunkshift, 1041 (sector_t)i << bitmap->counts.chunkshift,
1041 needed); 1042 needed);
1042 bit_cnt++; 1043 bit_cnt++;
1043 } 1044 }
1044 } 1045 offset = 0;
1045
1046 /* everything went OK */
1047 ret = 0;
1048 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
1049
1050 if (bit_cnt) { /* Kick recovery if any bits were set */
1051 set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
1052 md_wakeup_thread(bitmap->mddev->thread);
1053 } 1046 }
1054 1047
1055 printk(KERN_INFO "%s: bitmap initialized from disk: " 1048 printk(KERN_INFO "%s: bitmap initialized from disk: "
1056 "read %lu/%lu pages, set %lu of %lu bits\n", 1049 "read %lu pages, set %lu of %lu bits\n",
1057 bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks); 1050 bmname(bitmap), store->file_pages,
1051 bit_cnt, chunks);
1058 1052
1059 return 0; 1053 return 0;
1060 1054
@@ -1071,22 +1065,38 @@ void bitmap_write_all(struct bitmap *bitmap)
1071 */ 1065 */
1072 int i; 1066 int i;
1073 1067
1074 spin_lock_irq(&bitmap->lock); 1068 if (!bitmap || !bitmap->storage.filemap)
1075 for (i = 0; i < bitmap->file_pages; i++) 1069 return;
1076 set_page_attr(bitmap, bitmap->filemap[i], 1070 if (bitmap->storage.file)
1071 /* Only one copy, so nothing needed */
1072 return;
1073
1074 for (i = 0; i < bitmap->storage.file_pages; i++)
1075 set_page_attr(bitmap, i,
1077 BITMAP_PAGE_NEEDWRITE); 1076 BITMAP_PAGE_NEEDWRITE);
1078 bitmap->allclean = 0; 1077 bitmap->allclean = 0;
1079 spin_unlock_irq(&bitmap->lock);
1080} 1078}
1081 1079
1082static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) 1080static void bitmap_count_page(struct bitmap_counts *bitmap,
1081 sector_t offset, int inc)
1083{ 1082{
1084 sector_t chunk = offset >> bitmap->chunkshift; 1083 sector_t chunk = offset >> bitmap->chunkshift;
1085 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1084 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1086 bitmap->bp[page].count += inc; 1085 bitmap->bp[page].count += inc;
1087 bitmap_checkfree(bitmap, page); 1086 bitmap_checkfree(bitmap, page);
1088} 1087}
1089static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1088
1089static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
1090{
1091 sector_t chunk = offset >> bitmap->chunkshift;
1092 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1093 struct bitmap_page *bp = &bitmap->bp[page];
1094
1095 if (!bp->pending)
1096 bp->pending = 1;
1097}
1098
1099static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
1090 sector_t offset, sector_t *blocks, 1100 sector_t offset, sector_t *blocks,
1091 int create); 1101 int create);
1092 1102
@@ -1099,10 +1109,9 @@ void bitmap_daemon_work(struct mddev *mddev)
1099{ 1109{
1100 struct bitmap *bitmap; 1110 struct bitmap *bitmap;
1101 unsigned long j; 1111 unsigned long j;
1102 unsigned long flags; 1112 unsigned long nextpage;
1103 struct page *page = NULL, *lastpage = NULL;
1104 sector_t blocks; 1113 sector_t blocks;
1105 void *paddr; 1114 struct bitmap_counts *counts;
1106 1115
1107 /* Use a mutex to guard daemon_work against 1116 /* Use a mutex to guard daemon_work against
1108 * bitmap_destroy. 1117 * bitmap_destroy.
@@ -1124,112 +1133,90 @@ void bitmap_daemon_work(struct mddev *mddev)
1124 } 1133 }
1125 bitmap->allclean = 1; 1134 bitmap->allclean = 1;
1126 1135
1127 spin_lock_irqsave(&bitmap->lock, flags); 1136 /* Any file-page which is PENDING now needs to be written.
1128 for (j = 0; j < bitmap->chunks; j++) { 1137 * So set NEEDWRITE now, then after we make any last-minute changes
1138 * we will write it.
1139 */
1140 for (j = 0; j < bitmap->storage.file_pages; j++)
1141 if (test_and_clear_page_attr(bitmap, j,
1142 BITMAP_PAGE_PENDING))
1143 set_page_attr(bitmap, j,
1144 BITMAP_PAGE_NEEDWRITE);
1145
1146 if (bitmap->need_sync &&
1147 mddev->bitmap_info.external == 0) {
1148 /* Arrange for superblock update as well as
1149 * other changes */
1150 bitmap_super_t *sb;
1151 bitmap->need_sync = 0;
1152 if (bitmap->storage.filemap) {
1153 sb = kmap_atomic(bitmap->storage.sb_page);
1154 sb->events_cleared =
1155 cpu_to_le64(bitmap->events_cleared);
1156 kunmap_atomic(sb);
1157 set_page_attr(bitmap, 0,
1158 BITMAP_PAGE_NEEDWRITE);
1159 }
1160 }
1161 /* Now look at the bitmap counters and if any are '2' or '1',
1162 * decrement and handle accordingly.
1163 */
1164 counts = &bitmap->counts;
1165 spin_lock_irq(&counts->lock);
1166 nextpage = 0;
1167 for (j = 0; j < counts->chunks; j++) {
1129 bitmap_counter_t *bmc; 1168 bitmap_counter_t *bmc;
1130 if (!bitmap->filemap) 1169 sector_t block = (sector_t)j << counts->chunkshift;
1131 /* error or shutdown */
1132 break;
1133 1170
1134 page = filemap_get_page(bitmap, j); 1171 if (j == nextpage) {
1135 1172 nextpage += PAGE_COUNTER_RATIO;
1136 if (page != lastpage) { 1173 if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) {
1137 /* skip this page unless it's marked as needing cleaning */ 1174 j |= PAGE_COUNTER_MASK;
1138 if (!test_page_attr(bitmap, page, BITMAP_PAGE_PENDING)) {
1139 int need_write = test_page_attr(bitmap, page,
1140 BITMAP_PAGE_NEEDWRITE);
1141 if (need_write)
1142 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
1143
1144 spin_unlock_irqrestore(&bitmap->lock, flags);
1145 if (need_write)
1146 write_page(bitmap, page, 0);
1147 spin_lock_irqsave(&bitmap->lock, flags);
1148 j |= (PAGE_BITS - 1);
1149 continue; 1175 continue;
1150 } 1176 }
1151 1177 counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
1152 /* grab the new page, sync and release the old */
1153 if (lastpage != NULL) {
1154 if (test_page_attr(bitmap, lastpage,
1155 BITMAP_PAGE_NEEDWRITE)) {
1156 clear_page_attr(bitmap, lastpage,
1157 BITMAP_PAGE_NEEDWRITE);
1158 spin_unlock_irqrestore(&bitmap->lock, flags);
1159 write_page(bitmap, lastpage, 0);
1160 } else {
1161 set_page_attr(bitmap, lastpage,
1162 BITMAP_PAGE_NEEDWRITE);
1163 bitmap->allclean = 0;
1164 spin_unlock_irqrestore(&bitmap->lock, flags);
1165 }
1166 } else
1167 spin_unlock_irqrestore(&bitmap->lock, flags);
1168 lastpage = page;
1169
1170 /* We are possibly going to clear some bits, so make
1171 * sure that events_cleared is up-to-date.
1172 */
1173 if (bitmap->need_sync &&
1174 mddev->bitmap_info.external == 0) {
1175 bitmap_super_t *sb;
1176 bitmap->need_sync = 0;
1177 sb = kmap_atomic(bitmap->sb_page);
1178 sb->events_cleared =
1179 cpu_to_le64(bitmap->events_cleared);
1180 kunmap_atomic(sb);
1181 write_page(bitmap, bitmap->sb_page, 1);
1182 }
1183 spin_lock_irqsave(&bitmap->lock, flags);
1184 if (!bitmap->need_sync)
1185 clear_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
1186 else
1187 bitmap->allclean = 0;
1188 } 1178 }
1189 bmc = bitmap_get_counter(bitmap, 1179 bmc = bitmap_get_counter(counts,
1190 (sector_t)j << bitmap->chunkshift, 1180 block,
1191 &blocks, 0); 1181 &blocks, 0);
1192 if (!bmc) 1182
1183 if (!bmc) {
1193 j |= PAGE_COUNTER_MASK; 1184 j |= PAGE_COUNTER_MASK;
1194 else if (*bmc) { 1185 continue;
1195 if (*bmc == 1 && !bitmap->need_sync) {
1196 /* we can clear the bit */
1197 *bmc = 0;
1198 bitmap_count_page(bitmap,
1199 (sector_t)j << bitmap->chunkshift,
1200 -1);
1201
1202 /* clear the bit */
1203 paddr = kmap_atomic(page);
1204 if (bitmap->flags & BITMAP_HOSTENDIAN)
1205 clear_bit(file_page_offset(bitmap, j),
1206 paddr);
1207 else
1208 __clear_bit_le(
1209 file_page_offset(bitmap,
1210 j),
1211 paddr);
1212 kunmap_atomic(paddr);
1213 } else if (*bmc <= 2) {
1214 *bmc = 1; /* maybe clear the bit next time */
1215 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
1216 bitmap->allclean = 0;
1217 }
1218 } 1186 }
1219 } 1187 if (*bmc == 1 && !bitmap->need_sync) {
1220 spin_unlock_irqrestore(&bitmap->lock, flags); 1188 /* We can clear the bit */
1221 1189 *bmc = 0;
1222 /* now sync the final page */ 1190 bitmap_count_page(counts, block, -1);
1223 if (lastpage != NULL) { 1191 bitmap_file_clear_bit(bitmap, block);
1224 spin_lock_irqsave(&bitmap->lock, flags); 1192 } else if (*bmc && *bmc <= 2) {
1225 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { 1193 *bmc = 1;
1226 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1194 bitmap_set_pending(counts, block);
1227 spin_unlock_irqrestore(&bitmap->lock, flags);
1228 write_page(bitmap, lastpage, 0);
1229 } else {
1230 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1231 bitmap->allclean = 0; 1195 bitmap->allclean = 0;
1232 spin_unlock_irqrestore(&bitmap->lock, flags); 1196 }
1197 }
1198 spin_unlock_irq(&counts->lock);
1199
1200 /* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
1201 * DIRTY pages need to be written by bitmap_unplug so it can wait
1202 * for them.
1203 * If we find any DIRTY page we stop there and let bitmap_unplug
1204 * handle all the rest. This is important in the case where
1205 * the first blocking holds the superblock and it has been updated.
1206 * We mustn't write any other blocks before the superblock.
1207 */
1208 for (j = 0;
1209 j < bitmap->storage.file_pages
1210 && !test_bit(BITMAP_STALE, &bitmap->flags);
1211 j++) {
1212
1213 if (test_page_attr(bitmap, j,
1214 BITMAP_PAGE_DIRTY))
1215 /* bitmap_unplug will handle the rest */
1216 break;
1217 if (test_and_clear_page_attr(bitmap, j,
1218 BITMAP_PAGE_NEEDWRITE)) {
1219 write_page(bitmap, bitmap->storage.filemap[j], 0);
1233 } 1220 }
1234 } 1221 }
1235 1222
@@ -1240,7 +1227,7 @@ void bitmap_daemon_work(struct mddev *mddev)
1240 mutex_unlock(&mddev->bitmap_info.mutex); 1227 mutex_unlock(&mddev->bitmap_info.mutex);
1241} 1228}
1242 1229
1243static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1230static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
1244 sector_t offset, sector_t *blocks, 1231 sector_t offset, sector_t *blocks,
1245 int create) 1232 int create)
1246__releases(bitmap->lock) 1233__releases(bitmap->lock)
@@ -1302,10 +1289,10 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1302 sector_t blocks; 1289 sector_t blocks;
1303 bitmap_counter_t *bmc; 1290 bitmap_counter_t *bmc;
1304 1291
1305 spin_lock_irq(&bitmap->lock); 1292 spin_lock_irq(&bitmap->counts.lock);
1306 bmc = bitmap_get_counter(bitmap, offset, &blocks, 1); 1293 bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1);
1307 if (!bmc) { 1294 if (!bmc) {
1308 spin_unlock_irq(&bitmap->lock); 1295 spin_unlock_irq(&bitmap->counts.lock);
1309 return 0; 1296 return 0;
1310 } 1297 }
1311 1298
@@ -1317,7 +1304,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1317 */ 1304 */
1318 prepare_to_wait(&bitmap->overflow_wait, &__wait, 1305 prepare_to_wait(&bitmap->overflow_wait, &__wait,
1319 TASK_UNINTERRUPTIBLE); 1306 TASK_UNINTERRUPTIBLE);
1320 spin_unlock_irq(&bitmap->lock); 1307 spin_unlock_irq(&bitmap->counts.lock);
1321 io_schedule(); 1308 io_schedule();
1322 finish_wait(&bitmap->overflow_wait, &__wait); 1309 finish_wait(&bitmap->overflow_wait, &__wait);
1323 continue; 1310 continue;
@@ -1326,7 +1313,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1326 switch (*bmc) { 1313 switch (*bmc) {
1327 case 0: 1314 case 0:
1328 bitmap_file_set_bit(bitmap, offset); 1315 bitmap_file_set_bit(bitmap, offset);
1329 bitmap_count_page(bitmap, offset, 1); 1316 bitmap_count_page(&bitmap->counts, offset, 1);
1330 /* fall through */ 1317 /* fall through */
1331 case 1: 1318 case 1:
1332 *bmc = 2; 1319 *bmc = 2;
@@ -1334,7 +1321,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1334 1321
1335 (*bmc)++; 1322 (*bmc)++;
1336 1323
1337 spin_unlock_irq(&bitmap->lock); 1324 spin_unlock_irq(&bitmap->counts.lock);
1338 1325
1339 offset += blocks; 1326 offset += blocks;
1340 if (sectors > blocks) 1327 if (sectors > blocks)
@@ -1364,10 +1351,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1364 unsigned long flags; 1351 unsigned long flags;
1365 bitmap_counter_t *bmc; 1352 bitmap_counter_t *bmc;
1366 1353
1367 spin_lock_irqsave(&bitmap->lock, flags); 1354 spin_lock_irqsave(&bitmap->counts.lock, flags);
1368 bmc = bitmap_get_counter(bitmap, offset, &blocks, 0); 1355 bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0);
1369 if (!bmc) { 1356 if (!bmc) {
1370 spin_unlock_irqrestore(&bitmap->lock, flags); 1357 spin_unlock_irqrestore(&bitmap->counts.lock, flags);
1371 return; 1358 return;
1372 } 1359 }
1373 1360
@@ -1386,14 +1373,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1386 1373
1387 (*bmc)--; 1374 (*bmc)--;
1388 if (*bmc <= 2) { 1375 if (*bmc <= 2) {
1389 set_page_attr(bitmap, 1376 bitmap_set_pending(&bitmap->counts, offset);
1390 filemap_get_page(
1391 bitmap,
1392 offset >> bitmap->chunkshift),
1393 BITMAP_PAGE_PENDING);
1394 bitmap->allclean = 0; 1377 bitmap->allclean = 0;
1395 } 1378 }
1396 spin_unlock_irqrestore(&bitmap->lock, flags); 1379 spin_unlock_irqrestore(&bitmap->counts.lock, flags);
1397 offset += blocks; 1380 offset += blocks;
1398 if (sectors > blocks) 1381 if (sectors > blocks)
1399 sectors -= blocks; 1382 sectors -= blocks;
@@ -1412,8 +1395,8 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
1412 *blocks = 1024; 1395 *blocks = 1024;
1413 return 1; /* always resync if no bitmap */ 1396 return 1; /* always resync if no bitmap */
1414 } 1397 }
1415 spin_lock_irq(&bitmap->lock); 1398 spin_lock_irq(&bitmap->counts.lock);
1416 bmc = bitmap_get_counter(bitmap, offset, blocks, 0); 1399 bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
1417 rv = 0; 1400 rv = 0;
1418 if (bmc) { 1401 if (bmc) {
1419 /* locked */ 1402 /* locked */
@@ -1427,7 +1410,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
1427 } 1410 }
1428 } 1411 }
1429 } 1412 }
1430 spin_unlock_irq(&bitmap->lock); 1413 spin_unlock_irq(&bitmap->counts.lock);
1431 return rv; 1414 return rv;
1432} 1415}
1433 1416
@@ -1464,8 +1447,8 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
1464 *blocks = 1024; 1447 *blocks = 1024;
1465 return; 1448 return;
1466 } 1449 }
1467 spin_lock_irqsave(&bitmap->lock, flags); 1450 spin_lock_irqsave(&bitmap->counts.lock, flags);
1468 bmc = bitmap_get_counter(bitmap, offset, blocks, 0); 1451 bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
1469 if (bmc == NULL) 1452 if (bmc == NULL)
1470 goto unlock; 1453 goto unlock;
1471 /* locked */ 1454 /* locked */
@@ -1476,15 +1459,13 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
1476 *bmc |= NEEDED_MASK; 1459 *bmc |= NEEDED_MASK;
1477 else { 1460 else {
1478 if (*bmc <= 2) { 1461 if (*bmc <= 2) {
1479 set_page_attr(bitmap, 1462 bitmap_set_pending(&bitmap->counts, offset);
1480 filemap_get_page(bitmap, offset >> bitmap->chunkshift),
1481 BITMAP_PAGE_PENDING);
1482 bitmap->allclean = 0; 1463 bitmap->allclean = 0;
1483 } 1464 }
1484 } 1465 }
1485 } 1466 }
1486 unlock: 1467 unlock:
1487 spin_unlock_irqrestore(&bitmap->lock, flags); 1468 spin_unlock_irqrestore(&bitmap->counts.lock, flags);
1488} 1469}
1489EXPORT_SYMBOL(bitmap_end_sync); 1470EXPORT_SYMBOL(bitmap_end_sync);
1490 1471
@@ -1524,7 +1505,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1524 1505
1525 bitmap->mddev->curr_resync_completed = sector; 1506 bitmap->mddev->curr_resync_completed = sector;
1526 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); 1507 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
1527 sector &= ~((1ULL << bitmap->chunkshift) - 1); 1508 sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
1528 s = 0; 1509 s = 0;
1529 while (s < sector && s < bitmap->mddev->resync_max_sectors) { 1510 while (s < sector && s < bitmap->mddev->resync_max_sectors) {
1530 bitmap_end_sync(bitmap, s, &blocks, 0); 1511 bitmap_end_sync(bitmap, s, &blocks, 0);
@@ -1538,27 +1519,25 @@ EXPORT_SYMBOL(bitmap_cond_end_sync);
1538static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) 1519static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
1539{ 1520{
1540 /* For each chunk covered by any of these sectors, set the 1521 /* For each chunk covered by any of these sectors, set the
1541 * counter to 1 and set resync_needed. They should all 1522 * counter to 2 and possibly set resync_needed. They should all
1542 * be 0 at this point 1523 * be 0 at this point
1543 */ 1524 */
1544 1525
1545 sector_t secs; 1526 sector_t secs;
1546 bitmap_counter_t *bmc; 1527 bitmap_counter_t *bmc;
1547 spin_lock_irq(&bitmap->lock); 1528 spin_lock_irq(&bitmap->counts.lock);
1548 bmc = bitmap_get_counter(bitmap, offset, &secs, 1); 1529 bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1);
1549 if (!bmc) { 1530 if (!bmc) {
1550 spin_unlock_irq(&bitmap->lock); 1531 spin_unlock_irq(&bitmap->counts.lock);
1551 return; 1532 return;
1552 } 1533 }
1553 if (!*bmc) { 1534 if (!*bmc) {
1554 struct page *page;
1555 *bmc = 2 | (needed ? NEEDED_MASK : 0); 1535 *bmc = 2 | (needed ? NEEDED_MASK : 0);
1556 bitmap_count_page(bitmap, offset, 1); 1536 bitmap_count_page(&bitmap->counts, offset, 1);
1557 page = filemap_get_page(bitmap, offset >> bitmap->chunkshift); 1537 bitmap_set_pending(&bitmap->counts, offset);
1558 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
1559 bitmap->allclean = 0; 1538 bitmap->allclean = 0;
1560 } 1539 }
1561 spin_unlock_irq(&bitmap->lock); 1540 spin_unlock_irq(&bitmap->counts.lock);
1562} 1541}
1563 1542
1564/* dirty the memory and file bits for bitmap chunks "s" to "e" */ 1543/* dirty the memory and file bits for bitmap chunks "s" to "e" */
@@ -1567,11 +1546,9 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
1567 unsigned long chunk; 1546 unsigned long chunk;
1568 1547
1569 for (chunk = s; chunk <= e; chunk++) { 1548 for (chunk = s; chunk <= e; chunk++) {
1570 sector_t sec = (sector_t)chunk << bitmap->chunkshift; 1549 sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift;
1571 bitmap_set_memory_bits(bitmap, sec, 1); 1550 bitmap_set_memory_bits(bitmap, sec, 1);
1572 spin_lock_irq(&bitmap->lock);
1573 bitmap_file_set_bit(bitmap, sec); 1551 bitmap_file_set_bit(bitmap, sec);
1574 spin_unlock_irq(&bitmap->lock);
1575 if (sec < bitmap->mddev->recovery_cp) 1552 if (sec < bitmap->mddev->recovery_cp)
1576 /* We are asserting that the array is dirty, 1553 /* We are asserting that the array is dirty,
1577 * so move the recovery_cp address back so 1554 * so move the recovery_cp address back so
@@ -1616,11 +1593,15 @@ static void bitmap_free(struct bitmap *bitmap)
1616 if (!bitmap) /* there was no bitmap */ 1593 if (!bitmap) /* there was no bitmap */
1617 return; 1594 return;
1618 1595
1619 /* release the bitmap file and kill the daemon */ 1596 /* Shouldn't be needed - but just in case.... */
1620 bitmap_file_put(bitmap); 1597 wait_event(bitmap->write_wait,
1598 atomic_read(&bitmap->pending_writes) == 0);
1599
1600 /* release the bitmap file */
1601 bitmap_file_unmap(&bitmap->storage);
1621 1602
1622 bp = bitmap->bp; 1603 bp = bitmap->counts.bp;
1623 pages = bitmap->pages; 1604 pages = bitmap->counts.pages;
1624 1605
1625 /* free all allocated memory */ 1606 /* free all allocated memory */
1626 1607
@@ -1659,25 +1640,19 @@ int bitmap_create(struct mddev *mddev)
1659{ 1640{
1660 struct bitmap *bitmap; 1641 struct bitmap *bitmap;
1661 sector_t blocks = mddev->resync_max_sectors; 1642 sector_t blocks = mddev->resync_max_sectors;
1662 unsigned long chunks;
1663 unsigned long pages;
1664 struct file *file = mddev->bitmap_info.file; 1643 struct file *file = mddev->bitmap_info.file;
1665 int err; 1644 int err;
1666 struct sysfs_dirent *bm = NULL; 1645 struct sysfs_dirent *bm = NULL;
1667 1646
1668 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); 1647 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
1669 1648
1670 if (!file
1671 && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
1672 return 0;
1673
1674 BUG_ON(file && mddev->bitmap_info.offset); 1649 BUG_ON(file && mddev->bitmap_info.offset);
1675 1650
1676 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); 1651 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
1677 if (!bitmap) 1652 if (!bitmap)
1678 return -ENOMEM; 1653 return -ENOMEM;
1679 1654
1680 spin_lock_init(&bitmap->lock); 1655 spin_lock_init(&bitmap->counts.lock);
1681 atomic_set(&bitmap->pending_writes, 0); 1656 atomic_set(&bitmap->pending_writes, 0);
1682 init_waitqueue_head(&bitmap->write_wait); 1657 init_waitqueue_head(&bitmap->write_wait);
1683 init_waitqueue_head(&bitmap->overflow_wait); 1658 init_waitqueue_head(&bitmap->overflow_wait);
@@ -1693,7 +1668,7 @@ int bitmap_create(struct mddev *mddev)
1693 } else 1668 } else
1694 bitmap->sysfs_can_clear = NULL; 1669 bitmap->sysfs_can_clear = NULL;
1695 1670
1696 bitmap->file = file; 1671 bitmap->storage.file = file;
1697 if (file) { 1672 if (file) {
1698 get_file(file); 1673 get_file(file);
1699 /* As future accesses to this file will use bmap, 1674 /* As future accesses to this file will use bmap,
@@ -1724,32 +1699,15 @@ int bitmap_create(struct mddev *mddev)
1724 goto error; 1699 goto error;
1725 1700
1726 bitmap->daemon_lastrun = jiffies; 1701 bitmap->daemon_lastrun = jiffies;
1727 bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize) 1702 err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1);
1728 - BITMAP_BLOCK_SHIFT); 1703 if (err)
1729
1730 chunks = (blocks + (1 << bitmap->chunkshift) - 1) >>
1731 bitmap->chunkshift;
1732 pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
1733
1734 BUG_ON(!pages);
1735
1736 bitmap->chunks = chunks;
1737 bitmap->pages = pages;
1738 bitmap->missing_pages = pages;
1739
1740 bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
1741
1742 err = -ENOMEM;
1743 if (!bitmap->bp)
1744 goto error; 1704 goto error;
1745 1705
1746 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", 1706 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
1747 pages, bmname(bitmap)); 1707 bitmap->counts.pages, bmname(bitmap));
1748 1708
1749 mddev->bitmap = bitmap; 1709 mddev->bitmap = bitmap;
1750 1710 return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
1751
1752 return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0;
1753 1711
1754 error: 1712 error:
1755 bitmap_free(bitmap); 1713 bitmap_free(bitmap);
@@ -1790,13 +1748,17 @@ int bitmap_load(struct mddev *mddev)
1790 1748
1791 if (err) 1749 if (err)
1792 goto out; 1750 goto out;
1751 clear_bit(BITMAP_STALE, &bitmap->flags);
1752
1753 /* Kick recovery in case any bits were set */
1754 set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
1793 1755
1794 mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; 1756 mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
1795 md_wakeup_thread(mddev->thread); 1757 md_wakeup_thread(mddev->thread);
1796 1758
1797 bitmap_update_sb(bitmap); 1759 bitmap_update_sb(bitmap);
1798 1760
1799 if (bitmap->flags & BITMAP_WRITE_ERROR) 1761 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
1800 err = -EIO; 1762 err = -EIO;
1801out: 1763out:
1802 return err; 1764 return err;
@@ -1806,30 +1768,194 @@ EXPORT_SYMBOL_GPL(bitmap_load);
1806void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) 1768void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
1807{ 1769{
1808 unsigned long chunk_kb; 1770 unsigned long chunk_kb;
1809 unsigned long flags; 1771 struct bitmap_counts *counts;
1810 1772
1811 if (!bitmap) 1773 if (!bitmap)
1812 return; 1774 return;
1813 1775
1814 spin_lock_irqsave(&bitmap->lock, flags); 1776 counts = &bitmap->counts;
1777
1815 chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; 1778 chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
1816 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 1779 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
1817 "%lu%s chunk", 1780 "%lu%s chunk",
1818 bitmap->pages - bitmap->missing_pages, 1781 counts->pages - counts->missing_pages,
1819 bitmap->pages, 1782 counts->pages,
1820 (bitmap->pages - bitmap->missing_pages) 1783 (counts->pages - counts->missing_pages)
1821 << (PAGE_SHIFT - 10), 1784 << (PAGE_SHIFT - 10),
1822 chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, 1785 chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
1823 chunk_kb ? "KB" : "B"); 1786 chunk_kb ? "KB" : "B");
1824 if (bitmap->file) { 1787 if (bitmap->storage.file) {
1825 seq_printf(seq, ", file: "); 1788 seq_printf(seq, ", file: ");
1826 seq_path(seq, &bitmap->file->f_path, " \t\n"); 1789 seq_path(seq, &bitmap->storage.file->f_path, " \t\n");
1827 } 1790 }
1828 1791
1829 seq_printf(seq, "\n"); 1792 seq_printf(seq, "\n");
1830 spin_unlock_irqrestore(&bitmap->lock, flags);
1831} 1793}
1832 1794
1795int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
1796 int chunksize, int init)
1797{
1798 /* If chunk_size is 0, choose an appropriate chunk size.
1799 * Then possibly allocate new storage space.
1800 * Then quiesce, copy bits, replace bitmap, and re-start
1801 *
1802 * This function is called both to set up the initial bitmap
1803 * and to resize the bitmap while the array is active.
1804 * If this happens as a result of the array being resized,
1805 * chunksize will be zero, and we need to choose a suitable
1806 * chunksize, otherwise we use what we are given.
1807 */
1808 struct bitmap_storage store;
1809 struct bitmap_counts old_counts;
1810 unsigned long chunks;
1811 sector_t block;
1812 sector_t old_blocks, new_blocks;
1813 int chunkshift;
1814 int ret = 0;
1815 long pages;
1816 struct bitmap_page *new_bp;
1817
1818 if (chunksize == 0) {
1819 /* If there is enough space, leave the chunk size unchanged,
1820 * else increase by factor of two until there is enough space.
1821 */
1822 long bytes;
1823 long space = bitmap->mddev->bitmap_info.space;
1824
1825 if (space == 0) {
1826 /* We don't know how much space there is, so limit
1827 * to current size - in sectors.
1828 */
1829 bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8);
1830 if (!bitmap->mddev->bitmap_info.external)
1831 bytes += sizeof(bitmap_super_t);
1832 space = DIV_ROUND_UP(bytes, 512);
1833 bitmap->mddev->bitmap_info.space = space;
1834 }
1835 chunkshift = bitmap->counts.chunkshift;
1836 chunkshift--;
1837 do {
1838 /* 'chunkshift' is shift from block size to chunk size */
1839 chunkshift++;
1840 chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
1841 bytes = DIV_ROUND_UP(chunks, 8);
1842 if (!bitmap->mddev->bitmap_info.external)
1843 bytes += sizeof(bitmap_super_t);
1844 } while (bytes > (space << 9));
1845 } else
1846 chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
1847
1848 chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
1849 memset(&store, 0, sizeof(store));
1850 if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
1851 ret = bitmap_storage_alloc(&store, chunks,
1852 !bitmap->mddev->bitmap_info.external);
1853 if (ret)
1854 goto err;
1855
1856 pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
1857
1858 new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL);
1859 ret = -ENOMEM;
1860 if (!new_bp) {
1861 bitmap_file_unmap(&store);
1862 goto err;
1863 }
1864
1865 if (!init)
1866 bitmap->mddev->pers->quiesce(bitmap->mddev, 1);
1867
1868 store.file = bitmap->storage.file;
1869 bitmap->storage.file = NULL;
1870
1871 if (store.sb_page && bitmap->storage.sb_page)
1872 memcpy(page_address(store.sb_page),
1873 page_address(bitmap->storage.sb_page),
1874 sizeof(bitmap_super_t));
1875 bitmap_file_unmap(&bitmap->storage);
1876 bitmap->storage = store;
1877
1878 old_counts = bitmap->counts;
1879 bitmap->counts.bp = new_bp;
1880 bitmap->counts.pages = pages;
1881 bitmap->counts.missing_pages = pages;
1882 bitmap->counts.chunkshift = chunkshift;
1883 bitmap->counts.chunks = chunks;
1884 bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift +
1885 BITMAP_BLOCK_SHIFT);
1886
1887 blocks = min(old_counts.chunks << old_counts.chunkshift,
1888 chunks << chunkshift);
1889
1890 spin_lock_irq(&bitmap->counts.lock);
1891 for (block = 0; block < blocks; ) {
1892 bitmap_counter_t *bmc_old, *bmc_new;
1893 int set;
1894
1895 bmc_old = bitmap_get_counter(&old_counts, block,
1896 &old_blocks, 0);
1897 set = bmc_old && NEEDED(*bmc_old);
1898
1899 if (set) {
1900 bmc_new = bitmap_get_counter(&bitmap->counts, block,
1901 &new_blocks, 1);
1902 if (*bmc_new == 0) {
1903 /* need to set on-disk bits too. */
1904 sector_t end = block + new_blocks;
1905 sector_t start = block >> chunkshift;
1906 start <<= chunkshift;
1907 while (start < end) {
1908 bitmap_file_set_bit(bitmap, block);
1909 start += 1 << chunkshift;
1910 }
1911 *bmc_new = 2;
1912 bitmap_count_page(&bitmap->counts,
1913 block, 1);
1914 bitmap_set_pending(&bitmap->counts,
1915 block);
1916 }
1917 *bmc_new |= NEEDED_MASK;
1918 if (new_blocks < old_blocks)
1919 old_blocks = new_blocks;
1920 }
1921 block += old_blocks;
1922 }
1923
1924 if (!init) {
1925 int i;
1926 while (block < (chunks << chunkshift)) {
1927 bitmap_counter_t *bmc;
1928 bmc = bitmap_get_counter(&bitmap->counts, block,
1929 &new_blocks, 1);
1930 if (bmc) {
1931 /* new space. It needs to be resynced, so
1932 * we set NEEDED_MASK.
1933 */
1934 if (*bmc == 0) {
1935 *bmc = NEEDED_MASK | 2;
1936 bitmap_count_page(&bitmap->counts,
1937 block, 1);
1938 bitmap_set_pending(&bitmap->counts,
1939 block);
1940 }
1941 }
1942 block += new_blocks;
1943 }
1944 for (i = 0; i < bitmap->storage.file_pages; i++)
1945 set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
1946 }
1947 spin_unlock_irq(&bitmap->counts.lock);
1948
1949 if (!init) {
1950 bitmap_unplug(bitmap);
1951 bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
1952 }
1953 ret = 0;
1954err:
1955 return ret;
1956}
1957EXPORT_SYMBOL_GPL(bitmap_resize);
1958
1833static ssize_t 1959static ssize_t
1834location_show(struct mddev *mddev, char *page) 1960location_show(struct mddev *mddev, char *page)
1835{ 1961{
@@ -1923,6 +2049,43 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
1923static struct md_sysfs_entry bitmap_location = 2049static struct md_sysfs_entry bitmap_location =
1924__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); 2050__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
1925 2051
2052/* 'bitmap/space' is the space available at 'location' for the
2053 * bitmap. This allows the kernel to know when it is safe to
2054 * resize the bitmap to match a resized array.
2055 */
2056static ssize_t
2057space_show(struct mddev *mddev, char *page)
2058{
2059 return sprintf(page, "%lu\n", mddev->bitmap_info.space);
2060}
2061
2062static ssize_t
2063space_store(struct mddev *mddev, const char *buf, size_t len)
2064{
2065 unsigned long sectors;
2066 int rv;
2067
2068 rv = kstrtoul(buf, 10, &sectors);
2069 if (rv)
2070 return rv;
2071
2072 if (sectors == 0)
2073 return -EINVAL;
2074
2075 if (mddev->bitmap &&
2076 sectors < (mddev->bitmap->storage.bytes + 511) >> 9)
2077 return -EFBIG; /* Bitmap is too big for this small space */
2078
2079 /* could make sure it isn't too big, but that isn't really
2080 * needed - user-space should be careful.
2081 */
2082 mddev->bitmap_info.space = sectors;
2083 return len;
2084}
2085
2086static struct md_sysfs_entry bitmap_space =
2087__ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store);
2088
1926static ssize_t 2089static ssize_t
1927timeout_show(struct mddev *mddev, char *page) 2090timeout_show(struct mddev *mddev, char *page)
1928{ 2091{
@@ -2098,6 +2261,7 @@ __ATTR(max_backlog_used, S_IRUGO | S_IWUSR,
2098 2261
2099static struct attribute *md_bitmap_attrs[] = { 2262static struct attribute *md_bitmap_attrs[] = {
2100 &bitmap_location.attr, 2263 &bitmap_location.attr,
2264 &bitmap_space.attr,
2101 &bitmap_timeout.attr, 2265 &bitmap_timeout.attr,
2102 &bitmap_backlog.attr, 2266 &bitmap_backlog.attr,
2103 &bitmap_chunksize.attr, 2267 &bitmap_chunksize.attr,
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index b44b0aba2d47..df4aeb6ac6f0 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -111,9 +111,9 @@ typedef __u16 bitmap_counter_t;
111 111
112/* use these for bitmap->flags and bitmap->sb->state bit-fields */ 112/* use these for bitmap->flags and bitmap->sb->state bit-fields */
113enum bitmap_state { 113enum bitmap_state {
114 BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ 114 BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */
115 BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ 115 BITMAP_WRITE_ERROR = 2, /* A write error has occurred */
116 BITMAP_HOSTENDIAN = 0x8000, 116 BITMAP_HOSTENDIAN =15,
117}; 117};
118 118
119/* the superblock at the front of the bitmap file -- little endian */ 119/* the superblock at the front of the bitmap file -- little endian */
@@ -128,8 +128,10 @@ typedef struct bitmap_super_s {
128 __le32 chunksize; /* 52 the bitmap chunk size in bytes */ 128 __le32 chunksize; /* 52 the bitmap chunk size in bytes */
129 __le32 daemon_sleep; /* 56 seconds between disk flushes */ 129 __le32 daemon_sleep; /* 56 seconds between disk flushes */
130 __le32 write_behind; /* 60 number of outstanding write-behind writes */ 130 __le32 write_behind; /* 60 number of outstanding write-behind writes */
131 __le32 sectors_reserved; /* 64 number of 512-byte sectors that are
132 * reserved for the bitmap. */
131 133
132 __u8 pad[256 - 64]; /* set to zero */ 134 __u8 pad[256 - 68]; /* set to zero */
133} bitmap_super_t; 135} bitmap_super_t;
134 136
135/* notes: 137/* notes:
@@ -160,35 +162,48 @@ struct bitmap_page {
160 */ 162 */
161 unsigned int hijacked:1; 163 unsigned int hijacked:1;
162 /* 164 /*
165 * If any counter in this page is '1' or '2' - and so could be
166 * cleared then that page is marked as 'pending'
167 */
168 unsigned int pending:1;
169 /*
163 * count of dirty bits on the page 170 * count of dirty bits on the page
164 */ 171 */
165 unsigned int count:31; 172 unsigned int count:30;
166}; 173};
167 174
168/* the main bitmap structure - one per mddev */ 175/* the main bitmap structure - one per mddev */
169struct bitmap { 176struct bitmap {
170 struct bitmap_page *bp;
171 unsigned long pages; /* total number of pages in the bitmap */
172 unsigned long missing_pages; /* number of pages not yet allocated */
173 177
174 struct mddev *mddev; /* the md device that the bitmap is for */ 178 struct bitmap_counts {
179 spinlock_t lock;
180 struct bitmap_page *bp;
181 unsigned long pages; /* total number of pages
182 * in the bitmap */
183 unsigned long missing_pages; /* number of pages
184 * not yet allocated */
185 unsigned long chunkshift; /* chunksize = 2^chunkshift
186 * (for bitops) */
187 unsigned long chunks; /* Total number of data
188 * chunks for the array */
189 } counts;
175 190
176 /* bitmap chunksize -- how much data does each bit represent? */ 191 struct mddev *mddev; /* the md device that the bitmap is for */
177 unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */
178 unsigned long chunks; /* total number of data chunks for the array */
179 192
180 __u64 events_cleared; 193 __u64 events_cleared;
181 int need_sync; 194 int need_sync;
182 195
183 /* bitmap spinlock */ 196 struct bitmap_storage {
184 spinlock_t lock; 197 struct file *file; /* backing disk file */
185 198 struct page *sb_page; /* cached copy of the bitmap
186 struct file *file; /* backing disk file */ 199 * file superblock */
187 struct page *sb_page; /* cached copy of the bitmap file superblock */ 200 struct page **filemap; /* list of cache pages for
188 struct page **filemap; /* list of cache pages for the file */ 201 * the file */
189 unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ 202 unsigned long *filemap_attr; /* attributes associated
190 unsigned long file_pages; /* number of pages in the file */ 203 * w/ filemap pages */
191 int last_page_size; /* bytes in the last page */ 204 unsigned long file_pages; /* number of pages in the file*/
205 unsigned long bytes; /* total bytes in the bitmap */
206 } storage;
192 207
193 unsigned long flags; 208 unsigned long flags;
194 209
@@ -242,6 +257,9 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
242 257
243void bitmap_unplug(struct bitmap *bitmap); 258void bitmap_unplug(struct bitmap *bitmap);
244void bitmap_daemon_work(struct mddev *mddev); 259void bitmap_daemon_work(struct mddev *mddev);
260
261int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
262 int chunksize, int init);
245#endif 263#endif
246 264
247#endif 265#endif
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 68965e663248..017c34d78d61 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -155,10 +155,7 @@ static void context_free(struct raid_set *rs)
155 for (i = 0; i < rs->md.raid_disks; i++) { 155 for (i = 0; i < rs->md.raid_disks; i++) {
156 if (rs->dev[i].meta_dev) 156 if (rs->dev[i].meta_dev)
157 dm_put_device(rs->ti, rs->dev[i].meta_dev); 157 dm_put_device(rs->ti, rs->dev[i].meta_dev);
158 if (rs->dev[i].rdev.sb_page) 158 md_rdev_clear(&rs->dev[i].rdev);
159 put_page(rs->dev[i].rdev.sb_page);
160 rs->dev[i].rdev.sb_page = NULL;
161 rs->dev[i].rdev.sb_loaded = 0;
162 if (rs->dev[i].data_dev) 159 if (rs->dev[i].data_dev)
163 dm_put_device(rs->ti, rs->dev[i].data_dev); 160 dm_put_device(rs->ti, rs->dev[i].data_dev);
164 } 161 }
@@ -606,7 +603,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
606 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { 603 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
607 DMERR("Failed to read superblock of device at position %d", 604 DMERR("Failed to read superblock of device at position %d",
608 rdev->raid_disk); 605 rdev->raid_disk);
609 set_bit(Faulty, &rdev->flags); 606 md_error(rdev->mddev, rdev);
610 return -EINVAL; 607 return -EINVAL;
611 } 608 }
612 609
@@ -617,16 +614,18 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
617 614
618static void super_sync(struct mddev *mddev, struct md_rdev *rdev) 615static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
619{ 616{
620 struct md_rdev *r; 617 int i;
621 uint64_t failed_devices; 618 uint64_t failed_devices;
622 struct dm_raid_superblock *sb; 619 struct dm_raid_superblock *sb;
620 struct raid_set *rs = container_of(mddev, struct raid_set, md);
623 621
624 sb = page_address(rdev->sb_page); 622 sb = page_address(rdev->sb_page);
625 failed_devices = le64_to_cpu(sb->failed_devices); 623 failed_devices = le64_to_cpu(sb->failed_devices);
626 624
627 rdev_for_each(r, mddev) 625 for (i = 0; i < mddev->raid_disks; i++)
628 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) 626 if (!rs->dev[i].data_dev ||
629 failed_devices |= (1ULL << r->raid_disk); 627 test_bit(Faulty, &(rs->dev[i].rdev.flags)))
628 failed_devices |= (1ULL << i);
630 629
631 memset(sb, 0, sizeof(*sb)); 630 memset(sb, 0, sizeof(*sb));
632 631
@@ -1252,12 +1251,13 @@ static void raid_resume(struct dm_target *ti)
1252{ 1251{
1253 struct raid_set *rs = ti->private; 1252 struct raid_set *rs = ti->private;
1254 1253
1254 set_bit(MD_CHANGE_DEVS, &rs->md.flags);
1255 if (!rs->bitmap_loaded) { 1255 if (!rs->bitmap_loaded) {
1256 bitmap_load(&rs->md); 1256 bitmap_load(&rs->md);
1257 rs->bitmap_loaded = 1; 1257 rs->bitmap_loaded = 1;
1258 } else 1258 }
1259 md_wakeup_thread(rs->md.thread);
1260 1259
1260 clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
1261 mddev_resume(&rs->md); 1261 mddev_resume(&rs->md);
1262} 1262}
1263 1263
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 01233d855eb2..1c2f9048e1ae 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -402,6 +402,7 @@ void mddev_resume(struct mddev *mddev)
402 wake_up(&mddev->sb_wait); 402 wake_up(&mddev->sb_wait);
403 mddev->pers->quiesce(mddev, 0); 403 mddev->pers->quiesce(mddev, 0);
404 404
405 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
405 md_wakeup_thread(mddev->thread); 406 md_wakeup_thread(mddev->thread);
406 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 407 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
407} 408}
@@ -452,7 +453,7 @@ static void submit_flushes(struct work_struct *ws)
452 atomic_inc(&rdev->nr_pending); 453 atomic_inc(&rdev->nr_pending);
453 atomic_inc(&rdev->nr_pending); 454 atomic_inc(&rdev->nr_pending);
454 rcu_read_unlock(); 455 rcu_read_unlock();
455 bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev); 456 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
456 bi->bi_end_io = md_end_flush; 457 bi->bi_end_io = md_end_flush;
457 bi->bi_private = rdev; 458 bi->bi_private = rdev;
458 bi->bi_bdev = rdev->bdev; 459 bi->bi_bdev = rdev->bdev;
@@ -607,6 +608,7 @@ void mddev_init(struct mddev *mddev)
607 init_waitqueue_head(&mddev->sb_wait); 608 init_waitqueue_head(&mddev->sb_wait);
608 init_waitqueue_head(&mddev->recovery_wait); 609 init_waitqueue_head(&mddev->recovery_wait);
609 mddev->reshape_position = MaxSector; 610 mddev->reshape_position = MaxSector;
611 mddev->reshape_backwards = 0;
610 mddev->resync_min = 0; 612 mddev->resync_min = 0;
611 mddev->resync_max = MaxSector; 613 mddev->resync_max = MaxSector;
612 mddev->level = LEVEL_NONE; 614 mddev->level = LEVEL_NONE;
@@ -802,7 +804,7 @@ static int alloc_disk_sb(struct md_rdev * rdev)
802 return 0; 804 return 0;
803} 805}
804 806
805static void free_disk_sb(struct md_rdev * rdev) 807void md_rdev_clear(struct md_rdev *rdev)
806{ 808{
807 if (rdev->sb_page) { 809 if (rdev->sb_page) {
808 put_page(rdev->sb_page); 810 put_page(rdev->sb_page);
@@ -815,8 +817,10 @@ static void free_disk_sb(struct md_rdev * rdev)
815 put_page(rdev->bb_page); 817 put_page(rdev->bb_page);
816 rdev->bb_page = NULL; 818 rdev->bb_page = NULL;
817 } 819 }
820 kfree(rdev->badblocks.page);
821 rdev->badblocks.page = NULL;
818} 822}
819 823EXPORT_SYMBOL_GPL(md_rdev_clear);
820 824
821static void super_written(struct bio *bio, int error) 825static void super_written(struct bio *bio, int error)
822{ 826{
@@ -887,6 +891,10 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
887 rdev->meta_bdev : rdev->bdev; 891 rdev->meta_bdev : rdev->bdev;
888 if (metadata_op) 892 if (metadata_op)
889 bio->bi_sector = sector + rdev->sb_start; 893 bio->bi_sector = sector + rdev->sb_start;
894 else if (rdev->mddev->reshape_position != MaxSector &&
895 (rdev->mddev->reshape_backwards ==
896 (sector >= rdev->mddev->reshape_position)))
897 bio->bi_sector = sector + rdev->new_data_offset;
890 else 898 else
891 bio->bi_sector = sector + rdev->data_offset; 899 bio->bi_sector = sector + rdev->data_offset;
892 bio_add_page(bio, page, size, 0); 900 bio_add_page(bio, page, size, 0);
@@ -1034,12 +1042,17 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
1034struct super_type { 1042struct super_type {
1035 char *name; 1043 char *name;
1036 struct module *owner; 1044 struct module *owner;
1037 int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, 1045 int (*load_super)(struct md_rdev *rdev,
1046 struct md_rdev *refdev,
1038 int minor_version); 1047 int minor_version);
1039 int (*validate_super)(struct mddev *mddev, struct md_rdev *rdev); 1048 int (*validate_super)(struct mddev *mddev,
1040 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); 1049 struct md_rdev *rdev);
1050 void (*sync_super)(struct mddev *mddev,
1051 struct md_rdev *rdev);
1041 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1052 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
1042 sector_t num_sectors); 1053 sector_t num_sectors);
1054 int (*allow_new_offset)(struct md_rdev *rdev,
1055 unsigned long long new_offset);
1043}; 1056};
1044 1057
1045/* 1058/*
@@ -1111,6 +1124,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
1111 1124
1112 rdev->preferred_minor = sb->md_minor; 1125 rdev->preferred_minor = sb->md_minor;
1113 rdev->data_offset = 0; 1126 rdev->data_offset = 0;
1127 rdev->new_data_offset = 0;
1114 rdev->sb_size = MD_SB_BYTES; 1128 rdev->sb_size = MD_SB_BYTES;
1115 rdev->badblocks.shift = -1; 1129 rdev->badblocks.shift = -1;
1116 1130
@@ -1184,7 +1198,11 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1184 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1198 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1185 mddev->events = ev1; 1199 mddev->events = ev1;
1186 mddev->bitmap_info.offset = 0; 1200 mddev->bitmap_info.offset = 0;
1201 mddev->bitmap_info.space = 0;
1202 /* bitmap can use 60 K after the 4K superblocks */
1187 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1203 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1204 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1205 mddev->reshape_backwards = 0;
1188 1206
1189 if (mddev->minor_version >= 91) { 1207 if (mddev->minor_version >= 91) {
1190 mddev->reshape_position = sb->reshape_position; 1208 mddev->reshape_position = sb->reshape_position;
@@ -1192,6 +1210,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1192 mddev->new_level = sb->new_level; 1210 mddev->new_level = sb->new_level;
1193 mddev->new_layout = sb->new_layout; 1211 mddev->new_layout = sb->new_layout;
1194 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1212 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1213 if (mddev->delta_disks < 0)
1214 mddev->reshape_backwards = 1;
1195 } else { 1215 } else {
1196 mddev->reshape_position = MaxSector; 1216 mddev->reshape_position = MaxSector;
1197 mddev->delta_disks = 0; 1217 mddev->delta_disks = 0;
@@ -1218,9 +1238,12 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1218 mddev->max_disks = MD_SB_DISKS; 1238 mddev->max_disks = MD_SB_DISKS;
1219 1239
1220 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1240 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1221 mddev->bitmap_info.file == NULL) 1241 mddev->bitmap_info.file == NULL) {
1222 mddev->bitmap_info.offset = 1242 mddev->bitmap_info.offset =
1223 mddev->bitmap_info.default_offset; 1243 mddev->bitmap_info.default_offset;
1244 mddev->bitmap_info.space =
1245 mddev->bitmap_info.space;
1246 }
1224 1247
1225 } else if (mddev->pers == NULL) { 1248 } else if (mddev->pers == NULL) {
1226 /* Insist on good event counter while assembling, except 1249 /* Insist on good event counter while assembling, except
@@ -1434,6 +1457,12 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1434 return num_sectors; 1457 return num_sectors;
1435} 1458}
1436 1459
1460static int
1461super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1462{
1463 /* non-zero offset changes not possible with v0.90 */
1464 return new_offset == 0;
1465}
1437 1466
1438/* 1467/*
1439 * version 1 superblock 1468 * version 1 superblock
@@ -1469,6 +1498,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
1469 struct mdp_superblock_1 *sb; 1498 struct mdp_superblock_1 *sb;
1470 int ret; 1499 int ret;
1471 sector_t sb_start; 1500 sector_t sb_start;
1501 sector_t sectors;
1472 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1502 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1473 int bmask; 1503 int bmask;
1474 1504
@@ -1523,9 +1553,18 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
1523 bdevname(rdev->bdev,b)); 1553 bdevname(rdev->bdev,b));
1524 return -EINVAL; 1554 return -EINVAL;
1525 } 1555 }
1556 if (sb->pad0 ||
1557 sb->pad3[0] ||
1558 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1559 /* Some padding is non-zero, might be a new feature */
1560 return -EINVAL;
1526 1561
1527 rdev->preferred_minor = 0xffff; 1562 rdev->preferred_minor = 0xffff;
1528 rdev->data_offset = le64_to_cpu(sb->data_offset); 1563 rdev->data_offset = le64_to_cpu(sb->data_offset);
1564 rdev->new_data_offset = rdev->data_offset;
1565 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1566 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1567 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1529 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1568 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1530 1569
1531 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1570 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
@@ -1536,6 +1575,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
1536 if (minor_version 1575 if (minor_version
1537 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1576 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1538 return -EINVAL; 1577 return -EINVAL;
1578 if (minor_version
1579 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1580 return -EINVAL;
1539 1581
1540 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1582 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1541 rdev->desc_nr = -1; 1583 rdev->desc_nr = -1;
@@ -1607,16 +1649,14 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
1607 else 1649 else
1608 ret = 0; 1650 ret = 0;
1609 } 1651 }
1610 if (minor_version) 1652 if (minor_version) {
1611 rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 1653 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1612 le64_to_cpu(sb->data_offset); 1654 sectors -= rdev->data_offset;
1613 else 1655 } else
1614 rdev->sectors = rdev->sb_start; 1656 sectors = rdev->sb_start;
1615 if (rdev->sectors < le64_to_cpu(sb->data_size)) 1657 if (sectors < le64_to_cpu(sb->data_size))
1616 return -EINVAL; 1658 return -EINVAL;
1617 rdev->sectors = le64_to_cpu(sb->data_size); 1659 rdev->sectors = le64_to_cpu(sb->data_size);
1618 if (le64_to_cpu(sb->size) > rdev->sectors)
1619 return -EINVAL;
1620 return ret; 1660 return ret;
1621} 1661}
1622 1662
@@ -1644,17 +1684,37 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1644 mddev->dev_sectors = le64_to_cpu(sb->size); 1684 mddev->dev_sectors = le64_to_cpu(sb->size);
1645 mddev->events = ev1; 1685 mddev->events = ev1;
1646 mddev->bitmap_info.offset = 0; 1686 mddev->bitmap_info.offset = 0;
1687 mddev->bitmap_info.space = 0;
1688 /* Default location for bitmap is 1K after superblock
1689 * using 3K - total of 4K
1690 */
1647 mddev->bitmap_info.default_offset = 1024 >> 9; 1691 mddev->bitmap_info.default_offset = 1024 >> 9;
1648 1692 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1693 mddev->reshape_backwards = 0;
1694
1649 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1695 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1650 memcpy(mddev->uuid, sb->set_uuid, 16); 1696 memcpy(mddev->uuid, sb->set_uuid, 16);
1651 1697
1652 mddev->max_disks = (4096-256)/2; 1698 mddev->max_disks = (4096-256)/2;
1653 1699
1654 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1700 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1655 mddev->bitmap_info.file == NULL ) 1701 mddev->bitmap_info.file == NULL) {
1656 mddev->bitmap_info.offset = 1702 mddev->bitmap_info.offset =
1657 (__s32)le32_to_cpu(sb->bitmap_offset); 1703 (__s32)le32_to_cpu(sb->bitmap_offset);
1704 /* Metadata doesn't record how much space is available.
1705 * For 1.0, we assume we can use up to the superblock
1706 * if before, else to 4K beyond superblock.
1707 * For others, assume no change is possible.
1708 */
1709 if (mddev->minor_version > 0)
1710 mddev->bitmap_info.space = 0;
1711 else if (mddev->bitmap_info.offset > 0)
1712 mddev->bitmap_info.space =
1713 8 - mddev->bitmap_info.offset;
1714 else
1715 mddev->bitmap_info.space =
1716 -mddev->bitmap_info.offset;
1717 }
1658 1718
1659 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1719 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1660 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1720 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
@@ -1662,6 +1722,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1662 mddev->new_level = le32_to_cpu(sb->new_level); 1722 mddev->new_level = le32_to_cpu(sb->new_level);
1663 mddev->new_layout = le32_to_cpu(sb->new_layout); 1723 mddev->new_layout = le32_to_cpu(sb->new_layout);
1664 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1724 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1725 if (mddev->delta_disks < 0 ||
1726 (mddev->delta_disks == 0 &&
1727 (le32_to_cpu(sb->feature_map)
1728 & MD_FEATURE_RESHAPE_BACKWARDS)))
1729 mddev->reshape_backwards = 1;
1665 } else { 1730 } else {
1666 mddev->reshape_position = MaxSector; 1731 mddev->reshape_position = MaxSector;
1667 mddev->delta_disks = 0; 1732 mddev->delta_disks = 0;
@@ -1735,7 +1800,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1735 sb->feature_map = 0; 1800 sb->feature_map = 0;
1736 sb->pad0 = 0; 1801 sb->pad0 = 0;
1737 sb->recovery_offset = cpu_to_le64(0); 1802 sb->recovery_offset = cpu_to_le64(0);
1738 memset(sb->pad1, 0, sizeof(sb->pad1));
1739 memset(sb->pad3, 0, sizeof(sb->pad3)); 1803 memset(sb->pad3, 0, sizeof(sb->pad3));
1740 1804
1741 sb->utime = cpu_to_le64((__u64)mddev->utime); 1805 sb->utime = cpu_to_le64((__u64)mddev->utime);
@@ -1757,6 +1821,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1757 sb->devflags |= WriteMostly1; 1821 sb->devflags |= WriteMostly1;
1758 else 1822 else
1759 sb->devflags &= ~WriteMostly1; 1823 sb->devflags &= ~WriteMostly1;
1824 sb->data_offset = cpu_to_le64(rdev->data_offset);
1825 sb->data_size = cpu_to_le64(rdev->sectors);
1760 1826
1761 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1827 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1762 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1828 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
@@ -1781,6 +1847,16 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1781 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1847 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1782 sb->new_level = cpu_to_le32(mddev->new_level); 1848 sb->new_level = cpu_to_le32(mddev->new_level);
1783 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1849 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1850 if (mddev->delta_disks == 0 &&
1851 mddev->reshape_backwards)
1852 sb->feature_map
1853 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1854 if (rdev->new_data_offset != rdev->data_offset) {
1855 sb->feature_map
1856 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1857 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1858 - rdev->data_offset));
1859 }
1784 } 1860 }
1785 1861
1786 if (rdev->badblocks.count == 0) 1862 if (rdev->badblocks.count == 0)
@@ -1857,6 +1933,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1857 sector_t max_sectors; 1933 sector_t max_sectors;
1858 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1934 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1859 return 0; /* component must fit device */ 1935 return 0; /* component must fit device */
1936 if (rdev->data_offset != rdev->new_data_offset)
1937 return 0; /* too confusing */
1860 if (rdev->sb_start < rdev->data_offset) { 1938 if (rdev->sb_start < rdev->data_offset) {
1861 /* minor versions 1 and 2; superblock before data */ 1939 /* minor versions 1 and 2; superblock before data */
1862 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; 1940 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
@@ -1884,6 +1962,40 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1884 rdev->sb_page); 1962 rdev->sb_page);
1885 md_super_wait(rdev->mddev); 1963 md_super_wait(rdev->mddev);
1886 return num_sectors; 1964 return num_sectors;
1965
1966}
1967
1968static int
1969super_1_allow_new_offset(struct md_rdev *rdev,
1970 unsigned long long new_offset)
1971{
1972 /* All necessary checks on new >= old have been done */
1973 struct bitmap *bitmap;
1974 if (new_offset >= rdev->data_offset)
1975 return 1;
1976
1977 /* with 1.0 metadata, there is no metadata to tread on
1978 * so we can always move back */
1979 if (rdev->mddev->minor_version == 0)
1980 return 1;
1981
1982 /* otherwise we must be sure not to step on
1983 * any metadata, so stay:
1984 * 36K beyond start of superblock
1985 * beyond end of badblocks
1986 * beyond write-intent bitmap
1987 */
1988 if (rdev->sb_start + (32+4)*2 > new_offset)
1989 return 0;
1990 bitmap = rdev->mddev->bitmap;
1991 if (bitmap && !rdev->mddev->bitmap_info.file &&
1992 rdev->sb_start + rdev->mddev->bitmap_info.offset +
1993 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
1994 return 0;
1995 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
1996 return 0;
1997
1998 return 1;
1887} 1999}
1888 2000
1889static struct super_type super_types[] = { 2001static struct super_type super_types[] = {
@@ -1894,6 +2006,7 @@ static struct super_type super_types[] = {
1894 .validate_super = super_90_validate, 2006 .validate_super = super_90_validate,
1895 .sync_super = super_90_sync, 2007 .sync_super = super_90_sync,
1896 .rdev_size_change = super_90_rdev_size_change, 2008 .rdev_size_change = super_90_rdev_size_change,
2009 .allow_new_offset = super_90_allow_new_offset,
1897 }, 2010 },
1898 [1] = { 2011 [1] = {
1899 .name = "md-1", 2012 .name = "md-1",
@@ -1902,6 +2015,7 @@ static struct super_type super_types[] = {
1902 .validate_super = super_1_validate, 2015 .validate_super = super_1_validate,
1903 .sync_super = super_1_sync, 2016 .sync_super = super_1_sync,
1904 .rdev_size_change = super_1_rdev_size_change, 2017 .rdev_size_change = super_1_rdev_size_change,
2018 .allow_new_offset = super_1_allow_new_offset,
1905 }, 2019 },
1906}; 2020};
1907 2021
@@ -2105,9 +2219,7 @@ static void unbind_rdev_from_array(struct md_rdev * rdev)
2105 sysfs_remove_link(&rdev->kobj, "block"); 2219 sysfs_remove_link(&rdev->kobj, "block");
2106 sysfs_put(rdev->sysfs_state); 2220 sysfs_put(rdev->sysfs_state);
2107 rdev->sysfs_state = NULL; 2221 rdev->sysfs_state = NULL;
2108 kfree(rdev->badblocks.page);
2109 rdev->badblocks.count = 0; 2222 rdev->badblocks.count = 0;
2110 rdev->badblocks.page = NULL;
2111 /* We need to delay this, otherwise we can deadlock when 2223 /* We need to delay this, otherwise we can deadlock when
2112 * writing to 'remove' to "dev/state". We also need 2224 * writing to 'remove' to "dev/state". We also need
2113 * to delay it due to rcu usage. 2225 * to delay it due to rcu usage.
@@ -2158,7 +2270,7 @@ static void export_rdev(struct md_rdev * rdev)
2158 bdevname(rdev->bdev,b)); 2270 bdevname(rdev->bdev,b));
2159 if (rdev->mddev) 2271 if (rdev->mddev)
2160 MD_BUG(); 2272 MD_BUG();
2161 free_disk_sb(rdev); 2273 md_rdev_clear(rdev);
2162#ifndef MODULE 2274#ifndef MODULE
2163 if (test_bit(AutoDetected, &rdev->flags)) 2275 if (test_bit(AutoDetected, &rdev->flags))
2164 md_autodetect_dev(rdev->bdev->bd_dev); 2276 md_autodetect_dev(rdev->bdev->bd_dev);
@@ -2809,9 +2921,8 @@ offset_show(struct md_rdev *rdev, char *page)
2809static ssize_t 2921static ssize_t
2810offset_store(struct md_rdev *rdev, const char *buf, size_t len) 2922offset_store(struct md_rdev *rdev, const char *buf, size_t len)
2811{ 2923{
2812 char *e; 2924 unsigned long long offset;
2813 unsigned long long offset = simple_strtoull(buf, &e, 10); 2925 if (strict_strtoull(buf, 10, &offset) < 0)
2814 if (e==buf || (*e && *e != '\n'))
2815 return -EINVAL; 2926 return -EINVAL;
2816 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2927 if (rdev->mddev->pers && rdev->raid_disk >= 0)
2817 return -EBUSY; 2928 return -EBUSY;
@@ -2826,6 +2937,63 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len)
2826static struct rdev_sysfs_entry rdev_offset = 2937static struct rdev_sysfs_entry rdev_offset =
2827__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2938__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2828 2939
2940static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
2941{
2942 return sprintf(page, "%llu\n",
2943 (unsigned long long)rdev->new_data_offset);
2944}
2945
2946static ssize_t new_offset_store(struct md_rdev *rdev,
2947 const char *buf, size_t len)
2948{
2949 unsigned long long new_offset;
2950 struct mddev *mddev = rdev->mddev;
2951
2952 if (strict_strtoull(buf, 10, &new_offset) < 0)
2953 return -EINVAL;
2954
2955 if (mddev->sync_thread)
2956 return -EBUSY;
2957 if (new_offset == rdev->data_offset)
2958 /* reset is always permitted */
2959 ;
2960 else if (new_offset > rdev->data_offset) {
2961 /* must not push array size beyond rdev_sectors */
2962 if (new_offset - rdev->data_offset
2963 + mddev->dev_sectors > rdev->sectors)
2964 return -E2BIG;
2965 }
2966 /* Metadata worries about other space details. */
2967
2968 /* decreasing the offset is inconsistent with a backwards
2969 * reshape.
2970 */
2971 if (new_offset < rdev->data_offset &&
2972 mddev->reshape_backwards)
2973 return -EINVAL;
2974 /* Increasing offset is inconsistent with forwards
2975 * reshape. reshape_direction should be set to
2976 * 'backwards' first.
2977 */
2978 if (new_offset > rdev->data_offset &&
2979 !mddev->reshape_backwards)
2980 return -EINVAL;
2981
2982 if (mddev->pers && mddev->persistent &&
2983 !super_types[mddev->major_version]
2984 .allow_new_offset(rdev, new_offset))
2985 return -E2BIG;
2986 rdev->new_data_offset = new_offset;
2987 if (new_offset > rdev->data_offset)
2988 mddev->reshape_backwards = 1;
2989 else if (new_offset < rdev->data_offset)
2990 mddev->reshape_backwards = 0;
2991
2992 return len;
2993}
2994static struct rdev_sysfs_entry rdev_new_offset =
2995__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
2996
2829static ssize_t 2997static ssize_t
2830rdev_size_show(struct md_rdev *rdev, char *page) 2998rdev_size_show(struct md_rdev *rdev, char *page)
2831{ 2999{
@@ -2870,6 +3038,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2870 3038
2871 if (strict_blocks_to_sectors(buf, &sectors) < 0) 3039 if (strict_blocks_to_sectors(buf, &sectors) < 0)
2872 return -EINVAL; 3040 return -EINVAL;
3041 if (rdev->data_offset != rdev->new_data_offset)
3042 return -EINVAL; /* too confusing */
2873 if (my_mddev->pers && rdev->raid_disk >= 0) { 3043 if (my_mddev->pers && rdev->raid_disk >= 0) {
2874 if (my_mddev->persistent) { 3044 if (my_mddev->persistent) {
2875 sectors = super_types[my_mddev->major_version]. 3045 sectors = super_types[my_mddev->major_version].
@@ -3006,6 +3176,7 @@ static struct attribute *rdev_default_attrs[] = {
3006 &rdev_errors.attr, 3176 &rdev_errors.attr,
3007 &rdev_slot.attr, 3177 &rdev_slot.attr,
3008 &rdev_offset.attr, 3178 &rdev_offset.attr,
3179 &rdev_new_offset.attr,
3009 &rdev_size.attr, 3180 &rdev_size.attr,
3010 &rdev_recovery_start.attr, 3181 &rdev_recovery_start.attr,
3011 &rdev_bad_blocks.attr, 3182 &rdev_bad_blocks.attr,
@@ -3080,6 +3251,7 @@ int md_rdev_init(struct md_rdev *rdev)
3080 rdev->raid_disk = -1; 3251 rdev->raid_disk = -1;
3081 rdev->flags = 0; 3252 rdev->flags = 0;
3082 rdev->data_offset = 0; 3253 rdev->data_offset = 0;
3254 rdev->new_data_offset = 0;
3083 rdev->sb_events = 0; 3255 rdev->sb_events = 0;
3084 rdev->last_read_error.tv_sec = 0; 3256 rdev->last_read_error.tv_sec = 0;
3085 rdev->last_read_error.tv_nsec = 0; 3257 rdev->last_read_error.tv_nsec = 0;
@@ -3178,8 +3350,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
3178abort_free: 3350abort_free:
3179 if (rdev->bdev) 3351 if (rdev->bdev)
3180 unlock_rdev(rdev); 3352 unlock_rdev(rdev);
3181 free_disk_sb(rdev); 3353 md_rdev_clear(rdev);
3182 kfree(rdev->badblocks.page);
3183 kfree(rdev); 3354 kfree(rdev);
3184 return ERR_PTR(err); 3355 return ERR_PTR(err);
3185} 3356}
@@ -3419,6 +3590,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3419 mddev->new_chunk_sectors = mddev->chunk_sectors; 3590 mddev->new_chunk_sectors = mddev->chunk_sectors;
3420 mddev->raid_disks -= mddev->delta_disks; 3591 mddev->raid_disks -= mddev->delta_disks;
3421 mddev->delta_disks = 0; 3592 mddev->delta_disks = 0;
3593 mddev->reshape_backwards = 0;
3422 module_put(pers->owner); 3594 module_put(pers->owner);
3423 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3595 printk(KERN_WARNING "md: %s: %s would not accept array\n",
3424 mdname(mddev), clevel); 3596 mdname(mddev), clevel);
@@ -3492,6 +3664,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3492 mddev->layout = mddev->new_layout; 3664 mddev->layout = mddev->new_layout;
3493 mddev->chunk_sectors = mddev->new_chunk_sectors; 3665 mddev->chunk_sectors = mddev->new_chunk_sectors;
3494 mddev->delta_disks = 0; 3666 mddev->delta_disks = 0;
3667 mddev->reshape_backwards = 0;
3495 mddev->degraded = 0; 3668 mddev->degraded = 0;
3496 if (mddev->pers->sync_request == NULL) { 3669 if (mddev->pers->sync_request == NULL) {
3497 /* this is now an array without redundancy, so 3670 /* this is now an array without redundancy, so
@@ -3501,10 +3674,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3501 del_timer_sync(&mddev->safemode_timer); 3674 del_timer_sync(&mddev->safemode_timer);
3502 } 3675 }
3503 pers->run(mddev); 3676 pers->run(mddev);
3504 mddev_resume(mddev);
3505 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3677 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3506 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3678 mddev_resume(mddev);
3507 md_wakeup_thread(mddev->thread);
3508 sysfs_notify(&mddev->kobj, NULL, "level"); 3679 sysfs_notify(&mddev->kobj, NULL, "level");
3509 md_new_event(mddev); 3680 md_new_event(mddev);
3510 return rv; 3681 return rv;
@@ -3582,9 +3753,20 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3582 if (mddev->pers) 3753 if (mddev->pers)
3583 rv = update_raid_disks(mddev, n); 3754 rv = update_raid_disks(mddev, n);
3584 else if (mddev->reshape_position != MaxSector) { 3755 else if (mddev->reshape_position != MaxSector) {
3756 struct md_rdev *rdev;
3585 int olddisks = mddev->raid_disks - mddev->delta_disks; 3757 int olddisks = mddev->raid_disks - mddev->delta_disks;
3758
3759 rdev_for_each(rdev, mddev) {
3760 if (olddisks < n &&
3761 rdev->data_offset < rdev->new_data_offset)
3762 return -EINVAL;
3763 if (olddisks > n &&
3764 rdev->data_offset > rdev->new_data_offset)
3765 return -EINVAL;
3766 }
3586 mddev->delta_disks = n - olddisks; 3767 mddev->delta_disks = n - olddisks;
3587 mddev->raid_disks = n; 3768 mddev->raid_disks = n;
3769 mddev->reshape_backwards = (mddev->delta_disks < 0);
3588 } else 3770 } else
3589 mddev->raid_disks = n; 3771 mddev->raid_disks = n;
3590 return rv ? rv : len; 3772 return rv ? rv : len;
@@ -4266,7 +4448,8 @@ sync_completed_show(struct mddev *mddev, char *page)
4266 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4448 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4267 return sprintf(page, "none\n"); 4449 return sprintf(page, "none\n");
4268 4450
4269 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4451 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4452 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4270 max_sectors = mddev->resync_max_sectors; 4453 max_sectors = mddev->resync_max_sectors;
4271 else 4454 else
4272 max_sectors = mddev->dev_sectors; 4455 max_sectors = mddev->dev_sectors;
@@ -4428,6 +4611,7 @@ reshape_position_show(struct mddev *mddev, char *page)
4428static ssize_t 4611static ssize_t
4429reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 4612reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4430{ 4613{
4614 struct md_rdev *rdev;
4431 char *e; 4615 char *e;
4432 unsigned long long new = simple_strtoull(buf, &e, 10); 4616 unsigned long long new = simple_strtoull(buf, &e, 10);
4433 if (mddev->pers) 4617 if (mddev->pers)
@@ -4436,9 +4620,12 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4436 return -EINVAL; 4620 return -EINVAL;
4437 mddev->reshape_position = new; 4621 mddev->reshape_position = new;
4438 mddev->delta_disks = 0; 4622 mddev->delta_disks = 0;
4623 mddev->reshape_backwards = 0;
4439 mddev->new_level = mddev->level; 4624 mddev->new_level = mddev->level;
4440 mddev->new_layout = mddev->layout; 4625 mddev->new_layout = mddev->layout;
4441 mddev->new_chunk_sectors = mddev->chunk_sectors; 4626 mddev->new_chunk_sectors = mddev->chunk_sectors;
4627 rdev_for_each(rdev, mddev)
4628 rdev->new_data_offset = rdev->data_offset;
4442 return len; 4629 return len;
4443} 4630}
4444 4631
@@ -4447,6 +4634,42 @@ __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4447 reshape_position_store); 4634 reshape_position_store);
4448 4635
4449static ssize_t 4636static ssize_t
4637reshape_direction_show(struct mddev *mddev, char *page)
4638{
4639 return sprintf(page, "%s\n",
4640 mddev->reshape_backwards ? "backwards" : "forwards");
4641}
4642
4643static ssize_t
4644reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4645{
4646 int backwards = 0;
4647 if (cmd_match(buf, "forwards"))
4648 backwards = 0;
4649 else if (cmd_match(buf, "backwards"))
4650 backwards = 1;
4651 else
4652 return -EINVAL;
4653 if (mddev->reshape_backwards == backwards)
4654 return len;
4655
4656 /* check if we are allowed to change */
4657 if (mddev->delta_disks)
4658 return -EBUSY;
4659
4660 if (mddev->persistent &&
4661 mddev->major_version == 0)
4662 return -EINVAL;
4663
4664 mddev->reshape_backwards = backwards;
4665 return len;
4666}
4667
4668static struct md_sysfs_entry md_reshape_direction =
4669__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
4670 reshape_direction_store);
4671
4672static ssize_t
4450array_size_show(struct mddev *mddev, char *page) 4673array_size_show(struct mddev *mddev, char *page)
4451{ 4674{
4452 if (mddev->external_size) 4675 if (mddev->external_size)
@@ -4501,6 +4724,7 @@ static struct attribute *md_default_attrs[] = {
4501 &md_safe_delay.attr, 4724 &md_safe_delay.attr,
4502 &md_array_state.attr, 4725 &md_array_state.attr,
4503 &md_reshape_position.attr, 4726 &md_reshape_position.attr,
4727 &md_reshape_direction.attr,
4504 &md_array_size.attr, 4728 &md_array_size.attr,
4505 &max_corr_read_errors.attr, 4729 &max_corr_read_errors.attr,
4506 NULL, 4730 NULL,
@@ -4914,7 +5138,8 @@ int md_run(struct mddev *mddev)
4914 err = -EINVAL; 5138 err = -EINVAL;
4915 mddev->pers->stop(mddev); 5139 mddev->pers->stop(mddev);
4916 } 5140 }
4917 if (err == 0 && mddev->pers->sync_request) { 5141 if (err == 0 && mddev->pers->sync_request &&
5142 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
4918 err = bitmap_create(mddev); 5143 err = bitmap_create(mddev);
4919 if (err) { 5144 if (err) {
4920 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 5145 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -5064,6 +5289,7 @@ static void md_clean(struct mddev *mddev)
5064 mddev->events = 0; 5289 mddev->events = 0;
5065 mddev->can_decrease_events = 0; 5290 mddev->can_decrease_events = 0;
5066 mddev->delta_disks = 0; 5291 mddev->delta_disks = 0;
5292 mddev->reshape_backwards = 0;
5067 mddev->new_level = LEVEL_NONE; 5293 mddev->new_level = LEVEL_NONE;
5068 mddev->new_layout = 0; 5294 mddev->new_layout = 0;
5069 mddev->new_chunk_sectors = 0; 5295 mddev->new_chunk_sectors = 0;
@@ -5079,6 +5305,7 @@ static void md_clean(struct mddev *mddev)
5079 mddev->merge_check_needed = 0; 5305 mddev->merge_check_needed = 0;
5080 mddev->bitmap_info.offset = 0; 5306 mddev->bitmap_info.offset = 0;
5081 mddev->bitmap_info.default_offset = 0; 5307 mddev->bitmap_info.default_offset = 0;
5308 mddev->bitmap_info.default_space = 0;
5082 mddev->bitmap_info.chunksize = 0; 5309 mddev->bitmap_info.chunksize = 0;
5083 mddev->bitmap_info.daemon_sleep = 0; 5310 mddev->bitmap_info.daemon_sleep = 0;
5084 mddev->bitmap_info.max_write_behind = 0; 5311 mddev->bitmap_info.max_write_behind = 0;
@@ -5421,7 +5648,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
5421 goto out; 5648 goto out;
5422 5649
5423 /* bitmap disabled, zero the first byte and copy out */ 5650 /* bitmap disabled, zero the first byte and copy out */
5424 if (!mddev->bitmap || !mddev->bitmap->file) { 5651 if (!mddev->bitmap || !mddev->bitmap->storage.file) {
5425 file->pathname[0] = '\0'; 5652 file->pathname[0] = '\0';
5426 goto copy_out; 5653 goto copy_out;
5427 } 5654 }
@@ -5430,7 +5657,8 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
5430 if (!buf) 5657 if (!buf)
5431 goto out; 5658 goto out;
5432 5659
5433 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); 5660 ptr = d_path(&mddev->bitmap->storage.file->f_path,
5661 buf, sizeof(file->pathname));
5434 if (IS_ERR(ptr)) 5662 if (IS_ERR(ptr))
5435 goto out; 5663 goto out;
5436 5664
@@ -5875,6 +6103,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
5875 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6103 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5876 6104
5877 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 6105 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6106 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
5878 mddev->bitmap_info.offset = 0; 6107 mddev->bitmap_info.offset = 0;
5879 6108
5880 mddev->reshape_position = MaxSector; 6109 mddev->reshape_position = MaxSector;
@@ -5888,6 +6117,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
5888 mddev->new_chunk_sectors = mddev->chunk_sectors; 6117 mddev->new_chunk_sectors = mddev->chunk_sectors;
5889 mddev->new_layout = mddev->layout; 6118 mddev->new_layout = mddev->layout;
5890 mddev->delta_disks = 0; 6119 mddev->delta_disks = 0;
6120 mddev->reshape_backwards = 0;
5891 6121
5892 return 0; 6122 return 0;
5893} 6123}
@@ -5922,11 +6152,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
5922 */ 6152 */
5923 if (mddev->sync_thread) 6153 if (mddev->sync_thread)
5924 return -EBUSY; 6154 return -EBUSY;
5925 if (mddev->bitmap) 6155
5926 /* Sorry, cannot grow a bitmap yet, just remove it,
5927 * grow, and re-add.
5928 */
5929 return -EBUSY;
5930 rdev_for_each(rdev, mddev) { 6156 rdev_for_each(rdev, mddev) {
5931 sector_t avail = rdev->sectors; 6157 sector_t avail = rdev->sectors;
5932 6158
@@ -5944,6 +6170,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
5944static int update_raid_disks(struct mddev *mddev, int raid_disks) 6170static int update_raid_disks(struct mddev *mddev, int raid_disks)
5945{ 6171{
5946 int rv; 6172 int rv;
6173 struct md_rdev *rdev;
5947 /* change the number of raid disks */ 6174 /* change the number of raid disks */
5948 if (mddev->pers->check_reshape == NULL) 6175 if (mddev->pers->check_reshape == NULL)
5949 return -EINVAL; 6176 return -EINVAL;
@@ -5952,11 +6179,27 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
5952 return -EINVAL; 6179 return -EINVAL;
5953 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 6180 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5954 return -EBUSY; 6181 return -EBUSY;
6182
6183 rdev_for_each(rdev, mddev) {
6184 if (mddev->raid_disks < raid_disks &&
6185 rdev->data_offset < rdev->new_data_offset)
6186 return -EINVAL;
6187 if (mddev->raid_disks > raid_disks &&
6188 rdev->data_offset > rdev->new_data_offset)
6189 return -EINVAL;
6190 }
6191
5955 mddev->delta_disks = raid_disks - mddev->raid_disks; 6192 mddev->delta_disks = raid_disks - mddev->raid_disks;
6193 if (mddev->delta_disks < 0)
6194 mddev->reshape_backwards = 1;
6195 else if (mddev->delta_disks > 0)
6196 mddev->reshape_backwards = 0;
5956 6197
5957 rv = mddev->pers->check_reshape(mddev); 6198 rv = mddev->pers->check_reshape(mddev);
5958 if (rv < 0) 6199 if (rv < 0) {
5959 mddev->delta_disks = 0; 6200 mddev->delta_disks = 0;
6201 mddev->reshape_backwards = 0;
6202 }
5960 return rv; 6203 return rv;
5961} 6204}
5962 6205
@@ -6039,6 +6282,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6039 return -EINVAL; 6282 return -EINVAL;
6040 mddev->bitmap_info.offset = 6283 mddev->bitmap_info.offset =
6041 mddev->bitmap_info.default_offset; 6284 mddev->bitmap_info.default_offset;
6285 mddev->bitmap_info.space =
6286 mddev->bitmap_info.default_space;
6042 mddev->pers->quiesce(mddev, 1); 6287 mddev->pers->quiesce(mddev, 1);
6043 rv = bitmap_create(mddev); 6288 rv = bitmap_create(mddev);
6044 if (!rv) 6289 if (!rv)
@@ -6050,7 +6295,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6050 /* remove the bitmap */ 6295 /* remove the bitmap */
6051 if (!mddev->bitmap) 6296 if (!mddev->bitmap)
6052 return -ENOENT; 6297 return -ENOENT;
6053 if (mddev->bitmap->file) 6298 if (mddev->bitmap->storage.file)
6054 return -EINVAL; 6299 return -EINVAL;
6055 mddev->pers->quiesce(mddev, 1); 6300 mddev->pers->quiesce(mddev, 1);
6056 bitmap_destroy(mddev); 6301 bitmap_destroy(mddev);
@@ -6373,6 +6618,9 @@ static int md_open(struct block_device *bdev, fmode_t mode)
6373 struct mddev *mddev = mddev_find(bdev->bd_dev); 6618 struct mddev *mddev = mddev_find(bdev->bd_dev);
6374 int err; 6619 int err;
6375 6620
6621 if (!mddev)
6622 return -ENODEV;
6623
6376 if (mddev->gendisk != bdev->bd_disk) { 6624 if (mddev->gendisk != bdev->bd_disk) {
6377 /* we are racing with mddev_put which is discarding this 6625 /* we are racing with mddev_put which is discarding this
6378 * bd_disk. 6626 * bd_disk.
@@ -6584,7 +6832,8 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev)
6584 6832
6585 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); 6833 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
6586 6834
6587 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6835 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
6836 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6588 max_sectors = mddev->resync_max_sectors; 6837 max_sectors = mddev->resync_max_sectors;
6589 else 6838 else
6590 max_sectors = mddev->dev_sectors; 6839 max_sectors = mddev->dev_sectors;
@@ -7147,7 +7396,7 @@ void md_do_sync(struct mddev *mddev)
7147 j = mddev->recovery_cp; 7396 j = mddev->recovery_cp;
7148 7397
7149 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7398 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7150 max_sectors = mddev->dev_sectors; 7399 max_sectors = mddev->resync_max_sectors;
7151 else { 7400 else {
7152 /* recovery follows the physical size of devices */ 7401 /* recovery follows the physical size of devices */
7153 max_sectors = mddev->dev_sectors; 7402 max_sectors = mddev->dev_sectors;
@@ -7598,7 +7847,7 @@ void md_check_recovery(struct mddev *mddev)
7598 goto unlock; 7847 goto unlock;
7599 7848
7600 if (mddev->pers->sync_request) { 7849 if (mddev->pers->sync_request) {
7601 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 7850 if (spares) {
7602 /* We are adding a device or devices to an array 7851 /* We are adding a device or devices to an array
7603 * which has the bitmap stored on all devices. 7852 * which has the bitmap stored on all devices.
7604 * So make sure all bitmap pages get written 7853 * So make sure all bitmap pages get written
@@ -7646,6 +7895,20 @@ void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
7646} 7895}
7647EXPORT_SYMBOL(md_wait_for_blocked_rdev); 7896EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7648 7897
7898void md_finish_reshape(struct mddev *mddev)
7899{
7900 /* called be personality module when reshape completes. */
7901 struct md_rdev *rdev;
7902
7903 rdev_for_each(rdev, mddev) {
7904 if (rdev->data_offset > rdev->new_data_offset)
7905 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
7906 else
7907 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
7908 rdev->data_offset = rdev->new_data_offset;
7909 }
7910}
7911EXPORT_SYMBOL(md_finish_reshape);
7649 7912
7650/* Bad block management. 7913/* Bad block management.
7651 * We can record which blocks on each device are 'bad' and so just 7914 * We can record which blocks on each device are 'bad' and so just
@@ -7894,10 +8157,15 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
7894} 8157}
7895 8158
7896int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8159int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
7897 int acknowledged) 8160 int is_new)
7898{ 8161{
7899 int rv = md_set_badblocks(&rdev->badblocks, 8162 int rv;
7900 s + rdev->data_offset, sectors, acknowledged); 8163 if (is_new)
8164 s += rdev->new_data_offset;
8165 else
8166 s += rdev->data_offset;
8167 rv = md_set_badblocks(&rdev->badblocks,
8168 s, sectors, 0);
7901 if (rv) { 8169 if (rv) {
7902 /* Make sure they get written out promptly */ 8170 /* Make sure they get written out promptly */
7903 sysfs_notify_dirent_safe(rdev->sysfs_state); 8171 sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -8003,11 +8271,15 @@ out:
8003 return rv; 8271 return rv;
8004} 8272}
8005 8273
8006int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors) 8274int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8275 int is_new)
8007{ 8276{
8277 if (is_new)
8278 s += rdev->new_data_offset;
8279 else
8280 s += rdev->data_offset;
8008 return md_clear_badblocks(&rdev->badblocks, 8281 return md_clear_badblocks(&rdev->badblocks,
8009 s + rdev->data_offset, 8282 s, sectors);
8010 sectors);
8011} 8283}
8012EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 8284EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8013 8285
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1c2063ccf48e..7b4a3c318cae 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -55,6 +55,7 @@ struct md_rdev {
55 int sb_loaded; 55 int sb_loaded;
56 __u64 sb_events; 56 __u64 sb_events;
57 sector_t data_offset; /* start of data in array */ 57 sector_t data_offset; /* start of data in array */
58 sector_t new_data_offset;/* only relevant while reshaping */
58 sector_t sb_start; /* offset of the super block (in 512byte sectors) */ 59 sector_t sb_start; /* offset of the super block (in 512byte sectors) */
59 int sb_size; /* bytes in the superblock */ 60 int sb_size; /* bytes in the superblock */
60 int preferred_minor; /* autorun support */ 61 int preferred_minor; /* autorun support */
@@ -193,8 +194,9 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
193 return 0; 194 return 0;
194} 195}
195extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 196extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
196 int acknowledged); 197 int is_new);
197extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors); 198extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
199 int is_new);
198extern void md_ack_all_badblocks(struct badblocks *bb); 200extern void md_ack_all_badblocks(struct badblocks *bb);
199 201
200struct mddev { 202struct mddev {
@@ -262,6 +264,7 @@ struct mddev {
262 sector_t reshape_position; 264 sector_t reshape_position;
263 int delta_disks, new_level, new_layout; 265 int delta_disks, new_level, new_layout;
264 int new_chunk_sectors; 266 int new_chunk_sectors;
267 int reshape_backwards;
265 268
266 atomic_t plug_cnt; /* If device is expecting 269 atomic_t plug_cnt; /* If device is expecting
267 * more bios soon. 270 * more bios soon.
@@ -390,10 +393,13 @@ struct mddev {
390 * For external metadata, offset 393 * For external metadata, offset
391 * from start of device. 394 * from start of device.
392 */ 395 */
396 unsigned long space; /* space available at this offset */
393 loff_t default_offset; /* this is the offset to use when 397 loff_t default_offset; /* this is the offset to use when
394 * hot-adding a bitmap. It should 398 * hot-adding a bitmap. It should
395 * eventually be settable by sysfs. 399 * eventually be settable by sysfs.
396 */ 400 */
401 unsigned long default_space; /* space available at
402 * default offset */
397 struct mutex mutex; 403 struct mutex mutex;
398 unsigned long chunksize; 404 unsigned long chunksize;
399 unsigned long daemon_sleep; /* how many jiffies between updates? */ 405 unsigned long daemon_sleep; /* how many jiffies between updates? */
@@ -591,6 +597,7 @@ extern void md_write_start(struct mddev *mddev, struct bio *bi);
591extern void md_write_end(struct mddev *mddev); 597extern void md_write_end(struct mddev *mddev);
592extern void md_done_sync(struct mddev *mddev, int blocks, int ok); 598extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
593extern void md_error(struct mddev *mddev, struct md_rdev *rdev); 599extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
600extern void md_finish_reshape(struct mddev *mddev);
594 601
595extern int mddev_congested(struct mddev *mddev, int bits); 602extern int mddev_congested(struct mddev *mddev, int bits);
596extern void md_flush_request(struct mddev *mddev, struct bio *bio); 603extern void md_flush_request(struct mddev *mddev, struct bio *bio);
@@ -615,6 +622,7 @@ extern int md_run(struct mddev *mddev);
615extern void md_stop(struct mddev *mddev); 622extern void md_stop(struct mddev *mddev);
616extern void md_stop_writes(struct mddev *mddev); 623extern void md_stop_writes(struct mddev *mddev);
617extern int md_rdev_init(struct md_rdev *rdev); 624extern int md_rdev_init(struct md_rdev *rdev);
625extern void md_rdev_clear(struct md_rdev *rdev);
618 626
619extern void mddev_suspend(struct mddev *mddev); 627extern void mddev_suspend(struct mddev *mddev);
620extern void mddev_resume(struct mddev *mddev); 628extern void mddev_resume(struct mddev *mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 15dd59b84e94..835de7168cd3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1859,7 +1859,9 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
1859 1859
1860 rdev = conf->mirrors[d].rdev; 1860 rdev = conf->mirrors[d].rdev;
1861 if (rdev && 1861 if (rdev &&
1862 test_bit(In_sync, &rdev->flags) && 1862 (test_bit(In_sync, &rdev->flags) ||
1863 (!test_bit(Faulty, &rdev->flags) &&
1864 rdev->recovery_offset >= sect + s)) &&
1863 is_badblock(rdev, sect, s, 1865 is_badblock(rdev, sect, s,
1864 &first_bad, &bad_sectors) == 0 && 1866 &first_bad, &bad_sectors) == 0 &&
1865 sync_page_io(rdev, sect, s<<9, 1867 sync_page_io(rdev, sect, s<<9,
@@ -2024,7 +2026,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
2024 continue; 2026 continue;
2025 if (test_bit(BIO_UPTODATE, &bio->bi_flags) && 2027 if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
2026 test_bit(R1BIO_MadeGood, &r1_bio->state)) { 2028 test_bit(R1BIO_MadeGood, &r1_bio->state)) {
2027 rdev_clear_badblocks(rdev, r1_bio->sector, s); 2029 rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
2028 } 2030 }
2029 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 2031 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
2030 test_bit(R1BIO_WriteError, &r1_bio->state)) { 2032 test_bit(R1BIO_WriteError, &r1_bio->state)) {
@@ -2044,7 +2046,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
2044 struct md_rdev *rdev = conf->mirrors[m].rdev; 2046 struct md_rdev *rdev = conf->mirrors[m].rdev;
2045 rdev_clear_badblocks(rdev, 2047 rdev_clear_badblocks(rdev,
2046 r1_bio->sector, 2048 r1_bio->sector,
2047 r1_bio->sectors); 2049 r1_bio->sectors, 0);
2048 rdev_dec_pending(rdev, conf->mddev); 2050 rdev_dec_pending(rdev, conf->mddev);
2049 } else if (r1_bio->bios[m] != NULL) { 2051 } else if (r1_bio->bios[m] != NULL) {
2050 /* This drive got a write error. We need to 2052 /* This drive got a write error. We need to
@@ -2598,7 +2600,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2598 if (!disk->rdev || 2600 if (!disk->rdev ||
2599 !test_bit(In_sync, &disk->rdev->flags)) { 2601 !test_bit(In_sync, &disk->rdev->flags)) {
2600 disk->head_position = 0; 2602 disk->head_position = 0;
2601 if (disk->rdev) 2603 if (disk->rdev &&
2604 (disk->rdev->saved_raid_disk < 0))
2602 conf->fullsync = 1; 2605 conf->fullsync = 1;
2603 } else if (conf->last_used < 0) 2606 } else if (conf->last_used < 0)
2604 /* 2607 /*
@@ -2750,9 +2753,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
2750 * any io in the removed space completes, but it hardly seems 2753 * any io in the removed space completes, but it hardly seems
2751 * worth it. 2754 * worth it.
2752 */ 2755 */
2753 md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); 2756 sector_t newsize = raid1_size(mddev, sectors, 0);
2754 if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) 2757 if (mddev->external_size &&
2758 mddev->array_sectors > newsize)
2755 return -EINVAL; 2759 return -EINVAL;
2760 if (mddev->bitmap) {
2761 int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0);
2762 if (ret)
2763 return ret;
2764 }
2765 md_set_array_sectors(mddev, newsize);
2756 set_capacity(mddev->gendisk, mddev->array_sectors); 2766 set_capacity(mddev->gendisk, mddev->array_sectors);
2757 revalidate_disk(mddev->gendisk); 2767 revalidate_disk(mddev->gendisk);
2758 if (sectors > mddev->dev_sectors && 2768 if (sectors > mddev->dev_sectors &&
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3f91c2e1dfe7..987db37cb875 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -24,6 +24,7 @@
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/seq_file.h> 25#include <linux/seq_file.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
27#include "md.h" 28#include "md.h"
28#include "raid10.h" 29#include "raid10.h"
29#include "raid0.h" 30#include "raid0.h"
@@ -68,6 +69,11 @@ static int max_queued_requests = 1024;
68static void allow_barrier(struct r10conf *conf); 69static void allow_barrier(struct r10conf *conf);
69static void lower_barrier(struct r10conf *conf); 70static void lower_barrier(struct r10conf *conf);
70static int enough(struct r10conf *conf, int ignore); 71static int enough(struct r10conf *conf, int ignore);
72static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
73 int *skipped);
74static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
75static void end_reshape_write(struct bio *bio, int error);
76static void end_reshape(struct r10conf *conf);
71 77
72static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 78static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
73{ 79{
@@ -112,7 +118,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
112 if (!r10_bio) 118 if (!r10_bio)
113 return NULL; 119 return NULL;
114 120
115 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) 121 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
122 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
116 nalloc = conf->copies; /* resync */ 123 nalloc = conf->copies; /* resync */
117 else 124 else
118 nalloc = 2; /* recovery */ 125 nalloc = 2; /* recovery */
@@ -140,9 +147,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
140 struct bio *rbio = r10_bio->devs[j].repl_bio; 147 struct bio *rbio = r10_bio->devs[j].repl_bio;
141 bio = r10_bio->devs[j].bio; 148 bio = r10_bio->devs[j].bio;
142 for (i = 0; i < RESYNC_PAGES; i++) { 149 for (i = 0; i < RESYNC_PAGES; i++) {
143 if (j == 1 && !test_bit(MD_RECOVERY_SYNC, 150 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
144 &conf->mddev->recovery)) { 151 &conf->mddev->recovery)) {
145 /* we can share bv_page's during recovery */ 152 /* we can share bv_page's during recovery
153 * and reshape */
146 struct bio *rbio = r10_bio->devs[0].bio; 154 struct bio *rbio = r10_bio->devs[0].bio;
147 page = rbio->bi_io_vec[i].bv_page; 155 page = rbio->bi_io_vec[i].bv_page;
148 get_page(page); 156 get_page(page);
@@ -165,10 +173,11 @@ out_free_pages:
165 while (j--) 173 while (j--)
166 for (i = 0; i < RESYNC_PAGES ; i++) 174 for (i = 0; i < RESYNC_PAGES ; i++)
167 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); 175 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
168 j = -1; 176 j = 0;
169out_free_bio: 177out_free_bio:
170 while (++j < nalloc) { 178 for ( ; j < nalloc; j++) {
171 bio_put(r10_bio->devs[j].bio); 179 if (r10_bio->devs[j].bio)
180 bio_put(r10_bio->devs[j].bio);
172 if (r10_bio->devs[j].repl_bio) 181 if (r10_bio->devs[j].repl_bio)
173 bio_put(r10_bio->devs[j].repl_bio); 182 bio_put(r10_bio->devs[j].repl_bio);
174 } 183 }
@@ -504,79 +513,96 @@ static void raid10_end_write_request(struct bio *bio, int error)
504 * sector offset to a virtual address 513 * sector offset to a virtual address
505 */ 514 */
506 515
507static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) 516static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
508{ 517{
509 int n,f; 518 int n,f;
510 sector_t sector; 519 sector_t sector;
511 sector_t chunk; 520 sector_t chunk;
512 sector_t stripe; 521 sector_t stripe;
513 int dev; 522 int dev;
514
515 int slot = 0; 523 int slot = 0;
516 524
517 /* now calculate first sector/dev */ 525 /* now calculate first sector/dev */
518 chunk = r10bio->sector >> conf->chunk_shift; 526 chunk = r10bio->sector >> geo->chunk_shift;
519 sector = r10bio->sector & conf->chunk_mask; 527 sector = r10bio->sector & geo->chunk_mask;
520 528
521 chunk *= conf->near_copies; 529 chunk *= geo->near_copies;
522 stripe = chunk; 530 stripe = chunk;
523 dev = sector_div(stripe, conf->raid_disks); 531 dev = sector_div(stripe, geo->raid_disks);
524 if (conf->far_offset) 532 if (geo->far_offset)
525 stripe *= conf->far_copies; 533 stripe *= geo->far_copies;
526 534
527 sector += stripe << conf->chunk_shift; 535 sector += stripe << geo->chunk_shift;
528 536
529 /* and calculate all the others */ 537 /* and calculate all the others */
530 for (n=0; n < conf->near_copies; n++) { 538 for (n = 0; n < geo->near_copies; n++) {
531 int d = dev; 539 int d = dev;
532 sector_t s = sector; 540 sector_t s = sector;
533 r10bio->devs[slot].addr = sector; 541 r10bio->devs[slot].addr = sector;
534 r10bio->devs[slot].devnum = d; 542 r10bio->devs[slot].devnum = d;
535 slot++; 543 slot++;
536 544
537 for (f = 1; f < conf->far_copies; f++) { 545 for (f = 1; f < geo->far_copies; f++) {
538 d += conf->near_copies; 546 d += geo->near_copies;
539 if (d >= conf->raid_disks) 547 if (d >= geo->raid_disks)
540 d -= conf->raid_disks; 548 d -= geo->raid_disks;
541 s += conf->stride; 549 s += geo->stride;
542 r10bio->devs[slot].devnum = d; 550 r10bio->devs[slot].devnum = d;
543 r10bio->devs[slot].addr = s; 551 r10bio->devs[slot].addr = s;
544 slot++; 552 slot++;
545 } 553 }
546 dev++; 554 dev++;
547 if (dev >= conf->raid_disks) { 555 if (dev >= geo->raid_disks) {
548 dev = 0; 556 dev = 0;
549 sector += (conf->chunk_mask + 1); 557 sector += (geo->chunk_mask + 1);
550 } 558 }
551 } 559 }
552 BUG_ON(slot != conf->copies); 560}
561
562static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
563{
564 struct geom *geo = &conf->geo;
565
566 if (conf->reshape_progress != MaxSector &&
567 ((r10bio->sector >= conf->reshape_progress) !=
568 conf->mddev->reshape_backwards)) {
569 set_bit(R10BIO_Previous, &r10bio->state);
570 geo = &conf->prev;
571 } else
572 clear_bit(R10BIO_Previous, &r10bio->state);
573
574 __raid10_find_phys(geo, r10bio);
553} 575}
554 576
555static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) 577static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
556{ 578{
557 sector_t offset, chunk, vchunk; 579 sector_t offset, chunk, vchunk;
580 /* Never use conf->prev as this is only called during resync
581 * or recovery, so reshape isn't happening
582 */
583 struct geom *geo = &conf->geo;
558 584
559 offset = sector & conf->chunk_mask; 585 offset = sector & geo->chunk_mask;
560 if (conf->far_offset) { 586 if (geo->far_offset) {
561 int fc; 587 int fc;
562 chunk = sector >> conf->chunk_shift; 588 chunk = sector >> geo->chunk_shift;
563 fc = sector_div(chunk, conf->far_copies); 589 fc = sector_div(chunk, geo->far_copies);
564 dev -= fc * conf->near_copies; 590 dev -= fc * geo->near_copies;
565 if (dev < 0) 591 if (dev < 0)
566 dev += conf->raid_disks; 592 dev += geo->raid_disks;
567 } else { 593 } else {
568 while (sector >= conf->stride) { 594 while (sector >= geo->stride) {
569 sector -= conf->stride; 595 sector -= geo->stride;
570 if (dev < conf->near_copies) 596 if (dev < geo->near_copies)
571 dev += conf->raid_disks - conf->near_copies; 597 dev += geo->raid_disks - geo->near_copies;
572 else 598 else
573 dev -= conf->near_copies; 599 dev -= geo->near_copies;
574 } 600 }
575 chunk = sector >> conf->chunk_shift; 601 chunk = sector >> geo->chunk_shift;
576 } 602 }
577 vchunk = chunk * conf->raid_disks + dev; 603 vchunk = chunk * geo->raid_disks + dev;
578 sector_div(vchunk, conf->near_copies); 604 sector_div(vchunk, geo->near_copies);
579 return (vchunk << conf->chunk_shift) + offset; 605 return (vchunk << geo->chunk_shift) + offset;
580} 606}
581 607
582/** 608/**
@@ -597,10 +623,17 @@ static int raid10_mergeable_bvec(struct request_queue *q,
597 struct r10conf *conf = mddev->private; 623 struct r10conf *conf = mddev->private;
598 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 624 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
599 int max; 625 int max;
600 unsigned int chunk_sectors = mddev->chunk_sectors; 626 unsigned int chunk_sectors;
601 unsigned int bio_sectors = bvm->bi_size >> 9; 627 unsigned int bio_sectors = bvm->bi_size >> 9;
628 struct geom *geo = &conf->geo;
629
630 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
631 if (conf->reshape_progress != MaxSector &&
632 ((sector >= conf->reshape_progress) !=
633 conf->mddev->reshape_backwards))
634 geo = &conf->prev;
602 635
603 if (conf->near_copies < conf->raid_disks) { 636 if (geo->near_copies < geo->raid_disks) {
604 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) 637 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
605 + bio_sectors)) << 9; 638 + bio_sectors)) << 9;
606 if (max < 0) 639 if (max < 0)
@@ -614,6 +647,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
614 if (mddev->merge_check_needed) { 647 if (mddev->merge_check_needed) {
615 struct r10bio r10_bio; 648 struct r10bio r10_bio;
616 int s; 649 int s;
650 if (conf->reshape_progress != MaxSector) {
651 /* Cannot give any guidance during reshape */
652 if (max <= biovec->bv_len && bio_sectors == 0)
653 return biovec->bv_len;
654 return 0;
655 }
617 r10_bio.sector = sector; 656 r10_bio.sector = sector;
618 raid10_find_phys(conf, &r10_bio); 657 raid10_find_phys(conf, &r10_bio);
619 rcu_read_lock(); 658 rcu_read_lock();
@@ -681,6 +720,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
681 struct md_rdev *rdev, *best_rdev; 720 struct md_rdev *rdev, *best_rdev;
682 int do_balance; 721 int do_balance;
683 int best_slot; 722 int best_slot;
723 struct geom *geo = &conf->geo;
684 724
685 raid10_find_phys(conf, r10_bio); 725 raid10_find_phys(conf, r10_bio);
686 rcu_read_lock(); 726 rcu_read_lock();
@@ -761,11 +801,11 @@ retry:
761 * sequential read speed for 'far copies' arrays. So only 801 * sequential read speed for 'far copies' arrays. So only
762 * keep it for 'near' arrays, and review those later. 802 * keep it for 'near' arrays, and review those later.
763 */ 803 */
764 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) 804 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
765 break; 805 break;
766 806
767 /* for far > 1 always use the lowest address */ 807 /* for far > 1 always use the lowest address */
768 if (conf->far_copies > 1) 808 if (geo->far_copies > 1)
769 new_distance = r10_bio->devs[slot].addr; 809 new_distance = r10_bio->devs[slot].addr;
770 else 810 else
771 new_distance = abs(r10_bio->devs[slot].addr - 811 new_distance = abs(r10_bio->devs[slot].addr -
@@ -812,7 +852,10 @@ static int raid10_congested(void *data, int bits)
812 if (mddev_congested(mddev, bits)) 852 if (mddev_congested(mddev, bits))
813 return 1; 853 return 1;
814 rcu_read_lock(); 854 rcu_read_lock();
815 for (i = 0; i < conf->raid_disks && ret == 0; i++) { 855 for (i = 0;
856 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
857 && ret == 0;
858 i++) {
816 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 859 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
817 if (rdev && !test_bit(Faulty, &rdev->flags)) { 860 if (rdev && !test_bit(Faulty, &rdev->flags)) {
818 struct request_queue *q = bdev_get_queue(rdev->bdev); 861 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -973,13 +1016,24 @@ static void unfreeze_array(struct r10conf *conf)
973 spin_unlock_irq(&conf->resync_lock); 1016 spin_unlock_irq(&conf->resync_lock);
974} 1017}
975 1018
1019static sector_t choose_data_offset(struct r10bio *r10_bio,
1020 struct md_rdev *rdev)
1021{
1022 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1023 test_bit(R10BIO_Previous, &r10_bio->state))
1024 return rdev->data_offset;
1025 else
1026 return rdev->new_data_offset;
1027}
1028
976static void make_request(struct mddev *mddev, struct bio * bio) 1029static void make_request(struct mddev *mddev, struct bio * bio)
977{ 1030{
978 struct r10conf *conf = mddev->private; 1031 struct r10conf *conf = mddev->private;
979 struct r10bio *r10_bio; 1032 struct r10bio *r10_bio;
980 struct bio *read_bio; 1033 struct bio *read_bio;
981 int i; 1034 int i;
982 int chunk_sects = conf->chunk_mask + 1; 1035 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1036 int chunk_sects = chunk_mask + 1;
983 const int rw = bio_data_dir(bio); 1037 const int rw = bio_data_dir(bio);
984 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 1038 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
985 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 1039 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
@@ -988,6 +1042,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
988 int plugged; 1042 int plugged;
989 int sectors_handled; 1043 int sectors_handled;
990 int max_sectors; 1044 int max_sectors;
1045 int sectors;
991 1046
992 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 1047 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
993 md_flush_request(mddev, bio); 1048 md_flush_request(mddev, bio);
@@ -997,9 +1052,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
997 /* If this request crosses a chunk boundary, we need to 1052 /* If this request crosses a chunk boundary, we need to
998 * split it. This will only happen for 1 PAGE (or less) requests. 1053 * split it. This will only happen for 1 PAGE (or less) requests.
999 */ 1054 */
1000 if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) 1055 if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
1001 > chunk_sects && 1056 > chunk_sects
1002 conf->near_copies < conf->raid_disks)) { 1057 && (conf->geo.near_copies < conf->geo.raid_disks
1058 || conf->prev.near_copies < conf->prev.raid_disks))) {
1003 struct bio_pair *bp; 1059 struct bio_pair *bp;
1004 /* Sanity check -- queue functions should prevent this happening */ 1060 /* Sanity check -- queue functions should prevent this happening */
1005 if (bio->bi_vcnt != 1 || 1061 if (bio->bi_vcnt != 1 ||
@@ -1051,10 +1107,41 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1051 */ 1107 */
1052 wait_barrier(conf); 1108 wait_barrier(conf);
1053 1109
1110 sectors = bio->bi_size >> 9;
1111 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1112 bio->bi_sector < conf->reshape_progress &&
1113 bio->bi_sector + sectors > conf->reshape_progress) {
1114 /* IO spans the reshape position. Need to wait for
1115 * reshape to pass
1116 */
1117 allow_barrier(conf);
1118 wait_event(conf->wait_barrier,
1119 conf->reshape_progress <= bio->bi_sector ||
1120 conf->reshape_progress >= bio->bi_sector + sectors);
1121 wait_barrier(conf);
1122 }
1123 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1124 bio_data_dir(bio) == WRITE &&
1125 (mddev->reshape_backwards
1126 ? (bio->bi_sector < conf->reshape_safe &&
1127 bio->bi_sector + sectors > conf->reshape_progress)
1128 : (bio->bi_sector + sectors > conf->reshape_safe &&
1129 bio->bi_sector < conf->reshape_progress))) {
1130 /* Need to update reshape_position in metadata */
1131 mddev->reshape_position = conf->reshape_progress;
1132 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1133 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1134 md_wakeup_thread(mddev->thread);
1135 wait_event(mddev->sb_wait,
1136 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1137
1138 conf->reshape_safe = mddev->reshape_position;
1139 }
1140
1054 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 1141 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1055 1142
1056 r10_bio->master_bio = bio; 1143 r10_bio->master_bio = bio;
1057 r10_bio->sectors = bio->bi_size >> 9; 1144 r10_bio->sectors = sectors;
1058 1145
1059 r10_bio->mddev = mddev; 1146 r10_bio->mddev = mddev;
1060 r10_bio->sector = bio->bi_sector; 1147 r10_bio->sector = bio->bi_sector;
@@ -1093,7 +1180,7 @@ read_again:
1093 r10_bio->devs[slot].rdev = rdev; 1180 r10_bio->devs[slot].rdev = rdev;
1094 1181
1095 read_bio->bi_sector = r10_bio->devs[slot].addr + 1182 read_bio->bi_sector = r10_bio->devs[slot].addr +
1096 rdev->data_offset; 1183 choose_data_offset(r10_bio, rdev);
1097 read_bio->bi_bdev = rdev->bdev; 1184 read_bio->bi_bdev = rdev->bdev;
1098 read_bio->bi_end_io = raid10_end_read_request; 1185 read_bio->bi_end_io = raid10_end_read_request;
1099 read_bio->bi_rw = READ | do_sync; 1186 read_bio->bi_rw = READ | do_sync;
@@ -1297,7 +1384,8 @@ retry_write:
1297 r10_bio->devs[i].bio = mbio; 1384 r10_bio->devs[i].bio = mbio;
1298 1385
1299 mbio->bi_sector = (r10_bio->devs[i].addr+ 1386 mbio->bi_sector = (r10_bio->devs[i].addr+
1300 conf->mirrors[d].rdev->data_offset); 1387 choose_data_offset(r10_bio,
1388 conf->mirrors[d].rdev));
1301 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 1389 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1302 mbio->bi_end_io = raid10_end_write_request; 1390 mbio->bi_end_io = raid10_end_write_request;
1303 mbio->bi_rw = WRITE | do_sync | do_fua; 1391 mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -1321,8 +1409,10 @@ retry_write:
1321 * so it cannot disappear, so the replacement cannot 1409 * so it cannot disappear, so the replacement cannot
1322 * become NULL here 1410 * become NULL here
1323 */ 1411 */
1324 mbio->bi_sector = (r10_bio->devs[i].addr+ 1412 mbio->bi_sector = (r10_bio->devs[i].addr +
1325 conf->mirrors[d].replacement->data_offset); 1413 choose_data_offset(
1414 r10_bio,
1415 conf->mirrors[d].replacement));
1326 mbio->bi_bdev = conf->mirrors[d].replacement->bdev; 1416 mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
1327 mbio->bi_end_io = raid10_end_write_request; 1417 mbio->bi_end_io = raid10_end_write_request;
1328 mbio->bi_rw = WRITE | do_sync | do_fua; 1418 mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -1368,19 +1458,19 @@ static void status(struct seq_file *seq, struct mddev *mddev)
1368 struct r10conf *conf = mddev->private; 1458 struct r10conf *conf = mddev->private;
1369 int i; 1459 int i;
1370 1460
1371 if (conf->near_copies < conf->raid_disks) 1461 if (conf->geo.near_copies < conf->geo.raid_disks)
1372 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); 1462 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1373 if (conf->near_copies > 1) 1463 if (conf->geo.near_copies > 1)
1374 seq_printf(seq, " %d near-copies", conf->near_copies); 1464 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1375 if (conf->far_copies > 1) { 1465 if (conf->geo.far_copies > 1) {
1376 if (conf->far_offset) 1466 if (conf->geo.far_offset)
1377 seq_printf(seq, " %d offset-copies", conf->far_copies); 1467 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1378 else 1468 else
1379 seq_printf(seq, " %d far-copies", conf->far_copies); 1469 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1380 } 1470 }
1381 seq_printf(seq, " [%d/%d] [", conf->raid_disks, 1471 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1382 conf->raid_disks - mddev->degraded); 1472 conf->geo.raid_disks - mddev->degraded);
1383 for (i = 0; i < conf->raid_disks; i++) 1473 for (i = 0; i < conf->geo.raid_disks; i++)
1384 seq_printf(seq, "%s", 1474 seq_printf(seq, "%s",
1385 conf->mirrors[i].rdev && 1475 conf->mirrors[i].rdev &&
1386 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); 1476 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
@@ -1392,7 +1482,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
1392 * Don't consider the device numbered 'ignore' 1482 * Don't consider the device numbered 'ignore'
1393 * as we might be about to remove it. 1483 * as we might be about to remove it.
1394 */ 1484 */
1395static int enough(struct r10conf *conf, int ignore) 1485static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
1396{ 1486{
1397 int first = 0; 1487 int first = 0;
1398 1488
@@ -1403,7 +1493,7 @@ static int enough(struct r10conf *conf, int ignore)
1403 if (conf->mirrors[first].rdev && 1493 if (conf->mirrors[first].rdev &&
1404 first != ignore) 1494 first != ignore)
1405 cnt++; 1495 cnt++;
1406 first = (first+1) % conf->raid_disks; 1496 first = (first+1) % geo->raid_disks;
1407 } 1497 }
1408 if (cnt == 0) 1498 if (cnt == 0)
1409 return 0; 1499 return 0;
@@ -1411,6 +1501,12 @@ static int enough(struct r10conf *conf, int ignore)
1411 return 1; 1501 return 1;
1412} 1502}
1413 1503
1504static int enough(struct r10conf *conf, int ignore)
1505{
1506 return _enough(conf, &conf->geo, ignore) &&
1507 _enough(conf, &conf->prev, ignore);
1508}
1509
1414static void error(struct mddev *mddev, struct md_rdev *rdev) 1510static void error(struct mddev *mddev, struct md_rdev *rdev)
1415{ 1511{
1416 char b[BDEVNAME_SIZE]; 1512 char b[BDEVNAME_SIZE];
@@ -1445,7 +1541,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1445 "md/raid10:%s: Disk failure on %s, disabling device.\n" 1541 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1446 "md/raid10:%s: Operation continuing on %d devices.\n", 1542 "md/raid10:%s: Operation continuing on %d devices.\n",
1447 mdname(mddev), bdevname(rdev->bdev, b), 1543 mdname(mddev), bdevname(rdev->bdev, b),
1448 mdname(mddev), conf->raid_disks - mddev->degraded); 1544 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1449} 1545}
1450 1546
1451static void print_conf(struct r10conf *conf) 1547static void print_conf(struct r10conf *conf)
@@ -1458,10 +1554,10 @@ static void print_conf(struct r10conf *conf)
1458 printk(KERN_DEBUG "(!conf)\n"); 1554 printk(KERN_DEBUG "(!conf)\n");
1459 return; 1555 return;
1460 } 1556 }
1461 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1557 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1462 conf->raid_disks); 1558 conf->geo.raid_disks);
1463 1559
1464 for (i = 0; i < conf->raid_disks; i++) { 1560 for (i = 0; i < conf->geo.raid_disks; i++) {
1465 char b[BDEVNAME_SIZE]; 1561 char b[BDEVNAME_SIZE];
1466 tmp = conf->mirrors + i; 1562 tmp = conf->mirrors + i;
1467 if (tmp->rdev) 1563 if (tmp->rdev)
@@ -1493,7 +1589,7 @@ static int raid10_spare_active(struct mddev *mddev)
1493 * Find all non-in_sync disks within the RAID10 configuration 1589 * Find all non-in_sync disks within the RAID10 configuration
1494 * and mark them in_sync 1590 * and mark them in_sync
1495 */ 1591 */
1496 for (i = 0; i < conf->raid_disks; i++) { 1592 for (i = 0; i < conf->geo.raid_disks; i++) {
1497 tmp = conf->mirrors + i; 1593 tmp = conf->mirrors + i;
1498 if (tmp->replacement 1594 if (tmp->replacement
1499 && tmp->replacement->recovery_offset == MaxSector 1595 && tmp->replacement->recovery_offset == MaxSector
@@ -1535,7 +1631,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1535 int err = -EEXIST; 1631 int err = -EEXIST;
1536 int mirror; 1632 int mirror;
1537 int first = 0; 1633 int first = 0;
1538 int last = conf->raid_disks - 1; 1634 int last = conf->geo.raid_disks - 1;
1539 struct request_queue *q = bdev_get_queue(rdev->bdev); 1635 struct request_queue *q = bdev_get_queue(rdev->bdev);
1540 1636
1541 if (mddev->recovery_cp < MaxSector) 1637 if (mddev->recovery_cp < MaxSector)
@@ -1543,7 +1639,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1543 * very different from resync 1639 * very different from resync
1544 */ 1640 */
1545 return -EBUSY; 1641 return -EBUSY;
1546 if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) 1642 if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
1547 return -EINVAL; 1643 return -EINVAL;
1548 1644
1549 if (rdev->raid_disk >= 0) 1645 if (rdev->raid_disk >= 0)
@@ -1635,6 +1731,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1635 if (!test_bit(Faulty, &rdev->flags) && 1731 if (!test_bit(Faulty, &rdev->flags) &&
1636 mddev->recovery_disabled != p->recovery_disabled && 1732 mddev->recovery_disabled != p->recovery_disabled &&
1637 (!p->replacement || p->replacement == rdev) && 1733 (!p->replacement || p->replacement == rdev) &&
1734 number < conf->geo.raid_disks &&
1638 enough(conf, -1)) { 1735 enough(conf, -1)) {
1639 err = -EBUSY; 1736 err = -EBUSY;
1640 goto abort; 1737 goto abort;
@@ -1676,7 +1773,11 @@ static void end_sync_read(struct bio *bio, int error)
1676 struct r10conf *conf = r10_bio->mddev->private; 1773 struct r10conf *conf = r10_bio->mddev->private;
1677 int d; 1774 int d;
1678 1775
1679 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); 1776 if (bio == r10_bio->master_bio) {
1777 /* this is a reshape read */
1778 d = r10_bio->read_slot; /* really the read dev */
1779 } else
1780 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1680 1781
1681 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1782 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1682 set_bit(R10BIO_Uptodate, &r10_bio->state); 1783 set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -2218,7 +2319,9 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2218 " (%d sectors at %llu on %s)\n", 2319 " (%d sectors at %llu on %s)\n",
2219 mdname(mddev), s, 2320 mdname(mddev), s,
2220 (unsigned long long)( 2321 (unsigned long long)(
2221 sect + rdev->data_offset), 2322 sect +
2323 choose_data_offset(r10_bio,
2324 rdev)),
2222 bdevname(rdev->bdev, b)); 2325 bdevname(rdev->bdev, b));
2223 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 2326 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2224 "drive\n", 2327 "drive\n",
@@ -2256,7 +2359,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2256 " (%d sectors at %llu on %s)\n", 2359 " (%d sectors at %llu on %s)\n",
2257 mdname(mddev), s, 2360 mdname(mddev), s,
2258 (unsigned long long)( 2361 (unsigned long long)(
2259 sect + rdev->data_offset), 2362 sect +
2363 choose_data_offset(r10_bio, rdev)),
2260 bdevname(rdev->bdev, b)); 2364 bdevname(rdev->bdev, b));
2261 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 2365 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2262 "drive\n", 2366 "drive\n",
@@ -2269,7 +2373,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2269 " (%d sectors at %llu on %s)\n", 2373 " (%d sectors at %llu on %s)\n",
2270 mdname(mddev), s, 2374 mdname(mddev), s,
2271 (unsigned long long)( 2375 (unsigned long long)(
2272 sect + rdev->data_offset), 2376 sect +
2377 choose_data_offset(r10_bio, rdev)),
2273 bdevname(rdev->bdev, b)); 2378 bdevname(rdev->bdev, b));
2274 atomic_add(s, &rdev->corrected_errors); 2379 atomic_add(s, &rdev->corrected_errors);
2275 } 2380 }
@@ -2343,7 +2448,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
2343 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 2448 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2344 md_trim_bio(wbio, sector - bio->bi_sector, sectors); 2449 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2345 wbio->bi_sector = (r10_bio->devs[i].addr+ 2450 wbio->bi_sector = (r10_bio->devs[i].addr+
2346 rdev->data_offset+ 2451 choose_data_offset(r10_bio, rdev) +
2347 (sector - r10_bio->sector)); 2452 (sector - r10_bio->sector));
2348 wbio->bi_bdev = rdev->bdev; 2453 wbio->bi_bdev = rdev->bdev;
2349 if (submit_bio_wait(WRITE, wbio) == 0) 2454 if (submit_bio_wait(WRITE, wbio) == 0)
@@ -2420,7 +2525,7 @@ read_more:
2420 r10_bio->devs[slot].bio = bio; 2525 r10_bio->devs[slot].bio = bio;
2421 r10_bio->devs[slot].rdev = rdev; 2526 r10_bio->devs[slot].rdev = rdev;
2422 bio->bi_sector = r10_bio->devs[slot].addr 2527 bio->bi_sector = r10_bio->devs[slot].addr
2423 + rdev->data_offset; 2528 + choose_data_offset(r10_bio, rdev);
2424 bio->bi_bdev = rdev->bdev; 2529 bio->bi_bdev = rdev->bdev;
2425 bio->bi_rw = READ | do_sync; 2530 bio->bi_rw = READ | do_sync;
2426 bio->bi_private = r10_bio; 2531 bio->bi_private = r10_bio;
@@ -2480,7 +2585,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2480 rdev_clear_badblocks( 2585 rdev_clear_badblocks(
2481 rdev, 2586 rdev,
2482 r10_bio->devs[m].addr, 2587 r10_bio->devs[m].addr,
2483 r10_bio->sectors); 2588 r10_bio->sectors, 0);
2484 } else { 2589 } else {
2485 if (!rdev_set_badblocks( 2590 if (!rdev_set_badblocks(
2486 rdev, 2591 rdev,
@@ -2496,7 +2601,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2496 rdev_clear_badblocks( 2601 rdev_clear_badblocks(
2497 rdev, 2602 rdev,
2498 r10_bio->devs[m].addr, 2603 r10_bio->devs[m].addr,
2499 r10_bio->sectors); 2604 r10_bio->sectors, 0);
2500 } else { 2605 } else {
2501 if (!rdev_set_badblocks( 2606 if (!rdev_set_badblocks(
2502 rdev, 2607 rdev,
@@ -2515,7 +2620,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2515 rdev_clear_badblocks( 2620 rdev_clear_badblocks(
2516 rdev, 2621 rdev,
2517 r10_bio->devs[m].addr, 2622 r10_bio->devs[m].addr,
2518 r10_bio->sectors); 2623 r10_bio->sectors, 0);
2519 rdev_dec_pending(rdev, conf->mddev); 2624 rdev_dec_pending(rdev, conf->mddev);
2520 } else if (bio != NULL && 2625 } else if (bio != NULL &&
2521 !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2626 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2532,7 +2637,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2532 rdev_clear_badblocks( 2637 rdev_clear_badblocks(
2533 rdev, 2638 rdev,
2534 r10_bio->devs[m].addr, 2639 r10_bio->devs[m].addr,
2535 r10_bio->sectors); 2640 r10_bio->sectors, 0);
2536 rdev_dec_pending(rdev, conf->mddev); 2641 rdev_dec_pending(rdev, conf->mddev);
2537 } 2642 }
2538 } 2643 }
@@ -2573,6 +2678,8 @@ static void raid10d(struct mddev *mddev)
2573 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2678 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2574 test_bit(R10BIO_WriteError, &r10_bio->state)) 2679 test_bit(R10BIO_WriteError, &r10_bio->state))
2575 handle_write_completed(conf, r10_bio); 2680 handle_write_completed(conf, r10_bio);
2681 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2682 reshape_request_write(mddev, r10_bio);
2576 else if (test_bit(R10BIO_IsSync, &r10_bio->state)) 2683 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2577 sync_request_write(mddev, r10_bio); 2684 sync_request_write(mddev, r10_bio);
2578 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 2685 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
@@ -2603,7 +2710,7 @@ static int init_resync(struct r10conf *conf)
2603 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 2710 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2604 BUG_ON(conf->r10buf_pool); 2711 BUG_ON(conf->r10buf_pool);
2605 conf->have_replacement = 0; 2712 conf->have_replacement = 0;
2606 for (i = 0; i < conf->raid_disks; i++) 2713 for (i = 0; i < conf->geo.raid_disks; i++)
2607 if (conf->mirrors[i].replacement) 2714 if (conf->mirrors[i].replacement)
2608 conf->have_replacement = 1; 2715 conf->have_replacement = 1;
2609 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); 2716 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
@@ -2657,6 +2764,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2657 sector_t sync_blocks; 2764 sector_t sync_blocks;
2658 sector_t sectors_skipped = 0; 2765 sector_t sectors_skipped = 0;
2659 int chunks_skipped = 0; 2766 int chunks_skipped = 0;
2767 sector_t chunk_mask = conf->geo.chunk_mask;
2660 2768
2661 if (!conf->r10buf_pool) 2769 if (!conf->r10buf_pool)
2662 if (init_resync(conf)) 2770 if (init_resync(conf))
@@ -2664,7 +2772,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2664 2772
2665 skipped: 2773 skipped:
2666 max_sector = mddev->dev_sectors; 2774 max_sector = mddev->dev_sectors;
2667 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2775 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2776 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2668 max_sector = mddev->resync_max_sectors; 2777 max_sector = mddev->resync_max_sectors;
2669 if (sector_nr >= max_sector) { 2778 if (sector_nr >= max_sector) {
2670 /* If we aborted, we need to abort the 2779 /* If we aborted, we need to abort the
@@ -2676,11 +2785,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2676 * we need to convert that to several 2785 * we need to convert that to several
2677 * virtual addresses. 2786 * virtual addresses.
2678 */ 2787 */
2788 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2789 end_reshape(conf);
2790 return 0;
2791 }
2792
2679 if (mddev->curr_resync < max_sector) { /* aborted */ 2793 if (mddev->curr_resync < max_sector) { /* aborted */
2680 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2794 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2681 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 2795 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2682 &sync_blocks, 1); 2796 &sync_blocks, 1);
2683 else for (i=0; i<conf->raid_disks; i++) { 2797 else for (i = 0; i < conf->geo.raid_disks; i++) {
2684 sector_t sect = 2798 sector_t sect =
2685 raid10_find_virt(conf, mddev->curr_resync, i); 2799 raid10_find_virt(conf, mddev->curr_resync, i);
2686 bitmap_end_sync(mddev->bitmap, sect, 2800 bitmap_end_sync(mddev->bitmap, sect,
@@ -2694,7 +2808,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2694 /* Completed a full sync so the replacements 2808 /* Completed a full sync so the replacements
2695 * are now fully recovered. 2809 * are now fully recovered.
2696 */ 2810 */
2697 for (i = 0; i < conf->raid_disks; i++) 2811 for (i = 0; i < conf->geo.raid_disks; i++)
2698 if (conf->mirrors[i].replacement) 2812 if (conf->mirrors[i].replacement)
2699 conf->mirrors[i].replacement 2813 conf->mirrors[i].replacement
2700 ->recovery_offset 2814 ->recovery_offset
@@ -2707,7 +2821,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2707 *skipped = 1; 2821 *skipped = 1;
2708 return sectors_skipped; 2822 return sectors_skipped;
2709 } 2823 }
2710 if (chunks_skipped >= conf->raid_disks) { 2824
2825 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2826 return reshape_request(mddev, sector_nr, skipped);
2827
2828 if (chunks_skipped >= conf->geo.raid_disks) {
2711 /* if there has been nothing to do on any drive, 2829 /* if there has been nothing to do on any drive,
2712 * then there is nothing to do at all.. 2830 * then there is nothing to do at all..
2713 */ 2831 */
@@ -2721,9 +2839,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2721 /* make sure whole request will fit in a chunk - if chunks 2839 /* make sure whole request will fit in a chunk - if chunks
2722 * are meaningful 2840 * are meaningful
2723 */ 2841 */
2724 if (conf->near_copies < conf->raid_disks && 2842 if (conf->geo.near_copies < conf->geo.raid_disks &&
2725 max_sector > (sector_nr | conf->chunk_mask)) 2843 max_sector > (sector_nr | chunk_mask))
2726 max_sector = (sector_nr | conf->chunk_mask) + 1; 2844 max_sector = (sector_nr | chunk_mask) + 1;
2727 /* 2845 /*
2728 * If there is non-resync activity waiting for us then 2846 * If there is non-resync activity waiting for us then
2729 * put in a delay to throttle resync. 2847 * put in a delay to throttle resync.
@@ -2752,7 +2870,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2752 int j; 2870 int j;
2753 r10_bio = NULL; 2871 r10_bio = NULL;
2754 2872
2755 for (i=0 ; i<conf->raid_disks; i++) { 2873 for (i = 0 ; i < conf->geo.raid_disks; i++) {
2756 int still_degraded; 2874 int still_degraded;
2757 struct r10bio *rb2; 2875 struct r10bio *rb2;
2758 sector_t sect; 2876 sector_t sect;
@@ -2806,7 +2924,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2806 /* Need to check if the array will still be 2924 /* Need to check if the array will still be
2807 * degraded 2925 * degraded
2808 */ 2926 */
2809 for (j=0; j<conf->raid_disks; j++) 2927 for (j = 0; j < conf->geo.raid_disks; j++)
2810 if (conf->mirrors[j].rdev == NULL || 2928 if (conf->mirrors[j].rdev == NULL ||
2811 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { 2929 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
2812 still_degraded = 1; 2930 still_degraded = 1;
@@ -2984,9 +3102,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2984 r10_bio->sector = sector_nr; 3102 r10_bio->sector = sector_nr;
2985 set_bit(R10BIO_IsSync, &r10_bio->state); 3103 set_bit(R10BIO_IsSync, &r10_bio->state);
2986 raid10_find_phys(conf, r10_bio); 3104 raid10_find_phys(conf, r10_bio);
2987 r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; 3105 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
2988 3106
2989 for (i=0; i<conf->copies; i++) { 3107 for (i = 0; i < conf->copies; i++) {
2990 int d = r10_bio->devs[i].devnum; 3108 int d = r10_bio->devs[i].devnum;
2991 sector_t first_bad, sector; 3109 sector_t first_bad, sector;
2992 int bad_sectors; 3110 int bad_sectors;
@@ -3152,16 +3270,17 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3152 struct r10conf *conf = mddev->private; 3270 struct r10conf *conf = mddev->private;
3153 3271
3154 if (!raid_disks) 3272 if (!raid_disks)
3155 raid_disks = conf->raid_disks; 3273 raid_disks = min(conf->geo.raid_disks,
3274 conf->prev.raid_disks);
3156 if (!sectors) 3275 if (!sectors)
3157 sectors = conf->dev_sectors; 3276 sectors = conf->dev_sectors;
3158 3277
3159 size = sectors >> conf->chunk_shift; 3278 size = sectors >> conf->geo.chunk_shift;
3160 sector_div(size, conf->far_copies); 3279 sector_div(size, conf->geo.far_copies);
3161 size = size * raid_disks; 3280 size = size * raid_disks;
3162 sector_div(size, conf->near_copies); 3281 sector_div(size, conf->geo.near_copies);
3163 3282
3164 return size << conf->chunk_shift; 3283 return size << conf->geo.chunk_shift;
3165} 3284}
3166 3285
3167static void calc_sectors(struct r10conf *conf, sector_t size) 3286static void calc_sectors(struct r10conf *conf, sector_t size)
@@ -3171,10 +3290,10 @@ static void calc_sectors(struct r10conf *conf, sector_t size)
3171 * conf->stride 3290 * conf->stride
3172 */ 3291 */
3173 3292
3174 size = size >> conf->chunk_shift; 3293 size = size >> conf->geo.chunk_shift;
3175 sector_div(size, conf->far_copies); 3294 sector_div(size, conf->geo.far_copies);
3176 size = size * conf->raid_disks; 3295 size = size * conf->geo.raid_disks;
3177 sector_div(size, conf->near_copies); 3296 sector_div(size, conf->geo.near_copies);
3178 /* 'size' is now the number of chunks in the array */ 3297 /* 'size' is now the number of chunks in the array */
3179 /* calculate "used chunks per device" */ 3298 /* calculate "used chunks per device" */
3180 size = size * conf->copies; 3299 size = size * conf->copies;
@@ -3182,38 +3301,76 @@ static void calc_sectors(struct r10conf *conf, sector_t size)
3182 /* We need to round up when dividing by raid_disks to 3301 /* We need to round up when dividing by raid_disks to
3183 * get the stride size. 3302 * get the stride size.
3184 */ 3303 */
3185 size = DIV_ROUND_UP_SECTOR_T(size, conf->raid_disks); 3304 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3186 3305
3187 conf->dev_sectors = size << conf->chunk_shift; 3306 conf->dev_sectors = size << conf->geo.chunk_shift;
3188 3307
3189 if (conf->far_offset) 3308 if (conf->geo.far_offset)
3190 conf->stride = 1 << conf->chunk_shift; 3309 conf->geo.stride = 1 << conf->geo.chunk_shift;
3191 else { 3310 else {
3192 sector_div(size, conf->far_copies); 3311 sector_div(size, conf->geo.far_copies);
3193 conf->stride = size << conf->chunk_shift; 3312 conf->geo.stride = size << conf->geo.chunk_shift;
3194 } 3313 }
3195} 3314}
3196 3315
3316enum geo_type {geo_new, geo_old, geo_start};
3317static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3318{
3319 int nc, fc, fo;
3320 int layout, chunk, disks;
3321 switch (new) {
3322 case geo_old:
3323 layout = mddev->layout;
3324 chunk = mddev->chunk_sectors;
3325 disks = mddev->raid_disks - mddev->delta_disks;
3326 break;
3327 case geo_new:
3328 layout = mddev->new_layout;
3329 chunk = mddev->new_chunk_sectors;
3330 disks = mddev->raid_disks;
3331 break;
3332 default: /* avoid 'may be unused' warnings */
3333 case geo_start: /* new when starting reshape - raid_disks not
3334 * updated yet. */
3335 layout = mddev->new_layout;
3336 chunk = mddev->new_chunk_sectors;
3337 disks = mddev->raid_disks + mddev->delta_disks;
3338 break;
3339 }
3340 if (layout >> 17)
3341 return -1;
3342 if (chunk < (PAGE_SIZE >> 9) ||
3343 !is_power_of_2(chunk))
3344 return -2;
3345 nc = layout & 255;
3346 fc = (layout >> 8) & 255;
3347 fo = layout & (1<<16);
3348 geo->raid_disks = disks;
3349 geo->near_copies = nc;
3350 geo->far_copies = fc;
3351 geo->far_offset = fo;
3352 geo->chunk_mask = chunk - 1;
3353 geo->chunk_shift = ffz(~chunk);
3354 return nc*fc;
3355}
3356
3197static struct r10conf *setup_conf(struct mddev *mddev) 3357static struct r10conf *setup_conf(struct mddev *mddev)
3198{ 3358{
3199 struct r10conf *conf = NULL; 3359 struct r10conf *conf = NULL;
3200 int nc, fc, fo;
3201 int err = -EINVAL; 3360 int err = -EINVAL;
3361 struct geom geo;
3362 int copies;
3363
3364 copies = setup_geo(&geo, mddev, geo_new);
3202 3365
3203 if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || 3366 if (copies == -2) {
3204 !is_power_of_2(mddev->new_chunk_sectors)) {
3205 printk(KERN_ERR "md/raid10:%s: chunk size must be " 3367 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3206 "at least PAGE_SIZE(%ld) and be a power of 2.\n", 3368 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3207 mdname(mddev), PAGE_SIZE); 3369 mdname(mddev), PAGE_SIZE);
3208 goto out; 3370 goto out;
3209 } 3371 }
3210 3372
3211 nc = mddev->new_layout & 255; 3373 if (copies < 2 || copies > mddev->raid_disks) {
3212 fc = (mddev->new_layout >> 8) & 255;
3213 fo = mddev->new_layout & (1<<16);
3214
3215 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
3216 (mddev->new_layout >> 17)) {
3217 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 3374 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3218 mdname(mddev), mddev->new_layout); 3375 mdname(mddev), mddev->new_layout);
3219 goto out; 3376 goto out;
@@ -3224,7 +3381,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3224 if (!conf) 3381 if (!conf)
3225 goto out; 3382 goto out;
3226 3383
3227 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 3384 /* FIXME calc properly */
3385 conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks +
3386 max(0,mddev->delta_disks)),
3228 GFP_KERNEL); 3387 GFP_KERNEL);
3229 if (!conf->mirrors) 3388 if (!conf->mirrors)
3230 goto out; 3389 goto out;
@@ -3233,22 +3392,29 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3233 if (!conf->tmppage) 3392 if (!conf->tmppage)
3234 goto out; 3393 goto out;
3235 3394
3236 3395 conf->geo = geo;
3237 conf->raid_disks = mddev->raid_disks; 3396 conf->copies = copies;
3238 conf->near_copies = nc;
3239 conf->far_copies = fc;
3240 conf->copies = nc*fc;
3241 conf->far_offset = fo;
3242 conf->chunk_mask = mddev->new_chunk_sectors - 1;
3243 conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
3244
3245 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, 3397 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3246 r10bio_pool_free, conf); 3398 r10bio_pool_free, conf);
3247 if (!conf->r10bio_pool) 3399 if (!conf->r10bio_pool)
3248 goto out; 3400 goto out;
3249 3401
3250 calc_sectors(conf, mddev->dev_sectors); 3402 calc_sectors(conf, mddev->dev_sectors);
3251 3403 if (mddev->reshape_position == MaxSector) {
3404 conf->prev = conf->geo;
3405 conf->reshape_progress = MaxSector;
3406 } else {
3407 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3408 err = -EINVAL;
3409 goto out;
3410 }
3411 conf->reshape_progress = mddev->reshape_position;
3412 if (conf->prev.far_offset)
3413 conf->prev.stride = 1 << conf->prev.chunk_shift;
3414 else
3415 /* far_copies must be 1 */
3416 conf->prev.stride = conf->dev_sectors;
3417 }
3252 spin_lock_init(&conf->device_lock); 3418 spin_lock_init(&conf->device_lock);
3253 INIT_LIST_HEAD(&conf->retry_list); 3419 INIT_LIST_HEAD(&conf->retry_list);
3254 3420
@@ -3263,8 +3429,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3263 return conf; 3429 return conf;
3264 3430
3265 out: 3431 out:
3266 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", 3432 if (err == -ENOMEM)
3267 mdname(mddev)); 3433 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3434 mdname(mddev));
3268 if (conf) { 3435 if (conf) {
3269 if (conf->r10bio_pool) 3436 if (conf->r10bio_pool)
3270 mempool_destroy(conf->r10bio_pool); 3437 mempool_destroy(conf->r10bio_pool);
@@ -3282,12 +3449,8 @@ static int run(struct mddev *mddev)
3282 struct mirror_info *disk; 3449 struct mirror_info *disk;
3283 struct md_rdev *rdev; 3450 struct md_rdev *rdev;
3284 sector_t size; 3451 sector_t size;
3285 3452 sector_t min_offset_diff = 0;
3286 /* 3453 int first = 1;
3287 * copy the already verified devices into our private RAID10
3288 * bookkeeping area. [whatever we allocate in run(),
3289 * should be freed in stop()]
3290 */
3291 3454
3292 if (mddev->private == NULL) { 3455 if (mddev->private == NULL) {
3293 conf = setup_conf(mddev); 3456 conf = setup_conf(mddev);
@@ -3304,17 +3467,20 @@ static int run(struct mddev *mddev)
3304 3467
3305 chunk_size = mddev->chunk_sectors << 9; 3468 chunk_size = mddev->chunk_sectors << 9;
3306 blk_queue_io_min(mddev->queue, chunk_size); 3469 blk_queue_io_min(mddev->queue, chunk_size);
3307 if (conf->raid_disks % conf->near_copies) 3470 if (conf->geo.raid_disks % conf->geo.near_copies)
3308 blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); 3471 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3309 else 3472 else
3310 blk_queue_io_opt(mddev->queue, chunk_size * 3473 blk_queue_io_opt(mddev->queue, chunk_size *
3311 (conf->raid_disks / conf->near_copies)); 3474 (conf->geo.raid_disks / conf->geo.near_copies));
3312 3475
3313 rdev_for_each(rdev, mddev) { 3476 rdev_for_each(rdev, mddev) {
3477 long long diff;
3314 3478
3315 disk_idx = rdev->raid_disk; 3479 disk_idx = rdev->raid_disk;
3316 if (disk_idx >= conf->raid_disks 3480 if (disk_idx < 0)
3317 || disk_idx < 0) 3481 continue;
3482 if (disk_idx >= conf->geo.raid_disks &&
3483 disk_idx >= conf->prev.raid_disks)
3318 continue; 3484 continue;
3319 disk = conf->mirrors + disk_idx; 3485 disk = conf->mirrors + disk_idx;
3320 3486
@@ -3327,12 +3493,20 @@ static int run(struct mddev *mddev)
3327 goto out_free_conf; 3493 goto out_free_conf;
3328 disk->rdev = rdev; 3494 disk->rdev = rdev;
3329 } 3495 }
3496 diff = (rdev->new_data_offset - rdev->data_offset);
3497 if (!mddev->reshape_backwards)
3498 diff = -diff;
3499 if (diff < 0)
3500 diff = 0;
3501 if (first || diff < min_offset_diff)
3502 min_offset_diff = diff;
3330 3503
3331 disk_stack_limits(mddev->gendisk, rdev->bdev, 3504 disk_stack_limits(mddev->gendisk, rdev->bdev,
3332 rdev->data_offset << 9); 3505 rdev->data_offset << 9);
3333 3506
3334 disk->head_position = 0; 3507 disk->head_position = 0;
3335 } 3508 }
3509
3336 /* need to check that every block has at least one working mirror */ 3510 /* need to check that every block has at least one working mirror */
3337 if (!enough(conf, -1)) { 3511 if (!enough(conf, -1)) {
3338 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", 3512 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
@@ -3340,8 +3514,21 @@ static int run(struct mddev *mddev)
3340 goto out_free_conf; 3514 goto out_free_conf;
3341 } 3515 }
3342 3516
3517 if (conf->reshape_progress != MaxSector) {
3518 /* must ensure that shape change is supported */
3519 if (conf->geo.far_copies != 1 &&
3520 conf->geo.far_offset == 0)
3521 goto out_free_conf;
3522 if (conf->prev.far_copies != 1 &&
3523 conf->geo.far_offset == 0)
3524 goto out_free_conf;
3525 }
3526
3343 mddev->degraded = 0; 3527 mddev->degraded = 0;
3344 for (i = 0; i < conf->raid_disks; i++) { 3528 for (i = 0;
3529 i < conf->geo.raid_disks
3530 || i < conf->prev.raid_disks;
3531 i++) {
3345 3532
3346 disk = conf->mirrors + i; 3533 disk = conf->mirrors + i;
3347 3534
@@ -3368,8 +3555,8 @@ static int run(struct mddev *mddev)
3368 mdname(mddev)); 3555 mdname(mddev));
3369 printk(KERN_INFO 3556 printk(KERN_INFO
3370 "md/raid10:%s: active with %d out of %d devices\n", 3557 "md/raid10:%s: active with %d out of %d devices\n",
3371 mdname(mddev), conf->raid_disks - mddev->degraded, 3558 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3372 conf->raid_disks); 3559 conf->geo.raid_disks);
3373 /* 3560 /*
3374 * Ok, everything is just fine now 3561 * Ok, everything is just fine now
3375 */ 3562 */
@@ -3386,11 +3573,11 @@ static int run(struct mddev *mddev)
3386 * maybe... 3573 * maybe...
3387 */ 3574 */
3388 { 3575 {
3389 int stripe = conf->raid_disks * 3576 int stripe = conf->geo.raid_disks *
3390 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 3577 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3391 stripe /= conf->near_copies; 3578 stripe /= conf->geo.near_copies;
3392 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) 3579 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3393 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 3580 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3394 } 3581 }
3395 3582
3396 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 3583 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
@@ -3398,6 +3585,30 @@ static int run(struct mddev *mddev)
3398 if (md_integrity_register(mddev)) 3585 if (md_integrity_register(mddev))
3399 goto out_free_conf; 3586 goto out_free_conf;
3400 3587
3588 if (conf->reshape_progress != MaxSector) {
3589 unsigned long before_length, after_length;
3590
3591 before_length = ((1 << conf->prev.chunk_shift) *
3592 conf->prev.far_copies);
3593 after_length = ((1 << conf->geo.chunk_shift) *
3594 conf->geo.far_copies);
3595
3596 if (max(before_length, after_length) > min_offset_diff) {
3597 /* This cannot work */
3598 printk("md/raid10: offset difference not enough to continue reshape\n");
3599 goto out_free_conf;
3600 }
3601 conf->offset_diff = min_offset_diff;
3602
3603 conf->reshape_safe = conf->reshape_progress;
3604 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3605 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3606 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3607 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3608 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3609 "reshape");
3610 }
3611
3401 return 0; 3612 return 0;
3402 3613
3403out_free_conf: 3614out_free_conf:
@@ -3460,14 +3671,23 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
3460 struct r10conf *conf = mddev->private; 3671 struct r10conf *conf = mddev->private;
3461 sector_t oldsize, size; 3672 sector_t oldsize, size;
3462 3673
3463 if (conf->far_copies > 1 && !conf->far_offset) 3674 if (mddev->reshape_position != MaxSector)
3675 return -EBUSY;
3676
3677 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3464 return -EINVAL; 3678 return -EINVAL;
3465 3679
3466 oldsize = raid10_size(mddev, 0, 0); 3680 oldsize = raid10_size(mddev, 0, 0);
3467 size = raid10_size(mddev, sectors, 0); 3681 size = raid10_size(mddev, sectors, 0);
3468 md_set_array_sectors(mddev, size); 3682 if (mddev->external_size &&
3469 if (mddev->array_sectors > size) 3683 mddev->array_sectors > size)
3470 return -EINVAL; 3684 return -EINVAL;
3685 if (mddev->bitmap) {
3686 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3687 if (ret)
3688 return ret;
3689 }
3690 md_set_array_sectors(mddev, size);
3471 set_capacity(mddev->gendisk, mddev->array_sectors); 3691 set_capacity(mddev->gendisk, mddev->array_sectors);
3472 revalidate_disk(mddev->gendisk); 3692 revalidate_disk(mddev->gendisk);
3473 if (sectors > mddev->dev_sectors && 3693 if (sectors > mddev->dev_sectors &&
@@ -3534,6 +3754,758 @@ static void *raid10_takeover(struct mddev *mddev)
3534 return ERR_PTR(-EINVAL); 3754 return ERR_PTR(-EINVAL);
3535} 3755}
3536 3756
3757static int raid10_check_reshape(struct mddev *mddev)
3758{
3759 /* Called when there is a request to change
3760 * - layout (to ->new_layout)
3761 * - chunk size (to ->new_chunk_sectors)
3762 * - raid_disks (by delta_disks)
3763 * or when trying to restart a reshape that was ongoing.
3764 *
3765 * We need to validate the request and possibly allocate
3766 * space if that might be an issue later.
3767 *
3768 * Currently we reject any reshape of a 'far' mode array,
3769 * allow chunk size to change if new is generally acceptable,
3770 * allow raid_disks to increase, and allow
3771 * a switch between 'near' mode and 'offset' mode.
3772 */
3773 struct r10conf *conf = mddev->private;
3774 struct geom geo;
3775
3776 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3777 return -EINVAL;
3778
3779 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3780 /* mustn't change number of copies */
3781 return -EINVAL;
3782 if (geo.far_copies > 1 && !geo.far_offset)
3783 /* Cannot switch to 'far' mode */
3784 return -EINVAL;
3785
3786 if (mddev->array_sectors & geo.chunk_mask)
3787 /* not factor of array size */
3788 return -EINVAL;
3789
3790 if (!enough(conf, -1))
3791 return -EINVAL;
3792
3793 kfree(conf->mirrors_new);
3794 conf->mirrors_new = NULL;
3795 if (mddev->delta_disks > 0) {
3796 /* allocate new 'mirrors' list */
3797 conf->mirrors_new = kzalloc(
3798 sizeof(struct mirror_info)
3799 *(mddev->raid_disks +
3800 mddev->delta_disks),
3801 GFP_KERNEL);
3802 if (!conf->mirrors_new)
3803 return -ENOMEM;
3804 }
3805 return 0;
3806}
3807
3808/*
3809 * Need to check if array has failed when deciding whether to:
3810 * - start an array
3811 * - remove non-faulty devices
3812 * - add a spare
3813 * - allow a reshape
3814 * This determination is simple when no reshape is happening.
3815 * However if there is a reshape, we need to carefully check
3816 * both the before and after sections.
3817 * This is because some failed devices may only affect one
3818 * of the two sections, and some non-in_sync devices may
3819 * be insync in the section most affected by failed devices.
3820 */
3821static int calc_degraded(struct r10conf *conf)
3822{
3823 int degraded, degraded2;
3824 int i;
3825
3826 rcu_read_lock();
3827 degraded = 0;
3828 /* 'prev' section first */
3829 for (i = 0; i < conf->prev.raid_disks; i++) {
3830 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3831 if (!rdev || test_bit(Faulty, &rdev->flags))
3832 degraded++;
3833 else if (!test_bit(In_sync, &rdev->flags))
3834 /* When we can reduce the number of devices in
3835 * an array, this might not contribute to
3836 * 'degraded'. It does now.
3837 */
3838 degraded++;
3839 }
3840 rcu_read_unlock();
3841 if (conf->geo.raid_disks == conf->prev.raid_disks)
3842 return degraded;
3843 rcu_read_lock();
3844 degraded2 = 0;
3845 for (i = 0; i < conf->geo.raid_disks; i++) {
3846 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3847 if (!rdev || test_bit(Faulty, &rdev->flags))
3848 degraded2++;
3849 else if (!test_bit(In_sync, &rdev->flags)) {
3850 /* If reshape is increasing the number of devices,
3851 * this section has already been recovered, so
3852 * it doesn't contribute to degraded.
3853 * else it does.
3854 */
3855 if (conf->geo.raid_disks <= conf->prev.raid_disks)
3856 degraded2++;
3857 }
3858 }
3859 rcu_read_unlock();
3860 if (degraded2 > degraded)
3861 return degraded2;
3862 return degraded;
3863}
3864
3865static int raid10_start_reshape(struct mddev *mddev)
3866{
3867 /* A 'reshape' has been requested. This commits
3868 * the various 'new' fields and sets MD_RECOVER_RESHAPE
3869 * This also checks if there are enough spares and adds them
3870 * to the array.
3871 * We currently require enough spares to make the final
3872 * array non-degraded. We also require that the difference
3873 * between old and new data_offset - on each device - is
3874 * enough that we never risk over-writing.
3875 */
3876
3877 unsigned long before_length, after_length;
3878 sector_t min_offset_diff = 0;
3879 int first = 1;
3880 struct geom new;
3881 struct r10conf *conf = mddev->private;
3882 struct md_rdev *rdev;
3883 int spares = 0;
3884 int ret;
3885
3886 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3887 return -EBUSY;
3888
3889 if (setup_geo(&new, mddev, geo_start) != conf->copies)
3890 return -EINVAL;
3891
3892 before_length = ((1 << conf->prev.chunk_shift) *
3893 conf->prev.far_copies);
3894 after_length = ((1 << conf->geo.chunk_shift) *
3895 conf->geo.far_copies);
3896
3897 rdev_for_each(rdev, mddev) {
3898 if (!test_bit(In_sync, &rdev->flags)
3899 && !test_bit(Faulty, &rdev->flags))
3900 spares++;
3901 if (rdev->raid_disk >= 0) {
3902 long long diff = (rdev->new_data_offset
3903 - rdev->data_offset);
3904 if (!mddev->reshape_backwards)
3905 diff = -diff;
3906 if (diff < 0)
3907 diff = 0;
3908 if (first || diff < min_offset_diff)
3909 min_offset_diff = diff;
3910 }
3911 }
3912
3913 if (max(before_length, after_length) > min_offset_diff)
3914 return -EINVAL;
3915
3916 if (spares < mddev->delta_disks)
3917 return -EINVAL;
3918
3919 conf->offset_diff = min_offset_diff;
3920 spin_lock_irq(&conf->device_lock);
3921 if (conf->mirrors_new) {
3922 memcpy(conf->mirrors_new, conf->mirrors,
3923 sizeof(struct mirror_info)*conf->prev.raid_disks);
3924 smp_mb();
3925 kfree(conf->mirrors_old); /* FIXME and elsewhere */
3926 conf->mirrors_old = conf->mirrors;
3927 conf->mirrors = conf->mirrors_new;
3928 conf->mirrors_new = NULL;
3929 }
3930 setup_geo(&conf->geo, mddev, geo_start);
3931 smp_mb();
3932 if (mddev->reshape_backwards) {
3933 sector_t size = raid10_size(mddev, 0, 0);
3934 if (size < mddev->array_sectors) {
3935 spin_unlock_irq(&conf->device_lock);
3936 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
3937 mdname(mddev));
3938 return -EINVAL;
3939 }
3940 mddev->resync_max_sectors = size;
3941 conf->reshape_progress = size;
3942 } else
3943 conf->reshape_progress = 0;
3944 spin_unlock_irq(&conf->device_lock);
3945
3946 if (mddev->delta_disks && mddev->bitmap) {
3947 ret = bitmap_resize(mddev->bitmap,
3948 raid10_size(mddev, 0,
3949 conf->geo.raid_disks),
3950 0, 0);
3951 if (ret)
3952 goto abort;
3953 }
3954 if (mddev->delta_disks > 0) {
3955 rdev_for_each(rdev, mddev)
3956 if (rdev->raid_disk < 0 &&
3957 !test_bit(Faulty, &rdev->flags)) {
3958 if (raid10_add_disk(mddev, rdev) == 0) {
3959 if (rdev->raid_disk >=
3960 conf->prev.raid_disks)
3961 set_bit(In_sync, &rdev->flags);
3962 else
3963 rdev->recovery_offset = 0;
3964
3965 if (sysfs_link_rdev(mddev, rdev))
3966 /* Failure here is OK */;
3967 }
3968 } else if (rdev->raid_disk >= conf->prev.raid_disks
3969 && !test_bit(Faulty, &rdev->flags)) {
3970 /* This is a spare that was manually added */
3971 set_bit(In_sync, &rdev->flags);
3972 }
3973 }
3974 /* When a reshape changes the number of devices,
3975 * ->degraded is measured against the larger of the
3976 * pre and post numbers.
3977 */
3978 spin_lock_irq(&conf->device_lock);
3979 mddev->degraded = calc_degraded(conf);
3980 spin_unlock_irq(&conf->device_lock);
3981 mddev->raid_disks = conf->geo.raid_disks;
3982 mddev->reshape_position = conf->reshape_progress;
3983 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3984
3985 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3986 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3987 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3988 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3989
3990 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3991 "reshape");
3992 if (!mddev->sync_thread) {
3993 ret = -EAGAIN;
3994 goto abort;
3995 }
3996 conf->reshape_checkpoint = jiffies;
3997 md_wakeup_thread(mddev->sync_thread);
3998 md_new_event(mddev);
3999 return 0;
4000
4001abort:
4002 mddev->recovery = 0;
4003 spin_lock_irq(&conf->device_lock);
4004 conf->geo = conf->prev;
4005 mddev->raid_disks = conf->geo.raid_disks;
4006 rdev_for_each(rdev, mddev)
4007 rdev->new_data_offset = rdev->data_offset;
4008 smp_wmb();
4009 conf->reshape_progress = MaxSector;
4010 mddev->reshape_position = MaxSector;
4011 spin_unlock_irq(&conf->device_lock);
4012 return ret;
4013}
4014
4015/* Calculate the last device-address that could contain
4016 * any block from the chunk that includes the array-address 's'
4017 * and report the next address.
4018 * i.e. the address returned will be chunk-aligned and after
4019 * any data that is in the chunk containing 's'.
4020 */
4021static sector_t last_dev_address(sector_t s, struct geom *geo)
4022{
4023 s = (s | geo->chunk_mask) + 1;
4024 s >>= geo->chunk_shift;
4025 s *= geo->near_copies;
4026 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4027 s *= geo->far_copies;
4028 s <<= geo->chunk_shift;
4029 return s;
4030}
4031
4032/* Calculate the first device-address that could contain
4033 * any block from the chunk that includes the array-address 's'.
4034 * This too will be the start of a chunk
4035 */
4036static sector_t first_dev_address(sector_t s, struct geom *geo)
4037{
4038 s >>= geo->chunk_shift;
4039 s *= geo->near_copies;
4040 sector_div(s, geo->raid_disks);
4041 s *= geo->far_copies;
4042 s <<= geo->chunk_shift;
4043 return s;
4044}
4045
4046static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4047 int *skipped)
4048{
4049 /* We simply copy at most one chunk (smallest of old and new)
4050 * at a time, possibly less if that exceeds RESYNC_PAGES,
4051 * or we hit a bad block or something.
4052 * This might mean we pause for normal IO in the middle of
4053 * a chunk, but that is not a problem was mddev->reshape_position
4054 * can record any location.
4055 *
4056 * If we will want to write to a location that isn't
4057 * yet recorded as 'safe' (i.e. in metadata on disk) then
4058 * we need to flush all reshape requests and update the metadata.
4059 *
4060 * When reshaping forwards (e.g. to more devices), we interpret
4061 * 'safe' as the earliest block which might not have been copied
4062 * down yet. We divide this by previous stripe size and multiply
4063 * by previous stripe length to get lowest device offset that we
4064 * cannot write to yet.
4065 * We interpret 'sector_nr' as an address that we want to write to.
4066 * From this we use last_device_address() to find where we might
4067 * write to, and first_device_address on the 'safe' position.
4068 * If this 'next' write position is after the 'safe' position,
4069 * we must update the metadata to increase the 'safe' position.
4070 *
4071 * When reshaping backwards, we round in the opposite direction
4072 * and perform the reverse test: next write position must not be
4073 * less than current safe position.
4074 *
4075 * In all this the minimum difference in data offsets
4076 * (conf->offset_diff - always positive) allows a bit of slack,
4077 * so next can be after 'safe', but not by more than offset_disk
4078 *
4079 * We need to prepare all the bios here before we start any IO
4080 * to ensure the size we choose is acceptable to all devices.
4081 * The means one for each copy for write-out and an extra one for
4082 * read-in.
4083 * We store the read-in bio in ->master_bio and the others in
4084 * ->devs[x].bio and ->devs[x].repl_bio.
4085 */
4086 struct r10conf *conf = mddev->private;
4087 struct r10bio *r10_bio;
4088 sector_t next, safe, last;
4089 int max_sectors;
4090 int nr_sectors;
4091 int s;
4092 struct md_rdev *rdev;
4093 int need_flush = 0;
4094 struct bio *blist;
4095 struct bio *bio, *read_bio;
4096 int sectors_done = 0;
4097
4098 if (sector_nr == 0) {
4099 /* If restarting in the middle, skip the initial sectors */
4100 if (mddev->reshape_backwards &&
4101 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4102 sector_nr = (raid10_size(mddev, 0, 0)
4103 - conf->reshape_progress);
4104 } else if (!mddev->reshape_backwards &&
4105 conf->reshape_progress > 0)
4106 sector_nr = conf->reshape_progress;
4107 if (sector_nr) {
4108 mddev->curr_resync_completed = sector_nr;
4109 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4110 *skipped = 1;
4111 return sector_nr;
4112 }
4113 }
4114
4115 /* We don't use sector_nr to track where we are up to
4116 * as that doesn't work well for ->reshape_backwards.
4117 * So just use ->reshape_progress.
4118 */
4119 if (mddev->reshape_backwards) {
4120 /* 'next' is the earliest device address that we might
4121 * write to for this chunk in the new layout
4122 */
4123 next = first_dev_address(conf->reshape_progress - 1,
4124 &conf->geo);
4125
4126 /* 'safe' is the last device address that we might read from
4127 * in the old layout after a restart
4128 */
4129 safe = last_dev_address(conf->reshape_safe - 1,
4130 &conf->prev);
4131
4132 if (next + conf->offset_diff < safe)
4133 need_flush = 1;
4134
4135 last = conf->reshape_progress - 1;
4136 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4137 & conf->prev.chunk_mask);
4138 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4139 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4140 } else {
4141 /* 'next' is after the last device address that we
4142 * might write to for this chunk in the new layout
4143 */
4144 next = last_dev_address(conf->reshape_progress, &conf->geo);
4145
4146 /* 'safe' is the earliest device address that we might
4147 * read from in the old layout after a restart
4148 */
4149 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4150
4151 /* Need to update metadata if 'next' might be beyond 'safe'
4152 * as that would possibly corrupt data
4153 */
4154 if (next > safe + conf->offset_diff)
4155 need_flush = 1;
4156
4157 sector_nr = conf->reshape_progress;
4158 last = sector_nr | (conf->geo.chunk_mask
4159 & conf->prev.chunk_mask);
4160
4161 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4162 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4163 }
4164
4165 if (need_flush ||
4166 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4167 /* Need to update reshape_position in metadata */
4168 wait_barrier(conf);
4169 mddev->reshape_position = conf->reshape_progress;
4170 if (mddev->reshape_backwards)
4171 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4172 - conf->reshape_progress;
4173 else
4174 mddev->curr_resync_completed = conf->reshape_progress;
4175 conf->reshape_checkpoint = jiffies;
4176 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4177 md_wakeup_thread(mddev->thread);
4178 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4179 kthread_should_stop());
4180 conf->reshape_safe = mddev->reshape_position;
4181 allow_barrier(conf);
4182 }
4183
4184read_more:
4185 /* Now schedule reads for blocks from sector_nr to last */
4186 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4187 raise_barrier(conf, sectors_done != 0);
4188 atomic_set(&r10_bio->remaining, 0);
4189 r10_bio->mddev = mddev;
4190 r10_bio->sector = sector_nr;
4191 set_bit(R10BIO_IsReshape, &r10_bio->state);
4192 r10_bio->sectors = last - sector_nr + 1;
4193 rdev = read_balance(conf, r10_bio, &max_sectors);
4194 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4195
4196 if (!rdev) {
4197 /* Cannot read from here, so need to record bad blocks
4198 * on all the target devices.
4199 */
4200 // FIXME
4201 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4202 return sectors_done;
4203 }
4204
4205 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4206
4207 read_bio->bi_bdev = rdev->bdev;
4208 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4209 + rdev->data_offset);
4210 read_bio->bi_private = r10_bio;
4211 read_bio->bi_end_io = end_sync_read;
4212 read_bio->bi_rw = READ;
4213 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4214 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4215 read_bio->bi_vcnt = 0;
4216 read_bio->bi_idx = 0;
4217 read_bio->bi_size = 0;
4218 r10_bio->master_bio = read_bio;
4219 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4220
4221 /* Now find the locations in the new layout */
4222 __raid10_find_phys(&conf->geo, r10_bio);
4223
4224 blist = read_bio;
4225 read_bio->bi_next = NULL;
4226
4227 for (s = 0; s < conf->copies*2; s++) {
4228 struct bio *b;
4229 int d = r10_bio->devs[s/2].devnum;
4230 struct md_rdev *rdev2;
4231 if (s&1) {
4232 rdev2 = conf->mirrors[d].replacement;
4233 b = r10_bio->devs[s/2].repl_bio;
4234 } else {
4235 rdev2 = conf->mirrors[d].rdev;
4236 b = r10_bio->devs[s/2].bio;
4237 }
4238 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4239 continue;
4240 b->bi_bdev = rdev2->bdev;
4241 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4242 b->bi_private = r10_bio;
4243 b->bi_end_io = end_reshape_write;
4244 b->bi_rw = WRITE;
4245 b->bi_flags &= ~(BIO_POOL_MASK - 1);
4246 b->bi_flags |= 1 << BIO_UPTODATE;
4247 b->bi_next = blist;
4248 b->bi_vcnt = 0;
4249 b->bi_idx = 0;
4250 b->bi_size = 0;
4251 blist = b;
4252 }
4253
4254 /* Now add as many pages as possible to all of these bios. */
4255
4256 nr_sectors = 0;
4257 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4258 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4259 int len = (max_sectors - s) << 9;
4260 if (len > PAGE_SIZE)
4261 len = PAGE_SIZE;
4262 for (bio = blist; bio ; bio = bio->bi_next) {
4263 struct bio *bio2;
4264 if (bio_add_page(bio, page, len, 0))
4265 continue;
4266
4267 /* Didn't fit, must stop */
4268 for (bio2 = blist;
4269 bio2 && bio2 != bio;
4270 bio2 = bio2->bi_next) {
4271 /* Remove last page from this bio */
4272 bio2->bi_vcnt--;
4273 bio2->bi_size -= len;
4274 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4275 }
4276 goto bio_full;
4277 }
4278 sector_nr += len >> 9;
4279 nr_sectors += len >> 9;
4280 }
4281bio_full:
4282 r10_bio->sectors = nr_sectors;
4283
4284 /* Now submit the read */
4285 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4286 atomic_inc(&r10_bio->remaining);
4287 read_bio->bi_next = NULL;
4288 generic_make_request(read_bio);
4289 sector_nr += nr_sectors;
4290 sectors_done += nr_sectors;
4291 if (sector_nr <= last)
4292 goto read_more;
4293
4294 /* Now that we have done the whole section we can
4295 * update reshape_progress
4296 */
4297 if (mddev->reshape_backwards)
4298 conf->reshape_progress -= sectors_done;
4299 else
4300 conf->reshape_progress += sectors_done;
4301
4302 return sectors_done;
4303}
4304
4305static void end_reshape_request(struct r10bio *r10_bio);
4306static int handle_reshape_read_error(struct mddev *mddev,
4307 struct r10bio *r10_bio);
4308static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4309{
4310 /* Reshape read completed. Hopefully we have a block
4311 * to write out.
4312 * If we got a read error then we do sync 1-page reads from
4313 * elsewhere until we find the data - or give up.
4314 */
4315 struct r10conf *conf = mddev->private;
4316 int s;
4317
4318 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4319 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4320 /* Reshape has been aborted */
4321 md_done_sync(mddev, r10_bio->sectors, 0);
4322 return;
4323 }
4324
4325 /* We definitely have the data in the pages, schedule the
4326 * writes.
4327 */
4328 atomic_set(&r10_bio->remaining, 1);
4329 for (s = 0; s < conf->copies*2; s++) {
4330 struct bio *b;
4331 int d = r10_bio->devs[s/2].devnum;
4332 struct md_rdev *rdev;
4333 if (s&1) {
4334 rdev = conf->mirrors[d].replacement;
4335 b = r10_bio->devs[s/2].repl_bio;
4336 } else {
4337 rdev = conf->mirrors[d].rdev;
4338 b = r10_bio->devs[s/2].bio;
4339 }
4340 if (!rdev || test_bit(Faulty, &rdev->flags))
4341 continue;
4342 atomic_inc(&rdev->nr_pending);
4343 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4344 atomic_inc(&r10_bio->remaining);
4345 b->bi_next = NULL;
4346 generic_make_request(b);
4347 }
4348 end_reshape_request(r10_bio);
4349}
4350
4351static void end_reshape(struct r10conf *conf)
4352{
4353 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4354 return;
4355
4356 spin_lock_irq(&conf->device_lock);
4357 conf->prev = conf->geo;
4358 md_finish_reshape(conf->mddev);
4359 smp_wmb();
4360 conf->reshape_progress = MaxSector;
4361 spin_unlock_irq(&conf->device_lock);
4362
4363 /* read-ahead size must cover two whole stripes, which is
4364 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4365 */
4366 if (conf->mddev->queue) {
4367 int stripe = conf->geo.raid_disks *
4368 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4369 stripe /= conf->geo.near_copies;
4370 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4371 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4372 }
4373 conf->fullsync = 0;
4374}
4375
4376
4377static int handle_reshape_read_error(struct mddev *mddev,
4378 struct r10bio *r10_bio)
4379{
4380 /* Use sync reads to get the blocks from somewhere else */
4381 int sectors = r10_bio->sectors;
4382 struct r10bio r10b;
4383 struct r10conf *conf = mddev->private;
4384 int slot = 0;
4385 int idx = 0;
4386 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4387
4388 r10b.sector = r10_bio->sector;
4389 __raid10_find_phys(&conf->prev, &r10b);
4390
4391 while (sectors) {
4392 int s = sectors;
4393 int success = 0;
4394 int first_slot = slot;
4395
4396 if (s > (PAGE_SIZE >> 9))
4397 s = PAGE_SIZE >> 9;
4398
4399 while (!success) {
4400 int d = r10b.devs[slot].devnum;
4401 struct md_rdev *rdev = conf->mirrors[d].rdev;
4402 sector_t addr;
4403 if (rdev == NULL ||
4404 test_bit(Faulty, &rdev->flags) ||
4405 !test_bit(In_sync, &rdev->flags))
4406 goto failed;
4407
4408 addr = r10b.devs[slot].addr + idx * PAGE_SIZE;
4409 success = sync_page_io(rdev,
4410 addr,
4411 s << 9,
4412 bvec[idx].bv_page,
4413 READ, false);
4414 if (success)
4415 break;
4416 failed:
4417 slot++;
4418 if (slot >= conf->copies)
4419 slot = 0;
4420 if (slot == first_slot)
4421 break;
4422 }
4423 if (!success) {
4424 /* couldn't read this block, must give up */
4425 set_bit(MD_RECOVERY_INTR,
4426 &mddev->recovery);
4427 return -EIO;
4428 }
4429 sectors -= s;
4430 idx++;
4431 }
4432 return 0;
4433}
4434
4435static void end_reshape_write(struct bio *bio, int error)
4436{
4437 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4438 struct r10bio *r10_bio = bio->bi_private;
4439 struct mddev *mddev = r10_bio->mddev;
4440 struct r10conf *conf = mddev->private;
4441 int d;
4442 int slot;
4443 int repl;
4444 struct md_rdev *rdev = NULL;
4445
4446 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4447 if (repl)
4448 rdev = conf->mirrors[d].replacement;
4449 if (!rdev) {
4450 smp_mb();
4451 rdev = conf->mirrors[d].rdev;
4452 }
4453
4454 if (!uptodate) {
4455 /* FIXME should record badblock */
4456 md_error(mddev, rdev);
4457 }
4458
4459 rdev_dec_pending(rdev, mddev);
4460 end_reshape_request(r10_bio);
4461}
4462
4463static void end_reshape_request(struct r10bio *r10_bio)
4464{
4465 if (!atomic_dec_and_test(&r10_bio->remaining))
4466 return;
4467 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4468 bio_put(r10_bio->master_bio);
4469 put_buf(r10_bio);
4470}
4471
4472static void raid10_finish_reshape(struct mddev *mddev)
4473{
4474 struct r10conf *conf = mddev->private;
4475
4476 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4477 return;
4478
4479 if (mddev->delta_disks > 0) {
4480 sector_t size = raid10_size(mddev, 0, 0);
4481 md_set_array_sectors(mddev, size);
4482 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4483 mddev->recovery_cp = mddev->resync_max_sectors;
4484 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4485 }
4486 mddev->resync_max_sectors = size;
4487 set_capacity(mddev->gendisk, mddev->array_sectors);
4488 revalidate_disk(mddev->gendisk);
4489 } else {
4490 int d;
4491 for (d = conf->geo.raid_disks ;
4492 d < conf->geo.raid_disks - mddev->delta_disks;
4493 d++) {
4494 struct md_rdev *rdev = conf->mirrors[d].rdev;
4495 if (rdev)
4496 clear_bit(In_sync, &rdev->flags);
4497 rdev = conf->mirrors[d].replacement;
4498 if (rdev)
4499 clear_bit(In_sync, &rdev->flags);
4500 }
4501 }
4502 mddev->layout = mddev->new_layout;
4503 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4504 mddev->reshape_position = MaxSector;
4505 mddev->delta_disks = 0;
4506 mddev->reshape_backwards = 0;
4507}
4508
3537static struct md_personality raid10_personality = 4509static struct md_personality raid10_personality =
3538{ 4510{
3539 .name = "raid10", 4511 .name = "raid10",
@@ -3552,6 +4524,9 @@ static struct md_personality raid10_personality =
3552 .size = raid10_size, 4524 .size = raid10_size,
3553 .resize = raid10_resize, 4525 .resize = raid10_resize,
3554 .takeover = raid10_takeover, 4526 .takeover = raid10_takeover,
4527 .check_reshape = raid10_check_reshape,
4528 .start_reshape = raid10_start_reshape,
4529 .finish_reshape = raid10_finish_reshape,
3555}; 4530};
3556 4531
3557static int __init raid_init(void) 4532static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 7c615613c381..135b1b0a1554 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -14,32 +14,38 @@ struct mirror_info {
14struct r10conf { 14struct r10conf {
15 struct mddev *mddev; 15 struct mddev *mddev;
16 struct mirror_info *mirrors; 16 struct mirror_info *mirrors;
17 int raid_disks; 17 struct mirror_info *mirrors_new, *mirrors_old;
18 spinlock_t device_lock; 18 spinlock_t device_lock;
19 19
20 /* geometry */ 20 /* geometry */
21 int near_copies; /* number of copies laid out 21 struct geom {
22 int raid_disks;
23 int near_copies; /* number of copies laid out
22 * raid0 style */ 24 * raid0 style */
23 int far_copies; /* number of copies laid out 25 int far_copies; /* number of copies laid out
24 * at large strides across drives 26 * at large strides across drives
25 */ 27 */
26 int far_offset; /* far_copies are offset by 1 28 int far_offset; /* far_copies are offset by 1
27 * stripe instead of many 29 * stripe instead of many
28 */ 30 */
29 int copies; /* near_copies * far_copies. 31 sector_t stride; /* distance between far copies.
30 * must be <= raid_disks
31 */
32 sector_t stride; /* distance between far copies.
33 * This is size / far_copies unless 32 * This is size / far_copies unless
34 * far_offset, in which case it is 33 * far_offset, in which case it is
35 * 1 stripe. 34 * 1 stripe.
36 */ 35 */
36 int chunk_shift; /* shift from chunks to sectors */
37 sector_t chunk_mask;
38 } prev, geo;
39 int copies; /* near_copies * far_copies.
40 * must be <= raid_disks
41 */
37 42
38 sector_t dev_sectors; /* temp copy of 43 sector_t dev_sectors; /* temp copy of
39 * mddev->dev_sectors */ 44 * mddev->dev_sectors */
40 45 sector_t reshape_progress;
41 int chunk_shift; /* shift from chunks to sectors */ 46 sector_t reshape_safe;
42 sector_t chunk_mask; 47 unsigned long reshape_checkpoint;
48 sector_t offset_diff;
43 49
44 struct list_head retry_list; 50 struct list_head retry_list;
45 /* queue pending writes and submit them on unplug */ 51 /* queue pending writes and submit them on unplug */
@@ -136,6 +142,7 @@ enum r10bio_state {
136 R10BIO_Uptodate, 142 R10BIO_Uptodate,
137 R10BIO_IsSync, 143 R10BIO_IsSync,
138 R10BIO_IsRecover, 144 R10BIO_IsRecover,
145 R10BIO_IsReshape,
139 R10BIO_Degraded, 146 R10BIO_Degraded,
140/* Set ReadError on bios that experience a read error 147/* Set ReadError on bios that experience a read error
141 * so that raid10d knows what to do with them. 148 * so that raid10d knows what to do with them.
@@ -146,5 +153,10 @@ enum r10bio_state {
146 */ 153 */
147 R10BIO_MadeGood, 154 R10BIO_MadeGood,
148 R10BIO_WriteError, 155 R10BIO_WriteError,
156/* During a reshape we might be performing IO on the
157 * 'previous' part of the array, in which case this
158 * flag is set
159 */
160 R10BIO_Previous,
149}; 161};
150#endif 162#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f351422938e0..d26767246d26 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
488 return sh; 488 return sh;
489} 489}
490 490
491/* Determine if 'data_offset' or 'new_data_offset' should be used
492 * in this stripe_head.
493 */
494static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
495{
496 sector_t progress = conf->reshape_progress;
497 /* Need a memory barrier to make sure we see the value
498 * of conf->generation, or ->data_offset that was set before
499 * reshape_progress was updated.
500 */
501 smp_rmb();
502 if (progress == MaxSector)
503 return 0;
504 if (sh->generation == conf->generation - 1)
505 return 0;
506 /* We are in a reshape, and this is a new-generation stripe,
507 * so use new_data_offset.
508 */
509 return 1;
510}
511
491static void 512static void
492raid5_end_read_request(struct bio *bi, int error); 513raid5_end_read_request(struct bio *bi, int error);
493static void 514static void
@@ -518,6 +539,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
518 replace_only = 1; 539 replace_only = 1;
519 } else 540 } else
520 continue; 541 continue;
542 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
543 rw |= REQ_SYNC;
521 544
522 bi = &sh->dev[i].req; 545 bi = &sh->dev[i].req;
523 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 546 rbi = &sh->dev[i].rreq; /* For writing to replacement */
@@ -603,7 +626,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
603 __func__, (unsigned long long)sh->sector, 626 __func__, (unsigned long long)sh->sector,
604 bi->bi_rw, i); 627 bi->bi_rw, i);
605 atomic_inc(&sh->count); 628 atomic_inc(&sh->count);
606 bi->bi_sector = sh->sector + rdev->data_offset; 629 if (use_new_offset(conf, sh))
630 bi->bi_sector = (sh->sector
631 + rdev->new_data_offset);
632 else
633 bi->bi_sector = (sh->sector
634 + rdev->data_offset);
607 bi->bi_flags = 1 << BIO_UPTODATE; 635 bi->bi_flags = 1 << BIO_UPTODATE;
608 bi->bi_idx = 0; 636 bi->bi_idx = 0;
609 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 637 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -627,7 +655,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
627 __func__, (unsigned long long)sh->sector, 655 __func__, (unsigned long long)sh->sector,
628 rbi->bi_rw, i); 656 rbi->bi_rw, i);
629 atomic_inc(&sh->count); 657 atomic_inc(&sh->count);
630 rbi->bi_sector = sh->sector + rrdev->data_offset; 658 if (use_new_offset(conf, sh))
659 rbi->bi_sector = (sh->sector
660 + rrdev->new_data_offset);
661 else
662 rbi->bi_sector = (sh->sector
663 + rrdev->data_offset);
631 rbi->bi_flags = 1 << BIO_UPTODATE; 664 rbi->bi_flags = 1 << BIO_UPTODATE;
632 rbi->bi_idx = 0; 665 rbi->bi_idx = 0;
633 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 666 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1114,6 +1147,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1114 dev->sector + STRIPE_SECTORS) { 1147 dev->sector + STRIPE_SECTORS) {
1115 if (wbi->bi_rw & REQ_FUA) 1148 if (wbi->bi_rw & REQ_FUA)
1116 set_bit(R5_WantFUA, &dev->flags); 1149 set_bit(R5_WantFUA, &dev->flags);
1150 if (wbi->bi_rw & REQ_SYNC)
1151 set_bit(R5_SyncIO, &dev->flags);
1117 tx = async_copy_data(1, wbi, dev->page, 1152 tx = async_copy_data(1, wbi, dev->page,
1118 dev->sector, tx); 1153 dev->sector, tx);
1119 wbi = r5_next_bio(wbi, dev->sector); 1154 wbi = r5_next_bio(wbi, dev->sector);
@@ -1131,13 +1166,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1131 int pd_idx = sh->pd_idx; 1166 int pd_idx = sh->pd_idx;
1132 int qd_idx = sh->qd_idx; 1167 int qd_idx = sh->qd_idx;
1133 int i; 1168 int i;
1134 bool fua = false; 1169 bool fua = false, sync = false;
1135 1170
1136 pr_debug("%s: stripe %llu\n", __func__, 1171 pr_debug("%s: stripe %llu\n", __func__,
1137 (unsigned long long)sh->sector); 1172 (unsigned long long)sh->sector);
1138 1173
1139 for (i = disks; i--; ) 1174 for (i = disks; i--; ) {
1140 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1175 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1176 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1177 }
1141 1178
1142 for (i = disks; i--; ) { 1179 for (i = disks; i--; ) {
1143 struct r5dev *dev = &sh->dev[i]; 1180 struct r5dev *dev = &sh->dev[i];
@@ -1146,6 +1183,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1146 set_bit(R5_UPTODATE, &dev->flags); 1183 set_bit(R5_UPTODATE, &dev->flags);
1147 if (fua) 1184 if (fua)
1148 set_bit(R5_WantFUA, &dev->flags); 1185 set_bit(R5_WantFUA, &dev->flags);
1186 if (sync)
1187 set_bit(R5_SyncIO, &dev->flags);
1149 } 1188 }
1150 } 1189 }
1151 1190
@@ -1648,7 +1687,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1648 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1687 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1649 char b[BDEVNAME_SIZE]; 1688 char b[BDEVNAME_SIZE];
1650 struct md_rdev *rdev = NULL; 1689 struct md_rdev *rdev = NULL;
1651 1690 sector_t s;
1652 1691
1653 for (i=0 ; i<disks; i++) 1692 for (i=0 ; i<disks; i++)
1654 if (bi == &sh->dev[i].req) 1693 if (bi == &sh->dev[i].req)
@@ -1671,6 +1710,10 @@ static void raid5_end_read_request(struct bio * bi, int error)
1671 if (!rdev) 1710 if (!rdev)
1672 rdev = conf->disks[i].rdev; 1711 rdev = conf->disks[i].rdev;
1673 1712
1713 if (use_new_offset(conf, sh))
1714 s = sh->sector + rdev->new_data_offset;
1715 else
1716 s = sh->sector + rdev->data_offset;
1674 if (uptodate) { 1717 if (uptodate) {
1675 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1718 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1676 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1719 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
@@ -1683,8 +1726,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1683 "md/raid:%s: read error corrected" 1726 "md/raid:%s: read error corrected"
1684 " (%lu sectors at %llu on %s)\n", 1727 " (%lu sectors at %llu on %s)\n",
1685 mdname(conf->mddev), STRIPE_SECTORS, 1728 mdname(conf->mddev), STRIPE_SECTORS,
1686 (unsigned long long)(sh->sector 1729 (unsigned long long)s,
1687 + rdev->data_offset),
1688 bdevname(rdev->bdev, b)); 1730 bdevname(rdev->bdev, b));
1689 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1731 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1690 clear_bit(R5_ReadError, &sh->dev[i].flags); 1732 clear_bit(R5_ReadError, &sh->dev[i].flags);
@@ -1704,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1704 "md/raid:%s: read error on replacement device " 1746 "md/raid:%s: read error on replacement device "
1705 "(sector %llu on %s).\n", 1747 "(sector %llu on %s).\n",
1706 mdname(conf->mddev), 1748 mdname(conf->mddev),
1707 (unsigned long long)(sh->sector 1749 (unsigned long long)s,
1708 + rdev->data_offset),
1709 bdn); 1750 bdn);
1710 else if (conf->mddev->degraded >= conf->max_degraded) 1751 else if (conf->mddev->degraded >= conf->max_degraded)
1711 printk_ratelimited( 1752 printk_ratelimited(
@@ -1713,8 +1754,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1713 "md/raid:%s: read error not correctable " 1754 "md/raid:%s: read error not correctable "
1714 "(sector %llu on %s).\n", 1755 "(sector %llu on %s).\n",
1715 mdname(conf->mddev), 1756 mdname(conf->mddev),
1716 (unsigned long long)(sh->sector 1757 (unsigned long long)s,
1717 + rdev->data_offset),
1718 bdn); 1758 bdn);
1719 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1759 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1720 /* Oh, no!!! */ 1760 /* Oh, no!!! */
@@ -1723,8 +1763,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1723 "md/raid:%s: read error NOT corrected!! " 1763 "md/raid:%s: read error NOT corrected!! "
1724 "(sector %llu on %s).\n", 1764 "(sector %llu on %s).\n",
1725 mdname(conf->mddev), 1765 mdname(conf->mddev),
1726 (unsigned long long)(sh->sector 1766 (unsigned long long)s,
1727 + rdev->data_offset),
1728 bdn); 1767 bdn);
1729 else if (atomic_read(&rdev->read_errors) 1768 else if (atomic_read(&rdev->read_errors)
1730 > conf->max_nr_stripes) 1769 > conf->max_nr_stripes)
@@ -3561,7 +3600,7 @@ finish:
3561 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3600 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
3562 rdev = conf->disks[i].rdev; 3601 rdev = conf->disks[i].rdev;
3563 rdev_clear_badblocks(rdev, sh->sector, 3602 rdev_clear_badblocks(rdev, sh->sector,
3564 STRIPE_SECTORS); 3603 STRIPE_SECTORS, 0);
3565 rdev_dec_pending(rdev, conf->mddev); 3604 rdev_dec_pending(rdev, conf->mddev);
3566 } 3605 }
3567 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3606 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
@@ -3570,7 +3609,7 @@ finish:
3570 /* rdev have been moved down */ 3609 /* rdev have been moved down */
3571 rdev = conf->disks[i].rdev; 3610 rdev = conf->disks[i].rdev;
3572 rdev_clear_badblocks(rdev, sh->sector, 3611 rdev_clear_badblocks(rdev, sh->sector,
3573 STRIPE_SECTORS); 3612 STRIPE_SECTORS, 0);
3574 rdev_dec_pending(rdev, conf->mddev); 3613 rdev_dec_pending(rdev, conf->mddev);
3575 } 3614 }
3576 } 3615 }
@@ -3842,6 +3881,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3842 raid_bio->bi_next = (void*)rdev; 3881 raid_bio->bi_next = (void*)rdev;
3843 align_bi->bi_bdev = rdev->bdev; 3882 align_bi->bi_bdev = rdev->bdev;
3844 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3883 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3884 /* No reshape active, so we can trust rdev->data_offset */
3845 align_bi->bi_sector += rdev->data_offset; 3885 align_bi->bi_sector += rdev->data_offset;
3846 3886
3847 if (!bio_fits_rdev(align_bi) || 3887 if (!bio_fits_rdev(align_bi) ||
@@ -3953,12 +3993,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
3953 plugged = mddev_check_plugged(mddev); 3993 plugged = mddev_check_plugged(mddev);
3954 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3994 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
3955 DEFINE_WAIT(w); 3995 DEFINE_WAIT(w);
3956 int disks, data_disks;
3957 int previous; 3996 int previous;
3958 3997
3959 retry: 3998 retry:
3960 previous = 0; 3999 previous = 0;
3961 disks = conf->raid_disks;
3962 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4000 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
3963 if (unlikely(conf->reshape_progress != MaxSector)) { 4001 if (unlikely(conf->reshape_progress != MaxSector)) {
3964 /* spinlock is needed as reshape_progress may be 4002 /* spinlock is needed as reshape_progress may be
@@ -3970,13 +4008,12 @@ static void make_request(struct mddev *mddev, struct bio * bi)
3970 * to check again. 4008 * to check again.
3971 */ 4009 */
3972 spin_lock_irq(&conf->device_lock); 4010 spin_lock_irq(&conf->device_lock);
3973 if (mddev->delta_disks < 0 4011 if (mddev->reshape_backwards
3974 ? logical_sector < conf->reshape_progress 4012 ? logical_sector < conf->reshape_progress
3975 : logical_sector >= conf->reshape_progress) { 4013 : logical_sector >= conf->reshape_progress) {
3976 disks = conf->previous_raid_disks;
3977 previous = 1; 4014 previous = 1;
3978 } else { 4015 } else {
3979 if (mddev->delta_disks < 0 4016 if (mddev->reshape_backwards
3980 ? logical_sector < conf->reshape_safe 4017 ? logical_sector < conf->reshape_safe
3981 : logical_sector >= conf->reshape_safe) { 4018 : logical_sector >= conf->reshape_safe) {
3982 spin_unlock_irq(&conf->device_lock); 4019 spin_unlock_irq(&conf->device_lock);
@@ -3986,7 +4023,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
3986 } 4023 }
3987 spin_unlock_irq(&conf->device_lock); 4024 spin_unlock_irq(&conf->device_lock);
3988 } 4025 }
3989 data_disks = disks - conf->max_degraded;
3990 4026
3991 new_sector = raid5_compute_sector(conf, logical_sector, 4027 new_sector = raid5_compute_sector(conf, logical_sector,
3992 previous, 4028 previous,
@@ -4009,7 +4045,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4009 */ 4045 */
4010 int must_retry = 0; 4046 int must_retry = 0;
4011 spin_lock_irq(&conf->device_lock); 4047 spin_lock_irq(&conf->device_lock);
4012 if (mddev->delta_disks < 0 4048 if (mddev->reshape_backwards
4013 ? logical_sector >= conf->reshape_progress 4049 ? logical_sector >= conf->reshape_progress
4014 : logical_sector < conf->reshape_progress) 4050 : logical_sector < conf->reshape_progress)
4015 /* mismatch, need to try again */ 4051 /* mismatch, need to try again */
@@ -4108,11 +4144,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4108 4144
4109 if (sector_nr == 0) { 4145 if (sector_nr == 0) {
4110 /* If restarting in the middle, skip the initial sectors */ 4146 /* If restarting in the middle, skip the initial sectors */
4111 if (mddev->delta_disks < 0 && 4147 if (mddev->reshape_backwards &&
4112 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4148 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
4113 sector_nr = raid5_size(mddev, 0, 0) 4149 sector_nr = raid5_size(mddev, 0, 0)
4114 - conf->reshape_progress; 4150 - conf->reshape_progress;
4115 } else if (mddev->delta_disks >= 0 && 4151 } else if (!mddev->reshape_backwards &&
4116 conf->reshape_progress > 0) 4152 conf->reshape_progress > 0)
4117 sector_nr = conf->reshape_progress; 4153 sector_nr = conf->reshape_progress;
4118 sector_div(sector_nr, new_data_disks); 4154 sector_div(sector_nr, new_data_disks);
@@ -4133,13 +4169,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4133 else 4169 else
4134 reshape_sectors = mddev->chunk_sectors; 4170 reshape_sectors = mddev->chunk_sectors;
4135 4171
4136 /* we update the metadata when there is more than 3Meg 4172 /* We update the metadata at least every 10 seconds, or when
4137 * in the block range (that is rather arbitrary, should 4173 * the data about to be copied would over-write the source of
4138 * probably be time based) or when the data about to be 4174 * the data at the front of the range. i.e. one new_stripe
4139 * copied would over-write the source of the data at 4175 * along from reshape_progress new_maps to after where
4140 * the front of the range. 4176 * reshape_safe old_maps to
4141 * i.e. one new_stripe along from reshape_progress new_maps
4142 * to after where reshape_safe old_maps to
4143 */ 4177 */
4144 writepos = conf->reshape_progress; 4178 writepos = conf->reshape_progress;
4145 sector_div(writepos, new_data_disks); 4179 sector_div(writepos, new_data_disks);
@@ -4147,7 +4181,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4147 sector_div(readpos, data_disks); 4181 sector_div(readpos, data_disks);
4148 safepos = conf->reshape_safe; 4182 safepos = conf->reshape_safe;
4149 sector_div(safepos, data_disks); 4183 sector_div(safepos, data_disks);
4150 if (mddev->delta_disks < 0) { 4184 if (mddev->reshape_backwards) {
4151 writepos -= min_t(sector_t, reshape_sectors, writepos); 4185 writepos -= min_t(sector_t, reshape_sectors, writepos);
4152 readpos += reshape_sectors; 4186 readpos += reshape_sectors;
4153 safepos += reshape_sectors; 4187 safepos += reshape_sectors;
@@ -4157,11 +4191,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4157 safepos -= min_t(sector_t, reshape_sectors, safepos); 4191 safepos -= min_t(sector_t, reshape_sectors, safepos);
4158 } 4192 }
4159 4193
4194 /* Having calculated the 'writepos' possibly use it
4195 * to set 'stripe_addr' which is where we will write to.
4196 */
4197 if (mddev->reshape_backwards) {
4198 BUG_ON(conf->reshape_progress == 0);
4199 stripe_addr = writepos;
4200 BUG_ON((mddev->dev_sectors &
4201 ~((sector_t)reshape_sectors - 1))
4202 - reshape_sectors - stripe_addr
4203 != sector_nr);
4204 } else {
4205 BUG_ON(writepos != sector_nr + reshape_sectors);
4206 stripe_addr = sector_nr;
4207 }
4208
4160 /* 'writepos' is the most advanced device address we might write. 4209 /* 'writepos' is the most advanced device address we might write.
4161 * 'readpos' is the least advanced device address we might read. 4210 * 'readpos' is the least advanced device address we might read.
4162 * 'safepos' is the least address recorded in the metadata as having 4211 * 'safepos' is the least address recorded in the metadata as having
4163 * been reshaped. 4212 * been reshaped.
4164 * If 'readpos' is behind 'writepos', then there is no way that we can 4213 * If there is a min_offset_diff, these are adjusted either by
4214 * increasing the safepos/readpos if diff is negative, or
4215 * increasing writepos if diff is positive.
4216 * If 'readpos' is then behind 'writepos', there is no way that we can
4165 * ensure safety in the face of a crash - that must be done by userspace 4217 * ensure safety in the face of a crash - that must be done by userspace
4166 * making a backup of the data. So in that case there is no particular 4218 * making a backup of the data. So in that case there is no particular
4167 * rush to update metadata. 4219 * rush to update metadata.
@@ -4174,7 +4226,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4174 * Maybe that number should be configurable, but I'm not sure it is 4226 * Maybe that number should be configurable, but I'm not sure it is
4175 * worth it.... maybe it could be a multiple of safemode_delay??? 4227 * worth it.... maybe it could be a multiple of safemode_delay???
4176 */ 4228 */
4177 if ((mddev->delta_disks < 0 4229 if (conf->min_offset_diff < 0) {
4230 safepos += -conf->min_offset_diff;
4231 readpos += -conf->min_offset_diff;
4232 } else
4233 writepos += conf->min_offset_diff;
4234
4235 if ((mddev->reshape_backwards
4178 ? (safepos > writepos && readpos < writepos) 4236 ? (safepos > writepos && readpos < writepos)
4179 : (safepos < writepos && readpos > writepos)) || 4237 : (safepos < writepos && readpos > writepos)) ||
4180 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4238 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
@@ -4195,17 +4253,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4195 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4253 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4196 } 4254 }
4197 4255
4198 if (mddev->delta_disks < 0) {
4199 BUG_ON(conf->reshape_progress == 0);
4200 stripe_addr = writepos;
4201 BUG_ON((mddev->dev_sectors &
4202 ~((sector_t)reshape_sectors - 1))
4203 - reshape_sectors - stripe_addr
4204 != sector_nr);
4205 } else {
4206 BUG_ON(writepos != sector_nr + reshape_sectors);
4207 stripe_addr = sector_nr;
4208 }
4209 INIT_LIST_HEAD(&stripes); 4256 INIT_LIST_HEAD(&stripes);
4210 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4257 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
4211 int j; 4258 int j;
@@ -4239,7 +4286,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4239 list_add(&sh->lru, &stripes); 4286 list_add(&sh->lru, &stripes);
4240 } 4287 }
4241 spin_lock_irq(&conf->device_lock); 4288 spin_lock_irq(&conf->device_lock);
4242 if (mddev->delta_disks < 0) 4289 if (mddev->reshape_backwards)
4243 conf->reshape_progress -= reshape_sectors * new_data_disks; 4290 conf->reshape_progress -= reshape_sectors * new_data_disks;
4244 else 4291 else
4245 conf->reshape_progress += reshape_sectors * new_data_disks; 4292 conf->reshape_progress += reshape_sectors * new_data_disks;
@@ -4952,16 +4999,42 @@ static int run(struct mddev *mddev)
4952 struct md_rdev *rdev; 4999 struct md_rdev *rdev;
4953 sector_t reshape_offset = 0; 5000 sector_t reshape_offset = 0;
4954 int i; 5001 int i;
5002 long long min_offset_diff = 0;
5003 int first = 1;
4955 5004
4956 if (mddev->recovery_cp != MaxSector) 5005 if (mddev->recovery_cp != MaxSector)
4957 printk(KERN_NOTICE "md/raid:%s: not clean" 5006 printk(KERN_NOTICE "md/raid:%s: not clean"
4958 " -- starting background reconstruction\n", 5007 " -- starting background reconstruction\n",
4959 mdname(mddev)); 5008 mdname(mddev));
5009
5010 rdev_for_each(rdev, mddev) {
5011 long long diff;
5012 if (rdev->raid_disk < 0)
5013 continue;
5014 diff = (rdev->new_data_offset - rdev->data_offset);
5015 if (first) {
5016 min_offset_diff = diff;
5017 first = 0;
5018 } else if (mddev->reshape_backwards &&
5019 diff < min_offset_diff)
5020 min_offset_diff = diff;
5021 else if (!mddev->reshape_backwards &&
5022 diff > min_offset_diff)
5023 min_offset_diff = diff;
5024 }
5025
4960 if (mddev->reshape_position != MaxSector) { 5026 if (mddev->reshape_position != MaxSector) {
4961 /* Check that we can continue the reshape. 5027 /* Check that we can continue the reshape.
4962 * Currently only disks can change, it must 5028 * Difficulties arise if the stripe we would write to
4963 * increase, and we must be past the point where 5029 * next is at or after the stripe we would read from next.
4964 * a stripe over-writes itself 5030 * For a reshape that changes the number of devices, this
5031 * is only possible for a very short time, and mdadm makes
5032 * sure that time appears to have past before assembling
5033 * the array. So we fail if that time hasn't passed.
5034 * For a reshape that keeps the number of devices the same
5035 * mdadm must be monitoring the reshape can keeping the
5036 * critical areas read-only and backed up. It will start
5037 * the array in read-only mode, so we check for that.
4965 */ 5038 */
4966 sector_t here_new, here_old; 5039 sector_t here_new, here_old;
4967 int old_disks; 5040 int old_disks;
@@ -4993,26 +5066,34 @@ static int run(struct mddev *mddev)
4993 /* here_old is the first stripe that we might need to read 5066 /* here_old is the first stripe that we might need to read
4994 * from */ 5067 * from */
4995 if (mddev->delta_disks == 0) { 5068 if (mddev->delta_disks == 0) {
5069 if ((here_new * mddev->new_chunk_sectors !=
5070 here_old * mddev->chunk_sectors)) {
5071 printk(KERN_ERR "md/raid:%s: reshape position is"
5072 " confused - aborting\n", mdname(mddev));
5073 return -EINVAL;
5074 }
4996 /* We cannot be sure it is safe to start an in-place 5075 /* We cannot be sure it is safe to start an in-place
4997 * reshape. It is only safe if user-space if monitoring 5076 * reshape. It is only safe if user-space is monitoring
4998 * and taking constant backups. 5077 * and taking constant backups.
4999 * mdadm always starts a situation like this in 5078 * mdadm always starts a situation like this in
5000 * readonly mode so it can take control before 5079 * readonly mode so it can take control before
5001 * allowing any writes. So just check for that. 5080 * allowing any writes. So just check for that.
5002 */ 5081 */
5003 if ((here_new * mddev->new_chunk_sectors != 5082 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
5004 here_old * mddev->chunk_sectors) || 5083 abs(min_offset_diff) >= mddev->new_chunk_sectors)
5005 mddev->ro == 0) { 5084 /* not really in-place - so OK */;
5006 printk(KERN_ERR "md/raid:%s: in-place reshape must be started" 5085 else if (mddev->ro == 0) {
5007 " in read-only mode - aborting\n", 5086 printk(KERN_ERR "md/raid:%s: in-place reshape "
5087 "must be started in read-only mode "
5088 "- aborting\n",
5008 mdname(mddev)); 5089 mdname(mddev));
5009 return -EINVAL; 5090 return -EINVAL;
5010 } 5091 }
5011 } else if (mddev->delta_disks < 0 5092 } else if (mddev->reshape_backwards
5012 ? (here_new * mddev->new_chunk_sectors <= 5093 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
5013 here_old * mddev->chunk_sectors) 5094 here_old * mddev->chunk_sectors)
5014 : (here_new * mddev->new_chunk_sectors >= 5095 : (here_new * mddev->new_chunk_sectors >=
5015 here_old * mddev->chunk_sectors)) { 5096 here_old * mddev->chunk_sectors + (-min_offset_diff))) {
5016 /* Reading from the same stripe as writing to - bad */ 5097 /* Reading from the same stripe as writing to - bad */
5017 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5098 printk(KERN_ERR "md/raid:%s: reshape_position too early for "
5018 "auto-recovery - aborting.\n", 5099 "auto-recovery - aborting.\n",
@@ -5037,6 +5118,7 @@ static int run(struct mddev *mddev)
5037 if (IS_ERR(conf)) 5118 if (IS_ERR(conf))
5038 return PTR_ERR(conf); 5119 return PTR_ERR(conf);
5039 5120
5121 conf->min_offset_diff = min_offset_diff;
5040 mddev->thread = conf->thread; 5122 mddev->thread = conf->thread;
5041 conf->thread = NULL; 5123 conf->thread = NULL;
5042 mddev->private = conf; 5124 mddev->private = conf;
@@ -5182,9 +5264,12 @@ static int run(struct mddev *mddev)
5182 blk_queue_io_opt(mddev->queue, chunk_size * 5264 blk_queue_io_opt(mddev->queue, chunk_size *
5183 (conf->raid_disks - conf->max_degraded)); 5265 (conf->raid_disks - conf->max_degraded));
5184 5266
5185 rdev_for_each(rdev, mddev) 5267 rdev_for_each(rdev, mddev) {
5186 disk_stack_limits(mddev->gendisk, rdev->bdev, 5268 disk_stack_limits(mddev->gendisk, rdev->bdev,
5187 rdev->data_offset << 9); 5269 rdev->data_offset << 9);
5270 disk_stack_limits(mddev->gendisk, rdev->bdev,
5271 rdev->new_data_offset << 9);
5272 }
5188 } 5273 }
5189 5274
5190 return 0; 5275 return 0;
@@ -5418,12 +5503,18 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
5418 * any io in the removed space completes, but it hardly seems 5503 * any io in the removed space completes, but it hardly seems
5419 * worth it. 5504 * worth it.
5420 */ 5505 */
5506 sector_t newsize;
5421 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5507 sectors &= ~((sector_t)mddev->chunk_sectors - 1);
5422 md_set_array_sectors(mddev, raid5_size(mddev, sectors, 5508 newsize = raid5_size(mddev, sectors, mddev->raid_disks);
5423 mddev->raid_disks)); 5509 if (mddev->external_size &&
5424 if (mddev->array_sectors > 5510 mddev->array_sectors > newsize)
5425 raid5_size(mddev, sectors, mddev->raid_disks))
5426 return -EINVAL; 5511 return -EINVAL;
5512 if (mddev->bitmap) {
5513 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0);
5514 if (ret)
5515 return ret;
5516 }
5517 md_set_array_sectors(mddev, newsize);
5427 set_capacity(mddev->gendisk, mddev->array_sectors); 5518 set_capacity(mddev->gendisk, mddev->array_sectors);
5428 revalidate_disk(mddev->gendisk); 5519 revalidate_disk(mddev->gendisk);
5429 if (sectors > mddev->dev_sectors && 5520 if (sectors > mddev->dev_sectors &&
@@ -5468,9 +5559,6 @@ static int check_reshape(struct mddev *mddev)
5468 mddev->new_layout == mddev->layout && 5559 mddev->new_layout == mddev->layout &&
5469 mddev->new_chunk_sectors == mddev->chunk_sectors) 5560 mddev->new_chunk_sectors == mddev->chunk_sectors)
5470 return 0; /* nothing to do */ 5561 return 0; /* nothing to do */
5471 if (mddev->bitmap)
5472 /* Cannot grow a bitmap yet */
5473 return -EBUSY;
5474 if (has_failed(conf)) 5562 if (has_failed(conf))
5475 return -EINVAL; 5563 return -EINVAL;
5476 if (mddev->delta_disks < 0) { 5564 if (mddev->delta_disks < 0) {
@@ -5505,10 +5593,14 @@ static int raid5_start_reshape(struct mddev *mddev)
5505 if (!check_stripe_cache(mddev)) 5593 if (!check_stripe_cache(mddev))
5506 return -ENOSPC; 5594 return -ENOSPC;
5507 5595
5508 rdev_for_each(rdev, mddev) 5596 if (has_failed(conf))
5597 return -EINVAL;
5598
5599 rdev_for_each(rdev, mddev) {
5509 if (!test_bit(In_sync, &rdev->flags) 5600 if (!test_bit(In_sync, &rdev->flags)
5510 && !test_bit(Faulty, &rdev->flags)) 5601 && !test_bit(Faulty, &rdev->flags))
5511 spares++; 5602 spares++;
5603 }
5512 5604
5513 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5605 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
5514 /* Not enough devices even to make a degraded array 5606 /* Not enough devices even to make a degraded array
@@ -5535,12 +5627,16 @@ static int raid5_start_reshape(struct mddev *mddev)
5535 conf->chunk_sectors = mddev->new_chunk_sectors; 5627 conf->chunk_sectors = mddev->new_chunk_sectors;
5536 conf->prev_algo = conf->algorithm; 5628 conf->prev_algo = conf->algorithm;
5537 conf->algorithm = mddev->new_layout; 5629 conf->algorithm = mddev->new_layout;
5538 if (mddev->delta_disks < 0) 5630 conf->generation++;
5631 /* Code that selects data_offset needs to see the generation update
5632 * if reshape_progress has been set - so a memory barrier needed.
5633 */
5634 smp_mb();
5635 if (mddev->reshape_backwards)
5539 conf->reshape_progress = raid5_size(mddev, 0, 0); 5636 conf->reshape_progress = raid5_size(mddev, 0, 0);
5540 else 5637 else
5541 conf->reshape_progress = 0; 5638 conf->reshape_progress = 0;
5542 conf->reshape_safe = conf->reshape_progress; 5639 conf->reshape_safe = conf->reshape_progress;
5543 conf->generation++;
5544 spin_unlock_irq(&conf->device_lock); 5640 spin_unlock_irq(&conf->device_lock);
5545 5641
5546 /* Add some new drives, as many as will fit. 5642 /* Add some new drives, as many as will fit.
@@ -5592,6 +5688,9 @@ static int raid5_start_reshape(struct mddev *mddev)
5592 mddev->recovery = 0; 5688 mddev->recovery = 0;
5593 spin_lock_irq(&conf->device_lock); 5689 spin_lock_irq(&conf->device_lock);
5594 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5690 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
5691 rdev_for_each(rdev, mddev)
5692 rdev->new_data_offset = rdev->data_offset;
5693 smp_wmb();
5595 conf->reshape_progress = MaxSector; 5694 conf->reshape_progress = MaxSector;
5596 mddev->reshape_position = MaxSector; 5695 mddev->reshape_position = MaxSector;
5597 spin_unlock_irq(&conf->device_lock); 5696 spin_unlock_irq(&conf->device_lock);
@@ -5610,9 +5709,13 @@ static void end_reshape(struct r5conf *conf)
5610{ 5709{
5611 5710
5612 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5711 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
5712 struct md_rdev *rdev;
5613 5713
5614 spin_lock_irq(&conf->device_lock); 5714 spin_lock_irq(&conf->device_lock);
5615 conf->previous_raid_disks = conf->raid_disks; 5715 conf->previous_raid_disks = conf->raid_disks;
5716 rdev_for_each(rdev, conf->mddev)
5717 rdev->data_offset = rdev->new_data_offset;
5718 smp_wmb();
5616 conf->reshape_progress = MaxSector; 5719 conf->reshape_progress = MaxSector;
5617 spin_unlock_irq(&conf->device_lock); 5720 spin_unlock_irq(&conf->device_lock);
5618 wake_up(&conf->wait_for_overlap); 5721 wake_up(&conf->wait_for_overlap);
@@ -5652,17 +5755,18 @@ static void raid5_finish_reshape(struct mddev *mddev)
5652 d < conf->raid_disks - mddev->delta_disks; 5755 d < conf->raid_disks - mddev->delta_disks;
5653 d++) { 5756 d++) {
5654 struct md_rdev *rdev = conf->disks[d].rdev; 5757 struct md_rdev *rdev = conf->disks[d].rdev;
5655 if (rdev && 5758 if (rdev)
5656 raid5_remove_disk(mddev, rdev) == 0) { 5759 clear_bit(In_sync, &rdev->flags);
5657 sysfs_unlink_rdev(mddev, rdev); 5760 rdev = conf->disks[d].replacement;
5658 rdev->raid_disk = -1; 5761 if (rdev)
5659 } 5762 clear_bit(In_sync, &rdev->flags);
5660 } 5763 }
5661 } 5764 }
5662 mddev->layout = conf->algorithm; 5765 mddev->layout = conf->algorithm;
5663 mddev->chunk_sectors = conf->chunk_sectors; 5766 mddev->chunk_sectors = conf->chunk_sectors;
5664 mddev->reshape_position = MaxSector; 5767 mddev->reshape_position = MaxSector;
5665 mddev->delta_disks = 0; 5768 mddev->delta_disks = 0;
5769 mddev->reshape_backwards = 0;
5666 } 5770 }
5667} 5771}
5668 5772
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 8d8e13934a48..2164021f3b5f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -285,6 +285,7 @@ enum r5dev_flags {
285 */ 285 */
286 R5_Wantdrain, /* dev->towrite needs to be drained */ 286 R5_Wantdrain, /* dev->towrite needs to be drained */
287 R5_WantFUA, /* Write should be FUA */ 287 R5_WantFUA, /* Write should be FUA */
288 R5_SyncIO, /* The IO is sync */
288 R5_WriteError, /* got a write error - need to record it */ 289 R5_WriteError, /* got a write error - need to record it */
289 R5_MadeGood, /* A bad block has been fixed by writing to it */ 290 R5_MadeGood, /* A bad block has been fixed by writing to it */
290 R5_ReadRepl, /* Will/did read from replacement rather than orig */ 291 R5_ReadRepl, /* Will/did read from replacement rather than orig */
@@ -385,6 +386,12 @@ struct r5conf {
385 short generation; /* increments with every reshape */ 386 short generation; /* increments with every reshape */
386 unsigned long reshape_checkpoint; /* Time we last updated 387 unsigned long reshape_checkpoint; /* Time we last updated
387 * metadata */ 388 * metadata */
389 long long min_offset_diff; /* minimum difference between
390 * data_offset and
391 * new_data_offset across all
392 * devices. May be negative,
393 * but is closest to zero.
394 */
388 395
389 struct list_head handle_list; /* stripes needing handling */ 396 struct list_head handle_list; /* stripes needing handling */
390 struct list_head hold_list; /* preread ready stripes */ 397 struct list_head hold_list; /* preread ready stripes */
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 8c0a3adc5df5..ee753536ab70 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -233,7 +233,10 @@ struct mdp_superblock_1 {
233 __le32 delta_disks; /* change in number of raid_disks */ 233 __le32 delta_disks; /* change in number of raid_disks */
234 __le32 new_layout; /* new layout */ 234 __le32 new_layout; /* new layout */
235 __le32 new_chunk; /* new chunk size (512byte sectors) */ 235 __le32 new_chunk; /* new chunk size (512byte sectors) */
236 __u8 pad1[128-124]; /* set to 0 when written */ 236 __le32 new_offset; /* signed number to add to data_offset in new
237 * layout. 0 == no-change. This can be
238 * different on each device in the array.
239 */
237 240
238 /* constant this-device information - 64 bytes */ 241 /* constant this-device information - 64 bytes */
239 __le64 data_offset; /* sector start of data, often 0 */ 242 __le64 data_offset; /* sector start of data, often 0 */
@@ -281,10 +284,18 @@ struct mdp_superblock_1 {
281 * active device with same 'role'. 284 * active device with same 'role'.
282 * 'recovery_offset' is also set. 285 * 'recovery_offset' is also set.
283 */ 286 */
287#define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number
288 * of devices, but is going
289 * backwards anyway.
290 */
291#define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */
284#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ 292#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
285 |MD_FEATURE_RECOVERY_OFFSET \ 293 |MD_FEATURE_RECOVERY_OFFSET \
286 |MD_FEATURE_RESHAPE_ACTIVE \ 294 |MD_FEATURE_RESHAPE_ACTIVE \
287 |MD_FEATURE_BAD_BLOCKS \ 295 |MD_FEATURE_BAD_BLOCKS \
288 |MD_FEATURE_REPLACEMENT) 296 |MD_FEATURE_REPLACEMENT \
297 |MD_FEATURE_RESHAPE_BACKWARDS \
298 |MD_FEATURE_NEW_OFFSET \
299 )
289 300
290#endif 301#endif
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 53272e9860a7..640c69ceec96 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -99,8 +99,20 @@ extern const struct raid6_calls raid6_altivec2;
99extern const struct raid6_calls raid6_altivec4; 99extern const struct raid6_calls raid6_altivec4;
100extern const struct raid6_calls raid6_altivec8; 100extern const struct raid6_calls raid6_altivec8;
101 101
102struct raid6_recov_calls {
103 void (*data2)(int, size_t, int, int, void **);
104 void (*datap)(int, size_t, int, void **);
105 int (*valid)(void);
106 const char *name;
107 int priority;
108};
109
110extern const struct raid6_recov_calls raid6_recov_intx1;
111extern const struct raid6_recov_calls raid6_recov_ssse3;
112
102/* Algorithm list */ 113/* Algorithm list */
103extern const struct raid6_calls * const raid6_algos[]; 114extern const struct raid6_calls * const raid6_algos[];
115extern const struct raid6_recov_calls *const raid6_recov_algos[];
104int raid6_select_algo(void); 116int raid6_select_algo(void);
105 117
106/* Return values from chk_syndrome */ 118/* Return values from chk_syndrome */
@@ -111,14 +123,16 @@ int raid6_select_algo(void);
111 123
112/* Galois field tables */ 124/* Galois field tables */
113extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); 125extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
126extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256)));
114extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); 127extern const u8 raid6_gfexp[256] __attribute__((aligned(256)));
115extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); 128extern const u8 raid6_gfinv[256] __attribute__((aligned(256)));
116extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); 129extern const u8 raid6_gfexi[256] __attribute__((aligned(256)));
117 130
118/* Recovery routines */ 131/* Recovery routines */
119void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, 132extern void (*raid6_2data_recov)(int disks, size_t bytes, int faila, int failb,
120 void **ptrs); 133 void **ptrs);
121void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); 134extern void (*raid6_datap_recov)(int disks, size_t bytes, int faila,
135 void **ptrs);
122void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, 136void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
123 void **ptrs); 137 void **ptrs);
124 138
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 8a38102770f3..de06dfe165b8 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -1,6 +1,6 @@
1obj-$(CONFIG_RAID6_PQ) += raid6_pq.o 1obj-$(CONFIG_RAID6_PQ) += raid6_pq.o
2 2
3raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ 3raid6_pq-y += algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \
4 int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ 4 int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \
5 altivec8.o mmx.o sse1.o sse2.o 5 altivec8.o mmx.o sse1.o sse2.o
6hostprogs-y += mktables 6hostprogs-y += mktables
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 8b02f60ffc86..589f5f50ad2e 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -17,11 +17,11 @@
17 */ 17 */
18 18
19#include <linux/raid/pq.h> 19#include <linux/raid/pq.h>
20#include <linux/module.h>
21#ifndef __KERNEL__ 20#ifndef __KERNEL__
22#include <sys/mman.h> 21#include <sys/mman.h>
23#include <stdio.h> 22#include <stdio.h>
24#else 23#else
24#include <linux/module.h>
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#if !RAID6_USE_EMPTY_ZERO_PAGE 26#if !RAID6_USE_EMPTY_ZERO_PAGE
27/* In .bss so it's zeroed */ 27/* In .bss so it's zeroed */
@@ -34,10 +34,6 @@ struct raid6_calls raid6_call;
34EXPORT_SYMBOL_GPL(raid6_call); 34EXPORT_SYMBOL_GPL(raid6_call);
35 35
36const struct raid6_calls * const raid6_algos[] = { 36const struct raid6_calls * const raid6_algos[] = {
37 &raid6_intx1,
38 &raid6_intx2,
39 &raid6_intx4,
40 &raid6_intx8,
41#if defined(__ia64__) 37#if defined(__ia64__)
42 &raid6_intx16, 38 &raid6_intx16,
43 &raid6_intx32, 39 &raid6_intx32,
@@ -61,6 +57,24 @@ const struct raid6_calls * const raid6_algos[] = {
61 &raid6_altivec4, 57 &raid6_altivec4,
62 &raid6_altivec8, 58 &raid6_altivec8,
63#endif 59#endif
60 &raid6_intx1,
61 &raid6_intx2,
62 &raid6_intx4,
63 &raid6_intx8,
64 NULL
65};
66
67void (*raid6_2data_recov)(int, size_t, int, int, void **);
68EXPORT_SYMBOL_GPL(raid6_2data_recov);
69
70void (*raid6_datap_recov)(int, size_t, int, void **);
71EXPORT_SYMBOL_GPL(raid6_datap_recov);
72
73const struct raid6_recov_calls *const raid6_recov_algos[] = {
74#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
75 &raid6_recov_ssse3,
76#endif
77 &raid6_recov_intx1,
64 NULL 78 NULL
65}; 79};
66 80
@@ -72,59 +86,55 @@ const struct raid6_calls * const raid6_algos[] = {
72#define time_before(x, y) ((x) < (y)) 86#define time_before(x, y) ((x) < (y))
73#endif 87#endif
74 88
75/* Try to pick the best algorithm */ 89static inline const struct raid6_recov_calls *raid6_choose_recov(void)
76/* This code uses the gfmul table as convenient data set to abuse */
77
78int __init raid6_select_algo(void)
79{ 90{
80 const struct raid6_calls * const * algo; 91 const struct raid6_recov_calls *const *algo;
81 const struct raid6_calls * best; 92 const struct raid6_recov_calls *best;
82 char *syndromes;
83 void *dptrs[(65536/PAGE_SIZE)+2];
84 int i, disks;
85 unsigned long perf, bestperf;
86 int bestprefer;
87 unsigned long j0, j1;
88 93
89 disks = (65536/PAGE_SIZE)+2; 94 for (best = NULL, algo = raid6_recov_algos; *algo; algo++)
90 for ( i = 0 ; i < disks-2 ; i++ ) { 95 if (!best || (*algo)->priority > best->priority)
91 dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; 96 if (!(*algo)->valid || (*algo)->valid())
92 } 97 best = *algo;
93 98
94 /* Normal code - use a 2-page allocation to avoid D$ conflict */ 99 if (best) {
95 syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); 100 raid6_2data_recov = best->data2;
101 raid6_datap_recov = best->datap;
96 102
97 if ( !syndromes ) { 103 printk("raid6: using %s recovery algorithm\n", best->name);
98 printk("raid6: Yikes! No memory available.\n"); 104 } else
99 return -ENOMEM; 105 printk("raid6: Yikes! No recovery algorithm found!\n");
100 }
101 106
102 dptrs[disks-2] = syndromes; 107 return best;
103 dptrs[disks-1] = syndromes + PAGE_SIZE; 108}
109
110static inline const struct raid6_calls *raid6_choose_gen(
111 void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks)
112{
113 unsigned long perf, bestperf, j0, j1;
114 const struct raid6_calls *const *algo;
115 const struct raid6_calls *best;
104 116
105 bestperf = 0; bestprefer = 0; best = NULL; 117 for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
118 if (!best || (*algo)->prefer >= best->prefer) {
119 if ((*algo)->valid && !(*algo)->valid())
120 continue;
106 121
107 for ( algo = raid6_algos ; *algo ; algo++ ) {
108 if ( !(*algo)->valid || (*algo)->valid() ) {
109 perf = 0; 122 perf = 0;
110 123
111 preempt_disable(); 124 preempt_disable();
112 j0 = jiffies; 125 j0 = jiffies;
113 while ( (j1 = jiffies) == j0 ) 126 while ((j1 = jiffies) == j0)
114 cpu_relax(); 127 cpu_relax();
115 while (time_before(jiffies, 128 while (time_before(jiffies,
116 j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { 129 j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
117 (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); 130 (*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs);
118 perf++; 131 perf++;
119 } 132 }
120 preempt_enable(); 133 preempt_enable();
121 134
122 if ( (*algo)->prefer > bestprefer || 135 if (perf > bestperf) {
123 ((*algo)->prefer == bestprefer &&
124 perf > bestperf) ) {
125 best = *algo;
126 bestprefer = best->prefer;
127 bestperf = perf; 136 bestperf = perf;
137 best = *algo;
128 } 138 }
129 printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, 139 printk("raid6: %-8s %5ld MB/s\n", (*algo)->name,
130 (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); 140 (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
@@ -139,9 +149,46 @@ int __init raid6_select_algo(void)
139 } else 149 } else
140 printk("raid6: Yikes! No algorithm found!\n"); 150 printk("raid6: Yikes! No algorithm found!\n");
141 151
152 return best;
153}
154
155
156/* Try to pick the best algorithm */
157/* This code uses the gfmul table as convenient data set to abuse */
158
159int __init raid6_select_algo(void)
160{
161 const int disks = (65536/PAGE_SIZE)+2;
162
163 const struct raid6_calls *gen_best;
164 const struct raid6_recov_calls *rec_best;
165 char *syndromes;
166 void *dptrs[(65536/PAGE_SIZE)+2];
167 int i;
168
169 for (i = 0; i < disks-2; i++)
170 dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
171
172 /* Normal code - use a 2-page allocation to avoid D$ conflict */
173 syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
174
175 if (!syndromes) {
176 printk("raid6: Yikes! No memory available.\n");
177 return -ENOMEM;
178 }
179
180 dptrs[disks-2] = syndromes;
181 dptrs[disks-1] = syndromes + PAGE_SIZE;
182
183 /* select raid gen_syndrome function */
184 gen_best = raid6_choose_gen(&dptrs, disks);
185
186 /* select raid recover functions */
187 rec_best = raid6_choose_recov();
188
142 free_pages((unsigned long)syndromes, 1); 189 free_pages((unsigned long)syndromes, 1);
143 190
144 return best ? 0 : -EINVAL; 191 return gen_best && rec_best ? 0 : -EINVAL;
145} 192}
146 193
147static void raid6_exit(void) 194static void raid6_exit(void)
diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c
index 8a3780902cec..39787db588b0 100644
--- a/lib/raid6/mktables.c
+++ b/lib/raid6/mktables.c
@@ -81,6 +81,31 @@ int main(int argc, char *argv[])
81 printf("EXPORT_SYMBOL(raid6_gfmul);\n"); 81 printf("EXPORT_SYMBOL(raid6_gfmul);\n");
82 printf("#endif\n"); 82 printf("#endif\n");
83 83
84 /* Compute vector multiplication table */
85 printf("\nconst u8 __attribute__((aligned(256)))\n"
86 "raid6_vgfmul[256][32] =\n"
87 "{\n");
88 for (i = 0; i < 256; i++) {
89 printf("\t{\n");
90 for (j = 0; j < 16; j += 8) {
91 printf("\t\t");
92 for (k = 0; k < 8; k++)
93 printf("0x%02x,%c", gfmul(i, j + k),
94 (k == 7) ? '\n' : ' ');
95 }
96 for (j = 0; j < 16; j += 8) {
97 printf("\t\t");
98 for (k = 0; k < 8; k++)
99 printf("0x%02x,%c", gfmul(i, (j + k) << 4),
100 (k == 7) ? '\n' : ' ');
101 }
102 printf("\t},\n");
103 }
104 printf("};\n");
105 printf("#ifdef __KERNEL__\n");
106 printf("EXPORT_SYMBOL(raid6_vgfmul);\n");
107 printf("#endif\n");
108
84 /* Compute power-of-2 table (exponent) */ 109 /* Compute power-of-2 table (exponent) */
85 v = 1; 110 v = 1;
86 printf("\nconst u8 __attribute__((aligned(256)))\n" 111 printf("\nconst u8 __attribute__((aligned(256)))\n"
diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c
index fe275d7b6b36..1805a5cc5daa 100644
--- a/lib/raid6/recov.c
+++ b/lib/raid6/recov.c
@@ -22,7 +22,7 @@
22#include <linux/raid/pq.h> 22#include <linux/raid/pq.h>
23 23
24/* Recover two failed data blocks. */ 24/* Recover two failed data blocks. */
25void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, 25void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb,
26 void **ptrs) 26 void **ptrs)
27{ 27{
28 u8 *p, *q, *dp, *dq; 28 u8 *p, *q, *dp, *dq;
@@ -64,10 +64,9 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
64 p++; q++; 64 p++; q++;
65 } 65 }
66} 66}
67EXPORT_SYMBOL_GPL(raid6_2data_recov);
68 67
69/* Recover failure of one data block plus the P block */ 68/* Recover failure of one data block plus the P block */
70void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) 69void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, void **ptrs)
71{ 70{
72 u8 *p, *q, *dq; 71 u8 *p, *q, *dq;
73 const u8 *qmul; /* Q multiplier table */ 72 const u8 *qmul; /* Q multiplier table */
@@ -96,7 +95,15 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
96 q++; dq++; 95 q++; dq++;
97 } 96 }
98} 97}
99EXPORT_SYMBOL_GPL(raid6_datap_recov); 98
99
100const struct raid6_recov_calls raid6_recov_intx1 = {
101 .data2 = raid6_2data_recov_intx1,
102 .datap = raid6_datap_recov_intx1,
103 .valid = NULL,
104 .name = "intx1",
105 .priority = 0,
106};
100 107
101#ifndef __KERNEL__ 108#ifndef __KERNEL__
102/* Testing only */ 109/* Testing only */
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c
new file mode 100644
index 000000000000..37ae61930559
--- /dev/null
+++ b/lib/raid6/recov_ssse3.c
@@ -0,0 +1,335 @@
1/*
2 * Copyright (C) 2012 Intel Corporation
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; version 2
7 * of the License.
8 */
9
10#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
11
12#include <linux/raid/pq.h>
13#include "x86.h"
14
15static int raid6_has_ssse3(void)
16{
17 return boot_cpu_has(X86_FEATURE_XMM) &&
18 boot_cpu_has(X86_FEATURE_XMM2) &&
19 boot_cpu_has(X86_FEATURE_SSSE3);
20}
21
22void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb,
23 void **ptrs)
24{
25 u8 *p, *q, *dp, *dq;
26 const u8 *pbmul; /* P multiplier table for B data */
27 const u8 *qmul; /* Q multiplier table (for both) */
28 static const u8 __aligned(16) x0f[16] = {
29 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
30 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f};
31
32 p = (u8 *)ptrs[disks-2];
33 q = (u8 *)ptrs[disks-1];
34
35 /* Compute syndrome with zero for the missing data pages
36 Use the dead data pages as temporary storage for
37 delta p and delta q */
38 dp = (u8 *)ptrs[faila];
39 ptrs[faila] = (void *)raid6_empty_zero_page;
40 ptrs[disks-2] = dp;
41 dq = (u8 *)ptrs[failb];
42 ptrs[failb] = (void *)raid6_empty_zero_page;
43 ptrs[disks-1] = dq;
44
45 raid6_call.gen_syndrome(disks, bytes, ptrs);
46
47 /* Restore pointer table */
48 ptrs[faila] = dp;
49 ptrs[failb] = dq;
50 ptrs[disks-2] = p;
51 ptrs[disks-1] = q;
52
53 /* Now, pick the proper data tables */
54 pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
55 qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
56 raid6_gfexp[failb]]];
57
58 kernel_fpu_begin();
59
60 asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0]));
61
62#ifdef CONFIG_X86_64
63 asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0]));
64 asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0]));
65 asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16]));
66#endif
67
68 /* Now do it... */
69 while (bytes) {
70#ifdef CONFIG_X86_64
71 /* xmm6, xmm14, xmm15 */
72
73 asm volatile("movdqa %0,%%xmm1" : : "m" (q[0]));
74 asm volatile("movdqa %0,%%xmm9" : : "m" (q[16]));
75 asm volatile("movdqa %0,%%xmm0" : : "m" (p[0]));
76 asm volatile("movdqa %0,%%xmm8" : : "m" (p[16]));
77 asm volatile("pxor %0,%%xmm1" : : "m" (dq[0]));
78 asm volatile("pxor %0,%%xmm9" : : "m" (dq[16]));
79 asm volatile("pxor %0,%%xmm0" : : "m" (dp[0]));
80 asm volatile("pxor %0,%%xmm8" : : "m" (dp[16]));
81
82 /* xmm0/8 = px */
83
84 asm volatile("movdqa %xmm6,%xmm4");
85 asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16]));
86 asm volatile("movdqa %xmm6,%xmm12");
87 asm volatile("movdqa %xmm5,%xmm13");
88 asm volatile("movdqa %xmm1,%xmm3");
89 asm volatile("movdqa %xmm9,%xmm11");
90 asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */
91 asm volatile("movdqa %xmm8,%xmm10");
92 asm volatile("psraw $4,%xmm1");
93 asm volatile("psraw $4,%xmm9");
94 asm volatile("pand %xmm7,%xmm3");
95 asm volatile("pand %xmm7,%xmm11");
96 asm volatile("pand %xmm7,%xmm1");
97 asm volatile("pand %xmm7,%xmm9");
98 asm volatile("pshufb %xmm3,%xmm4");
99 asm volatile("pshufb %xmm11,%xmm12");
100 asm volatile("pshufb %xmm1,%xmm5");
101 asm volatile("pshufb %xmm9,%xmm13");
102 asm volatile("pxor %xmm4,%xmm5");
103 asm volatile("pxor %xmm12,%xmm13");
104
105 /* xmm5/13 = qx */
106
107 asm volatile("movdqa %xmm14,%xmm4");
108 asm volatile("movdqa %xmm15,%xmm1");
109 asm volatile("movdqa %xmm14,%xmm12");
110 asm volatile("movdqa %xmm15,%xmm9");
111 asm volatile("movdqa %xmm2,%xmm3");
112 asm volatile("movdqa %xmm10,%xmm11");
113 asm volatile("psraw $4,%xmm2");
114 asm volatile("psraw $4,%xmm10");
115 asm volatile("pand %xmm7,%xmm3");
116 asm volatile("pand %xmm7,%xmm11");
117 asm volatile("pand %xmm7,%xmm2");
118 asm volatile("pand %xmm7,%xmm10");
119 asm volatile("pshufb %xmm3,%xmm4");
120 asm volatile("pshufb %xmm11,%xmm12");
121 asm volatile("pshufb %xmm2,%xmm1");
122 asm volatile("pshufb %xmm10,%xmm9");
123 asm volatile("pxor %xmm4,%xmm1");
124 asm volatile("pxor %xmm12,%xmm9");
125
126 /* xmm1/9 = pbmul[px] */
127 asm volatile("pxor %xmm5,%xmm1");
128 asm volatile("pxor %xmm13,%xmm9");
129 /* xmm1/9 = db = DQ */
130 asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0]));
131 asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16]));
132
133 asm volatile("pxor %xmm1,%xmm0");
134 asm volatile("pxor %xmm9,%xmm8");
135 asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0]));
136 asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16]));
137
138 bytes -= 32;
139 p += 32;
140 q += 32;
141 dp += 32;
142 dq += 32;
143#else
144 asm volatile("movdqa %0,%%xmm1" : : "m" (*q));
145 asm volatile("movdqa %0,%%xmm0" : : "m" (*p));
146 asm volatile("pxor %0,%%xmm1" : : "m" (*dq));
147 asm volatile("pxor %0,%%xmm0" : : "m" (*dp));
148
149 /* 1 = dq ^ q
150 * 0 = dp ^ p
151 */
152 asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0]));
153 asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16]));
154
155 asm volatile("movdqa %xmm1,%xmm3");
156 asm volatile("psraw $4,%xmm1");
157 asm volatile("pand %xmm7,%xmm3");
158 asm volatile("pand %xmm7,%xmm1");
159 asm volatile("pshufb %xmm3,%xmm4");
160 asm volatile("pshufb %xmm1,%xmm5");
161 asm volatile("pxor %xmm4,%xmm5");
162
163 asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */
164
165 /* xmm5 = qx */
166
167 asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0]));
168 asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16]));
169 asm volatile("movdqa %xmm2,%xmm3");
170 asm volatile("psraw $4,%xmm2");
171 asm volatile("pand %xmm7,%xmm3");
172 asm volatile("pand %xmm7,%xmm2");
173 asm volatile("pshufb %xmm3,%xmm4");
174 asm volatile("pshufb %xmm2,%xmm1");
175 asm volatile("pxor %xmm4,%xmm1");
176
177 /* xmm1 = pbmul[px] */
178 asm volatile("pxor %xmm5,%xmm1");
179 /* xmm1 = db = DQ */
180 asm volatile("movdqa %%xmm1,%0" : "=m" (*dq));
181
182 asm volatile("pxor %xmm1,%xmm0");
183 asm volatile("movdqa %%xmm0,%0" : "=m" (*dp));
184
185 bytes -= 16;
186 p += 16;
187 q += 16;
188 dp += 16;
189 dq += 16;
190#endif
191 }
192
193 kernel_fpu_end();
194}
195
196
197void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, void **ptrs)
198{
199 u8 *p, *q, *dq;
200 const u8 *qmul; /* Q multiplier table */
201 static const u8 __aligned(16) x0f[16] = {
202 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
203 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f};
204
205 p = (u8 *)ptrs[disks-2];
206 q = (u8 *)ptrs[disks-1];
207
208 /* Compute syndrome with zero for the missing data page
209 Use the dead data page as temporary storage for delta q */
210 dq = (u8 *)ptrs[faila];
211 ptrs[faila] = (void *)raid6_empty_zero_page;
212 ptrs[disks-1] = dq;
213
214 raid6_call.gen_syndrome(disks, bytes, ptrs);
215
216 /* Restore pointer table */
217 ptrs[faila] = dq;
218 ptrs[disks-1] = q;
219
220 /* Now, pick the proper data tables */
221 qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
222
223 kernel_fpu_begin();
224
225 asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0]));
226
227 while (bytes) {
228#ifdef CONFIG_X86_64
229 asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0]));
230 asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16]));
231 asm volatile("pxor %0, %%xmm3" : : "m" (q[0]));
232 asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0]));
233
234 /* xmm3 = q[0] ^ dq[0] */
235
236 asm volatile("pxor %0, %%xmm4" : : "m" (q[16]));
237 asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16]));
238
239 /* xmm4 = q[16] ^ dq[16] */
240
241 asm volatile("movdqa %xmm3, %xmm6");
242 asm volatile("movdqa %xmm4, %xmm8");
243
244 /* xmm4 = xmm8 = q[16] ^ dq[16] */
245
246 asm volatile("psraw $4, %xmm3");
247 asm volatile("pand %xmm7, %xmm6");
248 asm volatile("pand %xmm7, %xmm3");
249 asm volatile("pshufb %xmm6, %xmm0");
250 asm volatile("pshufb %xmm3, %xmm1");
251 asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0]));
252 asm volatile("pxor %xmm0, %xmm1");
253 asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16]));
254
255 /* xmm1 = qmul[q[0] ^ dq[0]] */
256
257 asm volatile("psraw $4, %xmm4");
258 asm volatile("pand %xmm7, %xmm8");
259 asm volatile("pand %xmm7, %xmm4");
260 asm volatile("pshufb %xmm8, %xmm10");
261 asm volatile("pshufb %xmm4, %xmm11");
262 asm volatile("movdqa %0, %%xmm2" : : "m" (p[0]));
263 asm volatile("pxor %xmm10, %xmm11");
264 asm volatile("movdqa %0, %%xmm12" : : "m" (p[16]));
265
266 /* xmm11 = qmul[q[16] ^ dq[16]] */
267
268 asm volatile("pxor %xmm1, %xmm2");
269
270 /* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */
271
272 asm volatile("pxor %xmm11, %xmm12");
273
274 /* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */
275
276 asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0]));
277 asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16]));
278
279 asm volatile("movdqa %%xmm2, %0" : "=m" (p[0]));
280 asm volatile("movdqa %%xmm12, %0" : "=m" (p[16]));
281
282 bytes -= 32;
283 p += 32;
284 q += 32;
285 dq += 32;
286
287#else
288 asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0]));
289 asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0]));
290 asm volatile("pxor %0, %%xmm3" : : "m" (q[0]));
291 asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16]));
292
293 /* xmm3 = *q ^ *dq */
294
295 asm volatile("movdqa %xmm3, %xmm6");
296 asm volatile("movdqa %0, %%xmm2" : : "m" (p[0]));
297 asm volatile("psraw $4, %xmm3");
298 asm volatile("pand %xmm7, %xmm6");
299 asm volatile("pand %xmm7, %xmm3");
300 asm volatile("pshufb %xmm6, %xmm0");
301 asm volatile("pshufb %xmm3, %xmm1");
302 asm volatile("pxor %xmm0, %xmm1");
303
304 /* xmm1 = qmul[*q ^ *dq */
305
306 asm volatile("pxor %xmm1, %xmm2");
307
308 /* xmm2 = *p ^ qmul[*q ^ *dq] */
309
310 asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0]));
311 asm volatile("movdqa %%xmm2, %0" : "=m" (p[0]));
312
313 bytes -= 16;
314 p += 16;
315 q += 16;
316 dq += 16;
317#endif
318 }
319
320 kernel_fpu_end();
321}
322
323const struct raid6_recov_calls raid6_recov_ssse3 = {
324 .data2 = raid6_2data_recov_ssse3,
325 .datap = raid6_datap_recov_ssse3,
326 .valid = raid6_has_ssse3,
327#ifdef CONFIG_X86_64
328 .name = "ssse3x2",
329#else
330 .name = "ssse3x1",
331#endif
332 .priority = 1,
333};
334
335#endif
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index aa651697b6dc..c76151d94764 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -23,7 +23,7 @@ RANLIB = ranlib
23all: raid6.a raid6test 23all: raid6.a raid6test
24 24
25raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ 25raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \
26 altivec1.o altivec2.o altivec4.o altivec8.o recov.o algos.o \ 26 altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \
27 tables.o 27 tables.o
28 rm -f $@ 28 rm -f $@
29 $(AR) cq $@ $^ 29 $(AR) cq $@ $^
diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c
index 7a930318b17d..5a485b7a7d3c 100644
--- a/lib/raid6/test/test.c
+++ b/lib/raid6/test/test.c
@@ -90,25 +90,35 @@ static int test_disks(int i, int j)
90int main(int argc, char *argv[]) 90int main(int argc, char *argv[])
91{ 91{
92 const struct raid6_calls *const *algo; 92 const struct raid6_calls *const *algo;
93 const struct raid6_recov_calls *const *ra;
93 int i, j; 94 int i, j;
94 int err = 0; 95 int err = 0;
95 96
96 makedata(); 97 makedata();
97 98
98 for (algo = raid6_algos; *algo; algo++) { 99 for (ra = raid6_recov_algos; *ra; ra++) {
99 if (!(*algo)->valid || (*algo)->valid()) { 100 if ((*ra)->valid && !(*ra)->valid())
100 raid6_call = **algo; 101 continue;
102 raid6_2data_recov = (*ra)->data2;
103 raid6_datap_recov = (*ra)->datap;
101 104
102 /* Nuke syndromes */ 105 printf("using recovery %s\n", (*ra)->name);
103 memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
104 106
105 /* Generate assumed good syndrome */ 107 for (algo = raid6_algos; *algo; algo++) {
106 raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, 108 if (!(*algo)->valid || (*algo)->valid()) {
107 (void **)&dataptrs); 109 raid6_call = **algo;
108 110
109 for (i = 0; i < NDISKS-1; i++) 111 /* Nuke syndromes */
110 for (j = i+1; j < NDISKS; j++) 112 memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
111 err += test_disks(i, j); 113
114 /* Generate assumed good syndrome */
115 raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
116 (void **)&dataptrs);
117
118 for (i = 0; i < NDISKS-1; i++)
119 for (j = i+1; j < NDISKS; j++)
120 err += test_disks(i, j);
121 }
112 } 122 }
113 printf("\n"); 123 printf("\n");
114 } 124 }
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index cb2a8c91c886..d55d63232c55 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -35,24 +35,29 @@ static inline void kernel_fpu_end(void)
35{ 35{
36} 36}
37 37
38#define __aligned(x) __attribute__((aligned(x)))
39
38#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ 40#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
39#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions 41#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions
40 * (fast save and restore) */ 42 * (fast save and restore) */
41#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ 43#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */
42#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ 44#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */
45#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
46#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
47#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
43#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ 48#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
44 49
45/* Should work well enough on modern CPUs for testing */ 50/* Should work well enough on modern CPUs for testing */
46static inline int boot_cpu_has(int flag) 51static inline int boot_cpu_has(int flag)
47{ 52{
48 u32 eax = (flag >> 5) ? 0x80000001 : 1; 53 u32 eax = (flag & 0x20) ? 0x80000001 : 1;
49 u32 edx; 54 u32 ecx, edx;
50 55
51 asm volatile("cpuid" 56 asm volatile("cpuid"
52 : "+a" (eax), "=d" (edx) 57 : "+a" (eax), "=d" (edx), "=c" (ecx)
53 : : "ecx", "ebx"); 58 : : "ebx");
54 59
55 return (edx >> (flag & 31)) & 1; 60 return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1;
56} 61}
57 62
58#endif /* ndef __KERNEL__ */ 63#endif /* ndef __KERNEL__ */