aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c719
1 files changed, 652 insertions, 67 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2dba305daf3c..dae740adaf65 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -22,6 +22,7 @@
22#include <linux/raid/raid5.h> 22#include <linux/raid/raid5.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/bitops.h> 24#include <linux/bitops.h>
25#include <linux/kthread.h>
25#include <asm/atomic.h> 26#include <asm/atomic.h>
26 27
27#include <linux/raid/bitmap.h> 28#include <linux/raid/bitmap.h>
@@ -93,11 +94,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
93 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 94 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
94 md_wakeup_thread(conf->mddev->thread); 95 md_wakeup_thread(conf->mddev->thread);
95 } 96 }
96 list_add_tail(&sh->lru, &conf->inactive_list);
97 atomic_dec(&conf->active_stripes); 97 atomic_dec(&conf->active_stripes);
98 if (!conf->inactive_blocked || 98 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
99 atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4)) 99 list_add_tail(&sh->lru, &conf->inactive_list);
100 wake_up(&conf->wait_for_stripe); 100 wake_up(&conf->wait_for_stripe);
101 }
101 } 102 }
102 } 103 }
103} 104}
@@ -178,10 +179,10 @@ static int grow_buffers(struct stripe_head *sh, int num)
178 179
179static void raid5_build_block (struct stripe_head *sh, int i); 180static void raid5_build_block (struct stripe_head *sh, int i);
180 181
181static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx) 182static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
182{ 183{
183 raid5_conf_t *conf = sh->raid_conf; 184 raid5_conf_t *conf = sh->raid_conf;
184 int disks = conf->raid_disks, i; 185 int i;
185 186
186 if (atomic_read(&sh->count) != 0) 187 if (atomic_read(&sh->count) != 0)
187 BUG(); 188 BUG();
@@ -198,7 +199,9 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
198 sh->pd_idx = pd_idx; 199 sh->pd_idx = pd_idx;
199 sh->state = 0; 200 sh->state = 0;
200 201
201 for (i=disks; i--; ) { 202 sh->disks = disks;
203
204 for (i = sh->disks; i--; ) {
202 struct r5dev *dev = &sh->dev[i]; 205 struct r5dev *dev = &sh->dev[i];
203 206
204 if (dev->toread || dev->towrite || dev->written || 207 if (dev->toread || dev->towrite || dev->written ||
@@ -215,7 +218,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
215 insert_hash(conf, sh); 218 insert_hash(conf, sh);
216} 219}
217 220
218static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector) 221static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
219{ 222{
220 struct stripe_head *sh; 223 struct stripe_head *sh;
221 struct hlist_node *hn; 224 struct hlist_node *hn;
@@ -223,7 +226,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
223 CHECK_DEVLOCK(); 226 CHECK_DEVLOCK();
224 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); 227 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
225 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 228 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
226 if (sh->sector == sector) 229 if (sh->sector == sector && sh->disks == disks)
227 return sh; 230 return sh;
228 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); 231 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
229 return NULL; 232 return NULL;
@@ -232,8 +235,8 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
232static void unplug_slaves(mddev_t *mddev); 235static void unplug_slaves(mddev_t *mddev);
233static void raid5_unplug_device(request_queue_t *q); 236static void raid5_unplug_device(request_queue_t *q);
234 237
235static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, 238static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
236 int pd_idx, int noblock) 239 int pd_idx, int noblock)
237{ 240{
238 struct stripe_head *sh; 241 struct stripe_head *sh;
239 242
@@ -245,7 +248,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
245 wait_event_lock_irq(conf->wait_for_stripe, 248 wait_event_lock_irq(conf->wait_for_stripe,
246 conf->quiesce == 0, 249 conf->quiesce == 0,
247 conf->device_lock, /* nothing */); 250 conf->device_lock, /* nothing */);
248 sh = __find_stripe(conf, sector); 251 sh = __find_stripe(conf, sector, disks);
249 if (!sh) { 252 if (!sh) {
250 if (!conf->inactive_blocked) 253 if (!conf->inactive_blocked)
251 sh = get_free_stripe(conf); 254 sh = get_free_stripe(conf);
@@ -259,11 +262,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
259 < (conf->max_nr_stripes *3/4) 262 < (conf->max_nr_stripes *3/4)
260 || !conf->inactive_blocked), 263 || !conf->inactive_blocked),
261 conf->device_lock, 264 conf->device_lock,
262 unplug_slaves(conf->mddev); 265 unplug_slaves(conf->mddev)
263 ); 266 );
264 conf->inactive_blocked = 0; 267 conf->inactive_blocked = 0;
265 } else 268 } else
266 init_stripe(sh, sector, pd_idx); 269 init_stripe(sh, sector, pd_idx, disks);
267 } else { 270 } else {
268 if (atomic_read(&sh->count)) { 271 if (atomic_read(&sh->count)) {
269 if (!list_empty(&sh->lru)) 272 if (!list_empty(&sh->lru))
@@ -271,9 +274,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
271 } else { 274 } else {
272 if (!test_bit(STRIPE_HANDLE, &sh->state)) 275 if (!test_bit(STRIPE_HANDLE, &sh->state))
273 atomic_inc(&conf->active_stripes); 276 atomic_inc(&conf->active_stripes);
274 if (list_empty(&sh->lru)) 277 if (!list_empty(&sh->lru))
275 BUG(); 278 list_del_init(&sh->lru);
276 list_del_init(&sh->lru);
277 } 279 }
278 } 280 }
279 } while (sh == NULL); 281 } while (sh == NULL);
@@ -300,6 +302,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
300 kmem_cache_free(conf->slab_cache, sh); 302 kmem_cache_free(conf->slab_cache, sh);
301 return 0; 303 return 0;
302 } 304 }
305 sh->disks = conf->raid_disks;
303 /* we just created an active stripe so... */ 306 /* we just created an active stripe so... */
304 atomic_set(&sh->count, 1); 307 atomic_set(&sh->count, 1);
305 atomic_inc(&conf->active_stripes); 308 atomic_inc(&conf->active_stripes);
@@ -313,14 +316,16 @@ static int grow_stripes(raid5_conf_t *conf, int num)
313 kmem_cache_t *sc; 316 kmem_cache_t *sc;
314 int devs = conf->raid_disks; 317 int devs = conf->raid_disks;
315 318
316 sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev)); 319 sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
317 320 sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
318 sc = kmem_cache_create(conf->cache_name, 321 conf->active_name = 0;
322 sc = kmem_cache_create(conf->cache_name[conf->active_name],
319 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 323 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
320 0, 0, NULL, NULL); 324 0, 0, NULL, NULL);
321 if (!sc) 325 if (!sc)
322 return 1; 326 return 1;
323 conf->slab_cache = sc; 327 conf->slab_cache = sc;
328 conf->pool_size = devs;
324 while (num--) { 329 while (num--) {
325 if (!grow_one_stripe(conf)) 330 if (!grow_one_stripe(conf))
326 return 1; 331 return 1;
@@ -328,6 +333,129 @@ static int grow_stripes(raid5_conf_t *conf, int num)
328 return 0; 333 return 0;
329} 334}
330 335
336#ifdef CONFIG_MD_RAID5_RESHAPE
337static int resize_stripes(raid5_conf_t *conf, int newsize)
338{
339 /* Make all the stripes able to hold 'newsize' devices.
340 * New slots in each stripe get 'page' set to a new page.
341 *
342 * This happens in stages:
343 * 1/ create a new kmem_cache and allocate the required number of
344 * stripe_heads.
345 * 2/ gather all the old stripe_heads and tranfer the pages across
346 * to the new stripe_heads. This will have the side effect of
347 * freezing the array as once all stripe_heads have been collected,
348 * no IO will be possible. Old stripe heads are freed once their
349 * pages have been transferred over, and the old kmem_cache is
350 * freed when all stripes are done.
351 * 3/ reallocate conf->disks to be suitable bigger. If this fails,
352 * we simple return a failre status - no need to clean anything up.
353 * 4/ allocate new pages for the new slots in the new stripe_heads.
354 * If this fails, we don't bother trying the shrink the
355 * stripe_heads down again, we just leave them as they are.
356 * As each stripe_head is processed the new one is released into
357 * active service.
358 *
359 * Once step2 is started, we cannot afford to wait for a write,
360 * so we use GFP_NOIO allocations.
361 */
362 struct stripe_head *osh, *nsh;
363 LIST_HEAD(newstripes);
364 struct disk_info *ndisks;
365 int err = 0;
366 kmem_cache_t *sc;
367 int i;
368
369 if (newsize <= conf->pool_size)
370 return 0; /* never bother to shrink */
371
372 /* Step 1 */
373 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
374 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
375 0, 0, NULL, NULL);
376 if (!sc)
377 return -ENOMEM;
378
379 for (i = conf->max_nr_stripes; i; i--) {
380 nsh = kmem_cache_alloc(sc, GFP_KERNEL);
381 if (!nsh)
382 break;
383
384 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
385
386 nsh->raid_conf = conf;
387 spin_lock_init(&nsh->lock);
388
389 list_add(&nsh->lru, &newstripes);
390 }
391 if (i) {
392 /* didn't get enough, give up */
393 while (!list_empty(&newstripes)) {
394 nsh = list_entry(newstripes.next, struct stripe_head, lru);
395 list_del(&nsh->lru);
396 kmem_cache_free(sc, nsh);
397 }
398 kmem_cache_destroy(sc);
399 return -ENOMEM;
400 }
401 /* Step 2 - Must use GFP_NOIO now.
402 * OK, we have enough stripes, start collecting inactive
403 * stripes and copying them over
404 */
405 list_for_each_entry(nsh, &newstripes, lru) {
406 spin_lock_irq(&conf->device_lock);
407 wait_event_lock_irq(conf->wait_for_stripe,
408 !list_empty(&conf->inactive_list),
409 conf->device_lock,
410 unplug_slaves(conf->mddev)
411 );
412 osh = get_free_stripe(conf);
413 spin_unlock_irq(&conf->device_lock);
414 atomic_set(&nsh->count, 1);
415 for(i=0; i<conf->pool_size; i++)
416 nsh->dev[i].page = osh->dev[i].page;
417 for( ; i<newsize; i++)
418 nsh->dev[i].page = NULL;
419 kmem_cache_free(conf->slab_cache, osh);
420 }
421 kmem_cache_destroy(conf->slab_cache);
422
423 /* Step 3.
424 * At this point, we are holding all the stripes so the array
425 * is completely stalled, so now is a good time to resize
426 * conf->disks.
427 */
428 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
429 if (ndisks) {
430 for (i=0; i<conf->raid_disks; i++)
431 ndisks[i] = conf->disks[i];
432 kfree(conf->disks);
433 conf->disks = ndisks;
434 } else
435 err = -ENOMEM;
436
437 /* Step 4, return new stripes to service */
438 while(!list_empty(&newstripes)) {
439 nsh = list_entry(newstripes.next, struct stripe_head, lru);
440 list_del_init(&nsh->lru);
441 for (i=conf->raid_disks; i < newsize; i++)
442 if (nsh->dev[i].page == NULL) {
443 struct page *p = alloc_page(GFP_NOIO);
444 nsh->dev[i].page = p;
445 if (!p)
446 err = -ENOMEM;
447 }
448 release_stripe(nsh);
449 }
450 /* critical section pass, GFP_NOIO no longer needed */
451
452 conf->slab_cache = sc;
453 conf->active_name = 1-conf->active_name;
454 conf->pool_size = newsize;
455 return err;
456}
457#endif
458
331static int drop_one_stripe(raid5_conf_t *conf) 459static int drop_one_stripe(raid5_conf_t *conf)
332{ 460{
333 struct stripe_head *sh; 461 struct stripe_head *sh;
@@ -339,7 +467,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
339 return 0; 467 return 0;
340 if (atomic_read(&sh->count)) 468 if (atomic_read(&sh->count))
341 BUG(); 469 BUG();
342 shrink_buffers(sh, conf->raid_disks); 470 shrink_buffers(sh, conf->pool_size);
343 kmem_cache_free(conf->slab_cache, sh); 471 kmem_cache_free(conf->slab_cache, sh);
344 atomic_dec(&conf->active_stripes); 472 atomic_dec(&conf->active_stripes);
345 return 1; 473 return 1;
@@ -360,7 +488,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
360{ 488{
361 struct stripe_head *sh = bi->bi_private; 489 struct stripe_head *sh = bi->bi_private;
362 raid5_conf_t *conf = sh->raid_conf; 490 raid5_conf_t *conf = sh->raid_conf;
363 int disks = conf->raid_disks, i; 491 int disks = sh->disks, i;
364 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 492 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
365 493
366 if (bi->bi_size) 494 if (bi->bi_size)
@@ -458,7 +586,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
458{ 586{
459 struct stripe_head *sh = bi->bi_private; 587 struct stripe_head *sh = bi->bi_private;
460 raid5_conf_t *conf = sh->raid_conf; 588 raid5_conf_t *conf = sh->raid_conf;
461 int disks = conf->raid_disks, i; 589 int disks = sh->disks, i;
462 unsigned long flags; 590 unsigned long flags;
463 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 591 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
464 592
@@ -612,7 +740,7 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
612static sector_t compute_blocknr(struct stripe_head *sh, int i) 740static sector_t compute_blocknr(struct stripe_head *sh, int i)
613{ 741{
614 raid5_conf_t *conf = sh->raid_conf; 742 raid5_conf_t *conf = sh->raid_conf;
615 int raid_disks = conf->raid_disks, data_disks = raid_disks - 1; 743 int raid_disks = sh->disks, data_disks = raid_disks - 1;
616 sector_t new_sector = sh->sector, check; 744 sector_t new_sector = sh->sector, check;
617 int sectors_per_chunk = conf->chunk_size >> 9; 745 int sectors_per_chunk = conf->chunk_size >> 9;
618 sector_t stripe; 746 sector_t stripe;
@@ -713,8 +841,7 @@ static void copy_data(int frombio, struct bio *bio,
713 841
714static void compute_block(struct stripe_head *sh, int dd_idx) 842static void compute_block(struct stripe_head *sh, int dd_idx)
715{ 843{
716 raid5_conf_t *conf = sh->raid_conf; 844 int i, count, disks = sh->disks;
717 int i, count, disks = conf->raid_disks;
718 void *ptr[MAX_XOR_BLOCKS], *p; 845 void *ptr[MAX_XOR_BLOCKS], *p;
719 846
720 PRINTK("compute_block, stripe %llu, idx %d\n", 847 PRINTK("compute_block, stripe %llu, idx %d\n",
@@ -744,7 +871,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
744static void compute_parity(struct stripe_head *sh, int method) 871static void compute_parity(struct stripe_head *sh, int method)
745{ 872{
746 raid5_conf_t *conf = sh->raid_conf; 873 raid5_conf_t *conf = sh->raid_conf;
747 int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; 874 int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
748 void *ptr[MAX_XOR_BLOCKS]; 875 void *ptr[MAX_XOR_BLOCKS];
749 struct bio *chosen; 876 struct bio *chosen;
750 877
@@ -910,6 +1037,20 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
910 return 0; 1037 return 0;
911} 1038}
912 1039
1040static void end_reshape(raid5_conf_t *conf);
1041
1042static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1043{
1044 int sectors_per_chunk = conf->chunk_size >> 9;
1045 sector_t x = stripe;
1046 int pd_idx, dd_idx;
1047 int chunk_offset = sector_div(x, sectors_per_chunk);
1048 stripe = x;
1049 raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
1050 + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
1051 return pd_idx;
1052}
1053
913 1054
914/* 1055/*
915 * handle_stripe - do things to a stripe. 1056 * handle_stripe - do things to a stripe.
@@ -932,11 +1073,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
932static void handle_stripe(struct stripe_head *sh) 1073static void handle_stripe(struct stripe_head *sh)
933{ 1074{
934 raid5_conf_t *conf = sh->raid_conf; 1075 raid5_conf_t *conf = sh->raid_conf;
935 int disks = conf->raid_disks; 1076 int disks = sh->disks;
936 struct bio *return_bi= NULL; 1077 struct bio *return_bi= NULL;
937 struct bio *bi; 1078 struct bio *bi;
938 int i; 1079 int i;
939 int syncing; 1080 int syncing, expanding, expanded;
940 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; 1081 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
941 int non_overwrite = 0; 1082 int non_overwrite = 0;
942 int failed_num=0; 1083 int failed_num=0;
@@ -951,6 +1092,8 @@ static void handle_stripe(struct stripe_head *sh)
951 clear_bit(STRIPE_DELAYED, &sh->state); 1092 clear_bit(STRIPE_DELAYED, &sh->state);
952 1093
953 syncing = test_bit(STRIPE_SYNCING, &sh->state); 1094 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1095 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1096 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
954 /* Now to look around and see what can be done */ 1097 /* Now to look around and see what can be done */
955 1098
956 rcu_read_lock(); 1099 rcu_read_lock();
@@ -1143,13 +1286,14 @@ static void handle_stripe(struct stripe_head *sh)
1143 * parity, or to satisfy requests 1286 * parity, or to satisfy requests
1144 * or to load a block that is being partially written. 1287 * or to load a block that is being partially written.
1145 */ 1288 */
1146 if (to_read || non_overwrite || (syncing && (uptodate < disks))) { 1289 if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
1147 for (i=disks; i--;) { 1290 for (i=disks; i--;) {
1148 dev = &sh->dev[i]; 1291 dev = &sh->dev[i];
1149 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && 1292 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1150 (dev->toread || 1293 (dev->toread ||
1151 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 1294 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1152 syncing || 1295 syncing ||
1296 expanding ||
1153 (failed && (sh->dev[failed_num].toread || 1297 (failed && (sh->dev[failed_num].toread ||
1154 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) 1298 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
1155 ) 1299 )
@@ -1339,13 +1483,77 @@ static void handle_stripe(struct stripe_head *sh)
1339 set_bit(R5_Wantwrite, &dev->flags); 1483 set_bit(R5_Wantwrite, &dev->flags);
1340 set_bit(R5_ReWrite, &dev->flags); 1484 set_bit(R5_ReWrite, &dev->flags);
1341 set_bit(R5_LOCKED, &dev->flags); 1485 set_bit(R5_LOCKED, &dev->flags);
1486 locked++;
1342 } else { 1487 } else {
1343 /* let's read it back */ 1488 /* let's read it back */
1344 set_bit(R5_Wantread, &dev->flags); 1489 set_bit(R5_Wantread, &dev->flags);
1345 set_bit(R5_LOCKED, &dev->flags); 1490 set_bit(R5_LOCKED, &dev->flags);
1491 locked++;
1346 } 1492 }
1347 } 1493 }
1348 1494
1495 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
1496 /* Need to write out all blocks after computing parity */
1497 sh->disks = conf->raid_disks;
1498 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
1499 compute_parity(sh, RECONSTRUCT_WRITE);
1500 for (i= conf->raid_disks; i--;) {
1501 set_bit(R5_LOCKED, &sh->dev[i].flags);
1502 locked++;
1503 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1504 }
1505 clear_bit(STRIPE_EXPANDING, &sh->state);
1506 } else if (expanded) {
1507 clear_bit(STRIPE_EXPAND_READY, &sh->state);
1508 atomic_dec(&conf->reshape_stripes);
1509 wake_up(&conf->wait_for_overlap);
1510 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
1511 }
1512
1513 if (expanding && locked == 0) {
1514 /* We have read all the blocks in this stripe and now we need to
1515 * copy some of them into a target stripe for expand.
1516 */
1517 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1518 for (i=0; i< sh->disks; i++)
1519 if (i != sh->pd_idx) {
1520 int dd_idx, pd_idx, j;
1521 struct stripe_head *sh2;
1522
1523 sector_t bn = compute_blocknr(sh, i);
1524 sector_t s = raid5_compute_sector(bn, conf->raid_disks,
1525 conf->raid_disks-1,
1526 &dd_idx, &pd_idx, conf);
1527 sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
1528 if (sh2 == NULL)
1529 /* so far only the early blocks of this stripe
1530 * have been requested. When later blocks
1531 * get requested, we will try again
1532 */
1533 continue;
1534 if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
1535 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
1536 /* must have already done this block */
1537 release_stripe(sh2);
1538 continue;
1539 }
1540 memcpy(page_address(sh2->dev[dd_idx].page),
1541 page_address(sh->dev[i].page),
1542 STRIPE_SIZE);
1543 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
1544 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
1545 for (j=0; j<conf->raid_disks; j++)
1546 if (j != sh2->pd_idx &&
1547 !test_bit(R5_Expanded, &sh2->dev[j].flags))
1548 break;
1549 if (j == conf->raid_disks) {
1550 set_bit(STRIPE_EXPAND_READY, &sh2->state);
1551 set_bit(STRIPE_HANDLE, &sh2->state);
1552 }
1553 release_stripe(sh2);
1554 }
1555 }
1556
1349 spin_unlock(&sh->lock); 1557 spin_unlock(&sh->lock);
1350 1558
1351 while ((bi=return_bi)) { 1559 while ((bi=return_bi)) {
@@ -1384,7 +1592,7 @@ static void handle_stripe(struct stripe_head *sh)
1384 rcu_read_unlock(); 1592 rcu_read_unlock();
1385 1593
1386 if (rdev) { 1594 if (rdev) {
1387 if (syncing) 1595 if (syncing || expanding || expanded)
1388 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1596 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1389 1597
1390 bi->bi_bdev = rdev->bdev; 1598 bi->bi_bdev = rdev->bdev;
@@ -1526,17 +1734,16 @@ static inline void raid5_plug_device(raid5_conf_t *conf)
1526 spin_unlock_irq(&conf->device_lock); 1734 spin_unlock_irq(&conf->device_lock);
1527} 1735}
1528 1736
1529static int make_request (request_queue_t *q, struct bio * bi) 1737static int make_request(request_queue_t *q, struct bio * bi)
1530{ 1738{
1531 mddev_t *mddev = q->queuedata; 1739 mddev_t *mddev = q->queuedata;
1532 raid5_conf_t *conf = mddev_to_conf(mddev); 1740 raid5_conf_t *conf = mddev_to_conf(mddev);
1533 const unsigned int raid_disks = conf->raid_disks;
1534 const unsigned int data_disks = raid_disks - 1;
1535 unsigned int dd_idx, pd_idx; 1741 unsigned int dd_idx, pd_idx;
1536 sector_t new_sector; 1742 sector_t new_sector;
1537 sector_t logical_sector, last_sector; 1743 sector_t logical_sector, last_sector;
1538 struct stripe_head *sh; 1744 struct stripe_head *sh;
1539 const int rw = bio_data_dir(bi); 1745 const int rw = bio_data_dir(bi);
1746 int remaining;
1540 1747
1541 if (unlikely(bio_barrier(bi))) { 1748 if (unlikely(bio_barrier(bi))) {
1542 bio_endio(bi, bi->bi_size, -EOPNOTSUPP); 1749 bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
@@ -1555,20 +1762,77 @@ static int make_request (request_queue_t *q, struct bio * bi)
1555 1762
1556 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 1763 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1557 DEFINE_WAIT(w); 1764 DEFINE_WAIT(w);
1558 1765 int disks;
1559 new_sector = raid5_compute_sector(logical_sector,
1560 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1561 1766
1767 retry:
1768 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
1769 if (likely(conf->expand_progress == MaxSector))
1770 disks = conf->raid_disks;
1771 else {
1772 /* spinlock is needed as expand_progress may be
1773 * 64bit on a 32bit platform, and so it might be
1774 * possible to see a half-updated value
1775 * Ofcourse expand_progress could change after
1776 * the lock is dropped, so once we get a reference
1777 * to the stripe that we think it is, we will have
1778 * to check again.
1779 */
1780 spin_lock_irq(&conf->device_lock);
1781 disks = conf->raid_disks;
1782 if (logical_sector >= conf->expand_progress)
1783 disks = conf->previous_raid_disks;
1784 else {
1785 if (logical_sector >= conf->expand_lo) {
1786 spin_unlock_irq(&conf->device_lock);
1787 schedule();
1788 goto retry;
1789 }
1790 }
1791 spin_unlock_irq(&conf->device_lock);
1792 }
1793 new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
1794 &dd_idx, &pd_idx, conf);
1562 PRINTK("raid5: make_request, sector %llu logical %llu\n", 1795 PRINTK("raid5: make_request, sector %llu logical %llu\n",
1563 (unsigned long long)new_sector, 1796 (unsigned long long)new_sector,
1564 (unsigned long long)logical_sector); 1797 (unsigned long long)logical_sector);
1565 1798
1566 retry: 1799 sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
1567 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
1568 sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
1569 if (sh) { 1800 if (sh) {
1570 if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 1801 if (unlikely(conf->expand_progress != MaxSector)) {
1571 /* Add failed due to overlap. Flush everything 1802 /* expansion might have moved on while waiting for a
1803 * stripe, so we must do the range check again.
1804 * Expansion could still move past after this
1805 * test, but as we are holding a reference to
1806 * 'sh', we know that if that happens,
1807 * STRIPE_EXPANDING will get set and the expansion
1808 * won't proceed until we finish with the stripe.
1809 */
1810 int must_retry = 0;
1811 spin_lock_irq(&conf->device_lock);
1812 if (logical_sector < conf->expand_progress &&
1813 disks == conf->previous_raid_disks)
1814 /* mismatch, need to try again */
1815 must_retry = 1;
1816 spin_unlock_irq(&conf->device_lock);
1817 if (must_retry) {
1818 release_stripe(sh);
1819 goto retry;
1820 }
1821 }
1822 /* FIXME what if we get a false positive because these
1823 * are being updated.
1824 */
1825 if (logical_sector >= mddev->suspend_lo &&
1826 logical_sector < mddev->suspend_hi) {
1827 release_stripe(sh);
1828 schedule();
1829 goto retry;
1830 }
1831
1832 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
1833 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
1834 /* Stripe is busy expanding or
1835 * add failed due to overlap. Flush everything
1572 * and wait a while 1836 * and wait a while
1573 */ 1837 */
1574 raid5_unplug_device(mddev->queue); 1838 raid5_unplug_device(mddev->queue);
@@ -1580,7 +1844,6 @@ static int make_request (request_queue_t *q, struct bio * bi)
1580 raid5_plug_device(conf); 1844 raid5_plug_device(conf);
1581 handle_stripe(sh); 1845 handle_stripe(sh);
1582 release_stripe(sh); 1846 release_stripe(sh);
1583
1584 } else { 1847 } else {
1585 /* cannot get stripe for read-ahead, just give-up */ 1848 /* cannot get stripe for read-ahead, just give-up */
1586 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1849 clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1590,7 +1853,9 @@ static int make_request (request_queue_t *q, struct bio * bi)
1590 1853
1591 } 1854 }
1592 spin_lock_irq(&conf->device_lock); 1855 spin_lock_irq(&conf->device_lock);
1593 if (--bi->bi_phys_segments == 0) { 1856 remaining = --bi->bi_phys_segments;
1857 spin_unlock_irq(&conf->device_lock);
1858 if (remaining == 0) {
1594 int bytes = bi->bi_size; 1859 int bytes = bi->bi_size;
1595 1860
1596 if ( bio_data_dir(bi) == WRITE ) 1861 if ( bio_data_dir(bi) == WRITE )
@@ -1598,7 +1863,6 @@ static int make_request (request_queue_t *q, struct bio * bi)
1598 bi->bi_size = 0; 1863 bi->bi_size = 0;
1599 bi->bi_end_io(bi, bytes, 0); 1864 bi->bi_end_io(bi, bytes, 0);
1600 } 1865 }
1601 spin_unlock_irq(&conf->device_lock);
1602 return 0; 1866 return 0;
1603} 1867}
1604 1868
@@ -1607,12 +1871,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1607{ 1871{
1608 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1872 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1609 struct stripe_head *sh; 1873 struct stripe_head *sh;
1610 int sectors_per_chunk = conf->chunk_size >> 9; 1874 int pd_idx;
1611 sector_t x; 1875 sector_t first_sector, last_sector;
1612 unsigned long stripe;
1613 int chunk_offset;
1614 int dd_idx, pd_idx;
1615 sector_t first_sector;
1616 int raid_disks = conf->raid_disks; 1876 int raid_disks = conf->raid_disks;
1617 int data_disks = raid_disks-1; 1877 int data_disks = raid_disks-1;
1618 sector_t max_sector = mddev->size << 1; 1878 sector_t max_sector = mddev->size << 1;
@@ -1621,6 +1881,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1621 if (sector_nr >= max_sector) { 1881 if (sector_nr >= max_sector) {
1622 /* just being told to finish up .. nothing much to do */ 1882 /* just being told to finish up .. nothing much to do */
1623 unplug_slaves(mddev); 1883 unplug_slaves(mddev);
1884 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
1885 end_reshape(conf);
1886 return 0;
1887 }
1624 1888
1625 if (mddev->curr_resync < max_sector) /* aborted */ 1889 if (mddev->curr_resync < max_sector) /* aborted */
1626 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 1890 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
@@ -1631,6 +1895,123 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1631 1895
1632 return 0; 1896 return 0;
1633 } 1897 }
1898
1899 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
1900 /* reshaping is quite different to recovery/resync so it is
1901 * handled quite separately ... here.
1902 *
1903 * On each call to sync_request, we gather one chunk worth of
1904 * destination stripes and flag them as expanding.
1905 * Then we find all the source stripes and request reads.
1906 * As the reads complete, handle_stripe will copy the data
1907 * into the destination stripe and release that stripe.
1908 */
1909 int i;
1910 int dd_idx;
1911 sector_t writepos, safepos, gap;
1912
1913 if (sector_nr == 0 &&
1914 conf->expand_progress != 0) {
1915 /* restarting in the middle, skip the initial sectors */
1916 sector_nr = conf->expand_progress;
1917 sector_div(sector_nr, conf->raid_disks-1);
1918 *skipped = 1;
1919 return sector_nr;
1920 }
1921
1922 /* we update the metadata when there is more than 3Meg
1923 * in the block range (that is rather arbitrary, should
1924 * probably be time based) or when the data about to be
1925 * copied would over-write the source of the data at
1926 * the front of the range.
1927 * i.e. one new_stripe forward from expand_progress new_maps
1928 * to after where expand_lo old_maps to
1929 */
1930 writepos = conf->expand_progress +
1931 conf->chunk_size/512*(conf->raid_disks-1);
1932 sector_div(writepos, conf->raid_disks-1);
1933 safepos = conf->expand_lo;
1934 sector_div(safepos, conf->previous_raid_disks-1);
1935 gap = conf->expand_progress - conf->expand_lo;
1936
1937 if (writepos >= safepos ||
1938 gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
1939 /* Cannot proceed until we've updated the superblock... */
1940 wait_event(conf->wait_for_overlap,
1941 atomic_read(&conf->reshape_stripes)==0);
1942 mddev->reshape_position = conf->expand_progress;
1943 mddev->sb_dirty = 1;
1944 md_wakeup_thread(mddev->thread);
1945 wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
1946 kthread_should_stop());
1947 spin_lock_irq(&conf->device_lock);
1948 conf->expand_lo = mddev->reshape_position;
1949 spin_unlock_irq(&conf->device_lock);
1950 wake_up(&conf->wait_for_overlap);
1951 }
1952
1953 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
1954 int j;
1955 int skipped = 0;
1956 pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
1957 sh = get_active_stripe(conf, sector_nr+i,
1958 conf->raid_disks, pd_idx, 0);
1959 set_bit(STRIPE_EXPANDING, &sh->state);
1960 atomic_inc(&conf->reshape_stripes);
1961 /* If any of this stripe is beyond the end of the old
1962 * array, then we need to zero those blocks
1963 */
1964 for (j=sh->disks; j--;) {
1965 sector_t s;
1966 if (j == sh->pd_idx)
1967 continue;
1968 s = compute_blocknr(sh, j);
1969 if (s < (mddev->array_size<<1)) {
1970 skipped = 1;
1971 continue;
1972 }
1973 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
1974 set_bit(R5_Expanded, &sh->dev[j].flags);
1975 set_bit(R5_UPTODATE, &sh->dev[j].flags);
1976 }
1977 if (!skipped) {
1978 set_bit(STRIPE_EXPAND_READY, &sh->state);
1979 set_bit(STRIPE_HANDLE, &sh->state);
1980 }
1981 release_stripe(sh);
1982 }
1983 spin_lock_irq(&conf->device_lock);
1984 conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
1985 spin_unlock_irq(&conf->device_lock);
1986 /* Ok, those stripe are ready. We can start scheduling
1987 * reads on the source stripes.
1988 * The source stripes are determined by mapping the first and last
1989 * block on the destination stripes.
1990 */
1991 raid_disks = conf->previous_raid_disks;
1992 data_disks = raid_disks - 1;
1993 first_sector =
1994 raid5_compute_sector(sector_nr*(conf->raid_disks-1),
1995 raid_disks, data_disks,
1996 &dd_idx, &pd_idx, conf);
1997 last_sector =
1998 raid5_compute_sector((sector_nr+conf->chunk_size/512)
1999 *(conf->raid_disks-1) -1,
2000 raid_disks, data_disks,
2001 &dd_idx, &pd_idx, conf);
2002 if (last_sector >= (mddev->size<<1))
2003 last_sector = (mddev->size<<1)-1;
2004 while (first_sector <= last_sector) {
2005 pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
2006 sh = get_active_stripe(conf, first_sector,
2007 conf->previous_raid_disks, pd_idx, 0);
2008 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2009 set_bit(STRIPE_HANDLE, &sh->state);
2010 release_stripe(sh);
2011 first_sector += STRIPE_SECTORS;
2012 }
2013 return conf->chunk_size>>9;
2014 }
1634 /* if there is 1 or more failed drives and we are trying 2015 /* if there is 1 or more failed drives and we are trying
1635 * to resync, then assert that we are finished, because there is 2016 * to resync, then assert that we are finished, because there is
1636 * nothing we can do. 2017 * nothing we can do.
@@ -1649,16 +2030,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1649 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 2030 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
1650 } 2031 }
1651 2032
1652 x = sector_nr; 2033 pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
1653 chunk_offset = sector_div(x, sectors_per_chunk); 2034 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
1654 stripe = x;
1655 BUG_ON(x != stripe);
1656
1657 first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
1658 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1659 sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
1660 if (sh == NULL) { 2035 if (sh == NULL) {
1661 sh = get_active_stripe(conf, sector_nr, pd_idx, 0); 2036 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
1662 /* make sure we don't swamp the stripe cache if someone else 2037 /* make sure we don't swamp the stripe cache if someone else
1663 * is trying to get access 2038 * is trying to get access
1664 */ 2039 */
@@ -1822,11 +2197,64 @@ static int run(mddev_t *mddev)
1822 return -EIO; 2197 return -EIO;
1823 } 2198 }
1824 2199
1825 mddev->private = kzalloc(sizeof (raid5_conf_t) 2200 if (mddev->reshape_position != MaxSector) {
1826 + mddev->raid_disks * sizeof(struct disk_info), 2201 /* Check that we can continue the reshape.
1827 GFP_KERNEL); 2202 * Currently only disks can change, it must
2203 * increase, and we must be past the point where
2204 * a stripe over-writes itself
2205 */
2206 sector_t here_new, here_old;
2207 int old_disks;
2208
2209 if (mddev->new_level != mddev->level ||
2210 mddev->new_layout != mddev->layout ||
2211 mddev->new_chunk != mddev->chunk_size) {
2212 printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
2213 mdname(mddev));
2214 return -EINVAL;
2215 }
2216 if (mddev->delta_disks <= 0) {
2217 printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
2218 mdname(mddev));
2219 return -EINVAL;
2220 }
2221 old_disks = mddev->raid_disks - mddev->delta_disks;
2222 /* reshape_position must be on a new-stripe boundary, and one
2223 * further up in new geometry must map after here in old geometry.
2224 */
2225 here_new = mddev->reshape_position;
2226 if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
2227 printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
2228 return -EINVAL;
2229 }
2230 /* here_new is the stripe we will write to */
2231 here_old = mddev->reshape_position;
2232 sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
2233 /* here_old is the first stripe that we might need to read from */
2234 if (here_new >= here_old) {
2235 /* Reading from the same stripe as writing to - bad */
2236 printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
2237 return -EINVAL;
2238 }
2239 printk(KERN_INFO "raid5: reshape will continue\n");
2240 /* OK, we should be able to continue; */
2241 }
2242
2243
2244 mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
1828 if ((conf = mddev->private) == NULL) 2245 if ((conf = mddev->private) == NULL)
1829 goto abort; 2246 goto abort;
2247 if (mddev->reshape_position == MaxSector) {
2248 conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
2249 } else {
2250 conf->raid_disks = mddev->raid_disks;
2251 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
2252 }
2253
2254 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
2255 GFP_KERNEL);
2256 if (!conf->disks)
2257 goto abort;
1830 2258
1831 conf->mddev = mddev; 2259 conf->mddev = mddev;
1832 2260
@@ -1847,7 +2275,7 @@ static int run(mddev_t *mddev)
1847 2275
1848 ITERATE_RDEV(mddev,rdev,tmp) { 2276 ITERATE_RDEV(mddev,rdev,tmp) {
1849 raid_disk = rdev->raid_disk; 2277 raid_disk = rdev->raid_disk;
1850 if (raid_disk >= mddev->raid_disks 2278 if (raid_disk >= conf->raid_disks
1851 || raid_disk < 0) 2279 || raid_disk < 0)
1852 continue; 2280 continue;
1853 disk = conf->disks + raid_disk; 2281 disk = conf->disks + raid_disk;
@@ -1863,7 +2291,6 @@ static int run(mddev_t *mddev)
1863 } 2291 }
1864 } 2292 }
1865 2293
1866 conf->raid_disks = mddev->raid_disks;
1867 /* 2294 /*
1868 * 0 for a fully functional array, 1 for a degraded array. 2295 * 0 for a fully functional array, 1 for a degraded array.
1869 */ 2296 */
@@ -1873,6 +2300,7 @@ static int run(mddev_t *mddev)
1873 conf->level = mddev->level; 2300 conf->level = mddev->level;
1874 conf->algorithm = mddev->layout; 2301 conf->algorithm = mddev->layout;
1875 conf->max_nr_stripes = NR_STRIPES; 2302 conf->max_nr_stripes = NR_STRIPES;
2303 conf->expand_progress = mddev->reshape_position;
1876 2304
1877 /* device size must be a multiple of chunk size */ 2305 /* device size must be a multiple of chunk size */
1878 mddev->size &= ~(mddev->chunk_size/1024 -1); 2306 mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -1945,6 +2373,21 @@ static int run(mddev_t *mddev)
1945 2373
1946 print_raid5_conf(conf); 2374 print_raid5_conf(conf);
1947 2375
2376 if (conf->expand_progress != MaxSector) {
2377 printk("...ok start reshape thread\n");
2378 conf->expand_lo = conf->expand_progress;
2379 atomic_set(&conf->reshape_stripes, 0);
2380 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2381 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2382 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
2383 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
2384 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
2385 "%s_reshape");
2386 /* FIXME if md_register_thread fails?? */
2387 md_wakeup_thread(mddev->sync_thread);
2388
2389 }
2390
1948 /* read-ahead size must cover two whole stripes, which is 2391 /* read-ahead size must cover two whole stripes, which is
1949 * 2 * (n-1) * chunksize where 'n' is the number of raid devices 2392 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
1950 */ 2393 */
@@ -1960,12 +2403,13 @@ static int run(mddev_t *mddev)
1960 2403
1961 mddev->queue->unplug_fn = raid5_unplug_device; 2404 mddev->queue->unplug_fn = raid5_unplug_device;
1962 mddev->queue->issue_flush_fn = raid5_issue_flush; 2405 mddev->queue->issue_flush_fn = raid5_issue_flush;
2406 mddev->array_size = mddev->size * (conf->previous_raid_disks - 1);
1963 2407
1964 mddev->array_size = mddev->size * (mddev->raid_disks - 1);
1965 return 0; 2408 return 0;
1966abort: 2409abort:
1967 if (conf) { 2410 if (conf) {
1968 print_raid5_conf(conf); 2411 print_raid5_conf(conf);
2412 kfree(conf->disks);
1969 kfree(conf->stripe_hashtbl); 2413 kfree(conf->stripe_hashtbl);
1970 kfree(conf); 2414 kfree(conf);
1971 } 2415 }
@@ -1986,6 +2430,7 @@ static int stop(mddev_t *mddev)
1986 kfree(conf->stripe_hashtbl); 2430 kfree(conf->stripe_hashtbl);
1987 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 2431 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1988 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 2432 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
2433 kfree(conf->disks);
1989 kfree(conf); 2434 kfree(conf);
1990 mddev->private = NULL; 2435 mddev->private = NULL;
1991 return 0; 2436 return 0;
@@ -2001,7 +2446,7 @@ static void print_sh (struct stripe_head *sh)
2001 printk("sh %llu, count %d.\n", 2446 printk("sh %llu, count %d.\n",
2002 (unsigned long long)sh->sector, atomic_read(&sh->count)); 2447 (unsigned long long)sh->sector, atomic_read(&sh->count));
2003 printk("sh %llu, ", (unsigned long long)sh->sector); 2448 printk("sh %llu, ", (unsigned long long)sh->sector);
2004 for (i = 0; i < sh->raid_conf->raid_disks; i++) { 2449 for (i = 0; i < sh->disks; i++) {
2005 printk("(cache%d: %p %ld) ", 2450 printk("(cache%d: %p %ld) ",
2006 i, sh->dev[i].page, sh->dev[i].flags); 2451 i, sh->dev[i].page, sh->dev[i].flags);
2007 } 2452 }
@@ -2132,7 +2577,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2132 /* 2577 /*
2133 * find the disk ... 2578 * find the disk ...
2134 */ 2579 */
2135 for (disk=0; disk < mddev->raid_disks; disk++) 2580 for (disk=0; disk < conf->raid_disks; disk++)
2136 if ((p=conf->disks + disk)->rdev == NULL) { 2581 if ((p=conf->disks + disk)->rdev == NULL) {
2137 clear_bit(In_sync, &rdev->flags); 2582 clear_bit(In_sync, &rdev->flags);
2138 rdev->raid_disk = disk; 2583 rdev->raid_disk = disk;
@@ -2168,11 +2613,146 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
2168 return 0; 2613 return 0;
2169} 2614}
2170 2615
2616#ifdef CONFIG_MD_RAID5_RESHAPE
2617static int raid5_check_reshape(mddev_t *mddev)
2618{
2619 raid5_conf_t *conf = mddev_to_conf(mddev);
2620 int err;
2621
2622 if (mddev->delta_disks < 0 ||
2623 mddev->new_level != mddev->level)
2624 return -EINVAL; /* Cannot shrink array or change level yet */
2625 if (mddev->delta_disks == 0)
2626 return 0; /* nothing to do */
2627
2628 /* Can only proceed if there are plenty of stripe_heads.
2629 * We need a minimum of one full stripe,, and for sensible progress
2630 * it is best to have about 4 times that.
2631 * If we require 4 times, then the default 256 4K stripe_heads will
2632 * allow for chunk sizes up to 256K, which is probably OK.
2633 * If the chunk size is greater, user-space should request more
2634 * stripe_heads first.
2635 */
2636 if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
2637 (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
2638 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
2639 (mddev->chunk_size / STRIPE_SIZE)*4);
2640 return -ENOSPC;
2641 }
2642
2643 err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
2644 if (err)
2645 return err;
2646
2647 /* looks like we might be able to manage this */
2648 return 0;
2649}
2650
2651static int raid5_start_reshape(mddev_t *mddev)
2652{
2653 raid5_conf_t *conf = mddev_to_conf(mddev);
2654 mdk_rdev_t *rdev;
2655 struct list_head *rtmp;
2656 int spares = 0;
2657 int added_devices = 0;
2658
2659 if (mddev->degraded ||
2660 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2661 return -EBUSY;
2662
2663 ITERATE_RDEV(mddev, rdev, rtmp)
2664 if (rdev->raid_disk < 0 &&
2665 !test_bit(Faulty, &rdev->flags))
2666 spares++;
2667
2668 if (spares < mddev->delta_disks-1)
2669 /* Not enough devices even to make a degraded array
2670 * of that size
2671 */
2672 return -EINVAL;
2673
2674 atomic_set(&conf->reshape_stripes, 0);
2675 spin_lock_irq(&conf->device_lock);
2676 conf->previous_raid_disks = conf->raid_disks;
2677 conf->raid_disks += mddev->delta_disks;
2678 conf->expand_progress = 0;
2679 conf->expand_lo = 0;
2680 spin_unlock_irq(&conf->device_lock);
2681
2682 /* Add some new drives, as many as will fit.
2683 * We know there are enough to make the newly sized array work.
2684 */
2685 ITERATE_RDEV(mddev, rdev, rtmp)
2686 if (rdev->raid_disk < 0 &&
2687 !test_bit(Faulty, &rdev->flags)) {
2688 if (raid5_add_disk(mddev, rdev)) {
2689 char nm[20];
2690 set_bit(In_sync, &rdev->flags);
2691 conf->working_disks++;
2692 added_devices++;
2693 sprintf(nm, "rd%d", rdev->raid_disk);
2694 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
2695 } else
2696 break;
2697 }
2698
2699 mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
2700 mddev->raid_disks = conf->raid_disks;
2701 mddev->reshape_position = 0;
2702 mddev->sb_dirty = 1;
2703
2704 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2705 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2706 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
2707 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
2708 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
2709 "%s_reshape");
2710 if (!mddev->sync_thread) {
2711 mddev->recovery = 0;
2712 spin_lock_irq(&conf->device_lock);
2713 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
2714 conf->expand_progress = MaxSector;
2715 spin_unlock_irq(&conf->device_lock);
2716 return -EAGAIN;
2717 }
2718 md_wakeup_thread(mddev->sync_thread);
2719 md_new_event(mddev);
2720 return 0;
2721}
2722#endif
2723
2724static void end_reshape(raid5_conf_t *conf)
2725{
2726 struct block_device *bdev;
2727
2728 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
2729 conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
2730 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
2731 conf->mddev->changed = 1;
2732
2733 bdev = bdget_disk(conf->mddev->gendisk, 0);
2734 if (bdev) {
2735 mutex_lock(&bdev->bd_inode->i_mutex);
2736 i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
2737 mutex_unlock(&bdev->bd_inode->i_mutex);
2738 bdput(bdev);
2739 }
2740 spin_lock_irq(&conf->device_lock);
2741 conf->expand_progress = MaxSector;
2742 spin_unlock_irq(&conf->device_lock);
2743 conf->mddev->reshape_position = MaxSector;
2744 }
2745}
2746
2171static void raid5_quiesce(mddev_t *mddev, int state) 2747static void raid5_quiesce(mddev_t *mddev, int state)
2172{ 2748{
2173 raid5_conf_t *conf = mddev_to_conf(mddev); 2749 raid5_conf_t *conf = mddev_to_conf(mddev);
2174 2750
2175 switch(state) { 2751 switch(state) {
2752 case 2: /* resume for a suspend */
2753 wake_up(&conf->wait_for_overlap);
2754 break;
2755
2176 case 1: /* stop all writes */ 2756 case 1: /* stop all writes */
2177 spin_lock_irq(&conf->device_lock); 2757 spin_lock_irq(&conf->device_lock);
2178 conf->quiesce = 1; 2758 conf->quiesce = 1;
@@ -2186,6 +2766,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
2186 spin_lock_irq(&conf->device_lock); 2766 spin_lock_irq(&conf->device_lock);
2187 conf->quiesce = 0; 2767 conf->quiesce = 0;
2188 wake_up(&conf->wait_for_stripe); 2768 wake_up(&conf->wait_for_stripe);
2769 wake_up(&conf->wait_for_overlap);
2189 spin_unlock_irq(&conf->device_lock); 2770 spin_unlock_irq(&conf->device_lock);
2190 break; 2771 break;
2191 } 2772 }
@@ -2206,6 +2787,10 @@ static struct mdk_personality raid5_personality =
2206 .spare_active = raid5_spare_active, 2787 .spare_active = raid5_spare_active,
2207 .sync_request = sync_request, 2788 .sync_request = sync_request,
2208 .resize = raid5_resize, 2789 .resize = raid5_resize,
2790#ifdef CONFIG_MD_RAID5_RESHAPE
2791 .check_reshape = raid5_check_reshape,
2792 .start_reshape = raid5_start_reshape,
2793#endif
2209 .quiesce = raid5_quiesce, 2794 .quiesce = raid5_quiesce,
2210}; 2795};
2211 2796