aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c54
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/linear.c20
-rw-r--r--drivers/md/md.c615
-rw-r--r--drivers/md/multipath.c17
-rw-r--r--drivers/md/raid0.c8
-rw-r--r--drivers/md/raid1.c30
-rw-r--r--drivers/md/raid10.c22
-rw-r--r--drivers/md/raid5.c745
9 files changed, 752 insertions, 761 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index b26927ce889c..621a272a2c74 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -225,7 +225,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
225 || test_bit(Faulty, &rdev->flags)) 225 || test_bit(Faulty, &rdev->flags))
226 continue; 226 continue;
227 227
228 target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); 228 target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
229 229
230 if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { 230 if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) {
231 page->index = index; 231 page->index = index;
@@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
241static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) 241static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
242{ 242{
243 mdk_rdev_t *rdev; 243 mdk_rdev_t *rdev;
244 struct list_head *tmp;
245 mddev_t *mddev = bitmap->mddev; 244 mddev_t *mddev = bitmap->mddev;
246 245
247 rdev_for_each(rdev, tmp, mddev) 246 rcu_read_lock();
247 rdev_for_each_rcu(rdev, mddev)
248 if (test_bit(In_sync, &rdev->flags) 248 if (test_bit(In_sync, &rdev->flags)
249 && !test_bit(Faulty, &rdev->flags)) { 249 && !test_bit(Faulty, &rdev->flags)) {
250 int size = PAGE_SIZE; 250 int size = PAGE_SIZE;
@@ -260,32 +260,37 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
260 + (long)(page->index * (PAGE_SIZE/512)) 260 + (long)(page->index * (PAGE_SIZE/512))
261 + size/512 > 0) 261 + size/512 > 0)
262 /* bitmap runs in to metadata */ 262 /* bitmap runs in to metadata */
263 return -EINVAL; 263 goto bad_alignment;
264 if (rdev->data_offset + mddev->size*2 264 if (rdev->data_offset + mddev->size*2
265 > rdev->sb_offset*2 + bitmap->offset) 265 > rdev->sb_start + bitmap->offset)
266 /* data runs in to bitmap */ 266 /* data runs in to bitmap */
267 return -EINVAL; 267 goto bad_alignment;
268 } else if (rdev->sb_offset*2 < rdev->data_offset) { 268 } else if (rdev->sb_start < rdev->data_offset) {
269 /* METADATA BITMAP DATA */ 269 /* METADATA BITMAP DATA */
270 if (rdev->sb_offset*2 270 if (rdev->sb_start
271 + bitmap->offset 271 + bitmap->offset
272 + page->index*(PAGE_SIZE/512) + size/512 272 + page->index*(PAGE_SIZE/512) + size/512
273 > rdev->data_offset) 273 > rdev->data_offset)
274 /* bitmap runs in to data */ 274 /* bitmap runs in to data */
275 return -EINVAL; 275 goto bad_alignment;
276 } else { 276 } else {
277 /* DATA METADATA BITMAP - no problems */ 277 /* DATA METADATA BITMAP - no problems */
278 } 278 }
279 md_super_write(mddev, rdev, 279 md_super_write(mddev, rdev,
280 (rdev->sb_offset<<1) + bitmap->offset 280 rdev->sb_start + bitmap->offset
281 + page->index * (PAGE_SIZE/512), 281 + page->index * (PAGE_SIZE/512),
282 size, 282 size,
283 page); 283 page);
284 } 284 }
285 rcu_read_unlock();
285 286
286 if (wait) 287 if (wait)
287 md_super_wait(mddev); 288 md_super_wait(mddev);
288 return 0; 289 return 0;
290
291 bad_alignment:
292 rcu_read_unlock();
293 return -EINVAL;
289} 294}
290 295
291static void bitmap_file_kick(struct bitmap *bitmap); 296static void bitmap_file_kick(struct bitmap *bitmap);
@@ -454,8 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
454 spin_unlock_irqrestore(&bitmap->lock, flags); 459 spin_unlock_irqrestore(&bitmap->lock, flags);
455 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 460 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
456 sb->events = cpu_to_le64(bitmap->mddev->events); 461 sb->events = cpu_to_le64(bitmap->mddev->events);
457 if (!bitmap->mddev->degraded) 462 if (bitmap->mddev->events < bitmap->events_cleared) {
458 sb->events_cleared = cpu_to_le64(bitmap->mddev->events); 463 /* rocking back to read-only */
464 bitmap->events_cleared = bitmap->mddev->events;
465 sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
466 }
459 kunmap_atomic(sb, KM_USER0); 467 kunmap_atomic(sb, KM_USER0);
460 write_page(bitmap, bitmap->sb_page, 1); 468 write_page(bitmap, bitmap->sb_page, 1);
461} 469}
@@ -1085,9 +1093,19 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1085 } else 1093 } else
1086 spin_unlock_irqrestore(&bitmap->lock, flags); 1094 spin_unlock_irqrestore(&bitmap->lock, flags);
1087 lastpage = page; 1095 lastpage = page;
1088/* 1096
1089 printk("bitmap clean at page %lu\n", j); 1097 /* We are possibly going to clear some bits, so make
1090*/ 1098 * sure that events_cleared is up-to-date.
1099 */
1100 if (bitmap->need_sync) {
1101 bitmap_super_t *sb;
1102 bitmap->need_sync = 0;
1103 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
1104 sb->events_cleared =
1105 cpu_to_le64(bitmap->events_cleared);
1106 kunmap_atomic(sb, KM_USER0);
1107 write_page(bitmap, bitmap->sb_page, 1);
1108 }
1091 spin_lock_irqsave(&bitmap->lock, flags); 1109 spin_lock_irqsave(&bitmap->lock, flags);
1092 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); 1110 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1093 } 1111 }
@@ -1257,6 +1275,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1257 return; 1275 return;
1258 } 1276 }
1259 1277
1278 if (success &&
1279 bitmap->events_cleared < bitmap->mddev->events) {
1280 bitmap->events_cleared = bitmap->mddev->events;
1281 bitmap->need_sync = 1;
1282 }
1283
1260 if (!success && ! (*bmc & NEEDED_MASK)) 1284 if (!success && ! (*bmc & NEEDED_MASK))
1261 *bmc |= NEEDED_MASK; 1285 *bmc |= NEEDED_MASK;
1262 1286
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index d107ddceefcd..268547dbfbd3 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -297,7 +297,7 @@ static int run(mddev_t *mddev)
297 rdev_for_each(rdev, tmp, mddev) 297 rdev_for_each(rdev, tmp, mddev)
298 conf->rdev = rdev; 298 conf->rdev = rdev;
299 299
300 mddev->array_size = mddev->size; 300 mddev->array_sectors = mddev->size * 2;
301 mddev->private = conf; 301 mddev->private = conf;
302 302
303 reconfig(mddev, mddev->layout, -1); 303 reconfig(mddev, mddev->layout, -1);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 6a866d7c8ae5..b1eebf88c209 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -122,13 +122,13 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
122 return NULL; 122 return NULL;
123 123
124 cnt = 0; 124 cnt = 0;
125 conf->array_size = 0; 125 conf->array_sectors = 0;
126 126
127 rdev_for_each(rdev, tmp, mddev) { 127 rdev_for_each(rdev, tmp, mddev) {
128 int j = rdev->raid_disk; 128 int j = rdev->raid_disk;
129 dev_info_t *disk = conf->disks + j; 129 dev_info_t *disk = conf->disks + j;
130 130
131 if (j < 0 || j > raid_disks || disk->rdev) { 131 if (j < 0 || j >= raid_disks || disk->rdev) {
132 printk("linear: disk numbering problem. Aborting!\n"); 132 printk("linear: disk numbering problem. Aborting!\n");
133 goto out; 133 goto out;
134 } 134 }
@@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
146 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 146 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
147 147
148 disk->size = rdev->size; 148 disk->size = rdev->size;
149 conf->array_size += rdev->size; 149 conf->array_sectors += rdev->size * 2;
150 150
151 cnt++; 151 cnt++;
152 } 152 }
@@ -155,7 +155,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
155 goto out; 155 goto out;
156 } 156 }
157 157
158 min_spacing = conf->array_size; 158 min_spacing = conf->array_sectors / 2;
159 sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); 159 sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
160 160
161 /* min_spacing is the minimum spacing that will fit the hash 161 /* min_spacing is the minimum spacing that will fit the hash
@@ -164,7 +164,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
164 * that is larger than min_spacing as use the size of that as 164 * that is larger than min_spacing as use the size of that as
165 * the actual spacing 165 * the actual spacing
166 */ 166 */
167 conf->hash_spacing = conf->array_size; 167 conf->hash_spacing = conf->array_sectors / 2;
168 for (i=0; i < cnt-1 ; i++) { 168 for (i=0; i < cnt-1 ; i++) {
169 sector_t sz = 0; 169 sector_t sz = 0;
170 int j; 170 int j;
@@ -194,7 +194,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
194 unsigned round; 194 unsigned round;
195 unsigned long base; 195 unsigned long base;
196 196
197 sz = conf->array_size >> conf->preshift; 197 sz = conf->array_sectors >> (conf->preshift + 1);
198 sz += 1; /* force round-up */ 198 sz += 1; /* force round-up */
199 base = conf->hash_spacing >> conf->preshift; 199 base = conf->hash_spacing >> conf->preshift;
200 round = sector_div(sz, base); 200 round = sector_div(sz, base);
@@ -221,7 +221,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
221 curr_offset = 0; 221 curr_offset = 0;
222 i = 0; 222 i = 0;
223 for (curr_offset = 0; 223 for (curr_offset = 0;
224 curr_offset < conf->array_size; 224 curr_offset < conf->array_sectors / 2;
225 curr_offset += conf->hash_spacing) { 225 curr_offset += conf->hash_spacing) {
226 226
227 while (i < raid_disks-1 && 227 while (i < raid_disks-1 &&
@@ -258,7 +258,7 @@ static int linear_run (mddev_t *mddev)
258 if (!conf) 258 if (!conf)
259 return 1; 259 return 1;
260 mddev->private = conf; 260 mddev->private = conf;
261 mddev->array_size = conf->array_size; 261 mddev->array_sectors = conf->array_sectors;
262 262
263 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); 263 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
264 mddev->queue->unplug_fn = linear_unplug; 264 mddev->queue->unplug_fn = linear_unplug;
@@ -292,8 +292,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
292 newconf->prev = mddev_to_conf(mddev); 292 newconf->prev = mddev_to_conf(mddev);
293 mddev->private = newconf; 293 mddev->private = newconf;
294 mddev->raid_disks++; 294 mddev->raid_disks++;
295 mddev->array_size = newconf->array_size; 295 mddev->array_sectors = newconf->array_sectors;
296 set_capacity(mddev->gendisk, mddev->array_size << 1); 296 set_capacity(mddev->gendisk, mddev->array_sectors);
297 return 0; 297 return 0;
298} 298}
299 299
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2580ac1b9b0f..c2ff77ccec50 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev)
169{ 169{
170 atomic_inc(&md_event_count); 170 atomic_inc(&md_event_count);
171 wake_up(&md_event_waiters); 171 wake_up(&md_event_waiters);
172 sysfs_notify(&mddev->kobj, NULL, "sync_action");
173} 172}
174EXPORT_SYMBOL_GPL(md_new_event); 173EXPORT_SYMBOL_GPL(md_new_event);
175 174
@@ -274,10 +273,12 @@ static mddev_t * mddev_find(dev_t unit)
274 INIT_LIST_HEAD(&new->all_mddevs); 273 INIT_LIST_HEAD(&new->all_mddevs);
275 init_timer(&new->safemode_timer); 274 init_timer(&new->safemode_timer);
276 atomic_set(&new->active, 1); 275 atomic_set(&new->active, 1);
276 atomic_set(&new->openers, 0);
277 spin_lock_init(&new->write_lock); 277 spin_lock_init(&new->write_lock);
278 init_waitqueue_head(&new->sb_wait); 278 init_waitqueue_head(&new->sb_wait);
279 init_waitqueue_head(&new->recovery_wait); 279 init_waitqueue_head(&new->recovery_wait);
280 new->reshape_position = MaxSector; 280 new->reshape_position = MaxSector;
281 new->resync_min = 0;
281 new->resync_max = MaxSector; 282 new->resync_max = MaxSector;
282 new->level = LEVEL_NONE; 283 new->level = LEVEL_NONE;
283 284
@@ -347,21 +348,20 @@ static struct mdk_personality *find_pers(int level, char *clevel)
347 return NULL; 348 return NULL;
348} 349}
349 350
351/* return the offset of the super block in 512byte sectors */
350static inline sector_t calc_dev_sboffset(struct block_device *bdev) 352static inline sector_t calc_dev_sboffset(struct block_device *bdev)
351{ 353{
352 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 354 sector_t num_sectors = bdev->bd_inode->i_size / 512;
353 return MD_NEW_SIZE_BLOCKS(size); 355 return MD_NEW_SIZE_SECTORS(num_sectors);
354} 356}
355 357
356static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 358static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
357{ 359{
358 sector_t size; 360 sector_t num_sectors = rdev->sb_start;
359
360 size = rdev->sb_offset;
361 361
362 if (chunk_size) 362 if (chunk_size)
363 size &= ~((sector_t)chunk_size/1024 - 1); 363 num_sectors &= ~((sector_t)chunk_size/512 - 1);
364 return size; 364 return num_sectors;
365} 365}
366 366
367static int alloc_disk_sb(mdk_rdev_t * rdev) 367static int alloc_disk_sb(mdk_rdev_t * rdev)
@@ -372,7 +372,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
372 rdev->sb_page = alloc_page(GFP_KERNEL); 372 rdev->sb_page = alloc_page(GFP_KERNEL);
373 if (!rdev->sb_page) { 373 if (!rdev->sb_page) {
374 printk(KERN_ALERT "md: out of memory.\n"); 374 printk(KERN_ALERT "md: out of memory.\n");
375 return -EINVAL; 375 return -ENOMEM;
376 } 376 }
377 377
378 return 0; 378 return 0;
@@ -384,7 +384,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
384 put_page(rdev->sb_page); 384 put_page(rdev->sb_page);
385 rdev->sb_loaded = 0; 385 rdev->sb_loaded = 0;
386 rdev->sb_page = NULL; 386 rdev->sb_page = NULL;
387 rdev->sb_offset = 0; 387 rdev->sb_start = 0;
388 rdev->size = 0; 388 rdev->size = 0;
389 } 389 }
390} 390}
@@ -530,7 +530,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
530 return 0; 530 return 0;
531 531
532 532
533 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 533 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
534 goto fail; 534 goto fail;
535 rdev->sb_loaded = 1; 535 rdev->sb_loaded = 1;
536 return 0; 536 return 0;
@@ -543,17 +543,12 @@ fail:
543 543
544static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 544static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
545{ 545{
546 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 546 return sb1->set_uuid0 == sb2->set_uuid0 &&
547 (sb1->set_uuid1 == sb2->set_uuid1) && 547 sb1->set_uuid1 == sb2->set_uuid1 &&
548 (sb1->set_uuid2 == sb2->set_uuid2) && 548 sb1->set_uuid2 == sb2->set_uuid2 &&
549 (sb1->set_uuid3 == sb2->set_uuid3)) 549 sb1->set_uuid3 == sb2->set_uuid3;
550
551 return 1;
552
553 return 0;
554} 550}
555 551
556
557static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 552static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
558{ 553{
559 int ret; 554 int ret;
@@ -564,7 +559,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
564 559
565 if (!tmp1 || !tmp2) { 560 if (!tmp1 || !tmp2) {
566 ret = 0; 561 ret = 0;
567 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 562 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
568 goto abort; 563 goto abort;
569 } 564 }
570 565
@@ -577,11 +572,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
577 tmp1->nr_disks = 0; 572 tmp1->nr_disks = 0;
578 tmp2->nr_disks = 0; 573 tmp2->nr_disks = 0;
579 574
580 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 575 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
581 ret = 0;
582 else
583 ret = 1;
584
585abort: 576abort:
586 kfree(tmp1); 577 kfree(tmp1);
587 kfree(tmp2); 578 kfree(tmp2);
@@ -658,11 +649,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
658 */ 649 */
659 650
660struct super_type { 651struct super_type {
661 char *name; 652 char *name;
662 struct module *owner; 653 struct module *owner;
663 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 654 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
664 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 655 int minor_version);
665 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 656 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
657 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
658 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev,
659 sector_t num_sectors);
666}; 660};
667 661
668/* 662/*
@@ -673,16 +667,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
673 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 667 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
674 mdp_super_t *sb; 668 mdp_super_t *sb;
675 int ret; 669 int ret;
676 sector_t sb_offset;
677 670
678 /* 671 /*
679 * Calculate the position of the superblock, 672 * Calculate the position of the superblock (512byte sectors),
680 * it's at the end of the disk. 673 * it's at the end of the disk.
681 * 674 *
682 * It also happens to be a multiple of 4Kb. 675 * It also happens to be a multiple of 4Kb.
683 */ 676 */
684 sb_offset = calc_dev_sboffset(rdev->bdev); 677 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
685 rdev->sb_offset = sb_offset;
686 678
687 ret = read_disk_sb(rdev, MD_SB_BYTES); 679 ret = read_disk_sb(rdev, MD_SB_BYTES);
688 if (ret) return ret; 680 if (ret) return ret;
@@ -759,7 +751,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
759 else 751 else
760 ret = 0; 752 ret = 0;
761 } 753 }
762 rdev->size = calc_dev_size(rdev, sb->chunk_size); 754 rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2;
763 755
764 if (rdev->size < sb->size && sb->level > 1) 756 if (rdev->size < sb->size && sb->level > 1)
765 /* "this cannot possibly happen" ... */ 757 /* "this cannot possibly happen" ... */
@@ -1004,6 +996,26 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1004} 996}
1005 997
1006/* 998/*
999 * rdev_size_change for 0.90.0
1000 */
1001static unsigned long long
1002super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1003{
1004 if (num_sectors && num_sectors < rdev->mddev->size * 2)
1005 return 0; /* component must fit device */
1006 if (rdev->mddev->bitmap_offset)
1007 return 0; /* can't move bitmap */
1008 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1009 if (!num_sectors || num_sectors > rdev->sb_start)
1010 num_sectors = rdev->sb_start;
1011 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1012 rdev->sb_page);
1013 md_super_wait(rdev->mddev);
1014 return num_sectors / 2; /* kB for sysfs */
1015}
1016
1017
1018/*
1007 * version 1 superblock 1019 * version 1 superblock
1008 */ 1020 */
1009 1021
@@ -1034,12 +1046,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1034{ 1046{
1035 struct mdp_superblock_1 *sb; 1047 struct mdp_superblock_1 *sb;
1036 int ret; 1048 int ret;
1037 sector_t sb_offset; 1049 sector_t sb_start;
1038 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1050 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1039 int bmask; 1051 int bmask;
1040 1052
1041 /* 1053 /*
1042 * Calculate the position of the superblock. 1054 * Calculate the position of the superblock in 512byte sectors.
1043 * It is always aligned to a 4K boundary and 1055 * It is always aligned to a 4K boundary and
1044 * depeding on minor_version, it can be: 1056 * depeding on minor_version, it can be:
1045 * 0: At least 8K, but less than 12K, from end of device 1057 * 0: At least 8K, but less than 12K, from end of device
@@ -1048,22 +1060,20 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1048 */ 1060 */
1049 switch(minor_version) { 1061 switch(minor_version) {
1050 case 0: 1062 case 0:
1051 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 1063 sb_start = rdev->bdev->bd_inode->i_size >> 9;
1052 sb_offset -= 8*2; 1064 sb_start -= 8*2;
1053 sb_offset &= ~(sector_t)(4*2-1); 1065 sb_start &= ~(sector_t)(4*2-1);
1054 /* convert from sectors to K */
1055 sb_offset /= 2;
1056 break; 1066 break;
1057 case 1: 1067 case 1:
1058 sb_offset = 0; 1068 sb_start = 0;
1059 break; 1069 break;
1060 case 2: 1070 case 2:
1061 sb_offset = 4; 1071 sb_start = 8;
1062 break; 1072 break;
1063 default: 1073 default:
1064 return -EINVAL; 1074 return -EINVAL;
1065 } 1075 }
1066 rdev->sb_offset = sb_offset; 1076 rdev->sb_start = sb_start;
1067 1077
1068 /* superblock is rarely larger than 1K, but it can be larger, 1078 /* superblock is rarely larger than 1K, but it can be larger,
1069 * and it is safe to read 4k, so we do that 1079 * and it is safe to read 4k, so we do that
@@ -1077,7 +1087,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1077 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1087 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1078 sb->major_version != cpu_to_le32(1) || 1088 sb->major_version != cpu_to_le32(1) ||
1079 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1089 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1080 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 1090 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1081 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1091 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1082 return -EINVAL; 1092 return -EINVAL;
1083 1093
@@ -1113,7 +1123,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1113 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1123 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1114 1124
1115 if (minor_version 1125 if (minor_version
1116 && rdev->data_offset < sb_offset + (rdev->sb_size/512)) 1126 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1117 return -EINVAL; 1127 return -EINVAL;
1118 1128
1119 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1129 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
@@ -1149,7 +1159,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1149 if (minor_version) 1159 if (minor_version)
1150 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1160 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1151 else 1161 else
1152 rdev->size = rdev->sb_offset; 1162 rdev->size = rdev->sb_start / 2;
1153 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1163 if (rdev->size < le64_to_cpu(sb->data_size)/2)
1154 return -EINVAL; 1164 return -EINVAL;
1155 rdev->size = le64_to_cpu(sb->data_size)/2; 1165 rdev->size = le64_to_cpu(sb->data_size)/2;
@@ -1328,35 +1338,74 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1328 sb->sb_csum = calc_sb_1_csum(sb); 1338 sb->sb_csum = calc_sb_1_csum(sb);
1329} 1339}
1330 1340
1341static unsigned long long
1342super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1343{
1344 struct mdp_superblock_1 *sb;
1345 sector_t max_sectors;
1346 if (num_sectors && num_sectors < rdev->mddev->size * 2)
1347 return 0; /* component must fit device */
1348 if (rdev->sb_start < rdev->data_offset) {
1349 /* minor versions 1 and 2; superblock before data */
1350 max_sectors = rdev->bdev->bd_inode->i_size >> 9;
1351 max_sectors -= rdev->data_offset;
1352 if (!num_sectors || num_sectors > max_sectors)
1353 num_sectors = max_sectors;
1354 } else if (rdev->mddev->bitmap_offset) {
1355 /* minor version 0 with bitmap we can't move */
1356 return 0;
1357 } else {
1358 /* minor version 0; superblock after data */
1359 sector_t sb_start;
1360 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1361 sb_start &= ~(sector_t)(4*2 - 1);
1362 max_sectors = rdev->size * 2 + sb_start - rdev->sb_start;
1363 if (!num_sectors || num_sectors > max_sectors)
1364 num_sectors = max_sectors;
1365 rdev->sb_start = sb_start;
1366 }
1367 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1368 sb->data_size = cpu_to_le64(num_sectors);
1369 sb->super_offset = rdev->sb_start;
1370 sb->sb_csum = calc_sb_1_csum(sb);
1371 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1372 rdev->sb_page);
1373 md_super_wait(rdev->mddev);
1374 return num_sectors / 2; /* kB for sysfs */
1375}
1331 1376
1332static struct super_type super_types[] = { 1377static struct super_type super_types[] = {
1333 [0] = { 1378 [0] = {
1334 .name = "0.90.0", 1379 .name = "0.90.0",
1335 .owner = THIS_MODULE, 1380 .owner = THIS_MODULE,
1336 .load_super = super_90_load, 1381 .load_super = super_90_load,
1337 .validate_super = super_90_validate, 1382 .validate_super = super_90_validate,
1338 .sync_super = super_90_sync, 1383 .sync_super = super_90_sync,
1384 .rdev_size_change = super_90_rdev_size_change,
1339 }, 1385 },
1340 [1] = { 1386 [1] = {
1341 .name = "md-1", 1387 .name = "md-1",
1342 .owner = THIS_MODULE, 1388 .owner = THIS_MODULE,
1343 .load_super = super_1_load, 1389 .load_super = super_1_load,
1344 .validate_super = super_1_validate, 1390 .validate_super = super_1_validate,
1345 .sync_super = super_1_sync, 1391 .sync_super = super_1_sync,
1392 .rdev_size_change = super_1_rdev_size_change,
1346 }, 1393 },
1347}; 1394};
1348 1395
1349static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1396static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1350{ 1397{
1351 struct list_head *tmp, *tmp2;
1352 mdk_rdev_t *rdev, *rdev2; 1398 mdk_rdev_t *rdev, *rdev2;
1353 1399
1354 rdev_for_each(rdev, tmp, mddev1) 1400 rcu_read_lock();
1355 rdev_for_each(rdev2, tmp2, mddev2) 1401 rdev_for_each_rcu(rdev, mddev1)
1402 rdev_for_each_rcu(rdev2, mddev2)
1356 if (rdev->bdev->bd_contains == 1403 if (rdev->bdev->bd_contains ==
1357 rdev2->bdev->bd_contains) 1404 rdev2->bdev->bd_contains) {
1405 rcu_read_unlock();
1358 return 1; 1406 return 1;
1359 1407 }
1408 rcu_read_unlock();
1360 return 0; 1409 return 0;
1361} 1410}
1362 1411
@@ -1423,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1423 kobject_del(&rdev->kobj); 1472 kobject_del(&rdev->kobj);
1424 goto fail; 1473 goto fail;
1425 } 1474 }
1426 list_add(&rdev->same_set, &mddev->disks); 1475 list_add_rcu(&rdev->same_set, &mddev->disks);
1427 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1476 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1428 return 0; 1477 return 0;
1429 1478
@@ -1448,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1448 return; 1497 return;
1449 } 1498 }
1450 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1499 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1451 list_del_init(&rdev->same_set); 1500 list_del_rcu(&rdev->same_set);
1452 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1501 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1453 rdev->mddev = NULL; 1502 rdev->mddev = NULL;
1454 sysfs_remove_link(&rdev->kobj, "block"); 1503 sysfs_remove_link(&rdev->kobj, "block");
1455 1504
1456 /* We need to delay this, otherwise we can deadlock when 1505 /* We need to delay this, otherwise we can deadlock when
1457 * writing to 'remove' to "dev/state" 1506 * writing to 'remove' to "dev/state". We also need
1507 * to delay it due to rcu usage.
1458 */ 1508 */
1509 synchronize_rcu();
1459 INIT_WORK(&rdev->del_work, md_delayed_delete); 1510 INIT_WORK(&rdev->del_work, md_delayed_delete);
1460 kobject_get(&rdev->kobj); 1511 kobject_get(&rdev->kobj);
1461 schedule_work(&rdev->del_work); 1512 schedule_work(&rdev->del_work);
@@ -1511,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev)
1511 if (rdev->mddev) 1562 if (rdev->mddev)
1512 MD_BUG(); 1563 MD_BUG();
1513 free_disk_sb(rdev); 1564 free_disk_sb(rdev);
1514 list_del_init(&rdev->same_set);
1515#ifndef MODULE 1565#ifndef MODULE
1516 if (test_bit(AutoDetected, &rdev->flags)) 1566 if (test_bit(AutoDetected, &rdev->flags))
1517 md_autodetect_dev(rdev->bdev->bd_dev); 1567 md_autodetect_dev(rdev->bdev->bd_dev);
@@ -1758,11 +1808,11 @@ repeat:
1758 dprintk("%s ", bdevname(rdev->bdev,b)); 1808 dprintk("%s ", bdevname(rdev->bdev,b));
1759 if (!test_bit(Faulty, &rdev->flags)) { 1809 if (!test_bit(Faulty, &rdev->flags)) {
1760 md_super_write(mddev,rdev, 1810 md_super_write(mddev,rdev,
1761 rdev->sb_offset<<1, rdev->sb_size, 1811 rdev->sb_start, rdev->sb_size,
1762 rdev->sb_page); 1812 rdev->sb_page);
1763 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1813 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1764 bdevname(rdev->bdev,b), 1814 bdevname(rdev->bdev,b),
1765 (unsigned long long)rdev->sb_offset); 1815 (unsigned long long)rdev->sb_start);
1766 rdev->sb_events = mddev->events; 1816 rdev->sb_events = mddev->events;
1767 1817
1768 } else 1818 } else
@@ -1787,7 +1837,7 @@ repeat:
1787 1837
1788} 1838}
1789 1839
1790/* words written to sysfs files may, or my not, be \n terminated. 1840/* words written to sysfs files may, or may not, be \n terminated.
1791 * We want to accept with case. For this we use cmd_match. 1841 * We want to accept with case. For this we use cmd_match.
1792 */ 1842 */
1793static int cmd_match(const char *cmd, const char *str) 1843static int cmd_match(const char *cmd, const char *str)
@@ -1886,6 +1936,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1886 1936
1887 err = 0; 1937 err = 0;
1888 } 1938 }
1939 if (!err)
1940 sysfs_notify(&rdev->kobj, NULL, "state");
1889 return err ? err : len; 1941 return err ? err : len;
1890} 1942}
1891static struct rdev_sysfs_entry rdev_state = 1943static struct rdev_sysfs_entry rdev_state =
@@ -1931,7 +1983,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1931 slot = -1; 1983 slot = -1;
1932 else if (e==buf || (*e && *e!= '\n')) 1984 else if (e==buf || (*e && *e!= '\n'))
1933 return -EINVAL; 1985 return -EINVAL;
1934 if (rdev->mddev->pers) { 1986 if (rdev->mddev->pers && slot == -1) {
1935 /* Setting 'slot' on an active array requires also 1987 /* Setting 'slot' on an active array requires also
1936 * updating the 'rd%d' link, and communicating 1988 * updating the 'rd%d' link, and communicating
1937 * with the personality with ->hot_*_disk. 1989 * with the personality with ->hot_*_disk.
@@ -1939,8 +1991,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1939 * failed/spare devices. This normally happens automatically, 1991 * failed/spare devices. This normally happens automatically,
1940 * but not when the metadata is externally managed. 1992 * but not when the metadata is externally managed.
1941 */ 1993 */
1942 if (slot != -1)
1943 return -EBUSY;
1944 if (rdev->raid_disk == -1) 1994 if (rdev->raid_disk == -1)
1945 return -EEXIST; 1995 return -EEXIST;
1946 /* personality does all needed checks */ 1996 /* personality does all needed checks */
@@ -1954,6 +2004,43 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1954 sysfs_remove_link(&rdev->mddev->kobj, nm); 2004 sysfs_remove_link(&rdev->mddev->kobj, nm);
1955 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2005 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
1956 md_wakeup_thread(rdev->mddev->thread); 2006 md_wakeup_thread(rdev->mddev->thread);
2007 } else if (rdev->mddev->pers) {
2008 mdk_rdev_t *rdev2;
2009 struct list_head *tmp;
2010 /* Activating a spare .. or possibly reactivating
2011 * if we every get bitmaps working here.
2012 */
2013
2014 if (rdev->raid_disk != -1)
2015 return -EBUSY;
2016
2017 if (rdev->mddev->pers->hot_add_disk == NULL)
2018 return -EINVAL;
2019
2020 rdev_for_each(rdev2, tmp, rdev->mddev)
2021 if (rdev2->raid_disk == slot)
2022 return -EEXIST;
2023
2024 rdev->raid_disk = slot;
2025 if (test_bit(In_sync, &rdev->flags))
2026 rdev->saved_raid_disk = slot;
2027 else
2028 rdev->saved_raid_disk = -1;
2029 err = rdev->mddev->pers->
2030 hot_add_disk(rdev->mddev, rdev);
2031 if (err) {
2032 rdev->raid_disk = -1;
2033 return err;
2034 } else
2035 sysfs_notify(&rdev->kobj, NULL, "state");
2036 sprintf(nm, "rd%d", rdev->raid_disk);
2037 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2038 printk(KERN_WARNING
2039 "md: cannot register "
2040 "%s for %s\n",
2041 nm, mdname(rdev->mddev));
2042
2043 /* don't wakeup anyone, leave that to userspace. */
1957 } else { 2044 } else {
1958 if (slot >= rdev->mddev->raid_disks) 2045 if (slot >= rdev->mddev->raid_disks)
1959 return -ENOSPC; 2046 return -ENOSPC;
@@ -1962,6 +2049,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1962 clear_bit(Faulty, &rdev->flags); 2049 clear_bit(Faulty, &rdev->flags);
1963 clear_bit(WriteMostly, &rdev->flags); 2050 clear_bit(WriteMostly, &rdev->flags);
1964 set_bit(In_sync, &rdev->flags); 2051 set_bit(In_sync, &rdev->flags);
2052 sysfs_notify(&rdev->kobj, NULL, "state");
1965 } 2053 }
1966 return len; 2054 return len;
1967} 2055}
@@ -1983,7 +2071,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1983 unsigned long long offset = simple_strtoull(buf, &e, 10); 2071 unsigned long long offset = simple_strtoull(buf, &e, 10);
1984 if (e==buf || (*e && *e != '\n')) 2072 if (e==buf || (*e && *e != '\n'))
1985 return -EINVAL; 2073 return -EINVAL;
1986 if (rdev->mddev->pers) 2074 if (rdev->mddev->pers && rdev->raid_disk >= 0)
1987 return -EBUSY; 2075 return -EBUSY;
1988 if (rdev->size && rdev->mddev->external) 2076 if (rdev->size && rdev->mddev->external)
1989 /* Must set offset before size, so overlap checks 2077 /* Must set offset before size, so overlap checks
@@ -2015,17 +2103,30 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2015static ssize_t 2103static ssize_t
2016rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2104rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2017{ 2105{
2018 char *e; 2106 unsigned long long size;
2019 unsigned long long size = simple_strtoull(buf, &e, 10);
2020 unsigned long long oldsize = rdev->size; 2107 unsigned long long oldsize = rdev->size;
2021 mddev_t *my_mddev = rdev->mddev; 2108 mddev_t *my_mddev = rdev->mddev;
2022 2109
2023 if (e==buf || (*e && *e != '\n')) 2110 if (strict_strtoull(buf, 10, &size) < 0)
2024 return -EINVAL; 2111 return -EINVAL;
2025 if (my_mddev->pers) 2112 if (size < my_mddev->size)
2026 return -EBUSY; 2113 return -EINVAL;
2114 if (my_mddev->pers && rdev->raid_disk >= 0) {
2115 if (my_mddev->persistent) {
2116 size = super_types[my_mddev->major_version].
2117 rdev_size_change(rdev, size * 2);
2118 if (!size)
2119 return -EBUSY;
2120 } else if (!size) {
2121 size = (rdev->bdev->bd_inode->i_size >> 10);
2122 size -= rdev->data_offset/2;
2123 }
2124 if (size < my_mddev->size)
2125 return -EINVAL; /* component must fit device */
2126 }
2127
2027 rdev->size = size; 2128 rdev->size = size;
2028 if (size > oldsize && rdev->mddev->external) { 2129 if (size > oldsize && my_mddev->external) {
2029 /* need to check that all other rdevs with the same ->bdev 2130 /* need to check that all other rdevs with the same ->bdev
2030 * do not overlap. We need to unlock the mddev to avoid 2131 * do not overlap. We need to unlock the mddev to avoid
2031 * a deadlock. We have already changed rdev->size, and if 2132 * a deadlock. We have already changed rdev->size, and if
@@ -2044,8 +2145,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2044 if (test_bit(AllReserved, &rdev2->flags) || 2145 if (test_bit(AllReserved, &rdev2->flags) ||
2045 (rdev->bdev == rdev2->bdev && 2146 (rdev->bdev == rdev2->bdev &&
2046 rdev != rdev2 && 2147 rdev != rdev2 &&
2047 overlaps(rdev->data_offset, rdev->size, 2148 overlaps(rdev->data_offset, rdev->size * 2,
2048 rdev2->data_offset, rdev2->size))) { 2149 rdev2->data_offset,
2150 rdev2->size * 2))) {
2049 overlap = 1; 2151 overlap = 1;
2050 break; 2152 break;
2051 } 2153 }
@@ -2067,8 +2169,6 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2067 return -EBUSY; 2169 return -EBUSY;
2068 } 2170 }
2069 } 2171 }
2070 if (size < my_mddev->size || my_mddev->size == 0)
2071 my_mddev->size = size;
2072 return len; 2172 return len;
2073} 2173}
2074 2174
@@ -2512,7 +2612,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2512 * When written, doesn't tear down array, but just stops it 2612 * When written, doesn't tear down array, but just stops it
2513 * suspended (not supported yet) 2613 * suspended (not supported yet)
2514 * All IO requests will block. The array can be reconfigured. 2614 * All IO requests will block. The array can be reconfigured.
2515 * Writing this, if accepted, will block until array is quiessent 2615 * Writing this, if accepted, will block until array is quiescent
2516 * readonly 2616 * readonly
2517 * no resync can happen. no superblocks get written. 2617 * no resync can happen. no superblocks get written.
2518 * write requests fail 2618 * write requests fail
@@ -2585,7 +2685,7 @@ array_state_show(mddev_t *mddev, char *page)
2585 return sprintf(page, "%s\n", array_states[st]); 2685 return sprintf(page, "%s\n", array_states[st]);
2586} 2686}
2587 2687
2588static int do_md_stop(mddev_t * mddev, int ro); 2688static int do_md_stop(mddev_t * mddev, int ro, int is_open);
2589static int do_md_run(mddev_t * mddev); 2689static int do_md_run(mddev_t * mddev);
2590static int restart_array(mddev_t *mddev); 2690static int restart_array(mddev_t *mddev);
2591 2691
@@ -2599,16 +2699,16 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2599 break; 2699 break;
2600 case clear: 2700 case clear:
2601 /* stopping an active array */ 2701 /* stopping an active array */
2602 if (atomic_read(&mddev->active) > 1) 2702 if (atomic_read(&mddev->openers) > 0)
2603 return -EBUSY; 2703 return -EBUSY;
2604 err = do_md_stop(mddev, 0); 2704 err = do_md_stop(mddev, 0, 0);
2605 break; 2705 break;
2606 case inactive: 2706 case inactive:
2607 /* stopping an active array */ 2707 /* stopping an active array */
2608 if (mddev->pers) { 2708 if (mddev->pers) {
2609 if (atomic_read(&mddev->active) > 1) 2709 if (atomic_read(&mddev->openers) > 0)
2610 return -EBUSY; 2710 return -EBUSY;
2611 err = do_md_stop(mddev, 2); 2711 err = do_md_stop(mddev, 2, 0);
2612 } else 2712 } else
2613 err = 0; /* already inactive */ 2713 err = 0; /* already inactive */
2614 break; 2714 break;
@@ -2616,7 +2716,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2616 break; /* not supported yet */ 2716 break; /* not supported yet */
2617 case readonly: 2717 case readonly:
2618 if (mddev->pers) 2718 if (mddev->pers)
2619 err = do_md_stop(mddev, 1); 2719 err = do_md_stop(mddev, 1, 0);
2620 else { 2720 else {
2621 mddev->ro = 1; 2721 mddev->ro = 1;
2622 set_disk_ro(mddev->gendisk, 1); 2722 set_disk_ro(mddev->gendisk, 1);
@@ -2626,7 +2726,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2626 case read_auto: 2726 case read_auto:
2627 if (mddev->pers) { 2727 if (mddev->pers) {
2628 if (mddev->ro != 1) 2728 if (mddev->ro != 1)
2629 err = do_md_stop(mddev, 1); 2729 err = do_md_stop(mddev, 1, 0);
2630 else 2730 else
2631 err = restart_array(mddev); 2731 err = restart_array(mddev);
2632 if (err == 0) { 2732 if (err == 0) {
@@ -2681,8 +2781,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2681 } 2781 }
2682 if (err) 2782 if (err)
2683 return err; 2783 return err;
2684 else 2784 else {
2785 sysfs_notify(&mddev->kobj, NULL, "array_state");
2685 return len; 2786 return len;
2787 }
2686} 2788}
2687static struct md_sysfs_entry md_array_state = 2789static struct md_sysfs_entry md_array_state =
2688__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2790__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
@@ -2785,7 +2887,7 @@ size_show(mddev_t *mddev, char *page)
2785 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2887 return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2786} 2888}
2787 2889
2788static int update_size(mddev_t *mddev, unsigned long size); 2890static int update_size(mddev_t *mddev, sector_t num_sectors);
2789 2891
2790static ssize_t 2892static ssize_t
2791size_store(mddev_t *mddev, const char *buf, size_t len) 2893size_store(mddev_t *mddev, const char *buf, size_t len)
@@ -2802,7 +2904,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
2802 return -EINVAL; 2904 return -EINVAL;
2803 2905
2804 if (mddev->pers) { 2906 if (mddev->pers) {
2805 err = update_size(mddev, size); 2907 err = update_size(mddev, size * 2);
2806 md_update_sb(mddev, 1); 2908 md_update_sb(mddev, 1);
2807 } else { 2909 } else {
2808 if (mddev->size == 0 || 2910 if (mddev->size == 0 ||
@@ -2899,7 +3001,7 @@ action_show(mddev_t *mddev, char *page)
2899 type = "check"; 3001 type = "check";
2900 else 3002 else
2901 type = "repair"; 3003 type = "repair";
2902 } else 3004 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
2903 type = "recover"; 3005 type = "recover";
2904 } 3006 }
2905 return sprintf(page, "%s\n", type); 3007 return sprintf(page, "%s\n", type);
@@ -2921,15 +3023,19 @@ action_store(mddev_t *mddev, const char *page, size_t len)
2921 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3023 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2922 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3024 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2923 return -EBUSY; 3025 return -EBUSY;
2924 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 3026 else if (cmd_match(page, "resync"))
3027 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3028 else if (cmd_match(page, "recover")) {
3029 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2925 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3030 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2926 else if (cmd_match(page, "reshape")) { 3031 } else if (cmd_match(page, "reshape")) {
2927 int err; 3032 int err;
2928 if (mddev->pers->start_reshape == NULL) 3033 if (mddev->pers->start_reshape == NULL)
2929 return -EINVAL; 3034 return -EINVAL;
2930 err = mddev->pers->start_reshape(mddev); 3035 err = mddev->pers->start_reshape(mddev);
2931 if (err) 3036 if (err)
2932 return err; 3037 return err;
3038 sysfs_notify(&mddev->kobj, NULL, "degraded");
2933 } else { 3039 } else {
2934 if (cmd_match(page, "check")) 3040 if (cmd_match(page, "check"))
2935 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3041 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -2940,6 +3046,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
2940 } 3046 }
2941 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3047 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2942 md_wakeup_thread(mddev->thread); 3048 md_wakeup_thread(mddev->thread);
3049 sysfs_notify(&mddev->kobj, NULL, "sync_action");
2943 return len; 3050 return len;
2944} 3051}
2945 3052
@@ -3049,11 +3156,11 @@ static ssize_t
3049sync_speed_show(mddev_t *mddev, char *page) 3156sync_speed_show(mddev_t *mddev, char *page)
3050{ 3157{
3051 unsigned long resync, dt, db; 3158 unsigned long resync, dt, db;
3052 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); 3159 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3053 dt = ((jiffies - mddev->resync_mark) / HZ); 3160 dt = (jiffies - mddev->resync_mark) / HZ;
3054 if (!dt) dt++; 3161 if (!dt) dt++;
3055 db = resync - (mddev->resync_mark_cnt); 3162 db = resync - mddev->resync_mark_cnt;
3056 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 3163 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3057} 3164}
3058 3165
3059static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 3166static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
@@ -3075,6 +3182,36 @@ sync_completed_show(mddev_t *mddev, char *page)
3075static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3182static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3076 3183
3077static ssize_t 3184static ssize_t
3185min_sync_show(mddev_t *mddev, char *page)
3186{
3187 return sprintf(page, "%llu\n",
3188 (unsigned long long)mddev->resync_min);
3189}
3190static ssize_t
3191min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3192{
3193 unsigned long long min;
3194 if (strict_strtoull(buf, 10, &min))
3195 return -EINVAL;
3196 if (min > mddev->resync_max)
3197 return -EINVAL;
3198 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3199 return -EBUSY;
3200
3201 /* Must be a multiple of chunk_size */
3202 if (mddev->chunk_size) {
3203 if (min & (sector_t)((mddev->chunk_size>>9)-1))
3204 return -EINVAL;
3205 }
3206 mddev->resync_min = min;
3207
3208 return len;
3209}
3210
3211static struct md_sysfs_entry md_min_sync =
3212__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3213
3214static ssize_t
3078max_sync_show(mddev_t *mddev, char *page) 3215max_sync_show(mddev_t *mddev, char *page)
3079{ 3216{
3080 if (mddev->resync_max == MaxSector) 3217 if (mddev->resync_max == MaxSector)
@@ -3089,9 +3226,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3089 if (strncmp(buf, "max", 3) == 0) 3226 if (strncmp(buf, "max", 3) == 0)
3090 mddev->resync_max = MaxSector; 3227 mddev->resync_max = MaxSector;
3091 else { 3228 else {
3092 char *ep; 3229 unsigned long long max;
3093 unsigned long long max = simple_strtoull(buf, &ep, 10); 3230 if (strict_strtoull(buf, 10, &max))
3094 if (ep == buf || (*ep != 0 && *ep != '\n')) 3231 return -EINVAL;
3232 if (max < mddev->resync_min)
3095 return -EINVAL; 3233 return -EINVAL;
3096 if (max < mddev->resync_max && 3234 if (max < mddev->resync_max &&
3097 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3235 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -3222,6 +3360,7 @@ static struct attribute *md_redundancy_attrs[] = {
3222 &md_sync_speed.attr, 3360 &md_sync_speed.attr,
3223 &md_sync_force_parallel.attr, 3361 &md_sync_force_parallel.attr,
3224 &md_sync_completed.attr, 3362 &md_sync_completed.attr,
3363 &md_min_sync.attr,
3225 &md_max_sync.attr, 3364 &md_max_sync.attr,
3226 &md_suspend_lo.attr, 3365 &md_suspend_lo.attr,
3227 &md_suspend_hi.attr, 3366 &md_suspend_hi.attr,
@@ -3326,9 +3465,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
3326 disk->queue = mddev->queue; 3465 disk->queue = mddev->queue;
3327 add_disk(disk); 3466 add_disk(disk);
3328 mddev->gendisk = disk; 3467 mddev->gendisk = disk;
3329 mutex_unlock(&disks_mutex);
3330 error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, 3468 error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
3331 "%s", "md"); 3469 "%s", "md");
3470 mutex_unlock(&disks_mutex);
3332 if (error) 3471 if (error)
3333 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3472 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3334 disk->disk_name); 3473 disk->disk_name);
@@ -3341,7 +3480,11 @@ static void md_safemode_timeout(unsigned long data)
3341{ 3480{
3342 mddev_t *mddev = (mddev_t *) data; 3481 mddev_t *mddev = (mddev_t *) data;
3343 3482
3344 mddev->safemode = 1; 3483 if (!atomic_read(&mddev->writes_pending)) {
3484 mddev->safemode = 1;
3485 if (mddev->external)
3486 sysfs_notify(&mddev->kobj, NULL, "array_state");
3487 }
3345 md_wakeup_thread(mddev->thread); 3488 md_wakeup_thread(mddev->thread);
3346} 3489}
3347 3490
@@ -3432,22 +3575,23 @@ static int do_md_run(mddev_t * mddev)
3432 * We don't want the data to overlap the metadata, 3575 * We don't want the data to overlap the metadata,
3433 * Internal Bitmap issues has handled elsewhere. 3576 * Internal Bitmap issues has handled elsewhere.
3434 */ 3577 */
3435 if (rdev->data_offset < rdev->sb_offset) { 3578 if (rdev->data_offset < rdev->sb_start) {
3436 if (mddev->size && 3579 if (mddev->size &&
3437 rdev->data_offset + mddev->size*2 3580 rdev->data_offset + mddev->size*2
3438 > rdev->sb_offset*2) { 3581 > rdev->sb_start) {
3439 printk("md: %s: data overlaps metadata\n", 3582 printk("md: %s: data overlaps metadata\n",
3440 mdname(mddev)); 3583 mdname(mddev));
3441 return -EINVAL; 3584 return -EINVAL;
3442 } 3585 }
3443 } else { 3586 } else {
3444 if (rdev->sb_offset*2 + rdev->sb_size/512 3587 if (rdev->sb_start + rdev->sb_size/512
3445 > rdev->data_offset) { 3588 > rdev->data_offset) {
3446 printk("md: %s: metadata overlaps data\n", 3589 printk("md: %s: metadata overlaps data\n",
3447 mdname(mddev)); 3590 mdname(mddev));
3448 return -EINVAL; 3591 return -EINVAL;
3449 } 3592 }
3450 } 3593 }
3594 sysfs_notify(&rdev->kobj, NULL, "state");
3451 } 3595 }
3452 3596
3453 md_probe(mddev->unit, NULL, NULL); 3597 md_probe(mddev->unit, NULL, NULL);
@@ -3519,7 +3663,9 @@ static int do_md_run(mddev_t * mddev)
3519 mddev->ro = 2; /* read-only, but switch on first write */ 3663 mddev->ro = 2; /* read-only, but switch on first write */
3520 3664
3521 err = mddev->pers->run(mddev); 3665 err = mddev->pers->run(mddev);
3522 if (!err && mddev->pers->sync_request) { 3666 if (err)
3667 printk(KERN_ERR "md: pers->run() failed ...\n");
3668 else if (mddev->pers->sync_request) {
3523 err = bitmap_create(mddev); 3669 err = bitmap_create(mddev);
3524 if (err) { 3670 if (err) {
3525 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3671 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3528,7 +3674,6 @@ static int do_md_run(mddev_t * mddev)
3528 } 3674 }
3529 } 3675 }
3530 if (err) { 3676 if (err) {
3531 printk(KERN_ERR "md: pers->run() failed ...\n");
3532 module_put(mddev->pers->owner); 3677 module_put(mddev->pers->owner);
3533 mddev->pers = NULL; 3678 mddev->pers = NULL;
3534 bitmap_destroy(mddev); 3679 bitmap_destroy(mddev);
@@ -3563,7 +3708,7 @@ static int do_md_run(mddev_t * mddev)
3563 if (mddev->flags) 3708 if (mddev->flags)
3564 md_update_sb(mddev, 0); 3709 md_update_sb(mddev, 0);
3565 3710
3566 set_capacity(disk, mddev->array_size<<1); 3711 set_capacity(disk, mddev->array_sectors);
3567 3712
3568 /* If we call blk_queue_make_request here, it will 3713 /* If we call blk_queue_make_request here, it will
3569 * re-initialise max_sectors etc which may have been 3714 * re-initialise max_sectors etc which may have been
@@ -3608,6 +3753,9 @@ static int do_md_run(mddev_t * mddev)
3608 3753
3609 mddev->changed = 1; 3754 mddev->changed = 1;
3610 md_new_event(mddev); 3755 md_new_event(mddev);
3756 sysfs_notify(&mddev->kobj, NULL, "array_state");
3757 sysfs_notify(&mddev->kobj, NULL, "sync_action");
3758 sysfs_notify(&mddev->kobj, NULL, "degraded");
3611 kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); 3759 kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
3612 return 0; 3760 return 0;
3613} 3761}
@@ -3615,38 +3763,25 @@ static int do_md_run(mddev_t * mddev)
3615static int restart_array(mddev_t *mddev) 3763static int restart_array(mddev_t *mddev)
3616{ 3764{
3617 struct gendisk *disk = mddev->gendisk; 3765 struct gendisk *disk = mddev->gendisk;
3618 int err;
3619 3766
3620 /* 3767 /* Complain if it has no devices */
3621 * Complain if it has no devices
3622 */
3623 err = -ENXIO;
3624 if (list_empty(&mddev->disks)) 3768 if (list_empty(&mddev->disks))
3625 goto out; 3769 return -ENXIO;
3626 3770 if (!mddev->pers)
3627 if (mddev->pers) { 3771 return -EINVAL;
3628 err = -EBUSY; 3772 if (!mddev->ro)
3629 if (!mddev->ro) 3773 return -EBUSY;
3630 goto out; 3774 mddev->safemode = 0;
3631 3775 mddev->ro = 0;
3632 mddev->safemode = 0; 3776 set_disk_ro(disk, 0);
3633 mddev->ro = 0; 3777 printk(KERN_INFO "md: %s switched to read-write mode.\n",
3634 set_disk_ro(disk, 0); 3778 mdname(mddev));
3635 3779 /* Kick recovery or resync if necessary */
3636 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3780 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3637 mdname(mddev)); 3781 md_wakeup_thread(mddev->thread);
3638 /* 3782 md_wakeup_thread(mddev->sync_thread);
3639 * Kick recovery or resync if necessary 3783 sysfs_notify(&mddev->kobj, NULL, "array_state");
3640 */ 3784 return 0;
3641 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3642 md_wakeup_thread(mddev->thread);
3643 md_wakeup_thread(mddev->sync_thread);
3644 err = 0;
3645 } else
3646 err = -EINVAL;
3647
3648out:
3649 return err;
3650} 3785}
3651 3786
3652/* similar to deny_write_access, but accounts for our holding a reference 3787/* similar to deny_write_access, but accounts for our holding a reference
@@ -3680,16 +3815,17 @@ static void restore_bitmap_write_access(struct file *file)
3680 * 1 - switch to readonly 3815 * 1 - switch to readonly
3681 * 2 - stop but do not disassemble array 3816 * 2 - stop but do not disassemble array
3682 */ 3817 */
3683static int do_md_stop(mddev_t * mddev, int mode) 3818static int do_md_stop(mddev_t * mddev, int mode, int is_open)
3684{ 3819{
3685 int err = 0; 3820 int err = 0;
3686 struct gendisk *disk = mddev->gendisk; 3821 struct gendisk *disk = mddev->gendisk;
3687 3822
3823 if (atomic_read(&mddev->openers) > is_open) {
3824 printk("md: %s still in use.\n",mdname(mddev));
3825 return -EBUSY;
3826 }
3827
3688 if (mddev->pers) { 3828 if (mddev->pers) {
3689 if (atomic_read(&mddev->active)>2) {
3690 printk("md: %s still in use.\n",mdname(mddev));
3691 return -EBUSY;
3692 }
3693 3829
3694 if (mddev->sync_thread) { 3830 if (mddev->sync_thread) {
3695 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3831 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -3773,10 +3909,11 @@ static int do_md_stop(mddev_t * mddev, int mode)
3773 3909
3774 export_array(mddev); 3910 export_array(mddev);
3775 3911
3776 mddev->array_size = 0; 3912 mddev->array_sectors = 0;
3777 mddev->size = 0; 3913 mddev->size = 0;
3778 mddev->raid_disks = 0; 3914 mddev->raid_disks = 0;
3779 mddev->recovery_cp = 0; 3915 mddev->recovery_cp = 0;
3916 mddev->resync_min = 0;
3780 mddev->resync_max = MaxSector; 3917 mddev->resync_max = MaxSector;
3781 mddev->reshape_position = MaxSector; 3918 mddev->reshape_position = MaxSector;
3782 mddev->external = 0; 3919 mddev->external = 0;
@@ -3811,6 +3948,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
3811 mdname(mddev)); 3948 mdname(mddev));
3812 err = 0; 3949 err = 0;
3813 md_new_event(mddev); 3950 md_new_event(mddev);
3951 sysfs_notify(&mddev->kobj, NULL, "array_state");
3814out: 3952out:
3815 return err; 3953 return err;
3816} 3954}
@@ -3836,7 +3974,7 @@ static void autorun_array(mddev_t *mddev)
3836 err = do_md_run (mddev); 3974 err = do_md_run (mddev);
3837 if (err) { 3975 if (err) {
3838 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3976 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3839 do_md_stop (mddev, 0); 3977 do_md_stop (mddev, 0, 0);
3840 } 3978 }
3841} 3979}
3842 3980
@@ -3927,8 +4065,10 @@ static void autorun_devices(int part)
3927 /* on success, candidates will be empty, on error 4065 /* on success, candidates will be empty, on error
3928 * it won't... 4066 * it won't...
3929 */ 4067 */
3930 rdev_for_each_list(rdev, tmp, candidates) 4068 rdev_for_each_list(rdev, tmp, candidates) {
4069 list_del_init(&rdev->same_set);
3931 export_rdev(rdev); 4070 export_rdev(rdev);
4071 }
3932 mddev_put(mddev); 4072 mddev_put(mddev);
3933 } 4073 }
3934 printk(KERN_INFO "md: ... autorun DONE.\n"); 4074 printk(KERN_INFO "md: ... autorun DONE.\n");
@@ -4009,9 +4149,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4009 char *ptr, *buf = NULL; 4149 char *ptr, *buf = NULL;
4010 int err = -ENOMEM; 4150 int err = -ENOMEM;
4011 4151
4012 md_allow_write(mddev); 4152 if (md_allow_write(mddev))
4153 file = kmalloc(sizeof(*file), GFP_NOIO);
4154 else
4155 file = kmalloc(sizeof(*file), GFP_KERNEL);
4013 4156
4014 file = kmalloc(sizeof(*file), GFP_KERNEL);
4015 if (!file) 4157 if (!file)
4016 goto out; 4158 goto out;
4017 4159
@@ -4044,15 +4186,12 @@ out:
4044static int get_disk_info(mddev_t * mddev, void __user * arg) 4186static int get_disk_info(mddev_t * mddev, void __user * arg)
4045{ 4187{
4046 mdu_disk_info_t info; 4188 mdu_disk_info_t info;
4047 unsigned int nr;
4048 mdk_rdev_t *rdev; 4189 mdk_rdev_t *rdev;
4049 4190
4050 if (copy_from_user(&info, arg, sizeof(info))) 4191 if (copy_from_user(&info, arg, sizeof(info)))
4051 return -EFAULT; 4192 return -EFAULT;
4052 4193
4053 nr = info.number; 4194 rdev = find_rdev_nr(mddev, info.number);
4054
4055 rdev = find_rdev_nr(mddev, nr);
4056 if (rdev) { 4195 if (rdev) {
4057 info.major = MAJOR(rdev->bdev->bd_dev); 4196 info.major = MAJOR(rdev->bdev->bd_dev);
4058 info.minor = MINOR(rdev->bdev->bd_dev); 4197 info.minor = MINOR(rdev->bdev->bd_dev);
@@ -4172,8 +4311,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4172 } 4311 }
4173 if (err) 4312 if (err)
4174 export_rdev(rdev); 4313 export_rdev(rdev);
4314 else
4315 sysfs_notify(&rdev->kobj, NULL, "state");
4175 4316
4176 md_update_sb(mddev, 1); 4317 md_update_sb(mddev, 1);
4318 if (mddev->degraded)
4319 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4177 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4320 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4178 md_wakeup_thread(mddev->thread); 4321 md_wakeup_thread(mddev->thread);
4179 return err; 4322 return err;
@@ -4212,10 +4355,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4212 4355
4213 if (!mddev->persistent) { 4356 if (!mddev->persistent) {
4214 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 4357 printk(KERN_INFO "md: nonpersistent superblock ...\n");
4215 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 4358 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4216 } else 4359 } else
4217 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 4360 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4218 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 4361 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
4219 4362
4220 err = bind_rdev_to_array(rdev, mddev); 4363 err = bind_rdev_to_array(rdev, mddev);
4221 if (err) { 4364 if (err) {
@@ -4232,9 +4375,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4232 char b[BDEVNAME_SIZE]; 4375 char b[BDEVNAME_SIZE];
4233 mdk_rdev_t *rdev; 4376 mdk_rdev_t *rdev;
4234 4377
4235 if (!mddev->pers)
4236 return -ENODEV;
4237
4238 rdev = find_rdev(mddev, dev); 4378 rdev = find_rdev(mddev, dev);
4239 if (!rdev) 4379 if (!rdev)
4240 return -ENXIO; 4380 return -ENXIO;
@@ -4257,7 +4397,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
4257{ 4397{
4258 char b[BDEVNAME_SIZE]; 4398 char b[BDEVNAME_SIZE];
4259 int err; 4399 int err;
4260 unsigned int size;
4261 mdk_rdev_t *rdev; 4400 mdk_rdev_t *rdev;
4262 4401
4263 if (!mddev->pers) 4402 if (!mddev->pers)
@@ -4285,13 +4424,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
4285 } 4424 }
4286 4425
4287 if (mddev->persistent) 4426 if (mddev->persistent)
4288 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 4427 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4289 else 4428 else
4290 rdev->sb_offset = 4429 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4291 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
4292 4430
4293 size = calc_dev_size(rdev, mddev->chunk_size); 4431 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
4294 rdev->size = size;
4295 4432
4296 if (test_bit(Faulty, &rdev->flags)) { 4433 if (test_bit(Faulty, &rdev->flags)) {
4297 printk(KERN_WARNING 4434 printk(KERN_WARNING
@@ -4476,24 +4613,24 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4476 return 0; 4613 return 0;
4477} 4614}
4478 4615
4479static int update_size(mddev_t *mddev, unsigned long size) 4616static int update_size(mddev_t *mddev, sector_t num_sectors)
4480{ 4617{
4481 mdk_rdev_t * rdev; 4618 mdk_rdev_t * rdev;
4482 int rv; 4619 int rv;
4483 struct list_head *tmp; 4620 struct list_head *tmp;
4484 int fit = (size == 0); 4621 int fit = (num_sectors == 0);
4485 4622
4486 if (mddev->pers->resize == NULL) 4623 if (mddev->pers->resize == NULL)
4487 return -EINVAL; 4624 return -EINVAL;
4488 /* The "size" is the amount of each device that is used. 4625 /* The "num_sectors" is the number of sectors of each device that
4489 * This can only make sense for arrays with redundancy. 4626 * is used. This can only make sense for arrays with redundancy.
4490 * linear and raid0 always use whatever space is available 4627 * linear and raid0 always use whatever space is available. We can only
4491 * We can only consider changing the size if no resync 4628 * consider changing this number if no resync or reconstruction is
4492 * or reconstruction is happening, and if the new size 4629 * happening, and if the new size is acceptable. It must fit before the
4493 * is acceptable. It must fit before the sb_offset or, 4630 * sb_start or, if that is <data_offset, it must fit before the size
4494 * if that is <data_offset, it must fit before the 4631 * of each device. If num_sectors is zero, we find the largest size
4495 * size of each device. 4632 * that fits.
4496 * If size is zero, we find the largest size that fits. 4633
4497 */ 4634 */
4498 if (mddev->sync_thread) 4635 if (mddev->sync_thread)
4499 return -EBUSY; 4636 return -EBUSY;
@@ -4501,19 +4638,20 @@ static int update_size(mddev_t *mddev, unsigned long size)
4501 sector_t avail; 4638 sector_t avail;
4502 avail = rdev->size * 2; 4639 avail = rdev->size * 2;
4503 4640
4504 if (fit && (size == 0 || size > avail/2)) 4641 if (fit && (num_sectors == 0 || num_sectors > avail))
4505 size = avail/2; 4642 num_sectors = avail;
4506 if (avail < ((sector_t)size << 1)) 4643 if (avail < num_sectors)
4507 return -ENOSPC; 4644 return -ENOSPC;
4508 } 4645 }
4509 rv = mddev->pers->resize(mddev, (sector_t)size *2); 4646 rv = mddev->pers->resize(mddev, num_sectors);
4510 if (!rv) { 4647 if (!rv) {
4511 struct block_device *bdev; 4648 struct block_device *bdev;
4512 4649
4513 bdev = bdget_disk(mddev->gendisk, 0); 4650 bdev = bdget_disk(mddev->gendisk, 0);
4514 if (bdev) { 4651 if (bdev) {
4515 mutex_lock(&bdev->bd_inode->i_mutex); 4652 mutex_lock(&bdev->bd_inode->i_mutex);
4516 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); 4653 i_size_write(bdev->bd_inode,
4654 (loff_t)mddev->array_sectors << 9);
4517 mutex_unlock(&bdev->bd_inode->i_mutex); 4655 mutex_unlock(&bdev->bd_inode->i_mutex);
4518 bdput(bdev); 4656 bdput(bdev);
4519 } 4657 }
@@ -4588,7 +4726,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4588 return mddev->pers->reconfig(mddev, info->layout, -1); 4726 return mddev->pers->reconfig(mddev, info->layout, -1);
4589 } 4727 }
4590 if (info->size >= 0 && mddev->size != info->size) 4728 if (info->size >= 0 && mddev->size != info->size)
4591 rv = update_size(mddev, info->size); 4729 rv = update_size(mddev, (sector_t)info->size * 2);
4592 4730
4593 if (mddev->raid_disks != info->raid_disks) 4731 if (mddev->raid_disks != info->raid_disks)
4594 rv = update_raid_disks(mddev, info->raid_disks); 4732 rv = update_raid_disks(mddev, info->raid_disks);
@@ -4641,6 +4779,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4641 return 0; 4779 return 0;
4642} 4780}
4643 4781
4782/*
4783 * We have a problem here : there is no easy way to give a CHS
4784 * virtual geometry. We currently pretend that we have a 2 heads
4785 * 4 sectors (with a BIG number of cylinders...). This drives
4786 * dosfs just mad... ;-)
4787 */
4644static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4788static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4645{ 4789{
4646 mddev_t *mddev = bdev->bd_disk->private_data; 4790 mddev_t *mddev = bdev->bd_disk->private_data;
@@ -4785,19 +4929,13 @@ static int md_ioctl(struct inode *inode, struct file *file,
4785 goto done_unlock; 4929 goto done_unlock;
4786 4930
4787 case STOP_ARRAY: 4931 case STOP_ARRAY:
4788 err = do_md_stop (mddev, 0); 4932 err = do_md_stop (mddev, 0, 1);
4789 goto done_unlock; 4933 goto done_unlock;
4790 4934
4791 case STOP_ARRAY_RO: 4935 case STOP_ARRAY_RO:
4792 err = do_md_stop (mddev, 1); 4936 err = do_md_stop (mddev, 1, 1);
4793 goto done_unlock; 4937 goto done_unlock;
4794 4938
4795 /*
4796 * We have a problem here : there is no easy way to give a CHS
4797 * virtual geometry. We currently pretend that we have a 2 heads
4798 * 4 sectors (with a BIG number of cylinders...). This drives
4799 * dosfs just mad... ;-)
4800 */
4801 } 4939 }
4802 4940
4803 /* 4941 /*
@@ -4807,13 +4945,12 @@ static int md_ioctl(struct inode *inode, struct file *file,
4807 * here and hit the 'default' below, so only disallow 4945 * here and hit the 'default' below, so only disallow
4808 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4946 * 'md' ioctls, and switch to rw mode if started auto-readonly.
4809 */ 4947 */
4810 if (_IOC_TYPE(cmd) == MD_MAJOR && 4948 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
4811 mddev->ro && mddev->pers) {
4812 if (mddev->ro == 2) { 4949 if (mddev->ro == 2) {
4813 mddev->ro = 0; 4950 mddev->ro = 0;
4814 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4951 sysfs_notify(&mddev->kobj, NULL, "array_state");
4815 md_wakeup_thread(mddev->thread); 4952 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4816 4953 md_wakeup_thread(mddev->thread);
4817 } else { 4954 } else {
4818 err = -EROFS; 4955 err = -EROFS;
4819 goto abort_unlock; 4956 goto abort_unlock;
@@ -4883,6 +5020,7 @@ static int md_open(struct inode *inode, struct file *file)
4883 5020
4884 err = 0; 5021 err = 0;
4885 mddev_get(mddev); 5022 mddev_get(mddev);
5023 atomic_inc(&mddev->openers);
4886 mddev_unlock(mddev); 5024 mddev_unlock(mddev);
4887 5025
4888 check_disk_change(inode->i_bdev); 5026 check_disk_change(inode->i_bdev);
@@ -4895,6 +5033,7 @@ static int md_release(struct inode *inode, struct file * file)
4895 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 5033 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4896 5034
4897 BUG_ON(!mddev); 5035 BUG_ON(!mddev);
5036 atomic_dec(&mddev->openers);
4898 mddev_put(mddev); 5037 mddev_put(mddev);
4899 5038
4900 return 0; 5039 return 0;
@@ -5029,6 +5168,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5029 if (!mddev->pers->error_handler) 5168 if (!mddev->pers->error_handler)
5030 return; 5169 return;
5031 mddev->pers->error_handler(mddev,rdev); 5170 mddev->pers->error_handler(mddev,rdev);
5171 if (mddev->degraded)
5172 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5173 set_bit(StateChanged, &rdev->flags);
5032 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5174 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5033 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5175 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5034 md_wakeup_thread(mddev->thread); 5176 md_wakeup_thread(mddev->thread);
@@ -5258,10 +5400,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
5258 if (!list_empty(&mddev->disks)) { 5400 if (!list_empty(&mddev->disks)) {
5259 if (mddev->pers) 5401 if (mddev->pers)
5260 seq_printf(seq, "\n %llu blocks", 5402 seq_printf(seq, "\n %llu blocks",
5261 (unsigned long long)mddev->array_size); 5403 (unsigned long long)
5404 mddev->array_sectors / 2);
5262 else 5405 else
5263 seq_printf(seq, "\n %llu blocks", 5406 seq_printf(seq, "\n %llu blocks",
5264 (unsigned long long)size); 5407 (unsigned long long)size);
5265 } 5408 }
5266 if (mddev->persistent) { 5409 if (mddev->persistent) {
5267 if (mddev->major_version != 0 || 5410 if (mddev->major_version != 0 ||
@@ -5391,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p)
5391static int is_mddev_idle(mddev_t *mddev) 5534static int is_mddev_idle(mddev_t *mddev)
5392{ 5535{
5393 mdk_rdev_t * rdev; 5536 mdk_rdev_t * rdev;
5394 struct list_head *tmp;
5395 int idle; 5537 int idle;
5396 long curr_events; 5538 long curr_events;
5397 5539
5398 idle = 1; 5540 idle = 1;
5399 rdev_for_each(rdev, tmp, mddev) { 5541 rcu_read_lock();
5542 rdev_for_each_rcu(rdev, mddev) {
5400 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 5543 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5401 curr_events = disk_stat_read(disk, sectors[0]) + 5544 curr_events = disk_stat_read(disk, sectors[0]) +
5402 disk_stat_read(disk, sectors[1]) - 5545 disk_stat_read(disk, sectors[1]) -
@@ -5428,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev)
5428 idle = 0; 5571 idle = 0;
5429 } 5572 }
5430 } 5573 }
5574 rcu_read_unlock();
5431 return idle; 5575 return idle;
5432} 5576}
5433 5577
@@ -5451,6 +5595,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
5451 */ 5595 */
5452void md_write_start(mddev_t *mddev, struct bio *bi) 5596void md_write_start(mddev_t *mddev, struct bio *bi)
5453{ 5597{
5598 int did_change = 0;
5454 if (bio_data_dir(bi) != WRITE) 5599 if (bio_data_dir(bi) != WRITE)
5455 return; 5600 return;
5456 5601
@@ -5461,6 +5606,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
5461 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5606 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5462 md_wakeup_thread(mddev->thread); 5607 md_wakeup_thread(mddev->thread);
5463 md_wakeup_thread(mddev->sync_thread); 5608 md_wakeup_thread(mddev->sync_thread);
5609 did_change = 1;
5464 } 5610 }
5465 atomic_inc(&mddev->writes_pending); 5611 atomic_inc(&mddev->writes_pending);
5466 if (mddev->safemode == 1) 5612 if (mddev->safemode == 1)
@@ -5471,10 +5617,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
5471 mddev->in_sync = 0; 5617 mddev->in_sync = 0;
5472 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5618 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5473 md_wakeup_thread(mddev->thread); 5619 md_wakeup_thread(mddev->thread);
5620 did_change = 1;
5474 } 5621 }
5475 spin_unlock_irq(&mddev->write_lock); 5622 spin_unlock_irq(&mddev->write_lock);
5476 sysfs_notify(&mddev->kobj, NULL, "array_state");
5477 } 5623 }
5624 if (did_change)
5625 sysfs_notify(&mddev->kobj, NULL, "array_state");
5478 wait_event(mddev->sb_wait, 5626 wait_event(mddev->sb_wait,
5479 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 5627 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
5480 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 5628 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
@@ -5495,13 +5643,18 @@ void md_write_end(mddev_t *mddev)
5495 * may proceed without blocking. It is important to call this before 5643 * may proceed without blocking. It is important to call this before
5496 * attempting a GFP_KERNEL allocation while holding the mddev lock. 5644 * attempting a GFP_KERNEL allocation while holding the mddev lock.
5497 * Must be called with mddev_lock held. 5645 * Must be called with mddev_lock held.
5646 *
5647 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
5648 * is dropped, so return -EAGAIN after notifying userspace.
5498 */ 5649 */
5499void md_allow_write(mddev_t *mddev) 5650int md_allow_write(mddev_t *mddev)
5500{ 5651{
5501 if (!mddev->pers) 5652 if (!mddev->pers)
5502 return; 5653 return 0;
5503 if (mddev->ro) 5654 if (mddev->ro)
5504 return; 5655 return 0;
5656 if (!mddev->pers->sync_request)
5657 return 0;
5505 5658
5506 spin_lock_irq(&mddev->write_lock); 5659 spin_lock_irq(&mddev->write_lock);
5507 if (mddev->in_sync) { 5660 if (mddev->in_sync) {
@@ -5512,14 +5665,14 @@ void md_allow_write(mddev_t *mddev)
5512 mddev->safemode = 1; 5665 mddev->safemode = 1;
5513 spin_unlock_irq(&mddev->write_lock); 5666 spin_unlock_irq(&mddev->write_lock);
5514 md_update_sb(mddev, 0); 5667 md_update_sb(mddev, 0);
5515
5516 sysfs_notify(&mddev->kobj, NULL, "array_state"); 5668 sysfs_notify(&mddev->kobj, NULL, "array_state");
5517 /* wait for the dirty state to be recorded in the metadata */
5518 wait_event(mddev->sb_wait,
5519 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
5520 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
5521 } else 5669 } else
5522 spin_unlock_irq(&mddev->write_lock); 5670 spin_unlock_irq(&mddev->write_lock);
5671
5672 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
5673 return -EAGAIN;
5674 else
5675 return 0;
5523} 5676}
5524EXPORT_SYMBOL_GPL(md_allow_write); 5677EXPORT_SYMBOL_GPL(md_allow_write);
5525 5678
@@ -5625,9 +5778,11 @@ void md_do_sync(mddev_t *mddev)
5625 max_sectors = mddev->resync_max_sectors; 5778 max_sectors = mddev->resync_max_sectors;
5626 mddev->resync_mismatches = 0; 5779 mddev->resync_mismatches = 0;
5627 /* we don't use the checkpoint if there's a bitmap */ 5780 /* we don't use the checkpoint if there's a bitmap */
5628 if (!mddev->bitmap && 5781 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5629 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5782 j = mddev->resync_min;
5783 else if (!mddev->bitmap)
5630 j = mddev->recovery_cp; 5784 j = mddev->recovery_cp;
5785
5631 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5786 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5632 max_sectors = mddev->size << 1; 5787 max_sectors = mddev->size << 1;
5633 else { 5788 else {
@@ -5796,6 +5951,7 @@ void md_do_sync(mddev_t *mddev)
5796 5951
5797 skip: 5952 skip:
5798 mddev->curr_resync = 0; 5953 mddev->curr_resync = 0;
5954 mddev->resync_min = 0;
5799 mddev->resync_max = MaxSector; 5955 mddev->resync_max = MaxSector;
5800 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5956 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5801 wake_up(&resync_wait); 5957 wake_up(&resync_wait);
@@ -5845,7 +6001,8 @@ static int remove_and_add_spares(mddev_t *mddev)
5845 if (rdev->raid_disk < 0 6001 if (rdev->raid_disk < 0
5846 && !test_bit(Faulty, &rdev->flags)) { 6002 && !test_bit(Faulty, &rdev->flags)) {
5847 rdev->recovery_offset = 0; 6003 rdev->recovery_offset = 0;
5848 if (mddev->pers->hot_add_disk(mddev,rdev)) { 6004 if (mddev->pers->
6005 hot_add_disk(mddev, rdev) == 0) {
5849 char nm[20]; 6006 char nm[20];
5850 sprintf(nm, "rd%d", rdev->raid_disk); 6007 sprintf(nm, "rd%d", rdev->raid_disk);
5851 if (sysfs_create_link(&mddev->kobj, 6008 if (sysfs_create_link(&mddev->kobj,
@@ -5920,23 +6077,31 @@ void md_check_recovery(mddev_t *mddev)
5920 int spares = 0; 6077 int spares = 0;
5921 6078
5922 if (!mddev->external) { 6079 if (!mddev->external) {
6080 int did_change = 0;
5923 spin_lock_irq(&mddev->write_lock); 6081 spin_lock_irq(&mddev->write_lock);
5924 if (mddev->safemode && 6082 if (mddev->safemode &&
5925 !atomic_read(&mddev->writes_pending) && 6083 !atomic_read(&mddev->writes_pending) &&
5926 !mddev->in_sync && 6084 !mddev->in_sync &&
5927 mddev->recovery_cp == MaxSector) { 6085 mddev->recovery_cp == MaxSector) {
5928 mddev->in_sync = 1; 6086 mddev->in_sync = 1;
6087 did_change = 1;
5929 if (mddev->persistent) 6088 if (mddev->persistent)
5930 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6089 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5931 } 6090 }
5932 if (mddev->safemode == 1) 6091 if (mddev->safemode == 1)
5933 mddev->safemode = 0; 6092 mddev->safemode = 0;
5934 spin_unlock_irq(&mddev->write_lock); 6093 spin_unlock_irq(&mddev->write_lock);
6094 if (did_change)
6095 sysfs_notify(&mddev->kobj, NULL, "array_state");
5935 } 6096 }
5936 6097
5937 if (mddev->flags) 6098 if (mddev->flags)
5938 md_update_sb(mddev, 0); 6099 md_update_sb(mddev, 0);
5939 6100
6101 rdev_for_each(rdev, rtmp, mddev)
6102 if (test_and_clear_bit(StateChanged, &rdev->flags))
6103 sysfs_notify(&rdev->kobj, NULL, "state");
6104
5940 6105
5941 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 6106 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5942 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 6107 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -5951,7 +6116,9 @@ void md_check_recovery(mddev_t *mddev)
5951 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6116 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5952 /* success...*/ 6117 /* success...*/
5953 /* activate any spares */ 6118 /* activate any spares */
5954 mddev->pers->spare_active(mddev); 6119 if (mddev->pers->spare_active(mddev))
6120 sysfs_notify(&mddev->kobj, NULL,
6121 "degraded");
5955 } 6122 }
5956 md_update_sb(mddev, 1); 6123 md_update_sb(mddev, 1);
5957 6124
@@ -5965,13 +6132,18 @@ void md_check_recovery(mddev_t *mddev)
5965 mddev->recovery = 0; 6132 mddev->recovery = 0;
5966 /* flag recovery needed just to double check */ 6133 /* flag recovery needed just to double check */
5967 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6134 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6135 sysfs_notify(&mddev->kobj, NULL, "sync_action");
5968 md_new_event(mddev); 6136 md_new_event(mddev);
5969 goto unlock; 6137 goto unlock;
5970 } 6138 }
6139 /* Set RUNNING before clearing NEEDED to avoid
6140 * any transients in the value of "sync_action".
6141 */
6142 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6143 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5971 /* Clear some bits that don't mean anything, but 6144 /* Clear some bits that don't mean anything, but
5972 * might be left set 6145 * might be left set
5973 */ 6146 */
5974 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5975 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 6147 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5976 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 6148 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5977 6149
@@ -5989,17 +6161,19 @@ void md_check_recovery(mddev_t *mddev)
5989 /* Cannot proceed */ 6161 /* Cannot proceed */
5990 goto unlock; 6162 goto unlock;
5991 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6163 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6164 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5992 } else if ((spares = remove_and_add_spares(mddev))) { 6165 } else if ((spares = remove_and_add_spares(mddev))) {
5993 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6166 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5994 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6167 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6168 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5995 } else if (mddev->recovery_cp < MaxSector) { 6169 } else if (mddev->recovery_cp < MaxSector) {
5996 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6170 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6171 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5997 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6172 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5998 /* nothing to be done ... */ 6173 /* nothing to be done ... */
5999 goto unlock; 6174 goto unlock;
6000 6175
6001 if (mddev->pers->sync_request) { 6176 if (mddev->pers->sync_request) {
6002 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6003 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 6177 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
6004 /* We are adding a device or devices to an array 6178 /* We are adding a device or devices to an array
6005 * which has the bitmap stored on all devices. 6179 * which has the bitmap stored on all devices.
@@ -6018,9 +6192,16 @@ void md_check_recovery(mddev_t *mddev)
6018 mddev->recovery = 0; 6192 mddev->recovery = 0;
6019 } else 6193 } else
6020 md_wakeup_thread(mddev->sync_thread); 6194 md_wakeup_thread(mddev->sync_thread);
6195 sysfs_notify(&mddev->kobj, NULL, "sync_action");
6021 md_new_event(mddev); 6196 md_new_event(mddev);
6022 } 6197 }
6023 unlock: 6198 unlock:
6199 if (!mddev->sync_thread) {
6200 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6201 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
6202 &mddev->recovery))
6203 sysfs_notify(&mddev->kobj, NULL, "sync_action");
6204 }
6024 mddev_unlock(mddev); 6205 mddev_unlock(mddev);
6025 } 6206 }
6026} 6207}
@@ -6047,7 +6228,7 @@ static int md_notify_reboot(struct notifier_block *this,
6047 6228
6048 for_each_mddev(mddev, tmp) 6229 for_each_mddev(mddev, tmp)
6049 if (mddev_trylock(mddev)) { 6230 if (mddev_trylock(mddev)) {
6050 do_md_stop (mddev, 1); 6231 do_md_stop (mddev, 1, 0);
6051 mddev_unlock(mddev); 6232 mddev_unlock(mddev);
6052 } 6233 }
6053 /* 6234 /*
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index e968116e0de9..c4779ccba1c3 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -281,13 +281,18 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
281{ 281{
282 multipath_conf_t *conf = mddev->private; 282 multipath_conf_t *conf = mddev->private;
283 struct request_queue *q; 283 struct request_queue *q;
284 int found = 0; 284 int err = -EEXIST;
285 int path; 285 int path;
286 struct multipath_info *p; 286 struct multipath_info *p;
287 int first = 0;
288 int last = mddev->raid_disks - 1;
289
290 if (rdev->raid_disk >= 0)
291 first = last = rdev->raid_disk;
287 292
288 print_multipath_conf(conf); 293 print_multipath_conf(conf);
289 294
290 for (path=0; path<mddev->raid_disks; path++) 295 for (path = first; path <= last; path++)
291 if ((p=conf->multipaths+path)->rdev == NULL) { 296 if ((p=conf->multipaths+path)->rdev == NULL) {
292 q = rdev->bdev->bd_disk->queue; 297 q = rdev->bdev->bd_disk->queue;
293 blk_queue_stack_limits(mddev->queue, q); 298 blk_queue_stack_limits(mddev->queue, q);
@@ -307,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
307 rdev->raid_disk = path; 312 rdev->raid_disk = path;
308 set_bit(In_sync, &rdev->flags); 313 set_bit(In_sync, &rdev->flags);
309 rcu_assign_pointer(p->rdev, rdev); 314 rcu_assign_pointer(p->rdev, rdev);
310 found = 1; 315 err = 0;
316 break;
311 } 317 }
312 318
313 print_multipath_conf(conf); 319 print_multipath_conf(conf);
314 return found; 320
321 return err;
315} 322}
316 323
317static int multipath_remove_disk(mddev_t *mddev, int number) 324static int multipath_remove_disk(mddev_t *mddev, int number)
@@ -497,7 +504,7 @@ static int multipath_run (mddev_t *mddev)
497 /* 504 /*
498 * Ok, everything is just fine now 505 * Ok, everything is just fine now
499 */ 506 */
500 mddev->array_size = mddev->size; 507 mddev->array_sectors = mddev->size * 2;
501 508
502 mddev->queue->unplug_fn = multipath_unplug; 509 mddev->queue->unplug_fn = multipath_unplug;
503 mddev->queue->backing_dev_info.congested_fn = multipath_congested; 510 mddev->queue->backing_dev_info.congested_fn = multipath_congested;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index bcbb82594a19..183610635661 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -295,16 +295,16 @@ static int raid0_run (mddev_t *mddev)
295 goto out_free_conf; 295 goto out_free_conf;
296 296
297 /* calculate array device size */ 297 /* calculate array device size */
298 mddev->array_size = 0; 298 mddev->array_sectors = 0;
299 rdev_for_each(rdev, tmp, mddev) 299 rdev_for_each(rdev, tmp, mddev)
300 mddev->array_size += rdev->size; 300 mddev->array_sectors += rdev->size * 2;
301 301
302 printk("raid0 : md_size is %llu blocks.\n", 302 printk("raid0 : md_size is %llu blocks.\n",
303 (unsigned long long)mddev->array_size); 303 (unsigned long long)mddev->array_sectors / 2);
304 printk("raid0 : conf->hash_spacing is %llu blocks.\n", 304 printk("raid0 : conf->hash_spacing is %llu blocks.\n",
305 (unsigned long long)conf->hash_spacing); 305 (unsigned long long)conf->hash_spacing);
306 { 306 {
307 sector_t s = mddev->array_size; 307 sector_t s = mddev->array_sectors / 2;
308 sector_t space = conf->hash_spacing; 308 sector_t space = conf->hash_spacing;
309 int round; 309 int round;
310 conf->preshift = 0; 310 conf->preshift = 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c610b947218a..03a5ab705c20 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1100,11 +1100,16 @@ static int raid1_spare_active(mddev_t *mddev)
1100static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 1100static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1101{ 1101{
1102 conf_t *conf = mddev->private; 1102 conf_t *conf = mddev->private;
1103 int found = 0; 1103 int err = -EEXIST;
1104 int mirror = 0; 1104 int mirror = 0;
1105 mirror_info_t *p; 1105 mirror_info_t *p;
1106 int first = 0;
1107 int last = mddev->raid_disks - 1;
1106 1108
1107 for (mirror=0; mirror < mddev->raid_disks; mirror++) 1109 if (rdev->raid_disk >= 0)
1110 first = last = rdev->raid_disk;
1111
1112 for (mirror = first; mirror <= last; mirror++)
1108 if ( !(p=conf->mirrors+mirror)->rdev) { 1113 if ( !(p=conf->mirrors+mirror)->rdev) {
1109 1114
1110 blk_queue_stack_limits(mddev->queue, 1115 blk_queue_stack_limits(mddev->queue,
@@ -1119,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1119 1124
1120 p->head_position = 0; 1125 p->head_position = 0;
1121 rdev->raid_disk = mirror; 1126 rdev->raid_disk = mirror;
1122 found = 1; 1127 err = 0;
1123 /* As all devices are equivalent, we don't need a full recovery 1128 /* As all devices are equivalent, we don't need a full recovery
1124 * if this was recently any drive of the array 1129 * if this was recently any drive of the array
1125 */ 1130 */
@@ -1130,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1130 } 1135 }
1131 1136
1132 print_conf(conf); 1137 print_conf(conf);
1133 return found; 1138 return err;
1134} 1139}
1135 1140
1136static int raid1_remove_disk(mddev_t *mddev, int number) 1141static int raid1_remove_disk(mddev_t *mddev, int number)
@@ -2038,7 +2043,7 @@ static int run(mddev_t *mddev)
2038 /* 2043 /*
2039 * Ok, everything is just fine now 2044 * Ok, everything is just fine now
2040 */ 2045 */
2041 mddev->array_size = mddev->size; 2046 mddev->array_sectors = mddev->size * 2;
2042 2047
2043 mddev->queue->unplug_fn = raid1_unplug; 2048 mddev->queue->unplug_fn = raid1_unplug;
2044 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2049 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
@@ -2100,14 +2105,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
2100 * any io in the removed space completes, but it hardly seems 2105 * any io in the removed space completes, but it hardly seems
2101 * worth it. 2106 * worth it.
2102 */ 2107 */
2103 mddev->array_size = sectors>>1; 2108 mddev->array_sectors = sectors;
2104 set_capacity(mddev->gendisk, mddev->array_size << 1); 2109 set_capacity(mddev->gendisk, mddev->array_sectors);
2105 mddev->changed = 1; 2110 mddev->changed = 1;
2106 if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { 2111 if (mddev->array_sectors / 2 > mddev->size &&
2112 mddev->recovery_cp == MaxSector) {
2107 mddev->recovery_cp = mddev->size << 1; 2113 mddev->recovery_cp = mddev->size << 1;
2108 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2114 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2109 } 2115 }
2110 mddev->size = mddev->array_size; 2116 mddev->size = mddev->array_sectors / 2;
2111 mddev->resync_max_sectors = sectors; 2117 mddev->resync_max_sectors = sectors;
2112 return 0; 2118 return 0;
2113} 2119}
@@ -2131,7 +2137,7 @@ static int raid1_reshape(mddev_t *mddev)
2131 conf_t *conf = mddev_to_conf(mddev); 2137 conf_t *conf = mddev_to_conf(mddev);
2132 int cnt, raid_disks; 2138 int cnt, raid_disks;
2133 unsigned long flags; 2139 unsigned long flags;
2134 int d, d2; 2140 int d, d2, err;
2135 2141
2136 /* Cannot change chunk_size, layout, or level */ 2142 /* Cannot change chunk_size, layout, or level */
2137 if (mddev->chunk_size != mddev->new_chunk || 2143 if (mddev->chunk_size != mddev->new_chunk ||
@@ -2143,7 +2149,9 @@ static int raid1_reshape(mddev_t *mddev)
2143 return -EINVAL; 2149 return -EINVAL;
2144 } 2150 }
2145 2151
2146 md_allow_write(mddev); 2152 err = md_allow_write(mddev);
2153 if (err)
2154 return err;
2147 2155
2148 raid_disks = mddev->raid_disks + mddev->delta_disks; 2156 raid_disks = mddev->raid_disks + mddev->delta_disks;
2149 2157
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 22bb2b1b886d..159535d73567 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1114,24 +1114,30 @@ static int raid10_spare_active(mddev_t *mddev)
1114static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 1114static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1115{ 1115{
1116 conf_t *conf = mddev->private; 1116 conf_t *conf = mddev->private;
1117 int found = 0; 1117 int err = -EEXIST;
1118 int mirror; 1118 int mirror;
1119 mirror_info_t *p; 1119 mirror_info_t *p;
1120 int first = 0;
1121 int last = mddev->raid_disks - 1;
1120 1122
1121 if (mddev->recovery_cp < MaxSector) 1123 if (mddev->recovery_cp < MaxSector)
1122 /* only hot-add to in-sync arrays, as recovery is 1124 /* only hot-add to in-sync arrays, as recovery is
1123 * very different from resync 1125 * very different from resync
1124 */ 1126 */
1125 return 0; 1127 return -EBUSY;
1126 if (!enough(conf)) 1128 if (!enough(conf))
1127 return 0; 1129 return -EINVAL;
1130
1131 if (rdev->raid_disk)
1132 first = last = rdev->raid_disk;
1128 1133
1129 if (rdev->saved_raid_disk >= 0 && 1134 if (rdev->saved_raid_disk >= 0 &&
1135 rdev->saved_raid_disk >= first &&
1130 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1136 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1131 mirror = rdev->saved_raid_disk; 1137 mirror = rdev->saved_raid_disk;
1132 else 1138 else
1133 mirror = 0; 1139 mirror = first;
1134 for ( ; mirror < mddev->raid_disks; mirror++) 1140 for ( ; mirror <= last ; mirror++)
1135 if ( !(p=conf->mirrors+mirror)->rdev) { 1141 if ( !(p=conf->mirrors+mirror)->rdev) {
1136 1142
1137 blk_queue_stack_limits(mddev->queue, 1143 blk_queue_stack_limits(mddev->queue,
@@ -1146,7 +1152,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1146 1152
1147 p->head_position = 0; 1153 p->head_position = 0;
1148 rdev->raid_disk = mirror; 1154 rdev->raid_disk = mirror;
1149 found = 1; 1155 err = 0;
1150 if (rdev->saved_raid_disk != mirror) 1156 if (rdev->saved_raid_disk != mirror)
1151 conf->fullsync = 1; 1157 conf->fullsync = 1;
1152 rcu_assign_pointer(p->rdev, rdev); 1158 rcu_assign_pointer(p->rdev, rdev);
@@ -1154,7 +1160,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1154 } 1160 }
1155 1161
1156 print_conf(conf); 1162 print_conf(conf);
1157 return found; 1163 return err;
1158} 1164}
1159 1165
1160static int raid10_remove_disk(mddev_t *mddev, int number) 1166static int raid10_remove_disk(mddev_t *mddev, int number)
@@ -2159,7 +2165,7 @@ static int run(mddev_t *mddev)
2159 /* 2165 /*
2160 * Ok, everything is just fine now 2166 * Ok, everything is just fine now
2161 */ 2167 */
2162 mddev->array_size = size << (conf->chunk_shift-1); 2168 mddev->array_sectors = size << conf->chunk_shift;
2163 mddev->resync_max_sectors = size << conf->chunk_shift; 2169 mddev->resync_max_sectors = size << conf->chunk_shift;
2164 2170
2165 mddev->queue->unplug_fn = raid10_unplug; 2171 mddev->queue->unplug_fn = raid10_unplug;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9ce7154845c6..55e7c56045a0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -115,15 +115,20 @@ static void return_io(struct bio *return_bi)
115 return_bi = bi->bi_next; 115 return_bi = bi->bi_next;
116 bi->bi_next = NULL; 116 bi->bi_next = NULL;
117 bi->bi_size = 0; 117 bi->bi_size = 0;
118 bi->bi_end_io(bi, 118 bio_endio(bi, 0);
119 test_bit(BIO_UPTODATE, &bi->bi_flags)
120 ? 0 : -EIO);
121 bi = return_bi; 119 bi = return_bi;
122 } 120 }
123} 121}
124 122
125static void print_raid5_conf (raid5_conf_t *conf); 123static void print_raid5_conf (raid5_conf_t *conf);
126 124
125static int stripe_operations_active(struct stripe_head *sh)
126{
127 return sh->check_state || sh->reconstruct_state ||
128 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
129 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
130}
131
127static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 132static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
128{ 133{
129 if (atomic_dec_and_test(&sh->count)) { 134 if (atomic_dec_and_test(&sh->count)) {
@@ -143,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
143 } 148 }
144 md_wakeup_thread(conf->mddev->thread); 149 md_wakeup_thread(conf->mddev->thread);
145 } else { 150 } else {
146 BUG_ON(sh->ops.pending); 151 BUG_ON(stripe_operations_active(sh));
147 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 152 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
148 atomic_dec(&conf->preread_active_stripes); 153 atomic_dec(&conf->preread_active_stripes);
149 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 154 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -245,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
245 250
246 BUG_ON(atomic_read(&sh->count) != 0); 251 BUG_ON(atomic_read(&sh->count) != 0);
247 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 252 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
248 BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); 253 BUG_ON(stripe_operations_active(sh));
249 254
250 CHECK_DEVLOCK(); 255 CHECK_DEVLOCK();
251 pr_debug("init_stripe called, stripe %llu\n", 256 pr_debug("init_stripe called, stripe %llu\n",
@@ -346,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
346 return sh; 351 return sh;
347} 352}
348 353
349/* test_and_ack_op() ensures that we only dequeue an operation once */
350#define test_and_ack_op(op, pend) \
351do { \
352 if (test_bit(op, &sh->ops.pending) && \
353 !test_bit(op, &sh->ops.complete)) { \
354 if (test_and_set_bit(op, &sh->ops.ack)) \
355 clear_bit(op, &pend); \
356 else \
357 ack++; \
358 } else \
359 clear_bit(op, &pend); \
360} while (0)
361
362/* find new work to run, do not resubmit work that is already
363 * in flight
364 */
365static unsigned long get_stripe_work(struct stripe_head *sh)
366{
367 unsigned long pending;
368 int ack = 0;
369
370 pending = sh->ops.pending;
371
372 test_and_ack_op(STRIPE_OP_BIOFILL, pending);
373 test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
374 test_and_ack_op(STRIPE_OP_PREXOR, pending);
375 test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
376 test_and_ack_op(STRIPE_OP_POSTXOR, pending);
377 test_and_ack_op(STRIPE_OP_CHECK, pending);
378 if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
379 ack++;
380
381 sh->ops.count -= ack;
382 if (unlikely(sh->ops.count < 0)) {
383 printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
384 "ops.complete: %#lx\n", pending, sh->ops.pending,
385 sh->ops.ack, sh->ops.complete);
386 BUG();
387 }
388
389 return pending;
390}
391
392static void 354static void
393raid5_end_read_request(struct bio *bi, int error); 355raid5_end_read_request(struct bio *bi, int error);
394static void 356static void
395raid5_end_write_request(struct bio *bi, int error); 357raid5_end_write_request(struct bio *bi, int error);
396 358
397static void ops_run_io(struct stripe_head *sh) 359static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
398{ 360{
399 raid5_conf_t *conf = sh->raid_conf; 361 raid5_conf_t *conf = sh->raid_conf;
400 int i, disks = sh->disks; 362 int i, disks = sh->disks;
401 363
402 might_sleep(); 364 might_sleep();
403 365
404 set_bit(STRIPE_IO_STARTED, &sh->state);
405 for (i = disks; i--; ) { 366 for (i = disks; i--; ) {
406 int rw; 367 int rw;
407 struct bio *bi; 368 struct bio *bi;
@@ -430,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh)
430 rcu_read_unlock(); 391 rcu_read_unlock();
431 392
432 if (rdev) { 393 if (rdev) {
433 if (test_bit(STRIPE_SYNCING, &sh->state) || 394 if (s->syncing || s->expanding || s->expanded)
434 test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
435 test_bit(STRIPE_EXPAND_READY, &sh->state))
436 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 395 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
437 396
397 set_bit(STRIPE_IO_STARTED, &sh->state);
398
438 bi->bi_bdev = rdev->bdev; 399 bi->bi_bdev = rdev->bdev;
439 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 400 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
440 __func__, (unsigned long long)sh->sector, 401 __func__, (unsigned long long)sh->sector,
@@ -528,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref)
528 (unsigned long long)sh->sector); 489 (unsigned long long)sh->sector);
529 490
530 /* clear completed biofills */ 491 /* clear completed biofills */
492 spin_lock_irq(&conf->device_lock);
531 for (i = sh->disks; i--; ) { 493 for (i = sh->disks; i--; ) {
532 struct r5dev *dev = &sh->dev[i]; 494 struct r5dev *dev = &sh->dev[i];
533 495
534 /* acknowledge completion of a biofill operation */ 496 /* acknowledge completion of a biofill operation */
535 /* and check if we need to reply to a read request, 497 /* and check if we need to reply to a read request,
536 * new R5_Wantfill requests are held off until 498 * new R5_Wantfill requests are held off until
537 * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) 499 * !STRIPE_BIOFILL_RUN
538 */ 500 */
539 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 501 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
540 struct bio *rbi, *rbi2; 502 struct bio *rbi, *rbi2;
541 503
542 /* The access to dev->read is outside of the
543 * spin_lock_irq(&conf->device_lock), but is protected
544 * by the STRIPE_OP_BIOFILL pending bit
545 */
546 BUG_ON(!dev->read); 504 BUG_ON(!dev->read);
547 rbi = dev->read; 505 rbi = dev->read;
548 dev->read = NULL; 506 dev->read = NULL;
549 while (rbi && rbi->bi_sector < 507 while (rbi && rbi->bi_sector <
550 dev->sector + STRIPE_SECTORS) { 508 dev->sector + STRIPE_SECTORS) {
551 rbi2 = r5_next_bio(rbi, dev->sector); 509 rbi2 = r5_next_bio(rbi, dev->sector);
552 spin_lock_irq(&conf->device_lock);
553 if (--rbi->bi_phys_segments == 0) { 510 if (--rbi->bi_phys_segments == 0) {
554 rbi->bi_next = return_bi; 511 rbi->bi_next = return_bi;
555 return_bi = rbi; 512 return_bi = rbi;
556 } 513 }
557 spin_unlock_irq(&conf->device_lock);
558 rbi = rbi2; 514 rbi = rbi2;
559 } 515 }
560 } 516 }
561 } 517 }
562 set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); 518 spin_unlock_irq(&conf->device_lock);
519 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
563 520
564 return_io(return_bi); 521 return_io(return_bi);
565 522
@@ -610,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref)
610 set_bit(R5_UPTODATE, &tgt->flags); 567 set_bit(R5_UPTODATE, &tgt->flags);
611 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 568 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
612 clear_bit(R5_Wantcompute, &tgt->flags); 569 clear_bit(R5_Wantcompute, &tgt->flags);
613 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); 570 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
571 if (sh->check_state == check_state_compute_run)
572 sh->check_state = check_state_compute_result;
614 set_bit(STRIPE_HANDLE, &sh->state); 573 set_bit(STRIPE_HANDLE, &sh->state);
615 release_stripe(sh); 574 release_stripe(sh);
616} 575}
617 576
618static struct dma_async_tx_descriptor * 577static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
619ops_run_compute5(struct stripe_head *sh, unsigned long pending)
620{ 578{
621 /* kernel stack size limits the total number of disks */ 579 /* kernel stack size limits the total number of disks */
622 int disks = sh->disks; 580 int disks = sh->disks;
@@ -646,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
646 ASYNC_TX_XOR_ZERO_DST, NULL, 604 ASYNC_TX_XOR_ZERO_DST, NULL,
647 ops_complete_compute5, sh); 605 ops_complete_compute5, sh);
648 606
649 /* ack now if postxor is not set to be run */
650 if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
651 async_tx_ack(tx);
652
653 return tx; 607 return tx;
654} 608}
655 609
@@ -659,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref)
659 613
660 pr_debug("%s: stripe %llu\n", __func__, 614 pr_debug("%s: stripe %llu\n", __func__,
661 (unsigned long long)sh->sector); 615 (unsigned long long)sh->sector);
662
663 set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
664} 616}
665 617
666static struct dma_async_tx_descriptor * 618static struct dma_async_tx_descriptor *
@@ -680,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
680 for (i = disks; i--; ) { 632 for (i = disks; i--; ) {
681 struct r5dev *dev = &sh->dev[i]; 633 struct r5dev *dev = &sh->dev[i];
682 /* Only process blocks that are known to be uptodate */ 634 /* Only process blocks that are known to be uptodate */
683 if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) 635 if (test_bit(R5_Wantdrain, &dev->flags))
684 xor_srcs[count++] = dev->page; 636 xor_srcs[count++] = dev->page;
685 } 637 }
686 638
@@ -692,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
692} 644}
693 645
694static struct dma_async_tx_descriptor * 646static struct dma_async_tx_descriptor *
695ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, 647ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
696 unsigned long pending)
697{ 648{
698 int disks = sh->disks; 649 int disks = sh->disks;
699 int pd_idx = sh->pd_idx, i; 650 int i;
700
701 /* check if prexor is active which means only process blocks
702 * that are part of a read-modify-write (Wantprexor)
703 */
704 int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
705 651
706 pr_debug("%s: stripe %llu\n", __func__, 652 pr_debug("%s: stripe %llu\n", __func__,
707 (unsigned long long)sh->sector); 653 (unsigned long long)sh->sector);
@@ -709,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
709 for (i = disks; i--; ) { 655 for (i = disks; i--; ) {
710 struct r5dev *dev = &sh->dev[i]; 656 struct r5dev *dev = &sh->dev[i];
711 struct bio *chosen; 657 struct bio *chosen;
712 int towrite;
713
714 towrite = 0;
715 if (prexor) { /* rmw */
716 if (dev->towrite &&
717 test_bit(R5_Wantprexor, &dev->flags))
718 towrite = 1;
719 } else { /* rcw */
720 if (i != pd_idx && dev->towrite &&
721 test_bit(R5_LOCKED, &dev->flags))
722 towrite = 1;
723 }
724 658
725 if (towrite) { 659 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
726 struct bio *wbi; 660 struct bio *wbi;
727 661
728 spin_lock(&sh->lock); 662 spin_lock(&sh->lock);
@@ -747,18 +681,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
747static void ops_complete_postxor(void *stripe_head_ref) 681static void ops_complete_postxor(void *stripe_head_ref)
748{ 682{
749 struct stripe_head *sh = stripe_head_ref; 683 struct stripe_head *sh = stripe_head_ref;
750
751 pr_debug("%s: stripe %llu\n", __func__,
752 (unsigned long long)sh->sector);
753
754 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
755 set_bit(STRIPE_HANDLE, &sh->state);
756 release_stripe(sh);
757}
758
759static void ops_complete_write(void *stripe_head_ref)
760{
761 struct stripe_head *sh = stripe_head_ref;
762 int disks = sh->disks, i, pd_idx = sh->pd_idx; 684 int disks = sh->disks, i, pd_idx = sh->pd_idx;
763 685
764 pr_debug("%s: stripe %llu\n", __func__, 686 pr_debug("%s: stripe %llu\n", __func__,
@@ -770,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref)
770 set_bit(R5_UPTODATE, &dev->flags); 692 set_bit(R5_UPTODATE, &dev->flags);
771 } 693 }
772 694
773 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); 695 if (sh->reconstruct_state == reconstruct_state_drain_run)
774 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); 696 sh->reconstruct_state = reconstruct_state_drain_result;
697 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
698 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
699 else {
700 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
701 sh->reconstruct_state = reconstruct_state_result;
702 }
775 703
776 set_bit(STRIPE_HANDLE, &sh->state); 704 set_bit(STRIPE_HANDLE, &sh->state);
777 release_stripe(sh); 705 release_stripe(sh);
778} 706}
779 707
780static void 708static void
781ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, 709ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
782 unsigned long pending)
783{ 710{
784 /* kernel stack size limits the total number of disks */ 711 /* kernel stack size limits the total number of disks */
785 int disks = sh->disks; 712 int disks = sh->disks;
@@ -787,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
787 714
788 int count = 0, pd_idx = sh->pd_idx, i; 715 int count = 0, pd_idx = sh->pd_idx, i;
789 struct page *xor_dest; 716 struct page *xor_dest;
790 int prexor = test_bit(STRIPE_OP_PREXOR, &pending); 717 int prexor = 0;
791 unsigned long flags; 718 unsigned long flags;
792 dma_async_tx_callback callback;
793 719
794 pr_debug("%s: stripe %llu\n", __func__, 720 pr_debug("%s: stripe %llu\n", __func__,
795 (unsigned long long)sh->sector); 721 (unsigned long long)sh->sector);
@@ -797,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
797 /* check if prexor is active which means only process blocks 723 /* check if prexor is active which means only process blocks
798 * that are part of a read-modify-write (written) 724 * that are part of a read-modify-write (written)
799 */ 725 */
800 if (prexor) { 726 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
727 prexor = 1;
801 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 728 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
802 for (i = disks; i--; ) { 729 for (i = disks; i--; ) {
803 struct r5dev *dev = &sh->dev[i]; 730 struct r5dev *dev = &sh->dev[i];
@@ -813,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
813 } 740 }
814 } 741 }
815 742
816 /* check whether this postxor is part of a write */
817 callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
818 ops_complete_write : ops_complete_postxor;
819
820 /* 1/ if we prexor'd then the dest is reused as a source 743 /* 1/ if we prexor'd then the dest is reused as a source
821 * 2/ if we did not prexor then we are redoing the parity 744 * 2/ if we did not prexor then we are redoing the parity
822 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 745 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
@@ -830,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
830 if (unlikely(count == 1)) { 753 if (unlikely(count == 1)) {
831 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); 754 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
832 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 755 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
833 flags, tx, callback, sh); 756 flags, tx, ops_complete_postxor, sh);
834 } else 757 } else
835 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 758 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
836 flags, tx, callback, sh); 759 flags, tx, ops_complete_postxor, sh);
837} 760}
838 761
839static void ops_complete_check(void *stripe_head_ref) 762static void ops_complete_check(void *stripe_head_ref)
840{ 763{
841 struct stripe_head *sh = stripe_head_ref; 764 struct stripe_head *sh = stripe_head_ref;
842 int pd_idx = sh->pd_idx;
843 765
844 pr_debug("%s: stripe %llu\n", __func__, 766 pr_debug("%s: stripe %llu\n", __func__,
845 (unsigned long long)sh->sector); 767 (unsigned long long)sh->sector);
846 768
847 if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && 769 sh->check_state = check_state_check_result;
848 sh->ops.zero_sum_result == 0)
849 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
850
851 set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
852 set_bit(STRIPE_HANDLE, &sh->state); 770 set_bit(STRIPE_HANDLE, &sh->state);
853 release_stripe(sh); 771 release_stripe(sh);
854} 772}
@@ -875,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh)
875 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 793 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
876 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); 794 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
877 795
878 if (tx)
879 set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
880 else
881 clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
882
883 atomic_inc(&sh->count); 796 atomic_inc(&sh->count);
884 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 797 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
885 ops_complete_check, sh); 798 ops_complete_check, sh);
886} 799}
887 800
888static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) 801static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
889{ 802{
890 int overlap_clear = 0, i, disks = sh->disks; 803 int overlap_clear = 0, i, disks = sh->disks;
891 struct dma_async_tx_descriptor *tx = NULL; 804 struct dma_async_tx_descriptor *tx = NULL;
892 805
893 if (test_bit(STRIPE_OP_BIOFILL, &pending)) { 806 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
894 ops_run_biofill(sh); 807 ops_run_biofill(sh);
895 overlap_clear++; 808 overlap_clear++;
896 } 809 }
897 810
898 if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) 811 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
899 tx = ops_run_compute5(sh, pending); 812 tx = ops_run_compute5(sh);
813 /* terminate the chain if postxor is not set to be run */
814 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
815 async_tx_ack(tx);
816 }
900 817
901 if (test_bit(STRIPE_OP_PREXOR, &pending)) 818 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
902 tx = ops_run_prexor(sh, tx); 819 tx = ops_run_prexor(sh, tx);
903 820
904 if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { 821 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
905 tx = ops_run_biodrain(sh, tx, pending); 822 tx = ops_run_biodrain(sh, tx);
906 overlap_clear++; 823 overlap_clear++;
907 } 824 }
908 825
909 if (test_bit(STRIPE_OP_POSTXOR, &pending)) 826 if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
910 ops_run_postxor(sh, tx, pending); 827 ops_run_postxor(sh, tx);
911 828
912 if (test_bit(STRIPE_OP_CHECK, &pending)) 829 if (test_bit(STRIPE_OP_CHECK, &ops_request))
913 ops_run_check(sh); 830 ops_run_check(sh);
914 831
915 if (test_bit(STRIPE_OP_IO, &pending))
916 ops_run_io(sh);
917
918 if (overlap_clear) 832 if (overlap_clear)
919 for (i = disks; i--; ) { 833 for (i = disks; i--; ) {
920 struct r5dev *dev = &sh->dev[i]; 834 struct r5dev *dev = &sh->dev[i];
@@ -997,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
997 struct stripe_head *osh, *nsh; 911 struct stripe_head *osh, *nsh;
998 LIST_HEAD(newstripes); 912 LIST_HEAD(newstripes);
999 struct disk_info *ndisks; 913 struct disk_info *ndisks;
1000 int err = 0; 914 int err;
1001 struct kmem_cache *sc; 915 struct kmem_cache *sc;
1002 int i; 916 int i;
1003 917
1004 if (newsize <= conf->pool_size) 918 if (newsize <= conf->pool_size)
1005 return 0; /* never bother to shrink */ 919 return 0; /* never bother to shrink */
1006 920
1007 md_allow_write(conf->mddev); 921 err = md_allow_write(conf->mddev);
922 if (err)
923 return err;
1008 924
1009 /* Step 1 */ 925 /* Step 1 */
1010 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 926 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -1703,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1703 } 1619 }
1704} 1620}
1705 1621
1706static int 1622static void
1707handle_write_operations5(struct stripe_head *sh, int rcw, int expand) 1623schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1624 int rcw, int expand)
1708{ 1625{
1709 int i, pd_idx = sh->pd_idx, disks = sh->disks; 1626 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1710 int locked = 0;
1711 1627
1712 if (rcw) { 1628 if (rcw) {
1713 /* if we are not expanding this is a proper write request, and 1629 /* if we are not expanding this is a proper write request, and
@@ -1715,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1715 * stripe cache 1631 * stripe cache
1716 */ 1632 */
1717 if (!expand) { 1633 if (!expand) {
1718 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); 1634 sh->reconstruct_state = reconstruct_state_drain_run;
1719 sh->ops.count++; 1635 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1720 } 1636 } else
1637 sh->reconstruct_state = reconstruct_state_run;
1721 1638
1722 set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); 1639 set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1723 sh->ops.count++;
1724 1640
1725 for (i = disks; i--; ) { 1641 for (i = disks; i--; ) {
1726 struct r5dev *dev = &sh->dev[i]; 1642 struct r5dev *dev = &sh->dev[i];
1727 1643
1728 if (dev->towrite) { 1644 if (dev->towrite) {
1729 set_bit(R5_LOCKED, &dev->flags); 1645 set_bit(R5_LOCKED, &dev->flags);
1646 set_bit(R5_Wantdrain, &dev->flags);
1730 if (!expand) 1647 if (!expand)
1731 clear_bit(R5_UPTODATE, &dev->flags); 1648 clear_bit(R5_UPTODATE, &dev->flags);
1732 locked++; 1649 s->locked++;
1733 } 1650 }
1734 } 1651 }
1735 if (locked + 1 == disks) 1652 if (s->locked + 1 == disks)
1736 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 1653 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1737 atomic_inc(&sh->raid_conf->pending_full_writes); 1654 atomic_inc(&sh->raid_conf->pending_full_writes);
1738 } else { 1655 } else {
1739 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1656 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1740 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1657 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1741 1658
1742 set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); 1659 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1743 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); 1660 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1744 set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); 1661 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1745 1662 set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1746 sh->ops.count += 3;
1747 1663
1748 for (i = disks; i--; ) { 1664 for (i = disks; i--; ) {
1749 struct r5dev *dev = &sh->dev[i]; 1665 struct r5dev *dev = &sh->dev[i];
1750 if (i == pd_idx) 1666 if (i == pd_idx)
1751 continue; 1667 continue;
1752 1668
1753 /* For a read-modify write there may be blocks that are
1754 * locked for reading while others are ready to be
1755 * written so we distinguish these blocks by the
1756 * R5_Wantprexor bit
1757 */
1758 if (dev->towrite && 1669 if (dev->towrite &&
1759 (test_bit(R5_UPTODATE, &dev->flags) || 1670 (test_bit(R5_UPTODATE, &dev->flags) ||
1760 test_bit(R5_Wantcompute, &dev->flags))) { 1671 test_bit(R5_Wantcompute, &dev->flags))) {
1761 set_bit(R5_Wantprexor, &dev->flags); 1672 set_bit(R5_Wantdrain, &dev->flags);
1762 set_bit(R5_LOCKED, &dev->flags); 1673 set_bit(R5_LOCKED, &dev->flags);
1763 clear_bit(R5_UPTODATE, &dev->flags); 1674 clear_bit(R5_UPTODATE, &dev->flags);
1764 locked++; 1675 s->locked++;
1765 } 1676 }
1766 } 1677 }
1767 } 1678 }
@@ -1771,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1771 */ 1682 */
1772 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 1683 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1773 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1684 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1774 locked++; 1685 s->locked++;
1775 1686
1776 pr_debug("%s: stripe %llu locked: %d pending: %lx\n", 1687 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
1777 __func__, (unsigned long long)sh->sector, 1688 __func__, (unsigned long long)sh->sector,
1778 locked, sh->ops.pending); 1689 s->locked, s->ops_request);
1779
1780 return locked;
1781} 1690}
1782 1691
1783/* 1692/*
@@ -1876,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1876} 1785}
1877 1786
1878static void 1787static void
1879handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, 1788handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
1880 struct stripe_head_state *s, int disks, 1789 struct stripe_head_state *s, int disks,
1881 struct bio **return_bi) 1790 struct bio **return_bi)
1882{ 1791{
@@ -1967,48 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
1967 md_wakeup_thread(conf->mddev->thread); 1876 md_wakeup_thread(conf->mddev->thread);
1968} 1877}
1969 1878
1970/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks 1879/* fetch_block5 - checks the given member device to see if its data needs
1971 * to process 1880 * to be read or computed to satisfy a request.
1881 *
1882 * Returns 1 when no more member devices need to be checked, otherwise returns
1883 * 0 to tell the loop in handle_stripe_fill5 to continue
1972 */ 1884 */
1973static int __handle_issuing_new_read_requests5(struct stripe_head *sh, 1885static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
1974 struct stripe_head_state *s, int disk_idx, int disks) 1886 int disk_idx, int disks)
1975{ 1887{
1976 struct r5dev *dev = &sh->dev[disk_idx]; 1888 struct r5dev *dev = &sh->dev[disk_idx];
1977 struct r5dev *failed_dev = &sh->dev[s->failed_num]; 1889 struct r5dev *failed_dev = &sh->dev[s->failed_num];
1978 1890
1979 /* don't schedule compute operations or reads on the parity block while
1980 * a check is in flight
1981 */
1982 if ((disk_idx == sh->pd_idx) &&
1983 test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
1984 return ~0;
1985
1986 /* is the data in this block needed, and can we get it? */ 1891 /* is the data in this block needed, and can we get it? */
1987 if (!test_bit(R5_LOCKED, &dev->flags) && 1892 if (!test_bit(R5_LOCKED, &dev->flags) &&
1988 !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || 1893 !test_bit(R5_UPTODATE, &dev->flags) &&
1989 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 1894 (dev->toread ||
1990 s->syncing || s->expanding || (s->failed && 1895 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1991 (failed_dev->toread || (failed_dev->towrite && 1896 s->syncing || s->expanding ||
1992 !test_bit(R5_OVERWRITE, &failed_dev->flags) 1897 (s->failed &&
1993 ))))) { 1898 (failed_dev->toread ||
1994 /* 1/ We would like to get this block, possibly by computing it, 1899 (failed_dev->towrite &&
1995 * but we might not be able to. 1900 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
1996 * 1901 /* We would like to get this block, possibly by computing it,
1997 * 2/ Since parity check operations potentially make the parity 1902 * otherwise read it if the backing disk is insync
1998 * block !uptodate it will need to be refreshed before any
1999 * compute operations on data disks are scheduled.
2000 *
2001 * 3/ We hold off parity block re-reads until check operations
2002 * have quiesced.
2003 */ 1903 */
2004 if ((s->uptodate == disks - 1) && 1904 if ((s->uptodate == disks - 1) &&
2005 (s->failed && disk_idx == s->failed_num) && 1905 (s->failed && disk_idx == s->failed_num)) {
2006 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { 1906 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2007 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); 1907 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2008 set_bit(R5_Wantcompute, &dev->flags); 1908 set_bit(R5_Wantcompute, &dev->flags);
2009 sh->ops.target = disk_idx; 1909 sh->ops.target = disk_idx;
2010 s->req_compute = 1; 1910 s->req_compute = 1;
2011 sh->ops.count++;
2012 /* Careful: from this point on 'uptodate' is in the eye 1911 /* Careful: from this point on 'uptodate' is in the eye
2013 * of raid5_run_ops which services 'compute' operations 1912 * of raid5_run_ops which services 'compute' operations
2014 * before writes. R5_Wantcompute flags a block that will 1913 * before writes. R5_Wantcompute flags a block that will
@@ -2016,53 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
2016 * subsequent operation. 1915 * subsequent operation.
2017 */ 1916 */
2018 s->uptodate++; 1917 s->uptodate++;
2019 return 0; /* uptodate + compute == disks */ 1918 return 1; /* uptodate + compute == disks */
2020 } else if (test_bit(R5_Insync, &dev->flags)) { 1919 } else if (test_bit(R5_Insync, &dev->flags)) {
2021 set_bit(R5_LOCKED, &dev->flags); 1920 set_bit(R5_LOCKED, &dev->flags);
2022 set_bit(R5_Wantread, &dev->flags); 1921 set_bit(R5_Wantread, &dev->flags);
2023 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2024 sh->ops.count++;
2025 s->locked++; 1922 s->locked++;
2026 pr_debug("Reading block %d (sync=%d)\n", disk_idx, 1923 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2027 s->syncing); 1924 s->syncing);
2028 } 1925 }
2029 } 1926 }
2030 1927
2031 return ~0; 1928 return 0;
2032} 1929}
2033 1930
2034static void handle_issuing_new_read_requests5(struct stripe_head *sh, 1931/**
1932 * handle_stripe_fill5 - read or compute data to satisfy pending requests.
1933 */
1934static void handle_stripe_fill5(struct stripe_head *sh,
2035 struct stripe_head_state *s, int disks) 1935 struct stripe_head_state *s, int disks)
2036{ 1936{
2037 int i; 1937 int i;
2038 1938
2039 /* Clear completed compute operations. Parity recovery
2040 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
2041 * later on in this routine
2042 */
2043 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2044 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2045 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2046 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2047 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2048 }
2049
2050 /* look for blocks to read/compute, skip this if a compute 1939 /* look for blocks to read/compute, skip this if a compute
2051 * is already in flight, or if the stripe contents are in the 1940 * is already in flight, or if the stripe contents are in the
2052 * midst of changing due to a write 1941 * midst of changing due to a write
2053 */ 1942 */
2054 if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && 1943 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2055 !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && 1944 !sh->reconstruct_state)
2056 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2057 for (i = disks; i--; ) 1945 for (i = disks; i--; )
2058 if (__handle_issuing_new_read_requests5( 1946 if (fetch_block5(sh, s, i, disks))
2059 sh, s, i, disks) == 0)
2060 break; 1947 break;
2061 }
2062 set_bit(STRIPE_HANDLE, &sh->state); 1948 set_bit(STRIPE_HANDLE, &sh->state);
2063} 1949}
2064 1950
2065static void handle_issuing_new_read_requests6(struct stripe_head *sh, 1951static void handle_stripe_fill6(struct stripe_head *sh,
2066 struct stripe_head_state *s, struct r6_state *r6s, 1952 struct stripe_head_state *s, struct r6_state *r6s,
2067 int disks) 1953 int disks)
2068{ 1954{
@@ -2121,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
2121} 2007}
2122 2008
2123 2009
2124/* handle_completed_write_requests 2010/* handle_stripe_clean_event
2125 * any written block on an uptodate or failed drive can be returned. 2011 * any written block on an uptodate or failed drive can be returned.
2126 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2012 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2127 * never LOCKED, so we don't need to test 'failed' directly. 2013 * never LOCKED, so we don't need to test 'failed' directly.
2128 */ 2014 */
2129static void handle_completed_write_requests(raid5_conf_t *conf, 2015static void handle_stripe_clean_event(raid5_conf_t *conf,
2130 struct stripe_head *sh, int disks, struct bio **return_bi) 2016 struct stripe_head *sh, int disks, struct bio **return_bi)
2131{ 2017{
2132 int i; 2018 int i;
@@ -2171,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
2171 md_wakeup_thread(conf->mddev->thread); 2057 md_wakeup_thread(conf->mddev->thread);
2172} 2058}
2173 2059
2174static void handle_issuing_new_write_requests5(raid5_conf_t *conf, 2060static void handle_stripe_dirtying5(raid5_conf_t *conf,
2175 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2061 struct stripe_head *sh, struct stripe_head_state *s, int disks)
2176{ 2062{
2177 int rmw = 0, rcw = 0, i; 2063 int rmw = 0, rcw = 0, i;
@@ -2215,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2215 "%d for r-m-w\n", i); 2101 "%d for r-m-w\n", i);
2216 set_bit(R5_LOCKED, &dev->flags); 2102 set_bit(R5_LOCKED, &dev->flags);
2217 set_bit(R5_Wantread, &dev->flags); 2103 set_bit(R5_Wantread, &dev->flags);
2218 if (!test_and_set_bit(
2219 STRIPE_OP_IO, &sh->ops.pending))
2220 sh->ops.count++;
2221 s->locked++; 2104 s->locked++;
2222 } else { 2105 } else {
2223 set_bit(STRIPE_DELAYED, &sh->state); 2106 set_bit(STRIPE_DELAYED, &sh->state);
@@ -2241,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2241 "%d for Reconstruct\n", i); 2124 "%d for Reconstruct\n", i);
2242 set_bit(R5_LOCKED, &dev->flags); 2125 set_bit(R5_LOCKED, &dev->flags);
2243 set_bit(R5_Wantread, &dev->flags); 2126 set_bit(R5_Wantread, &dev->flags);
2244 if (!test_and_set_bit(
2245 STRIPE_OP_IO, &sh->ops.pending))
2246 sh->ops.count++;
2247 s->locked++; 2127 s->locked++;
2248 } else { 2128 } else {
2249 set_bit(STRIPE_DELAYED, &sh->state); 2129 set_bit(STRIPE_DELAYED, &sh->state);
@@ -2261,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2261 * simultaneously. If this is not the case then new writes need to be 2141 * simultaneously. If this is not the case then new writes need to be
2262 * held off until the compute completes. 2142 * held off until the compute completes.
2263 */ 2143 */
2264 if ((s->req_compute || 2144 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2265 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && 2145 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2266 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2146 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2267 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2147 schedule_reconstruction5(sh, s, rcw == 0, 0);
2268 s->locked += handle_write_operations5(sh, rcw == 0, 0);
2269} 2148}
2270 2149
2271static void handle_issuing_new_write_requests6(raid5_conf_t *conf, 2150static void handle_stripe_dirtying6(raid5_conf_t *conf,
2272 struct stripe_head *sh, struct stripe_head_state *s, 2151 struct stripe_head *sh, struct stripe_head_state *s,
2273 struct r6_state *r6s, int disks) 2152 struct r6_state *r6s, int disks)
2274{ 2153{
@@ -2371,92 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
2371static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2250static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2372 struct stripe_head_state *s, int disks) 2251 struct stripe_head_state *s, int disks)
2373{ 2252{
2374 int canceled_check = 0; 2253 struct r5dev *dev = NULL;
2375 2254
2376 set_bit(STRIPE_HANDLE, &sh->state); 2255 set_bit(STRIPE_HANDLE, &sh->state);
2377 2256
2378 /* complete a check operation */ 2257 switch (sh->check_state) {
2379 if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { 2258 case check_state_idle:
2380 clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); 2259 /* start a new check operation if there are no failures */
2381 clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
2382 if (s->failed == 0) { 2260 if (s->failed == 0) {
2383 if (sh->ops.zero_sum_result == 0)
2384 /* parity is correct (on disc,
2385 * not in buffer any more)
2386 */
2387 set_bit(STRIPE_INSYNC, &sh->state);
2388 else {
2389 conf->mddev->resync_mismatches +=
2390 STRIPE_SECTORS;
2391 if (test_bit(
2392 MD_RECOVERY_CHECK, &conf->mddev->recovery))
2393 /* don't try to repair!! */
2394 set_bit(STRIPE_INSYNC, &sh->state);
2395 else {
2396 set_bit(STRIPE_OP_COMPUTE_BLK,
2397 &sh->ops.pending);
2398 set_bit(STRIPE_OP_MOD_REPAIR_PD,
2399 &sh->ops.pending);
2400 set_bit(R5_Wantcompute,
2401 &sh->dev[sh->pd_idx].flags);
2402 sh->ops.target = sh->pd_idx;
2403 sh->ops.count++;
2404 s->uptodate++;
2405 }
2406 }
2407 } else
2408 canceled_check = 1; /* STRIPE_INSYNC is not set */
2409 }
2410
2411 /* start a new check operation if there are no failures, the stripe is
2412 * not insync, and a repair is not in flight
2413 */
2414 if (s->failed == 0 &&
2415 !test_bit(STRIPE_INSYNC, &sh->state) &&
2416 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2417 if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
2418 BUG_ON(s->uptodate != disks); 2261 BUG_ON(s->uptodate != disks);
2262 sh->check_state = check_state_run;
2263 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2419 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2264 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2420 sh->ops.count++;
2421 s->uptodate--; 2265 s->uptodate--;
2266 break;
2422 } 2267 }
2423 } 2268 dev = &sh->dev[s->failed_num];
2424 2269 /* fall through */
2425 /* check if we can clear a parity disk reconstruct */ 2270 case check_state_compute_result:
2426 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && 2271 sh->check_state = check_state_idle;
2427 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { 2272 if (!dev)
2428 2273 dev = &sh->dev[sh->pd_idx];
2429 clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); 2274
2430 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); 2275 /* check that a write has not made the stripe insync */
2431 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); 2276 if (test_bit(STRIPE_INSYNC, &sh->state))
2432 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); 2277 break;
2433 }
2434
2435 2278
2436 /* Wait for check parity and compute block operations to complete
2437 * before write-back. If a failure occurred while the check operation
2438 * was in flight we need to cycle this stripe through handle_stripe
2439 * since the parity block may not be uptodate
2440 */
2441 if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
2442 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
2443 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
2444 struct r5dev *dev;
2445 /* either failed parity check, or recovery is happening */ 2279 /* either failed parity check, or recovery is happening */
2446 if (s->failed == 0)
2447 s->failed_num = sh->pd_idx;
2448 dev = &sh->dev[s->failed_num];
2449 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2280 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2450 BUG_ON(s->uptodate != disks); 2281 BUG_ON(s->uptodate != disks);
2451 2282
2452 set_bit(R5_LOCKED, &dev->flags); 2283 set_bit(R5_LOCKED, &dev->flags);
2284 s->locked++;
2453 set_bit(R5_Wantwrite, &dev->flags); 2285 set_bit(R5_Wantwrite, &dev->flags);
2454 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2455 sh->ops.count++;
2456 2286
2457 clear_bit(STRIPE_DEGRADED, &sh->state); 2287 clear_bit(STRIPE_DEGRADED, &sh->state);
2458 s->locked++;
2459 set_bit(STRIPE_INSYNC, &sh->state); 2288 set_bit(STRIPE_INSYNC, &sh->state);
2289 break;
2290 case check_state_run:
2291 break; /* we will be called again upon completion */
2292 case check_state_check_result:
2293 sh->check_state = check_state_idle;
2294
2295 /* if a failure occurred during the check operation, leave
2296 * STRIPE_INSYNC not set and let the stripe be handled again
2297 */
2298 if (s->failed)
2299 break;
2300
2301 /* handle a successful check operation, if parity is correct
2302 * we are done. Otherwise update the mismatch count and repair
2303 * parity if !MD_RECOVERY_CHECK
2304 */
2305 if (sh->ops.zero_sum_result == 0)
2306 /* parity is correct (on disc,
2307 * not in buffer any more)
2308 */
2309 set_bit(STRIPE_INSYNC, &sh->state);
2310 else {
2311 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2312 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2313 /* don't try to repair!! */
2314 set_bit(STRIPE_INSYNC, &sh->state);
2315 else {
2316 sh->check_state = check_state_compute_run;
2317 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2318 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2319 set_bit(R5_Wantcompute,
2320 &sh->dev[sh->pd_idx].flags);
2321 sh->ops.target = sh->pd_idx;
2322 s->uptodate++;
2323 }
2324 }
2325 break;
2326 case check_state_compute_run:
2327 break;
2328 default:
2329 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2330 __func__, sh->check_state,
2331 (unsigned long long) sh->sector);
2332 BUG();
2460 } 2333 }
2461} 2334}
2462 2335
@@ -2641,15 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh)
2641 struct bio *return_bi = NULL; 2514 struct bio *return_bi = NULL;
2642 struct stripe_head_state s; 2515 struct stripe_head_state s;
2643 struct r5dev *dev; 2516 struct r5dev *dev;
2644 unsigned long pending = 0;
2645 mdk_rdev_t *blocked_rdev = NULL; 2517 mdk_rdev_t *blocked_rdev = NULL;
2646 int prexor; 2518 int prexor;
2647 2519
2648 memset(&s, 0, sizeof(s)); 2520 memset(&s, 0, sizeof(s));
2649 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " 2521 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
2650 "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, 2522 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
2651 atomic_read(&sh->count), sh->pd_idx, 2523 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
2652 sh->ops.pending, sh->ops.ack, sh->ops.complete); 2524 sh->reconstruct_state);
2653 2525
2654 spin_lock(&sh->lock); 2526 spin_lock(&sh->lock);
2655 clear_bit(STRIPE_HANDLE, &sh->state); 2527 clear_bit(STRIPE_HANDLE, &sh->state);
@@ -2658,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh)
2658 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 2530 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2659 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2531 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2660 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2532 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2661 /* Now to look around and see what can be done */
2662
2663 /* clean-up completed biofill operations */
2664 if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
2665 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
2666 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
2667 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
2668 }
2669 2533
2534 /* Now to look around and see what can be done */
2670 rcu_read_lock(); 2535 rcu_read_lock();
2671 for (i=disks; i--; ) { 2536 for (i=disks; i--; ) {
2672 mdk_rdev_t *rdev; 2537 mdk_rdev_t *rdev;
@@ -2680,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh)
2680 /* maybe we can request a biofill operation 2545 /* maybe we can request a biofill operation
2681 * 2546 *
2682 * new wantfill requests are only permitted while 2547 * new wantfill requests are only permitted while
2683 * STRIPE_OP_BIOFILL is clear 2548 * ops_complete_biofill is guaranteed to be inactive
2684 */ 2549 */
2685 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 2550 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
2686 !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) 2551 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
2687 set_bit(R5_Wantfill, &dev->flags); 2552 set_bit(R5_Wantfill, &dev->flags);
2688 2553
2689 /* now count some things */ 2554 /* now count some things */
@@ -2727,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh)
2727 goto unlock; 2592 goto unlock;
2728 } 2593 }
2729 2594
2730 if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) 2595 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
2731 sh->ops.count++; 2596 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
2597 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
2598 }
2732 2599
2733 pr_debug("locked=%d uptodate=%d to_read=%d" 2600 pr_debug("locked=%d uptodate=%d to_read=%d"
2734 " to_write=%d failed=%d failed_num=%d\n", 2601 " to_write=%d failed=%d failed_num=%d\n",
@@ -2738,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh)
2738 * need to be failed 2605 * need to be failed
2739 */ 2606 */
2740 if (s.failed > 1 && s.to_read+s.to_write+s.written) 2607 if (s.failed > 1 && s.to_read+s.to_write+s.written)
2741 handle_requests_to_failed_array(conf, sh, &s, disks, 2608 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
2742 &return_bi);
2743 if (s.failed > 1 && s.syncing) { 2609 if (s.failed > 1 && s.syncing) {
2744 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2610 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2745 clear_bit(STRIPE_SYNCING, &sh->state); 2611 clear_bit(STRIPE_SYNCING, &sh->state);
@@ -2755,48 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh)
2755 !test_bit(R5_LOCKED, &dev->flags) && 2621 !test_bit(R5_LOCKED, &dev->flags) &&
2756 test_bit(R5_UPTODATE, &dev->flags)) || 2622 test_bit(R5_UPTODATE, &dev->flags)) ||
2757 (s.failed == 1 && s.failed_num == sh->pd_idx))) 2623 (s.failed == 1 && s.failed_num == sh->pd_idx)))
2758 handle_completed_write_requests(conf, sh, disks, &return_bi); 2624 handle_stripe_clean_event(conf, sh, disks, &return_bi);
2759 2625
2760 /* Now we might consider reading some blocks, either to check/generate 2626 /* Now we might consider reading some blocks, either to check/generate
2761 * parity, or to satisfy requests 2627 * parity, or to satisfy requests
2762 * or to load a block that is being partially written. 2628 * or to load a block that is being partially written.
2763 */ 2629 */
2764 if (s.to_read || s.non_overwrite || 2630 if (s.to_read || s.non_overwrite ||
2765 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || 2631 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
2766 test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 2632 handle_stripe_fill5(sh, &s, disks);
2767 handle_issuing_new_read_requests5(sh, &s, disks);
2768 2633
2769 /* Now we check to see if any write operations have recently 2634 /* Now we check to see if any write operations have recently
2770 * completed 2635 * completed
2771 */ 2636 */
2772
2773 /* leave prexor set until postxor is done, allows us to distinguish
2774 * a rmw from a rcw during biodrain
2775 */
2776 prexor = 0; 2637 prexor = 0;
2777 if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && 2638 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
2778 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2779
2780 prexor = 1; 2639 prexor = 1;
2781 clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); 2640 if (sh->reconstruct_state == reconstruct_state_drain_result ||
2782 clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); 2641 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
2783 clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); 2642 sh->reconstruct_state = reconstruct_state_idle;
2784
2785 for (i = disks; i--; )
2786 clear_bit(R5_Wantprexor, &sh->dev[i].flags);
2787 }
2788
2789 /* if only POSTXOR is set then this is an 'expand' postxor */
2790 if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
2791 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2792
2793 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
2794 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
2795 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
2796
2797 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2798 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2799 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2800 2643
2801 /* All the 'written' buffers and the parity block are ready to 2644 /* All the 'written' buffers and the parity block are ready to
2802 * be written back to disk 2645 * be written back to disk
@@ -2808,9 +2651,6 @@ static void handle_stripe5(struct stripe_head *sh)
2808 (i == sh->pd_idx || dev->written)) { 2651 (i == sh->pd_idx || dev->written)) {
2809 pr_debug("Writing block %d\n", i); 2652 pr_debug("Writing block %d\n", i);
2810 set_bit(R5_Wantwrite, &dev->flags); 2653 set_bit(R5_Wantwrite, &dev->flags);
2811 if (!test_and_set_bit(
2812 STRIPE_OP_IO, &sh->ops.pending))
2813 sh->ops.count++;
2814 if (prexor) 2654 if (prexor)
2815 continue; 2655 continue;
2816 if (!test_bit(R5_Insync, &dev->flags) || 2656 if (!test_bit(R5_Insync, &dev->flags) ||
@@ -2832,20 +2672,18 @@ static void handle_stripe5(struct stripe_head *sh)
2832 * 2/ A 'check' operation is in flight, as it may clobber the parity 2672 * 2/ A 'check' operation is in flight, as it may clobber the parity
2833 * block. 2673 * block.
2834 */ 2674 */
2835 if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && 2675 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
2836 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) 2676 handle_stripe_dirtying5(conf, sh, &s, disks);
2837 handle_issuing_new_write_requests5(conf, sh, &s, disks);
2838 2677
2839 /* maybe we need to check and possibly fix the parity for this stripe 2678 /* maybe we need to check and possibly fix the parity for this stripe
2840 * Any reads will already have been scheduled, so we just see if enough 2679 * Any reads will already have been scheduled, so we just see if enough
2841 * data is available. The parity check is held off while parity 2680 * data is available. The parity check is held off while parity
2842 * dependent operations are in flight. 2681 * dependent operations are in flight.
2843 */ 2682 */
2844 if ((s.syncing && s.locked == 0 && 2683 if (sh->check_state ||
2845 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && 2684 (s.syncing && s.locked == 0 &&
2846 !test_bit(STRIPE_INSYNC, &sh->state)) || 2685 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
2847 test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || 2686 !test_bit(STRIPE_INSYNC, &sh->state)))
2848 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
2849 handle_parity_checks5(conf, sh, &s, disks); 2687 handle_parity_checks5(conf, sh, &s, disks);
2850 2688
2851 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 2689 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -2864,52 +2702,35 @@ static void handle_stripe5(struct stripe_head *sh)
2864 dev = &sh->dev[s.failed_num]; 2702 dev = &sh->dev[s.failed_num];
2865 if (!test_bit(R5_ReWrite, &dev->flags)) { 2703 if (!test_bit(R5_ReWrite, &dev->flags)) {
2866 set_bit(R5_Wantwrite, &dev->flags); 2704 set_bit(R5_Wantwrite, &dev->flags);
2867 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2868 sh->ops.count++;
2869 set_bit(R5_ReWrite, &dev->flags); 2705 set_bit(R5_ReWrite, &dev->flags);
2870 set_bit(R5_LOCKED, &dev->flags); 2706 set_bit(R5_LOCKED, &dev->flags);
2871 s.locked++; 2707 s.locked++;
2872 } else { 2708 } else {
2873 /* let's read it back */ 2709 /* let's read it back */
2874 set_bit(R5_Wantread, &dev->flags); 2710 set_bit(R5_Wantread, &dev->flags);
2875 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2876 sh->ops.count++;
2877 set_bit(R5_LOCKED, &dev->flags); 2711 set_bit(R5_LOCKED, &dev->flags);
2878 s.locked++; 2712 s.locked++;
2879 } 2713 }
2880 } 2714 }
2881 2715
2882 /* Finish postxor operations initiated by the expansion 2716 /* Finish reconstruct operations initiated by the expansion process */
2883 * process 2717 if (sh->reconstruct_state == reconstruct_state_result) {
2884 */ 2718 sh->reconstruct_state = reconstruct_state_idle;
2885 if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
2886 !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
2887
2888 clear_bit(STRIPE_EXPANDING, &sh->state); 2719 clear_bit(STRIPE_EXPANDING, &sh->state);
2889 2720 for (i = conf->raid_disks; i--; )
2890 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2891 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2892 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2893
2894 for (i = conf->raid_disks; i--; ) {
2895 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2721 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2896 set_bit(R5_LOCKED, &dev->flags); 2722 set_bit(R5_LOCKED, &dev->flags);
2897 s.locked++; 2723 s.locked++;
2898 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2899 sh->ops.count++;
2900 }
2901 } 2724 }
2902 2725
2903 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 2726 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
2904 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { 2727 !sh->reconstruct_state) {
2905 /* Need to write out all blocks after computing parity */ 2728 /* Need to write out all blocks after computing parity */
2906 sh->disks = conf->raid_disks; 2729 sh->disks = conf->raid_disks;
2907 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 2730 sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
2908 conf->raid_disks); 2731 conf->raid_disks);
2909 s.locked += handle_write_operations5(sh, 1, 1); 2732 schedule_reconstruction5(sh, &s, 1, 1);
2910 } else if (s.expanded && 2733 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
2911 s.locked == 0 &&
2912 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2913 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2734 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2914 atomic_dec(&conf->reshape_stripes); 2735 atomic_dec(&conf->reshape_stripes);
2915 wake_up(&conf->wait_for_overlap); 2736 wake_up(&conf->wait_for_overlap);
@@ -2917,12 +2738,9 @@ static void handle_stripe5(struct stripe_head *sh)
2917 } 2738 }
2918 2739
2919 if (s.expanding && s.locked == 0 && 2740 if (s.expanding && s.locked == 0 &&
2920 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 2741 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
2921 handle_stripe_expansion(conf, sh, NULL); 2742 handle_stripe_expansion(conf, sh, NULL);
2922 2743
2923 if (sh->ops.count)
2924 pending = get_stripe_work(sh);
2925
2926 unlock: 2744 unlock:
2927 spin_unlock(&sh->lock); 2745 spin_unlock(&sh->lock);
2928 2746
@@ -2930,11 +2748,12 @@ static void handle_stripe5(struct stripe_head *sh)
2930 if (unlikely(blocked_rdev)) 2748 if (unlikely(blocked_rdev))
2931 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2749 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2932 2750
2933 if (pending) 2751 if (s.ops_request)
2934 raid5_run_ops(sh, pending); 2752 raid5_run_ops(sh, s.ops_request);
2935 2753
2936 return_io(return_bi); 2754 ops_run_io(sh, &s);
2937 2755
2756 return_io(return_bi);
2938} 2757}
2939 2758
2940static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 2759static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
@@ -3042,8 +2861,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3042 * might need to be failed 2861 * might need to be failed
3043 */ 2862 */
3044 if (s.failed > 2 && s.to_read+s.to_write+s.written) 2863 if (s.failed > 2 && s.to_read+s.to_write+s.written)
3045 handle_requests_to_failed_array(conf, sh, &s, disks, 2864 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
3046 &return_bi);
3047 if (s.failed > 2 && s.syncing) { 2865 if (s.failed > 2 && s.syncing) {
3048 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2866 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3049 clear_bit(STRIPE_SYNCING, &sh->state); 2867 clear_bit(STRIPE_SYNCING, &sh->state);
@@ -3068,7 +2886,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3068 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 2886 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3069 && !test_bit(R5_LOCKED, &qdev->flags) 2887 && !test_bit(R5_LOCKED, &qdev->flags)
3070 && test_bit(R5_UPTODATE, &qdev->flags))))) 2888 && test_bit(R5_UPTODATE, &qdev->flags)))))
3071 handle_completed_write_requests(conf, sh, disks, &return_bi); 2889 handle_stripe_clean_event(conf, sh, disks, &return_bi);
3072 2890
3073 /* Now we might consider reading some blocks, either to check/generate 2891 /* Now we might consider reading some blocks, either to check/generate
3074 * parity, or to satisfy requests 2892 * parity, or to satisfy requests
@@ -3076,11 +2894,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3076 */ 2894 */
3077 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 2895 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3078 (s.syncing && (s.uptodate < disks)) || s.expanding) 2896 (s.syncing && (s.uptodate < disks)) || s.expanding)
3079 handle_issuing_new_read_requests6(sh, &s, &r6s, disks); 2897 handle_stripe_fill6(sh, &s, &r6s, disks);
3080 2898
3081 /* now to consider writing and what else, if anything should be read */ 2899 /* now to consider writing and what else, if anything should be read */
3082 if (s.to_write) 2900 if (s.to_write)
3083 handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); 2901 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3084 2902
3085 /* maybe we need to check and possibly fix the parity for this stripe 2903 /* maybe we need to check and possibly fix the parity for this stripe
3086 * Any reads will already have been scheduled, so we just see if enough 2904 * Any reads will already have been scheduled, so we just see if enough
@@ -3136,7 +2954,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3136 } 2954 }
3137 2955
3138 if (s.expanding && s.locked == 0 && 2956 if (s.expanding && s.locked == 0 &&
3139 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 2957 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3140 handle_stripe_expansion(conf, sh, &r6s); 2958 handle_stripe_expansion(conf, sh, &r6s);
3141 2959
3142 unlock: 2960 unlock:
@@ -3146,68 +2964,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3146 if (unlikely(blocked_rdev)) 2964 if (unlikely(blocked_rdev))
3147 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2965 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3148 2966
3149 return_io(return_bi); 2967 ops_run_io(sh, &s);
3150
3151 for (i=disks; i-- ;) {
3152 int rw;
3153 struct bio *bi;
3154 mdk_rdev_t *rdev;
3155 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
3156 rw = WRITE;
3157 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
3158 rw = READ;
3159 else
3160 continue;
3161
3162 set_bit(STRIPE_IO_STARTED, &sh->state);
3163
3164 bi = &sh->dev[i].req;
3165
3166 bi->bi_rw = rw;
3167 if (rw == WRITE)
3168 bi->bi_end_io = raid5_end_write_request;
3169 else
3170 bi->bi_end_io = raid5_end_read_request;
3171
3172 rcu_read_lock();
3173 rdev = rcu_dereference(conf->disks[i].rdev);
3174 if (rdev && test_bit(Faulty, &rdev->flags))
3175 rdev = NULL;
3176 if (rdev)
3177 atomic_inc(&rdev->nr_pending);
3178 rcu_read_unlock();
3179 2968
3180 if (rdev) { 2969 return_io(return_bi);
3181 if (s.syncing || s.expanding || s.expanded)
3182 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
3183
3184 bi->bi_bdev = rdev->bdev;
3185 pr_debug("for %llu schedule op %ld on disc %d\n",
3186 (unsigned long long)sh->sector, bi->bi_rw, i);
3187 atomic_inc(&sh->count);
3188 bi->bi_sector = sh->sector + rdev->data_offset;
3189 bi->bi_flags = 1 << BIO_UPTODATE;
3190 bi->bi_vcnt = 1;
3191 bi->bi_max_vecs = 1;
3192 bi->bi_idx = 0;
3193 bi->bi_io_vec = &sh->dev[i].vec;
3194 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
3195 bi->bi_io_vec[0].bv_offset = 0;
3196 bi->bi_size = STRIPE_SIZE;
3197 bi->bi_next = NULL;
3198 if (rw == WRITE &&
3199 test_bit(R5_ReWrite, &sh->dev[i].flags))
3200 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
3201 generic_make_request(bi);
3202 } else {
3203 if (rw == WRITE)
3204 set_bit(STRIPE_DEGRADED, &sh->state);
3205 pr_debug("skip op %ld on disc %d for sector %llu\n",
3206 bi->bi_rw, i, (unsigned long long)sh->sector);
3207 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3208 set_bit(STRIPE_HANDLE, &sh->state);
3209 }
3210 }
3211} 2970}
3212 2971
3213static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) 2972static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
@@ -3697,9 +3456,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
3697 if ( rw == WRITE ) 3456 if ( rw == WRITE )
3698 md_write_end(mddev); 3457 md_write_end(mddev);
3699 3458
3700 bi->bi_end_io(bi, 3459 bio_endio(bi, 0);
3701 test_bit(BIO_UPTODATE, &bi->bi_flags)
3702 ? 0 : -EIO);
3703 } 3460 }
3704 return 0; 3461 return 0;
3705} 3462}
@@ -3785,7 +3542,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3785 j == raid6_next_disk(sh->pd_idx, sh->disks)) 3542 j == raid6_next_disk(sh->pd_idx, sh->disks))
3786 continue; 3543 continue;
3787 s = compute_blocknr(sh, j); 3544 s = compute_blocknr(sh, j);
3788 if (s < (mddev->array_size<<1)) { 3545 if (s < mddev->array_sectors) {
3789 skipped = 1; 3546 skipped = 1;
3790 continue; 3547 continue;
3791 } 3548 }
@@ -4002,12 +3759,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4002 spin_lock_irq(&conf->device_lock); 3759 spin_lock_irq(&conf->device_lock);
4003 remaining = --raid_bio->bi_phys_segments; 3760 remaining = --raid_bio->bi_phys_segments;
4004 spin_unlock_irq(&conf->device_lock); 3761 spin_unlock_irq(&conf->device_lock);
4005 if (remaining == 0) { 3762 if (remaining == 0)
4006 3763 bio_endio(raid_bio, 0);
4007 raid_bio->bi_end_io(raid_bio,
4008 test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
4009 ? 0 : -EIO);
4010 }
4011 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3764 if (atomic_dec_and_test(&conf->active_aligned_reads))
4012 wake_up(&conf->wait_for_stripe); 3765 wake_up(&conf->wait_for_stripe);
4013 return handled; 3766 return handled;
@@ -4094,6 +3847,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4094{ 3847{
4095 raid5_conf_t *conf = mddev_to_conf(mddev); 3848 raid5_conf_t *conf = mddev_to_conf(mddev);
4096 unsigned long new; 3849 unsigned long new;
3850 int err;
3851
4097 if (len >= PAGE_SIZE) 3852 if (len >= PAGE_SIZE)
4098 return -EINVAL; 3853 return -EINVAL;
4099 if (!conf) 3854 if (!conf)
@@ -4109,7 +3864,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4109 else 3864 else
4110 break; 3865 break;
4111 } 3866 }
4112 md_allow_write(mddev); 3867 err = md_allow_write(mddev);
3868 if (err)
3869 return err;
4113 while (new > conf->max_nr_stripes) { 3870 while (new > conf->max_nr_stripes) {
4114 if (grow_one_stripe(conf)) 3871 if (grow_one_stripe(conf))
4115 conf->max_nr_stripes++; 3872 conf->max_nr_stripes++;
@@ -4434,7 +4191,7 @@ static int run(mddev_t *mddev)
4434 mddev->queue->backing_dev_info.congested_data = mddev; 4191 mddev->queue->backing_dev_info.congested_data = mddev;
4435 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 4192 mddev->queue->backing_dev_info.congested_fn = raid5_congested;
4436 4193
4437 mddev->array_size = mddev->size * (conf->previous_raid_disks - 4194 mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
4438 conf->max_degraded); 4195 conf->max_degraded);
4439 4196
4440 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 4197 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
@@ -4609,35 +4366,41 @@ abort:
4609static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 4366static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
4610{ 4367{
4611 raid5_conf_t *conf = mddev->private; 4368 raid5_conf_t *conf = mddev->private;
4612 int found = 0; 4369 int err = -EEXIST;
4613 int disk; 4370 int disk;
4614 struct disk_info *p; 4371 struct disk_info *p;
4372 int first = 0;
4373 int last = conf->raid_disks - 1;
4615 4374
4616 if (mddev->degraded > conf->max_degraded) 4375 if (mddev->degraded > conf->max_degraded)
4617 /* no point adding a device */ 4376 /* no point adding a device */
4618 return 0; 4377 return -EINVAL;
4378
4379 if (rdev->raid_disk >= 0)
4380 first = last = rdev->raid_disk;
4619 4381
4620 /* 4382 /*
4621 * find the disk ... but prefer rdev->saved_raid_disk 4383 * find the disk ... but prefer rdev->saved_raid_disk
4622 * if possible. 4384 * if possible.
4623 */ 4385 */
4624 if (rdev->saved_raid_disk >= 0 && 4386 if (rdev->saved_raid_disk >= 0 &&
4387 rdev->saved_raid_disk >= first &&
4625 conf->disks[rdev->saved_raid_disk].rdev == NULL) 4388 conf->disks[rdev->saved_raid_disk].rdev == NULL)
4626 disk = rdev->saved_raid_disk; 4389 disk = rdev->saved_raid_disk;
4627 else 4390 else
4628 disk = 0; 4391 disk = first;
4629 for ( ; disk < conf->raid_disks; disk++) 4392 for ( ; disk <= last ; disk++)
4630 if ((p=conf->disks + disk)->rdev == NULL) { 4393 if ((p=conf->disks + disk)->rdev == NULL) {
4631 clear_bit(In_sync, &rdev->flags); 4394 clear_bit(In_sync, &rdev->flags);
4632 rdev->raid_disk = disk; 4395 rdev->raid_disk = disk;
4633 found = 1; 4396 err = 0;
4634 if (rdev->saved_raid_disk != disk) 4397 if (rdev->saved_raid_disk != disk)
4635 conf->fullsync = 1; 4398 conf->fullsync = 1;
4636 rcu_assign_pointer(p->rdev, rdev); 4399 rcu_assign_pointer(p->rdev, rdev);
4637 break; 4400 break;
4638 } 4401 }
4639 print_raid5_conf(conf); 4402 print_raid5_conf(conf);
4640 return found; 4403 return err;
4641} 4404}
4642 4405
4643static int raid5_resize(mddev_t *mddev, sector_t sectors) 4406static int raid5_resize(mddev_t *mddev, sector_t sectors)
@@ -4652,8 +4415,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
4652 raid5_conf_t *conf = mddev_to_conf(mddev); 4415 raid5_conf_t *conf = mddev_to_conf(mddev);
4653 4416
4654 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 4417 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
4655 mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; 4418 mddev->array_sectors = sectors * (mddev->raid_disks
4656 set_capacity(mddev->gendisk, mddev->array_size << 1); 4419 - conf->max_degraded);
4420 set_capacity(mddev->gendisk, mddev->array_sectors);
4657 mddev->changed = 1; 4421 mddev->changed = 1;
4658 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { 4422 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
4659 mddev->recovery_cp = mddev->size << 1; 4423 mddev->recovery_cp = mddev->size << 1;
@@ -4738,7 +4502,7 @@ static int raid5_start_reshape(mddev_t *mddev)
4738 rdev_for_each(rdev, rtmp, mddev) 4502 rdev_for_each(rdev, rtmp, mddev)
4739 if (rdev->raid_disk < 0 && 4503 if (rdev->raid_disk < 0 &&
4740 !test_bit(Faulty, &rdev->flags)) { 4504 !test_bit(Faulty, &rdev->flags)) {
4741 if (raid5_add_disk(mddev, rdev)) { 4505 if (raid5_add_disk(mddev, rdev) == 0) {
4742 char nm[20]; 4506 char nm[20];
4743 set_bit(In_sync, &rdev->flags); 4507 set_bit(In_sync, &rdev->flags);
4744 added_devices++; 4508 added_devices++;
@@ -4786,15 +4550,16 @@ static void end_reshape(raid5_conf_t *conf)
4786 struct block_device *bdev; 4550 struct block_device *bdev;
4787 4551
4788 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 4552 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
4789 conf->mddev->array_size = conf->mddev->size * 4553 conf->mddev->array_sectors = 2 * conf->mddev->size *
4790 (conf->raid_disks - conf->max_degraded); 4554 (conf->raid_disks - conf->max_degraded);
4791 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); 4555 set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
4792 conf->mddev->changed = 1; 4556 conf->mddev->changed = 1;
4793 4557
4794 bdev = bdget_disk(conf->mddev->gendisk, 0); 4558 bdev = bdget_disk(conf->mddev->gendisk, 0);
4795 if (bdev) { 4559 if (bdev) {
4796 mutex_lock(&bdev->bd_inode->i_mutex); 4560 mutex_lock(&bdev->bd_inode->i_mutex);
4797 i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); 4561 i_size_write(bdev->bd_inode,
4562 (loff_t)conf->mddev->array_sectors << 9);
4798 mutex_unlock(&bdev->bd_inode->i_mutex); 4563 mutex_unlock(&bdev->bd_inode->i_mutex);
4799 bdput(bdev); 4564 bdput(bdev);
4800 } 4565 }