aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-05-23 01:03:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-23 01:03:03 -0400
commit4b382d0643603819e8b48da58efc254cabc22574 (patch)
tree71ae8bc989af8a0137c065e4741a76dc4e4d4cb8 /drivers
parentbdfbe804c2303cb4b178bb4b5c3e855892472033 (diff)
parentb098636cf04c89db4036fedc778da0acc666ad1a (diff)
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: md: allow resync_start to be set while an array is active. md/raid10: reformat some loops with less indenting. md/raid10: remove unused variable. md/raid10: make more use of 'slot' in raid10d. md/raid10: some tidying up in fix_read_error md/raid1: improve handling of pages allocated for write-behind. md/raid1: try fix_sync_read_error before process_checks. md/raid1: tidy up new functions: process_checks and fix_sync_read_error. md/raid1: split out two sub-functions from sync_request_write md: make error_handler functions more uniform and correct. md/multipath: discard ->working_disks in favour of ->degraded md/raid1: clean up read_balance. md: simplify raid10 read_balance md/bitmap: fix saving of events_cleared and other state. md: reject a re-add request that cannot be honoured. md: Fix race when creating a new md device.
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/bitmap.c10
-rw-r--r--drivers/md/md.c23
-rw-r--r--drivers/md/multipath.c60
-rw-r--r--drivers/md/multipath.h1
-rw-r--r--drivers/md/raid1.c506
-rw-r--r--drivers/md/raid1.h4
-rw-r--r--drivers/md/raid10.c424
-rw-r--r--drivers/md/raid5.c41
8 files changed, 535 insertions, 534 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 5c9362792f1d..70bd738b8b99 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -493,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
493 spin_unlock_irqrestore(&bitmap->lock, flags); 493 spin_unlock_irqrestore(&bitmap->lock, flags);
494 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 494 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
495 sb->events = cpu_to_le64(bitmap->mddev->events); 495 sb->events = cpu_to_le64(bitmap->mddev->events);
496 if (bitmap->mddev->events < bitmap->events_cleared) { 496 if (bitmap->mddev->events < bitmap->events_cleared)
497 /* rocking back to read-only */ 497 /* rocking back to read-only */
498 bitmap->events_cleared = bitmap->mddev->events; 498 bitmap->events_cleared = bitmap->mddev->events;
499 sb->events_cleared = cpu_to_le64(bitmap->events_cleared); 499 sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
500 } 500 sb->state = cpu_to_le32(bitmap->flags);
501 /* Just in case these have been changed via sysfs: */ 501 /* Just in case these have been changed via sysfs: */
502 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); 502 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
503 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); 503 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
@@ -618,7 +618,7 @@ success:
618 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) 618 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
619 bitmap->flags |= BITMAP_HOSTENDIAN; 619 bitmap->flags |= BITMAP_HOSTENDIAN;
620 bitmap->events_cleared = le64_to_cpu(sb->events_cleared); 620 bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
621 if (sb->state & cpu_to_le32(BITMAP_STALE)) 621 if (bitmap->flags & BITMAP_STALE)
622 bitmap->events_cleared = bitmap->mddev->events; 622 bitmap->events_cleared = bitmap->mddev->events;
623 err = 0; 623 err = 0;
624out: 624out:
@@ -652,9 +652,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
652 switch (op) { 652 switch (op) {
653 case MASK_SET: 653 case MASK_SET:
654 sb->state |= cpu_to_le32(bits); 654 sb->state |= cpu_to_le32(bits);
655 bitmap->flags |= bits;
655 break; 656 break;
656 case MASK_UNSET: 657 case MASK_UNSET:
657 sb->state &= cpu_to_le32(~bits); 658 sb->state &= cpu_to_le32(~bits);
659 bitmap->flags &= ~bits;
658 break; 660 break;
659 default: 661 default:
660 BUG(); 662 BUG();
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7d6f7f18a920..aa640a85bb21 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3324,7 +3324,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
3324 char *e; 3324 char *e;
3325 unsigned long long n = simple_strtoull(buf, &e, 10); 3325 unsigned long long n = simple_strtoull(buf, &e, 10);
3326 3326
3327 if (mddev->pers) 3327 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3328 return -EBUSY; 3328 return -EBUSY;
3329 if (cmd_match(buf, "none")) 3329 if (cmd_match(buf, "none"))
3330 n = MaxSector; 3330 n = MaxSector;
@@ -4347,13 +4347,19 @@ static int md_alloc(dev_t dev, char *name)
4347 disk->fops = &md_fops; 4347 disk->fops = &md_fops;
4348 disk->private_data = mddev; 4348 disk->private_data = mddev;
4349 disk->queue = mddev->queue; 4349 disk->queue = mddev->queue;
4350 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
4350 /* Allow extended partitions. This makes the 4351 /* Allow extended partitions. This makes the
4351 * 'mdp' device redundant, but we can't really 4352 * 'mdp' device redundant, but we can't really
4352 * remove it now. 4353 * remove it now.
4353 */ 4354 */
4354 disk->flags |= GENHD_FL_EXT_DEVT; 4355 disk->flags |= GENHD_FL_EXT_DEVT;
4355 add_disk(disk);
4356 mddev->gendisk = disk; 4356 mddev->gendisk = disk;
4357 /* As soon as we call add_disk(), another thread could get
4358 * through to md_open, so make sure it doesn't get too far
4359 */
4360 mutex_lock(&mddev->open_mutex);
4361 add_disk(disk);
4362
4357 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 4363 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4358 &disk_to_dev(disk)->kobj, "%s", "md"); 4364 &disk_to_dev(disk)->kobj, "%s", "md");
4359 if (error) { 4365 if (error) {
@@ -4367,8 +4373,7 @@ static int md_alloc(dev_t dev, char *name)
4367 if (mddev->kobj.sd && 4373 if (mddev->kobj.sd &&
4368 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 4374 sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4369 printk(KERN_DEBUG "pointless warning\n"); 4375 printk(KERN_DEBUG "pointless warning\n");
4370 4376 mutex_unlock(&mddev->open_mutex);
4371 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
4372 abort: 4377 abort:
4373 mutex_unlock(&disks_mutex); 4378 mutex_unlock(&disks_mutex);
4374 if (!error && mddev->kobj.sd) { 4379 if (!error && mddev->kobj.sd) {
@@ -5211,6 +5216,16 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
5211 } else 5216 } else
5212 super_types[mddev->major_version]. 5217 super_types[mddev->major_version].
5213 validate_super(mddev, rdev); 5218 validate_super(mddev, rdev);
5219 if ((info->state & (1<<MD_DISK_SYNC)) &&
5220 (!test_bit(In_sync, &rdev->flags) ||
5221 rdev->raid_disk != info->raid_disk)) {
5222 /* This was a hot-add request, but events doesn't
5223 * match, so reject it.
5224 */
5225 export_rdev(rdev);
5226 return -EINVAL;
5227 }
5228
5214 if (test_bit(In_sync, &rdev->flags)) 5229 if (test_bit(In_sync, &rdev->flags))
5215 rdev->saved_raid_disk = rdev->raid_disk; 5230 rdev->saved_raid_disk = rdev->raid_disk;
5216 else 5231 else
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index c35890990985..3535c23af288 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -146,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
146 int i; 146 int i;
147 147
148 seq_printf (seq, " [%d/%d] [", conf->raid_disks, 148 seq_printf (seq, " [%d/%d] [", conf->raid_disks,
149 conf->working_disks); 149 conf->raid_disks - mddev->degraded);
150 for (i = 0; i < conf->raid_disks; i++) 150 for (i = 0; i < conf->raid_disks; i++)
151 seq_printf (seq, "%s", 151 seq_printf (seq, "%s",
152 conf->multipaths[i].rdev && 152 conf->multipaths[i].rdev &&
@@ -186,35 +186,36 @@ static int multipath_congested(void *data, int bits)
186static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) 186static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
187{ 187{
188 multipath_conf_t *conf = mddev->private; 188 multipath_conf_t *conf = mddev->private;
189 char b[BDEVNAME_SIZE];
189 190
190 if (conf->working_disks <= 1) { 191 if (conf->raid_disks - mddev->degraded <= 1) {
191 /* 192 /*
192 * Uh oh, we can do nothing if this is our last path, but 193 * Uh oh, we can do nothing if this is our last path, but
193 * first check if this is a queued request for a device 194 * first check if this is a queued request for a device
194 * which has just failed. 195 * which has just failed.
195 */ 196 */
196 printk(KERN_ALERT 197 printk(KERN_ALERT
197 "multipath: only one IO path left and IO error.\n"); 198 "multipath: only one IO path left and IO error.\n");
198 /* leave it active... it's all we have */ 199 /* leave it active... it's all we have */
199 } else { 200 return;
200 /* 201 }
201 * Mark disk as unusable 202 /*
202 */ 203 * Mark disk as unusable
203 if (!test_bit(Faulty, &rdev->flags)) { 204 */
204 char b[BDEVNAME_SIZE]; 205 if (test_and_clear_bit(In_sync, &rdev->flags)) {
205 clear_bit(In_sync, &rdev->flags); 206 unsigned long flags;
206 set_bit(Faulty, &rdev->flags); 207 spin_lock_irqsave(&conf->device_lock, flags);
207 set_bit(MD_CHANGE_DEVS, &mddev->flags); 208 mddev->degraded++;
208 conf->working_disks--; 209 spin_unlock_irqrestore(&conf->device_lock, flags);
209 mddev->degraded++;
210 printk(KERN_ALERT "multipath: IO failure on %s,"
211 " disabling IO path.\n"
212 "multipath: Operation continuing"
213 " on %d IO paths.\n",
214 bdevname (rdev->bdev,b),
215 conf->working_disks);
216 }
217 } 210 }
211 set_bit(Faulty, &rdev->flags);
212 set_bit(MD_CHANGE_DEVS, &mddev->flags);
213 printk(KERN_ALERT "multipath: IO failure on %s,"
214 " disabling IO path.\n"
215 "multipath: Operation continuing"
216 " on %d IO paths.\n",
217 bdevname(rdev->bdev, b),
218 conf->raid_disks - mddev->degraded);
218} 219}
219 220
220static void print_multipath_conf (multipath_conf_t *conf) 221static void print_multipath_conf (multipath_conf_t *conf)
@@ -227,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf)
227 printk("(conf==NULL)\n"); 228 printk("(conf==NULL)\n");
228 return; 229 return;
229 } 230 }
230 printk(" --- wd:%d rd:%d\n", conf->working_disks, 231 printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
231 conf->raid_disks); 232 conf->raid_disks);
232 233
233 for (i = 0; i < conf->raid_disks; i++) { 234 for (i = 0; i < conf->raid_disks; i++) {
@@ -274,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
274 PAGE_CACHE_SIZE - 1); 275 PAGE_CACHE_SIZE - 1);
275 } 276 }
276 277
277 conf->working_disks++; 278 spin_lock_irq(&conf->device_lock);
278 mddev->degraded--; 279 mddev->degraded--;
279 rdev->raid_disk = path; 280 rdev->raid_disk = path;
280 set_bit(In_sync, &rdev->flags); 281 set_bit(In_sync, &rdev->flags);
282 spin_unlock_irq(&conf->device_lock);
281 rcu_assign_pointer(p->rdev, rdev); 283 rcu_assign_pointer(p->rdev, rdev);
282 err = 0; 284 err = 0;
283 md_integrity_add_rdev(rdev, mddev); 285 md_integrity_add_rdev(rdev, mddev);
@@ -391,6 +393,7 @@ static int multipath_run (mddev_t *mddev)
391 int disk_idx; 393 int disk_idx;
392 struct multipath_info *disk; 394 struct multipath_info *disk;
393 mdk_rdev_t *rdev; 395 mdk_rdev_t *rdev;
396 int working_disks;
394 397
395 if (md_check_no_bitmap(mddev)) 398 if (md_check_no_bitmap(mddev))
396 return -EINVAL; 399 return -EINVAL;
@@ -424,7 +427,7 @@ static int multipath_run (mddev_t *mddev)
424 goto out_free_conf; 427 goto out_free_conf;
425 } 428 }
426 429
427 conf->working_disks = 0; 430 working_disks = 0;
428 list_for_each_entry(rdev, &mddev->disks, same_set) { 431 list_for_each_entry(rdev, &mddev->disks, same_set) {
429 disk_idx = rdev->raid_disk; 432 disk_idx = rdev->raid_disk;
430 if (disk_idx < 0 || 433 if (disk_idx < 0 ||
@@ -446,7 +449,7 @@ static int multipath_run (mddev_t *mddev)
446 } 449 }
447 450
448 if (!test_bit(Faulty, &rdev->flags)) 451 if (!test_bit(Faulty, &rdev->flags))
449 conf->working_disks++; 452 working_disks++;
450 } 453 }
451 454
452 conf->raid_disks = mddev->raid_disks; 455 conf->raid_disks = mddev->raid_disks;
@@ -454,12 +457,12 @@ static int multipath_run (mddev_t *mddev)
454 spin_lock_init(&conf->device_lock); 457 spin_lock_init(&conf->device_lock);
455 INIT_LIST_HEAD(&conf->retry_list); 458 INIT_LIST_HEAD(&conf->retry_list);
456 459
457 if (!conf->working_disks) { 460 if (!working_disks) {
458 printk(KERN_ERR "multipath: no operational IO paths for %s\n", 461 printk(KERN_ERR "multipath: no operational IO paths for %s\n",
459 mdname(mddev)); 462 mdname(mddev));
460 goto out_free_conf; 463 goto out_free_conf;
461 } 464 }
462 mddev->degraded = conf->raid_disks - conf->working_disks; 465 mddev->degraded = conf->raid_disks - working_disks;
463 466
464 conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, 467 conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
465 sizeof(struct multipath_bh)); 468 sizeof(struct multipath_bh));
@@ -481,7 +484,8 @@ static int multipath_run (mddev_t *mddev)
481 484
482 printk(KERN_INFO 485 printk(KERN_INFO
483 "multipath: array %s active with %d out of %d IO paths\n", 486 "multipath: array %s active with %d out of %d IO paths\n",
484 mdname(mddev), conf->working_disks, mddev->raid_disks); 487 mdname(mddev), conf->raid_disks - mddev->degraded,
488 mddev->raid_disks);
485 /* 489 /*
486 * Ok, everything is just fine now 490 * Ok, everything is just fine now
487 */ 491 */
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index d1c2a8d78395..3c5a45eb5f8a 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -9,7 +9,6 @@ struct multipath_private_data {
9 mddev_t *mddev; 9 mddev_t *mddev;
10 struct multipath_info *multipaths; 10 struct multipath_info *multipaths;
11 int raid_disks; 11 int raid_disks;
12 int working_disks;
13 spinlock_t device_lock; 12 spinlock_t device_lock;
14 struct list_head retry_list; 13 struct list_head retry_list;
15 14
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 2b7a7ff401dc..5d096096f958 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -297,23 +297,24 @@ static void raid1_end_read_request(struct bio *bio, int error)
297 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 297 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
298} 298}
299 299
300static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv, 300static void r1_bio_write_done(r1bio_t *r1_bio)
301 int behind)
302{ 301{
303 if (atomic_dec_and_test(&r1_bio->remaining)) 302 if (atomic_dec_and_test(&r1_bio->remaining))
304 { 303 {
305 /* it really is the end of this request */ 304 /* it really is the end of this request */
306 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 305 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
307 /* free extra copy of the data pages */ 306 /* free extra copy of the data pages */
308 int i = vcnt; 307 int i = r1_bio->behind_page_count;
309 while (i--) 308 while (i--)
310 safe_put_page(bv[i].bv_page); 309 safe_put_page(r1_bio->behind_pages[i]);
310 kfree(r1_bio->behind_pages);
311 r1_bio->behind_pages = NULL;
311 } 312 }
312 /* clear the bitmap if all writes complete successfully */ 313 /* clear the bitmap if all writes complete successfully */
313 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 314 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
314 r1_bio->sectors, 315 r1_bio->sectors,
315 !test_bit(R1BIO_Degraded, &r1_bio->state), 316 !test_bit(R1BIO_Degraded, &r1_bio->state),
316 behind); 317 test_bit(R1BIO_BehindIO, &r1_bio->state));
317 md_write_end(r1_bio->mddev); 318 md_write_end(r1_bio->mddev);
318 raid_end_bio_io(r1_bio); 319 raid_end_bio_io(r1_bio);
319 } 320 }
@@ -386,7 +387,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
386 * Let's see if all mirrored write operations have finished 387 * Let's see if all mirrored write operations have finished
387 * already. 388 * already.
388 */ 389 */
389 r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind); 390 r1_bio_write_done(r1_bio);
390 391
391 if (to_put) 392 if (to_put)
392 bio_put(to_put); 393 bio_put(to_put);
@@ -411,10 +412,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
411{ 412{
412 const sector_t this_sector = r1_bio->sector; 413 const sector_t this_sector = r1_bio->sector;
413 const int sectors = r1_bio->sectors; 414 const int sectors = r1_bio->sectors;
414 int new_disk = -1;
415 int start_disk; 415 int start_disk;
416 int best_disk;
416 int i; 417 int i;
417 sector_t new_distance, current_distance; 418 sector_t best_dist;
418 mdk_rdev_t *rdev; 419 mdk_rdev_t *rdev;
419 int choose_first; 420 int choose_first;
420 421
@@ -425,6 +426,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
425 * We take the first readable disk when above the resync window. 426 * We take the first readable disk when above the resync window.
426 */ 427 */
427 retry: 428 retry:
429 best_disk = -1;
430 best_dist = MaxSector;
428 if (conf->mddev->recovery_cp < MaxSector && 431 if (conf->mddev->recovery_cp < MaxSector &&
429 (this_sector + sectors >= conf->next_resync)) { 432 (this_sector + sectors >= conf->next_resync)) {
430 choose_first = 1; 433 choose_first = 1;
@@ -434,8 +437,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
434 start_disk = conf->last_used; 437 start_disk = conf->last_used;
435 } 438 }
436 439
437 /* make sure the disk is operational */
438 for (i = 0 ; i < conf->raid_disks ; i++) { 440 for (i = 0 ; i < conf->raid_disks ; i++) {
441 sector_t dist;
439 int disk = start_disk + i; 442 int disk = start_disk + i;
440 if (disk >= conf->raid_disks) 443 if (disk >= conf->raid_disks)
441 disk -= conf->raid_disks; 444 disk -= conf->raid_disks;
@@ -443,60 +446,43 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
443 rdev = rcu_dereference(conf->mirrors[disk].rdev); 446 rdev = rcu_dereference(conf->mirrors[disk].rdev);
444 if (r1_bio->bios[disk] == IO_BLOCKED 447 if (r1_bio->bios[disk] == IO_BLOCKED
445 || rdev == NULL 448 || rdev == NULL
446 || !test_bit(In_sync, &rdev->flags)) 449 || test_bit(Faulty, &rdev->flags))
447 continue; 450 continue;
448 451 if (!test_bit(In_sync, &rdev->flags) &&
449 new_disk = disk; 452 rdev->recovery_offset < this_sector + sectors)
450 if (!test_bit(WriteMostly, &rdev->flags))
451 break;
452 }
453
454 if (new_disk < 0 || choose_first)
455 goto rb_out;
456
457 /*
458 * Don't change to another disk for sequential reads:
459 */
460 if (conf->next_seq_sect == this_sector)
461 goto rb_out;
462 if (this_sector == conf->mirrors[new_disk].head_position)
463 goto rb_out;
464
465 current_distance = abs(this_sector
466 - conf->mirrors[new_disk].head_position);
467
468 /* look for a better disk - i.e. head is closer */
469 start_disk = new_disk;
470 for (i = 1; i < conf->raid_disks; i++) {
471 int disk = start_disk + 1;
472 if (disk >= conf->raid_disks)
473 disk -= conf->raid_disks;
474
475 rdev = rcu_dereference(conf->mirrors[disk].rdev);
476 if (r1_bio->bios[disk] == IO_BLOCKED
477 || rdev == NULL
478 || !test_bit(In_sync, &rdev->flags)
479 || test_bit(WriteMostly, &rdev->flags))
480 continue; 453 continue;
481 454 if (test_bit(WriteMostly, &rdev->flags)) {
482 if (!atomic_read(&rdev->nr_pending)) { 455 /* Don't balance among write-mostly, just
483 new_disk = disk; 456 * use the first as a last resort */
457 if (best_disk < 0)
458 best_disk = disk;
459 continue;
460 }
461 /* This is a reasonable device to use. It might
462 * even be best.
463 */
464 dist = abs(this_sector - conf->mirrors[disk].head_position);
465 if (choose_first
466 /* Don't change to another disk for sequential reads */
467 || conf->next_seq_sect == this_sector
468 || dist == 0
469 /* If device is idle, use it */
470 || atomic_read(&rdev->nr_pending) == 0) {
471 best_disk = disk;
484 break; 472 break;
485 } 473 }
486 new_distance = abs(this_sector - conf->mirrors[disk].head_position); 474 if (dist < best_dist) {
487 if (new_distance < current_distance) { 475 best_dist = dist;
488 current_distance = new_distance; 476 best_disk = disk;
489 new_disk = disk;
490 } 477 }
491 } 478 }
492 479
493 rb_out: 480 if (best_disk >= 0) {
494 if (new_disk >= 0) { 481 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
495 rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
496 if (!rdev) 482 if (!rdev)
497 goto retry; 483 goto retry;
498 atomic_inc(&rdev->nr_pending); 484 atomic_inc(&rdev->nr_pending);
499 if (!test_bit(In_sync, &rdev->flags)) { 485 if (test_bit(Faulty, &rdev->flags)) {
500 /* cannot risk returning a device that failed 486 /* cannot risk returning a device that failed
501 * before we inc'ed nr_pending 487 * before we inc'ed nr_pending
502 */ 488 */
@@ -504,11 +490,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
504 goto retry; 490 goto retry;
505 } 491 }
506 conf->next_seq_sect = this_sector + sectors; 492 conf->next_seq_sect = this_sector + sectors;
507 conf->last_used = new_disk; 493 conf->last_used = best_disk;
508 } 494 }
509 rcu_read_unlock(); 495 rcu_read_unlock();
510 496
511 return new_disk; 497 return best_disk;
512} 498}
513 499
514static int raid1_congested(void *data, int bits) 500static int raid1_congested(void *data, int bits)
@@ -675,37 +661,36 @@ static void unfreeze_array(conf_t *conf)
675 661
676 662
677/* duplicate the data pages for behind I/O 663/* duplicate the data pages for behind I/O
678 * We return a list of bio_vec rather than just page pointers
679 * as it makes freeing easier
680 */ 664 */
681static struct bio_vec *alloc_behind_pages(struct bio *bio) 665static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
682{ 666{
683 int i; 667 int i;
684 struct bio_vec *bvec; 668 struct bio_vec *bvec;
685 struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), 669 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
686 GFP_NOIO); 670 GFP_NOIO);
687 if (unlikely(!pages)) 671 if (unlikely(!pages))
688 goto do_sync_io; 672 return;
689 673
690 bio_for_each_segment(bvec, bio, i) { 674 bio_for_each_segment(bvec, bio, i) {
691 pages[i].bv_page = alloc_page(GFP_NOIO); 675 pages[i] = alloc_page(GFP_NOIO);
692 if (unlikely(!pages[i].bv_page)) 676 if (unlikely(!pages[i]))
693 goto do_sync_io; 677 goto do_sync_io;
694 memcpy(kmap(pages[i].bv_page) + bvec->bv_offset, 678 memcpy(kmap(pages[i]) + bvec->bv_offset,
695 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); 679 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
696 kunmap(pages[i].bv_page); 680 kunmap(pages[i]);
697 kunmap(bvec->bv_page); 681 kunmap(bvec->bv_page);
698 } 682 }
699 683 r1_bio->behind_pages = pages;
700 return pages; 684 r1_bio->behind_page_count = bio->bi_vcnt;
685 set_bit(R1BIO_BehindIO, &r1_bio->state);
686 return;
701 687
702do_sync_io: 688do_sync_io:
703 if (pages) 689 for (i = 0; i < bio->bi_vcnt; i++)
704 for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++) 690 if (pages[i])
705 put_page(pages[i].bv_page); 691 put_page(pages[i]);
706 kfree(pages); 692 kfree(pages);
707 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 693 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
708 return NULL;
709} 694}
710 695
711static int make_request(mddev_t *mddev, struct bio * bio) 696static int make_request(mddev_t *mddev, struct bio * bio)
@@ -717,7 +702,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
717 int i, targets = 0, disks; 702 int i, targets = 0, disks;
718 struct bitmap *bitmap; 703 struct bitmap *bitmap;
719 unsigned long flags; 704 unsigned long flags;
720 struct bio_vec *behind_pages = NULL;
721 const int rw = bio_data_dir(bio); 705 const int rw = bio_data_dir(bio);
722 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 706 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
723 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 707 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
@@ -870,9 +854,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
870 if (bitmap && 854 if (bitmap &&
871 (atomic_read(&bitmap->behind_writes) 855 (atomic_read(&bitmap->behind_writes)
872 < mddev->bitmap_info.max_write_behind) && 856 < mddev->bitmap_info.max_write_behind) &&
873 !waitqueue_active(&bitmap->behind_wait) && 857 !waitqueue_active(&bitmap->behind_wait))
874 (behind_pages = alloc_behind_pages(bio)) != NULL) 858 alloc_behind_pages(bio, r1_bio);
875 set_bit(R1BIO_BehindIO, &r1_bio->state);
876 859
877 atomic_set(&r1_bio->remaining, 1); 860 atomic_set(&r1_bio->remaining, 1);
878 atomic_set(&r1_bio->behind_remaining, 0); 861 atomic_set(&r1_bio->behind_remaining, 0);
@@ -893,7 +876,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
893 mbio->bi_rw = WRITE | do_flush_fua | do_sync; 876 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
894 mbio->bi_private = r1_bio; 877 mbio->bi_private = r1_bio;
895 878
896 if (behind_pages) { 879 if (r1_bio->behind_pages) {
897 struct bio_vec *bvec; 880 struct bio_vec *bvec;
898 int j; 881 int j;
899 882
@@ -905,7 +888,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
905 * them all 888 * them all
906 */ 889 */
907 __bio_for_each_segment(bvec, mbio, j, 0) 890 __bio_for_each_segment(bvec, mbio, j, 0)
908 bvec->bv_page = behind_pages[j].bv_page; 891 bvec->bv_page = r1_bio->behind_pages[j];
909 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 892 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
910 atomic_inc(&r1_bio->behind_remaining); 893 atomic_inc(&r1_bio->behind_remaining);
911 } 894 }
@@ -915,8 +898,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
915 bio_list_add(&conf->pending_bio_list, mbio); 898 bio_list_add(&conf->pending_bio_list, mbio);
916 spin_unlock_irqrestore(&conf->device_lock, flags); 899 spin_unlock_irqrestore(&conf->device_lock, flags);
917 } 900 }
918 r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL); 901 r1_bio_write_done(r1_bio);
919 kfree(behind_pages); /* the behind pages are attached to the bios now */
920 902
921 /* In case raid1d snuck in to freeze_array */ 903 /* In case raid1d snuck in to freeze_array */
922 wake_up(&conf->wait_barrier); 904 wake_up(&conf->wait_barrier);
@@ -1196,194 +1178,210 @@ static void end_sync_write(struct bio *bio, int error)
1196 } 1178 }
1197} 1179}
1198 1180
1199static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) 1181static int fix_sync_read_error(r1bio_t *r1_bio)
1200{ 1182{
1183 /* Try some synchronous reads of other devices to get
1184 * good data, much like with normal read errors. Only
1185 * read into the pages we already have so we don't
1186 * need to re-issue the read request.
1187 * We don't need to freeze the array, because being in an
1188 * active sync request, there is no normal IO, and
1189 * no overlapping syncs.
1190 */
1191 mddev_t *mddev = r1_bio->mddev;
1201 conf_t *conf = mddev->private; 1192 conf_t *conf = mddev->private;
1202 int i; 1193 struct bio *bio = r1_bio->bios[r1_bio->read_disk];
1203 int disks = conf->raid_disks; 1194 sector_t sect = r1_bio->sector;
1204 struct bio *bio, *wbio; 1195 int sectors = r1_bio->sectors;
1205 1196 int idx = 0;
1206 bio = r1_bio->bios[r1_bio->read_disk];
1207 1197
1198 while(sectors) {
1199 int s = sectors;
1200 int d = r1_bio->read_disk;
1201 int success = 0;
1202 mdk_rdev_t *rdev;
1203 int start;
1208 1204
1209 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 1205 if (s > (PAGE_SIZE>>9))
1210 /* We have read all readable devices. If we haven't 1206 s = PAGE_SIZE >> 9;
1211 * got the block, then there is no hope left. 1207 do {
1212 * If we have, then we want to do a comparison 1208 if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1213 * and skip the write if everything is the same. 1209 /* No rcu protection needed here devices
1214 * If any blocks failed to read, then we need to 1210 * can only be removed when no resync is
1215 * attempt an over-write 1211 * active, and resync is currently active
1216 */ 1212 */
1217 int primary; 1213 rdev = conf->mirrors[d].rdev;
1218 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1214 if (sync_page_io(rdev,
1219 for (i=0; i<mddev->raid_disks; i++) 1215 sect,
1220 if (r1_bio->bios[i]->bi_end_io == end_sync_read) 1216 s<<9,
1221 md_error(mddev, conf->mirrors[i].rdev); 1217 bio->bi_io_vec[idx].bv_page,
1218 READ, false)) {
1219 success = 1;
1220 break;
1221 }
1222 }
1223 d++;
1224 if (d == conf->raid_disks)
1225 d = 0;
1226 } while (!success && d != r1_bio->read_disk);
1222 1227
1223 md_done_sync(mddev, r1_bio->sectors, 1); 1228 if (!success) {
1229 char b[BDEVNAME_SIZE];
1230 /* Cannot read from anywhere, array is toast */
1231 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1232 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1233 " for block %llu\n",
1234 mdname(mddev),
1235 bdevname(bio->bi_bdev, b),
1236 (unsigned long long)r1_bio->sector);
1237 md_done_sync(mddev, r1_bio->sectors, 0);
1224 put_buf(r1_bio); 1238 put_buf(r1_bio);
1225 return; 1239 return 0;
1226 } 1240 }
1227 for (primary=0; primary<mddev->raid_disks; primary++)
1228 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1229 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1230 r1_bio->bios[primary]->bi_end_io = NULL;
1231 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1232 break;
1233 }
1234 r1_bio->read_disk = primary;
1235 for (i=0; i<mddev->raid_disks; i++)
1236 if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
1237 int j;
1238 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1239 struct bio *pbio = r1_bio->bios[primary];
1240 struct bio *sbio = r1_bio->bios[i];
1241
1242 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
1243 for (j = vcnt; j-- ; ) {
1244 struct page *p, *s;
1245 p = pbio->bi_io_vec[j].bv_page;
1246 s = sbio->bi_io_vec[j].bv_page;
1247 if (memcmp(page_address(p),
1248 page_address(s),
1249 PAGE_SIZE))
1250 break;
1251 }
1252 } else
1253 j = 0;
1254 if (j >= 0)
1255 mddev->resync_mismatches += r1_bio->sectors;
1256 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
1257 && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
1258 sbio->bi_end_io = NULL;
1259 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1260 } else {
1261 /* fixup the bio for reuse */
1262 int size;
1263 sbio->bi_vcnt = vcnt;
1264 sbio->bi_size = r1_bio->sectors << 9;
1265 sbio->bi_idx = 0;
1266 sbio->bi_phys_segments = 0;
1267 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1268 sbio->bi_flags |= 1 << BIO_UPTODATE;
1269 sbio->bi_next = NULL;
1270 sbio->bi_sector = r1_bio->sector +
1271 conf->mirrors[i].rdev->data_offset;
1272 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1273 size = sbio->bi_size;
1274 for (j = 0; j < vcnt ; j++) {
1275 struct bio_vec *bi;
1276 bi = &sbio->bi_io_vec[j];
1277 bi->bv_offset = 0;
1278 if (size > PAGE_SIZE)
1279 bi->bv_len = PAGE_SIZE;
1280 else
1281 bi->bv_len = size;
1282 size -= PAGE_SIZE;
1283 memcpy(page_address(bi->bv_page),
1284 page_address(pbio->bi_io_vec[j].bv_page),
1285 PAGE_SIZE);
1286 }
1287 1241
1288 } 1242 start = d;
1289 } 1243 /* write it back and re-read */
1244 while (d != r1_bio->read_disk) {
1245 if (d == 0)
1246 d = conf->raid_disks;
1247 d--;
1248 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1249 continue;
1250 rdev = conf->mirrors[d].rdev;
1251 if (sync_page_io(rdev,
1252 sect,
1253 s<<9,
1254 bio->bi_io_vec[idx].bv_page,
1255 WRITE, false) == 0) {
1256 r1_bio->bios[d]->bi_end_io = NULL;
1257 rdev_dec_pending(rdev, mddev);
1258 md_error(mddev, rdev);
1259 } else
1260 atomic_add(s, &rdev->corrected_errors);
1261 }
1262 d = start;
1263 while (d != r1_bio->read_disk) {
1264 if (d == 0)
1265 d = conf->raid_disks;
1266 d--;
1267 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1268 continue;
1269 rdev = conf->mirrors[d].rdev;
1270 if (sync_page_io(rdev,
1271 sect,
1272 s<<9,
1273 bio->bi_io_vec[idx].bv_page,
1274 READ, false) == 0)
1275 md_error(mddev, rdev);
1276 }
1277 sectors -= s;
1278 sect += s;
1279 idx ++;
1290 } 1280 }
1291 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1281 set_bit(R1BIO_Uptodate, &r1_bio->state);
1292 /* ouch - failed to read all of that. 1282 set_bit(BIO_UPTODATE, &bio->bi_flags);
1293 * Try some synchronous reads of other devices to get 1283 return 1;
1294 * good data, much like with normal read errors. Only 1284}
1295 * read into the pages we already have so we don't 1285
1296 * need to re-issue the read request. 1286static int process_checks(r1bio_t *r1_bio)
1297 * We don't need to freeze the array, because being in an 1287{
1298 * active sync request, there is no normal IO, and 1288 /* We have read all readable devices. If we haven't
1299 * no overlapping syncs. 1289 * got the block, then there is no hope left.
1300 */ 1290 * If we have, then we want to do a comparison
1301 sector_t sect = r1_bio->sector; 1291 * and skip the write if everything is the same.
1302 int sectors = r1_bio->sectors; 1292 * If any blocks failed to read, then we need to
1303 int idx = 0; 1293 * attempt an over-write
1304 1294 */
1305 while(sectors) { 1295 mddev_t *mddev = r1_bio->mddev;
1306 int s = sectors; 1296 conf_t *conf = mddev->private;
1307 int d = r1_bio->read_disk; 1297 int primary;
1308 int success = 0; 1298 int i;
1309 mdk_rdev_t *rdev; 1299
1310 1300 for (primary = 0; primary < conf->raid_disks; primary++)
1311 if (s > (PAGE_SIZE>>9)) 1301 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1312 s = PAGE_SIZE >> 9; 1302 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1313 do { 1303 r1_bio->bios[primary]->bi_end_io = NULL;
1314 if (r1_bio->bios[d]->bi_end_io == end_sync_read) { 1304 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1315 /* No rcu protection needed here devices 1305 break;
1316 * can only be removed when no resync is 1306 }
1317 * active, and resync is currently active 1307 r1_bio->read_disk = primary;
1318 */ 1308 for (i = 0; i < conf->raid_disks; i++) {
1319 rdev = conf->mirrors[d].rdev; 1309 int j;
1320 if (sync_page_io(rdev, 1310 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1321 sect, 1311 struct bio *pbio = r1_bio->bios[primary];
1322 s<<9, 1312 struct bio *sbio = r1_bio->bios[i];
1323 bio->bi_io_vec[idx].bv_page, 1313 int size;
1324 READ, false)) { 1314
1325 success = 1; 1315 if (r1_bio->bios[i]->bi_end_io != end_sync_read)
1326 break; 1316 continue;
1327 } 1317
1328 } 1318 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
1329 d++; 1319 for (j = vcnt; j-- ; ) {
1330 if (d == conf->raid_disks) 1320 struct page *p, *s;
1331 d = 0; 1321 p = pbio->bi_io_vec[j].bv_page;
1332 } while (!success && d != r1_bio->read_disk); 1322 s = sbio->bi_io_vec[j].bv_page;
1333 1323 if (memcmp(page_address(p),
1334 if (success) { 1324 page_address(s),
1335 int start = d; 1325 PAGE_SIZE))
1336 /* write it back and re-read */ 1326 break;
1337 set_bit(R1BIO_Uptodate, &r1_bio->state);
1338 while (d != r1_bio->read_disk) {
1339 if (d == 0)
1340 d = conf->raid_disks;
1341 d--;
1342 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1343 continue;
1344 rdev = conf->mirrors[d].rdev;
1345 atomic_add(s, &rdev->corrected_errors);
1346 if (sync_page_io(rdev,
1347 sect,
1348 s<<9,
1349 bio->bi_io_vec[idx].bv_page,
1350 WRITE, false) == 0)
1351 md_error(mddev, rdev);
1352 }
1353 d = start;
1354 while (d != r1_bio->read_disk) {
1355 if (d == 0)
1356 d = conf->raid_disks;
1357 d--;
1358 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1359 continue;
1360 rdev = conf->mirrors[d].rdev;
1361 if (sync_page_io(rdev,
1362 sect,
1363 s<<9,
1364 bio->bi_io_vec[idx].bv_page,
1365 READ, false) == 0)
1366 md_error(mddev, rdev);
1367 }
1368 } else {
1369 char b[BDEVNAME_SIZE];
1370 /* Cannot read from anywhere, array is toast */
1371 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1372 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1373 " for block %llu\n",
1374 mdname(mddev),
1375 bdevname(bio->bi_bdev, b),
1376 (unsigned long long)r1_bio->sector);
1377 md_done_sync(mddev, r1_bio->sectors, 0);
1378 put_buf(r1_bio);
1379 return;
1380 } 1327 }
1381 sectors -= s; 1328 } else
1382 sect += s; 1329 j = 0;
1383 idx ++; 1330 if (j >= 0)
1331 mddev->resync_mismatches += r1_bio->sectors;
1332 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
1333 && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
1334 /* No need to write to this device. */
1335 sbio->bi_end_io = NULL;
1336 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1337 continue;
1338 }
1339 /* fixup the bio for reuse */
1340 sbio->bi_vcnt = vcnt;
1341 sbio->bi_size = r1_bio->sectors << 9;
1342 sbio->bi_idx = 0;
1343 sbio->bi_phys_segments = 0;
1344 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1345 sbio->bi_flags |= 1 << BIO_UPTODATE;
1346 sbio->bi_next = NULL;
1347 sbio->bi_sector = r1_bio->sector +
1348 conf->mirrors[i].rdev->data_offset;
1349 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1350 size = sbio->bi_size;
1351 for (j = 0; j < vcnt ; j++) {
1352 struct bio_vec *bi;
1353 bi = &sbio->bi_io_vec[j];
1354 bi->bv_offset = 0;
1355 if (size > PAGE_SIZE)
1356 bi->bv_len = PAGE_SIZE;
1357 else
1358 bi->bv_len = size;
1359 size -= PAGE_SIZE;
1360 memcpy(page_address(bi->bv_page),
1361 page_address(pbio->bi_io_vec[j].bv_page),
1362 PAGE_SIZE);
1384 } 1363 }
1385 } 1364 }
1365 return 0;
1366}
1386 1367
1368static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1369{
1370 conf_t *conf = mddev->private;
1371 int i;
1372 int disks = conf->raid_disks;
1373 struct bio *bio, *wbio;
1374
1375 bio = r1_bio->bios[r1_bio->read_disk];
1376
1377 if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
1378 /* ouch - failed to read all of that. */
1379 if (!fix_sync_read_error(r1_bio))
1380 return;
1381
1382 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1383 if (process_checks(r1_bio) < 0)
1384 return;
1387 /* 1385 /*
1388 * schedule writes 1386 * schedule writes
1389 */ 1387 */
@@ -2063,7 +2061,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
2063 set_capacity(mddev->gendisk, mddev->array_sectors); 2061 set_capacity(mddev->gendisk, mddev->array_sectors);
2064 revalidate_disk(mddev->gendisk); 2062 revalidate_disk(mddev->gendisk);
2065 if (sectors > mddev->dev_sectors && 2063 if (sectors > mddev->dev_sectors &&
2066 mddev->recovery_cp == MaxSector) { 2064 mddev->recovery_cp > mddev->dev_sectors) {
2067 mddev->recovery_cp = mddev->dev_sectors; 2065 mddev->recovery_cp = mddev->dev_sectors;
2068 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2066 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2069 } 2067 }
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index cbfdf1a6acd9..5fc4ca1af863 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -94,7 +94,9 @@ struct r1bio_s {
94 int read_disk; 94 int read_disk;
95 95
96 struct list_head retry_list; 96 struct list_head retry_list;
97 struct bitmap_update *bitmap_update; 97 /* Next two are only valid when R1BIO_BehindIO is set */
98 struct page **behind_pages;
99 int behind_page_count;
98 /* 100 /*
99 * if the IO is in WRITE direction, then multiple bios are used. 101 * if the IO is in WRITE direction, then multiple bios are used.
100 * We choose the number when they are allocated. 102 * We choose the number when they are allocated.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8e9462626ec5..6e846688962f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -271,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
271 */ 271 */
272 set_bit(R10BIO_Uptodate, &r10_bio->state); 272 set_bit(R10BIO_Uptodate, &r10_bio->state);
273 raid_end_bio_io(r10_bio); 273 raid_end_bio_io(r10_bio);
274 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
274 } else { 275 } else {
275 /* 276 /*
276 * oops, read error: 277 * oops, read error - keep the refcount on the rdev
277 */ 278 */
278 char b[BDEVNAME_SIZE]; 279 char b[BDEVNAME_SIZE];
279 if (printk_ratelimit()) 280 if (printk_ratelimit())
@@ -282,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
282 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); 283 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
283 reschedule_retry(r10_bio); 284 reschedule_retry(r10_bio);
284 } 285 }
285
286 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
287} 286}
288 287
289static void raid10_end_write_request(struct bio *bio, int error) 288static void raid10_end_write_request(struct bio *bio, int error)
@@ -488,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
488static int read_balance(conf_t *conf, r10bio_t *r10_bio) 487static int read_balance(conf_t *conf, r10bio_t *r10_bio)
489{ 488{
490 const sector_t this_sector = r10_bio->sector; 489 const sector_t this_sector = r10_bio->sector;
491 int disk, slot, nslot; 490 int disk, slot;
492 const int sectors = r10_bio->sectors; 491 const int sectors = r10_bio->sectors;
493 sector_t new_distance, current_distance; 492 sector_t new_distance, best_dist;
494 mdk_rdev_t *rdev; 493 mdk_rdev_t *rdev;
494 int do_balance;
495 int best_slot;
495 496
496 raid10_find_phys(conf, r10_bio); 497 raid10_find_phys(conf, r10_bio);
497 rcu_read_lock(); 498 rcu_read_lock();
499retry:
500 best_slot = -1;
501 best_dist = MaxSector;
502 do_balance = 1;
498 /* 503 /*
499 * Check if we can balance. We can balance on the whole 504 * Check if we can balance. We can balance on the whole
500 * device if no resync is going on (recovery is ok), or below 505 * device if no resync is going on (recovery is ok), or below
@@ -502,86 +507,58 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
502 * above the resync window. 507 * above the resync window.
503 */ 508 */
504 if (conf->mddev->recovery_cp < MaxSector 509 if (conf->mddev->recovery_cp < MaxSector
505 && (this_sector + sectors >= conf->next_resync)) { 510 && (this_sector + sectors >= conf->next_resync))
506 /* make sure that disk is operational */ 511 do_balance = 0;
507 slot = 0;
508 disk = r10_bio->devs[slot].devnum;
509
510 while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
511 r10_bio->devs[slot].bio == IO_BLOCKED ||
512 !test_bit(In_sync, &rdev->flags)) {
513 slot++;
514 if (slot == conf->copies) {
515 slot = 0;
516 disk = -1;
517 break;
518 }
519 disk = r10_bio->devs[slot].devnum;
520 }
521 goto rb_out;
522 }
523
524 512
525 /* make sure the disk is operational */ 513 for (slot = 0; slot < conf->copies ; slot++) {
526 slot = 0; 514 if (r10_bio->devs[slot].bio == IO_BLOCKED)
527 disk = r10_bio->devs[slot].devnum; 515 continue;
528 while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
529 r10_bio->devs[slot].bio == IO_BLOCKED ||
530 !test_bit(In_sync, &rdev->flags)) {
531 slot ++;
532 if (slot == conf->copies) {
533 disk = -1;
534 goto rb_out;
535 }
536 disk = r10_bio->devs[slot].devnum; 516 disk = r10_bio->devs[slot].devnum;
537 } 517 rdev = rcu_dereference(conf->mirrors[disk].rdev);
538 518 if (rdev == NULL)
539
540 current_distance = abs(r10_bio->devs[slot].addr -
541 conf->mirrors[disk].head_position);
542
543 /* Find the disk whose head is closest,
544 * or - for far > 1 - find the closest to partition beginning */
545
546 for (nslot = slot; nslot < conf->copies; nslot++) {
547 int ndisk = r10_bio->devs[nslot].devnum;
548
549
550 if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
551 r10_bio->devs[nslot].bio == IO_BLOCKED ||
552 !test_bit(In_sync, &rdev->flags))
553 continue; 519 continue;
520 if (!test_bit(In_sync, &rdev->flags))
521 continue;
522
523 if (!do_balance)
524 break;
554 525
555 /* This optimisation is debatable, and completely destroys 526 /* This optimisation is debatable, and completely destroys
556 * sequential read speed for 'far copies' arrays. So only 527 * sequential read speed for 'far copies' arrays. So only
557 * keep it for 'near' arrays, and review those later. 528 * keep it for 'near' arrays, and review those later.
558 */ 529 */
559 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) { 530 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
560 disk = ndisk;
561 slot = nslot;
562 break; 531 break;
563 }
564 532
565 /* for far > 1 always use the lowest address */ 533 /* for far > 1 always use the lowest address */
566 if (conf->far_copies > 1) 534 if (conf->far_copies > 1)
567 new_distance = r10_bio->devs[nslot].addr; 535 new_distance = r10_bio->devs[slot].addr;
568 else 536 else
569 new_distance = abs(r10_bio->devs[nslot].addr - 537 new_distance = abs(r10_bio->devs[slot].addr -
570 conf->mirrors[ndisk].head_position); 538 conf->mirrors[disk].head_position);
571 if (new_distance < current_distance) { 539 if (new_distance < best_dist) {
572 current_distance = new_distance; 540 best_dist = new_distance;
573 disk = ndisk; 541 best_slot = slot;
574 slot = nslot;
575 } 542 }
576 } 543 }
544 if (slot == conf->copies)
545 slot = best_slot;
577 546
578rb_out: 547 if (slot >= 0) {
579 r10_bio->read_slot = slot; 548 disk = r10_bio->devs[slot].devnum;
580/* conf->next_seq_sect = this_sector + sectors;*/ 549 rdev = rcu_dereference(conf->mirrors[disk].rdev);
581 550 if (!rdev)
582 if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL) 551 goto retry;
583 atomic_inc(&conf->mirrors[disk].rdev->nr_pending); 552 atomic_inc(&rdev->nr_pending);
584 else 553 if (test_bit(Faulty, &rdev->flags)) {
554 /* Cannot risk returning a device that failed
555 * before we inc'ed nr_pending
556 */
557 rdev_dec_pending(rdev, conf->mddev);
558 goto retry;
559 }
560 r10_bio->read_slot = slot;
561 } else
585 disk = -1; 562 disk = -1;
586 rcu_read_unlock(); 563 rcu_read_unlock();
587 564
@@ -1460,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1460 int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 1437 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1461 int d = r10_bio->devs[r10_bio->read_slot].devnum; 1438 int d = r10_bio->devs[r10_bio->read_slot].devnum;
1462 1439
1463 rcu_read_lock(); 1440 /* still own a reference to this rdev, so it cannot
1464 rdev = rcu_dereference(conf->mirrors[d].rdev); 1441 * have been cleared recently.
1465 if (rdev) { /* If rdev is not NULL */ 1442 */
1466 char b[BDEVNAME_SIZE]; 1443 rdev = conf->mirrors[d].rdev;
1467 int cur_read_error_count = 0;
1468 1444
1469 bdevname(rdev->bdev, b); 1445 if (test_bit(Faulty, &rdev->flags))
1446 /* drive has already been failed, just ignore any
1447 more fix_read_error() attempts */
1448 return;
1470 1449
1471 if (test_bit(Faulty, &rdev->flags)) { 1450 check_decay_read_errors(mddev, rdev);
1472 rcu_read_unlock(); 1451 atomic_inc(&rdev->read_errors);
1473 /* drive has already been failed, just ignore any 1452 if (atomic_read(&rdev->read_errors) > max_read_errors) {
1474 more fix_read_error() attempts */ 1453 char b[BDEVNAME_SIZE];
1475 return; 1454 bdevname(rdev->bdev, b);
1476 }
1477 1455
1478 check_decay_read_errors(mddev, rdev); 1456 printk(KERN_NOTICE
1479 atomic_inc(&rdev->read_errors); 1457 "md/raid10:%s: %s: Raid device exceeded "
1480 cur_read_error_count = atomic_read(&rdev->read_errors); 1458 "read_error threshold [cur %d:max %d]\n",
1481 if (cur_read_error_count > max_read_errors) { 1459 mdname(mddev), b,
1482 rcu_read_unlock(); 1460 atomic_read(&rdev->read_errors), max_read_errors);
1483 printk(KERN_NOTICE 1461 printk(KERN_NOTICE
1484 "md/raid10:%s: %s: Raid device exceeded " 1462 "md/raid10:%s: %s: Failing raid device\n",
1485 "read_error threshold " 1463 mdname(mddev), b);
1486 "[cur %d:max %d]\n", 1464 md_error(mddev, conf->mirrors[d].rdev);
1487 mdname(mddev), 1465 return;
1488 b, cur_read_error_count, max_read_errors);
1489 printk(KERN_NOTICE
1490 "md/raid10:%s: %s: Failing raid "
1491 "device\n", mdname(mddev), b);
1492 md_error(mddev, conf->mirrors[d].rdev);
1493 return;
1494 }
1495 } 1466 }
1496 rcu_read_unlock();
1497 1467
1498 while(sectors) { 1468 while(sectors) {
1499 int s = sectors; 1469 int s = sectors;
@@ -1562,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1562 "write failed" 1532 "write failed"
1563 " (%d sectors at %llu on %s)\n", 1533 " (%d sectors at %llu on %s)\n",
1564 mdname(mddev), s, 1534 mdname(mddev), s,
1565 (unsigned long long)(sect+ 1535 (unsigned long long)(
1566 rdev->data_offset), 1536 sect + rdev->data_offset),
1567 bdevname(rdev->bdev, b)); 1537 bdevname(rdev->bdev, b));
1568 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 1538 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1569 "drive\n", 1539 "drive\n",
@@ -1599,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1599 "corrected sectors" 1569 "corrected sectors"
1600 " (%d sectors at %llu on %s)\n", 1570 " (%d sectors at %llu on %s)\n",
1601 mdname(mddev), s, 1571 mdname(mddev), s,
1602 (unsigned long long)(sect+ 1572 (unsigned long long)(
1603 rdev->data_offset), 1573 sect + rdev->data_offset),
1604 bdevname(rdev->bdev, b)); 1574 bdevname(rdev->bdev, b));
1605 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", 1575 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
1606 mdname(mddev), 1576 mdname(mddev),
@@ -1612,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1612 "md/raid10:%s: read error corrected" 1582 "md/raid10:%s: read error corrected"
1613 " (%d sectors at %llu on %s)\n", 1583 " (%d sectors at %llu on %s)\n",
1614 mdname(mddev), s, 1584 mdname(mddev), s,
1615 (unsigned long long)(sect+ 1585 (unsigned long long)(
1616 rdev->data_offset), 1586 sect + rdev->data_offset),
1617 bdevname(rdev->bdev, b)); 1587 bdevname(rdev->bdev, b));
1618 } 1588 }
1619 1589
@@ -1663,7 +1633,8 @@ static void raid10d(mddev_t *mddev)
1663 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 1633 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
1664 recovery_request_write(mddev, r10_bio); 1634 recovery_request_write(mddev, r10_bio);
1665 else { 1635 else {
1666 int mirror; 1636 int slot = r10_bio->read_slot;
1637 int mirror = r10_bio->devs[slot].devnum;
1667 /* we got a read error. Maybe the drive is bad. Maybe just 1638 /* we got a read error. Maybe the drive is bad. Maybe just
1668 * the block and we can fix it. 1639 * the block and we can fix it.
1669 * We freeze all other IO, and try reading the block from 1640 * We freeze all other IO, and try reading the block from
@@ -1677,9 +1648,10 @@ static void raid10d(mddev_t *mddev)
1677 fix_read_error(conf, mddev, r10_bio); 1648 fix_read_error(conf, mddev, r10_bio);
1678 unfreeze_array(conf); 1649 unfreeze_array(conf);
1679 } 1650 }
1651 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
1680 1652
1681 bio = r10_bio->devs[r10_bio->read_slot].bio; 1653 bio = r10_bio->devs[slot].bio;
1682 r10_bio->devs[r10_bio->read_slot].bio = 1654 r10_bio->devs[slot].bio =
1683 mddev->ro ? IO_BLOCKED : NULL; 1655 mddev->ro ? IO_BLOCKED : NULL;
1684 mirror = read_balance(conf, r10_bio); 1656 mirror = read_balance(conf, r10_bio);
1685 if (mirror == -1) { 1657 if (mirror == -1) {
@@ -1693,6 +1665,7 @@ static void raid10d(mddev_t *mddev)
1693 } else { 1665 } else {
1694 const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); 1666 const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
1695 bio_put(bio); 1667 bio_put(bio);
1668 slot = r10_bio->read_slot;
1696 rdev = conf->mirrors[mirror].rdev; 1669 rdev = conf->mirrors[mirror].rdev;
1697 if (printk_ratelimit()) 1670 if (printk_ratelimit())
1698 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" 1671 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
@@ -1702,8 +1675,8 @@ static void raid10d(mddev_t *mddev)
1702 (unsigned long long)r10_bio->sector); 1675 (unsigned long long)r10_bio->sector);
1703 bio = bio_clone_mddev(r10_bio->master_bio, 1676 bio = bio_clone_mddev(r10_bio->master_bio,
1704 GFP_NOIO, mddev); 1677 GFP_NOIO, mddev);
1705 r10_bio->devs[r10_bio->read_slot].bio = bio; 1678 r10_bio->devs[slot].bio = bio;
1706 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr 1679 bio->bi_sector = r10_bio->devs[slot].addr
1707 + rdev->data_offset; 1680 + rdev->data_offset;
1708 bio->bi_bdev = rdev->bdev; 1681 bio->bi_bdev = rdev->bdev;
1709 bio->bi_rw = READ | do_sync; 1682 bio->bi_rw = READ | do_sync;
@@ -1763,13 +1736,13 @@ static int init_resync(conf_t *conf)
1763 * 1736 *
1764 */ 1737 */
1765 1738
1766static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 1739static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1740 int *skipped, int go_faster)
1767{ 1741{
1768 conf_t *conf = mddev->private; 1742 conf_t *conf = mddev->private;
1769 r10bio_t *r10_bio; 1743 r10bio_t *r10_bio;
1770 struct bio *biolist = NULL, *bio; 1744 struct bio *biolist = NULL, *bio;
1771 sector_t max_sector, nr_sectors; 1745 sector_t max_sector, nr_sectors;
1772 int disk;
1773 int i; 1746 int i;
1774 int max_sync; 1747 int max_sync;
1775 sector_t sync_blocks; 1748 sector_t sync_blocks;
@@ -1858,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1858 int j, k; 1831 int j, k;
1859 r10_bio = NULL; 1832 r10_bio = NULL;
1860 1833
1861 for (i=0 ; i<conf->raid_disks; i++) 1834 for (i=0 ; i<conf->raid_disks; i++) {
1862 if (conf->mirrors[i].rdev && 1835 int still_degraded;
1863 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { 1836 r10bio_t *rb2;
1864 int still_degraded = 0; 1837 sector_t sect;
1865 /* want to reconstruct this device */ 1838 int must_sync;
1866 r10bio_t *rb2 = r10_bio;
1867 sector_t sect = raid10_find_virt(conf, sector_nr, i);
1868 int must_sync;
1869 /* Unless we are doing a full sync, we only need
1870 * to recover the block if it is set in the bitmap
1871 */
1872 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1873 &sync_blocks, 1);
1874 if (sync_blocks < max_sync)
1875 max_sync = sync_blocks;
1876 if (!must_sync &&
1877 !conf->fullsync) {
1878 /* yep, skip the sync_blocks here, but don't assume
1879 * that there will never be anything to do here
1880 */
1881 chunks_skipped = -1;
1882 continue;
1883 }
1884 1839
1885 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); 1840 if (conf->mirrors[i].rdev == NULL ||
1886 raise_barrier(conf, rb2 != NULL); 1841 test_bit(In_sync, &conf->mirrors[i].rdev->flags))
1887 atomic_set(&r10_bio->remaining, 0); 1842 continue;
1888 1843
1889 r10_bio->master_bio = (struct bio*)rb2; 1844 still_degraded = 0;
1890 if (rb2) 1845 /* want to reconstruct this device */
1891 atomic_inc(&rb2->remaining); 1846 rb2 = r10_bio;
1892 r10_bio->mddev = mddev; 1847 sect = raid10_find_virt(conf, sector_nr, i);
1893 set_bit(R10BIO_IsRecover, &r10_bio->state); 1848 /* Unless we are doing a full sync, we only need
1894 r10_bio->sector = sect; 1849 * to recover the block if it is set in the bitmap
1850 */
1851 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1852 &sync_blocks, 1);
1853 if (sync_blocks < max_sync)
1854 max_sync = sync_blocks;
1855 if (!must_sync &&
1856 !conf->fullsync) {
1857 /* yep, skip the sync_blocks here, but don't assume
1858 * that there will never be anything to do here
1859 */
1860 chunks_skipped = -1;
1861 continue;
1862 }
1895 1863
1896 raid10_find_phys(conf, r10_bio); 1864 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1865 raise_barrier(conf, rb2 != NULL);
1866 atomic_set(&r10_bio->remaining, 0);
1897 1867
1898 /* Need to check if the array will still be 1868 r10_bio->master_bio = (struct bio*)rb2;
1899 * degraded 1869 if (rb2)
1900 */ 1870 atomic_inc(&rb2->remaining);
1901 for (j=0; j<conf->raid_disks; j++) 1871 r10_bio->mddev = mddev;
1902 if (conf->mirrors[j].rdev == NULL || 1872 set_bit(R10BIO_IsRecover, &r10_bio->state);
1903 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { 1873 r10_bio->sector = sect;
1904 still_degraded = 1;
1905 break;
1906 }
1907
1908 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1909 &sync_blocks, still_degraded);
1910
1911 for (j=0; j<conf->copies;j++) {
1912 int d = r10_bio->devs[j].devnum;
1913 if (conf->mirrors[d].rdev &&
1914 test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
1915 /* This is where we read from */
1916 bio = r10_bio->devs[0].bio;
1917 bio->bi_next = biolist;
1918 biolist = bio;
1919 bio->bi_private = r10_bio;
1920 bio->bi_end_io = end_sync_read;
1921 bio->bi_rw = READ;
1922 bio->bi_sector = r10_bio->devs[j].addr +
1923 conf->mirrors[d].rdev->data_offset;
1924 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1925 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1926 atomic_inc(&r10_bio->remaining);
1927 /* and we write to 'i' */
1928
1929 for (k=0; k<conf->copies; k++)
1930 if (r10_bio->devs[k].devnum == i)
1931 break;
1932 BUG_ON(k == conf->copies);
1933 bio = r10_bio->devs[1].bio;
1934 bio->bi_next = biolist;
1935 biolist = bio;
1936 bio->bi_private = r10_bio;
1937 bio->bi_end_io = end_sync_write;
1938 bio->bi_rw = WRITE;
1939 bio->bi_sector = r10_bio->devs[k].addr +
1940 conf->mirrors[i].rdev->data_offset;
1941 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1942
1943 r10_bio->devs[0].devnum = d;
1944 r10_bio->devs[1].devnum = i;
1945 1874
1946 break; 1875 raid10_find_phys(conf, r10_bio);
1947 } 1876
1948 } 1877 /* Need to check if the array will still be
1949 if (j == conf->copies) { 1878 * degraded
1950 /* Cannot recover, so abort the recovery */ 1879 */
1951 put_buf(r10_bio); 1880 for (j=0; j<conf->raid_disks; j++)
1952 if (rb2) 1881 if (conf->mirrors[j].rdev == NULL ||
1953 atomic_dec(&rb2->remaining); 1882 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
1954 r10_bio = rb2; 1883 still_degraded = 1;
1955 if (!test_and_set_bit(MD_RECOVERY_INTR,
1956 &mddev->recovery))
1957 printk(KERN_INFO "md/raid10:%s: insufficient "
1958 "working devices for recovery.\n",
1959 mdname(mddev));
1960 break; 1884 break;
1961 } 1885 }
1886
1887 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1888 &sync_blocks, still_degraded);
1889
1890 for (j=0; j<conf->copies;j++) {
1891 int d = r10_bio->devs[j].devnum;
1892 if (!conf->mirrors[d].rdev ||
1893 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
1894 continue;
1895 /* This is where we read from */
1896 bio = r10_bio->devs[0].bio;
1897 bio->bi_next = biolist;
1898 biolist = bio;
1899 bio->bi_private = r10_bio;
1900 bio->bi_end_io = end_sync_read;
1901 bio->bi_rw = READ;
1902 bio->bi_sector = r10_bio->devs[j].addr +
1903 conf->mirrors[d].rdev->data_offset;
1904 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1905 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1906 atomic_inc(&r10_bio->remaining);
1907 /* and we write to 'i' */
1908
1909 for (k=0; k<conf->copies; k++)
1910 if (r10_bio->devs[k].devnum == i)
1911 break;
1912 BUG_ON(k == conf->copies);
1913 bio = r10_bio->devs[1].bio;
1914 bio->bi_next = biolist;
1915 biolist = bio;
1916 bio->bi_private = r10_bio;
1917 bio->bi_end_io = end_sync_write;
1918 bio->bi_rw = WRITE;
1919 bio->bi_sector = r10_bio->devs[k].addr +
1920 conf->mirrors[i].rdev->data_offset;
1921 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1922
1923 r10_bio->devs[0].devnum = d;
1924 r10_bio->devs[1].devnum = i;
1925
1926 break;
1927 }
1928 if (j == conf->copies) {
1929 /* Cannot recover, so abort the recovery */
1930 put_buf(r10_bio);
1931 if (rb2)
1932 atomic_dec(&rb2->remaining);
1933 r10_bio = rb2;
1934 if (!test_and_set_bit(MD_RECOVERY_INTR,
1935 &mddev->recovery))
1936 printk(KERN_INFO "md/raid10:%s: insufficient "
1937 "working devices for recovery.\n",
1938 mdname(mddev));
1939 break;
1962 } 1940 }
1941 }
1963 if (biolist == NULL) { 1942 if (biolist == NULL) {
1964 while (r10_bio) { 1943 while (r10_bio) {
1965 r10bio_t *rb2 = r10_bio; 1944 r10bio_t *rb2 = r10_bio;
@@ -1977,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1977 1956
1978 if (!bitmap_start_sync(mddev->bitmap, sector_nr, 1957 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1979 &sync_blocks, mddev->degraded) && 1958 &sync_blocks, mddev->degraded) &&
1980 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 1959 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
1960 &mddev->recovery)) {
1981 /* We can skip this block */ 1961 /* We can skip this block */
1982 *skipped = 1; 1962 *skipped = 1;
1983 return sync_blocks + sectors_skipped; 1963 return sync_blocks + sectors_skipped;
@@ -2022,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2022 for (i=0; i<conf->copies; i++) { 2002 for (i=0; i<conf->copies; i++) {
2023 int d = r10_bio->devs[i].devnum; 2003 int d = r10_bio->devs[i].devnum;
2024 if (r10_bio->devs[i].bio->bi_end_io) 2004 if (r10_bio->devs[i].bio->bi_end_io)
2025 rdev_dec_pending(conf->mirrors[d].rdev, mddev); 2005 rdev_dec_pending(conf->mirrors[d].rdev,
2006 mddev);
2026 } 2007 }
2027 put_buf(r10_bio); 2008 put_buf(r10_bio);
2028 biolist = NULL; 2009 biolist = NULL;
@@ -2047,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2047 do { 2028 do {
2048 struct page *page; 2029 struct page *page;
2049 int len = PAGE_SIZE; 2030 int len = PAGE_SIZE;
2050 disk = 0;
2051 if (sector_nr + (len>>9) > max_sector) 2031 if (sector_nr + (len>>9) > max_sector)
2052 len = (max_sector - sector_nr) << 9; 2032 len = (max_sector - sector_nr) << 9;
2053 if (len == 0) 2033 if (len == 0)
2054 break; 2034 break;
2055 for (bio= biolist ; bio ; bio=bio->bi_next) { 2035 for (bio= biolist ; bio ; bio=bio->bi_next) {
2036 struct bio *bio2;
2056 page = bio->bi_io_vec[bio->bi_vcnt].bv_page; 2037 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
2057 if (bio_add_page(bio, page, len, 0) == 0) { 2038 if (bio_add_page(bio, page, len, 0))
2058 /* stop here */ 2039 continue;
2059 struct bio *bio2; 2040
2060 bio->bi_io_vec[bio->bi_vcnt].bv_page = page; 2041 /* stop here */
2061 for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { 2042 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
2062 /* remove last page from this bio */ 2043 for (bio2 = biolist;
2063 bio2->bi_vcnt--; 2044 bio2 && bio2 != bio;
2064 bio2->bi_size -= len; 2045 bio2 = bio2->bi_next) {
2065 bio2->bi_flags &= ~(1<< BIO_SEG_VALID); 2046 /* remove last page from this bio */
2066 } 2047 bio2->bi_vcnt--;
2067 goto bio_full; 2048 bio2->bi_size -= len;
2049 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
2068 } 2050 }
2069 disk = i; 2051 goto bio_full;
2070 } 2052 }
2071 nr_sectors += len>>9; 2053 nr_sectors += len>>9;
2072 sector_nr += len>>9; 2054 sector_nr += len>>9;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 49bf5f891435..34dd54539f7b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1700,27 +1700,25 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1700 raid5_conf_t *conf = mddev->private; 1700 raid5_conf_t *conf = mddev->private;
1701 pr_debug("raid456: error called\n"); 1701 pr_debug("raid456: error called\n");
1702 1702
1703 if (!test_bit(Faulty, &rdev->flags)) { 1703 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1704 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1704 unsigned long flags;
1705 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1705 spin_lock_irqsave(&conf->device_lock, flags);
1706 unsigned long flags; 1706 mddev->degraded++;
1707 spin_lock_irqsave(&conf->device_lock, flags); 1707 spin_unlock_irqrestore(&conf->device_lock, flags);
1708 mddev->degraded++; 1708 /*
1709 spin_unlock_irqrestore(&conf->device_lock, flags); 1709 * if recovery was running, make sure it aborts.
1710 /* 1710 */
1711 * if recovery was running, make sure it aborts. 1711 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1712 */
1713 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1714 }
1715 set_bit(Faulty, &rdev->flags);
1716 printk(KERN_ALERT
1717 "md/raid:%s: Disk failure on %s, disabling device.\n"
1718 "md/raid:%s: Operation continuing on %d devices.\n",
1719 mdname(mddev),
1720 bdevname(rdev->bdev, b),
1721 mdname(mddev),
1722 conf->raid_disks - mddev->degraded);
1723 } 1712 }
1713 set_bit(Faulty, &rdev->flags);
1714 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1715 printk(KERN_ALERT
1716 "md/raid:%s: Disk failure on %s, disabling device.\n"
1717 "md/raid:%s: Operation continuing on %d devices.\n",
1718 mdname(mddev),
1719 bdevname(rdev->bdev, b),
1720 mdname(mddev),
1721 conf->raid_disks - mddev->degraded);
1724} 1722}
1725 1723
1726/* 1724/*
@@ -5391,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
5391 return -EINVAL; 5389 return -EINVAL;
5392 set_capacity(mddev->gendisk, mddev->array_sectors); 5390 set_capacity(mddev->gendisk, mddev->array_sectors);
5393 revalidate_disk(mddev->gendisk); 5391 revalidate_disk(mddev->gendisk);
5394 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 5392 if (sectors > mddev->dev_sectors &&
5393 mddev->recovery_cp > mddev->dev_sectors) {
5395 mddev->recovery_cp = mddev->dev_sectors; 5394 mddev->recovery_cp = mddev->dev_sectors;
5396 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5395 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5397 } 5396 }