aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /drivers/md/raid1.c
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c920
1 files changed, 395 insertions, 525 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0b830bbe1d8b..f7431b6d8447 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -52,23 +52,16 @@
52#define NR_RAID1_BIOS 256 52#define NR_RAID1_BIOS 256
53 53
54 54
55static void unplug_slaves(mddev_t *mddev);
56
57static void allow_barrier(conf_t *conf); 55static void allow_barrier(conf_t *conf);
58static void lower_barrier(conf_t *conf); 56static void lower_barrier(conf_t *conf);
59 57
60static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 58static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
61{ 59{
62 struct pool_info *pi = data; 60 struct pool_info *pi = data;
63 r1bio_t *r1_bio;
64 int size = offsetof(r1bio_t, bios[pi->raid_disks]); 61 int size = offsetof(r1bio_t, bios[pi->raid_disks]);
65 62
66 /* allocate a r1bio with room for raid_disks entries in the bios array */ 63 /* allocate a r1bio with room for raid_disks entries in the bios array */
67 r1_bio = kzalloc(size, gfp_flags); 64 return kzalloc(size, gfp_flags);
68 if (!r1_bio && pi->mddev)
69 unplug_slaves(pi->mddev);
70
71 return r1_bio;
72} 65}
73 66
74static void r1bio_pool_free(void *r1_bio, void *data) 67static void r1bio_pool_free(void *r1_bio, void *data)
@@ -91,16 +84,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
91 int i, j; 84 int i, j;
92 85
93 r1_bio = r1bio_pool_alloc(gfp_flags, pi); 86 r1_bio = r1bio_pool_alloc(gfp_flags, pi);
94 if (!r1_bio) { 87 if (!r1_bio)
95 unplug_slaves(pi->mddev);
96 return NULL; 88 return NULL;
97 }
98 89
99 /* 90 /*
100 * Allocate bios : 1 for reading, n-1 for writing 91 * Allocate bios : 1 for reading, n-1 for writing
101 */ 92 */
102 for (j = pi->raid_disks ; j-- ; ) { 93 for (j = pi->raid_disks ; j-- ; ) {
103 bio = bio_alloc(gfp_flags, RESYNC_PAGES); 94 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
104 if (!bio) 95 if (!bio)
105 goto out_free_bio; 96 goto out_free_bio;
106 r1_bio->bios[j] = bio; 97 r1_bio->bios[j] = bio;
@@ -306,6 +297,29 @@ static void raid1_end_read_request(struct bio *bio, int error)
306 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 297 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
307} 298}
308 299
300static void r1_bio_write_done(r1bio_t *r1_bio)
301{
302 if (atomic_dec_and_test(&r1_bio->remaining))
303 {
304 /* it really is the end of this request */
305 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
306 /* free extra copy of the data pages */
307 int i = r1_bio->behind_page_count;
308 while (i--)
309 safe_put_page(r1_bio->behind_pages[i]);
310 kfree(r1_bio->behind_pages);
311 r1_bio->behind_pages = NULL;
312 }
313 /* clear the bitmap if all writes complete successfully */
314 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
315 r1_bio->sectors,
316 !test_bit(R1BIO_Degraded, &r1_bio->state),
317 test_bit(R1BIO_BehindIO, &r1_bio->state));
318 md_write_end(r1_bio->mddev);
319 raid_end_bio_io(r1_bio);
320 }
321}
322
309static void raid1_end_write_request(struct bio *bio, int error) 323static void raid1_end_write_request(struct bio *bio, int error)
310{ 324{
311 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 325 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -319,84 +333,61 @@ static void raid1_end_write_request(struct bio *bio, int error)
319 if (r1_bio->bios[mirror] == bio) 333 if (r1_bio->bios[mirror] == bio)
320 break; 334 break;
321 335
322 if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { 336 /*
323 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); 337 * 'one mirror IO has finished' event handler:
324 set_bit(R1BIO_BarrierRetry, &r1_bio->state); 338 */
325 r1_bio->mddev->barriers_work = 0; 339 r1_bio->bios[mirror] = NULL;
326 /* Don't rdev_dec_pending in this branch - keep it for the retry */ 340 to_put = bio;
327 } else { 341 if (!uptodate) {
342 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
343 /* an I/O failed, we can't clear the bitmap */
344 set_bit(R1BIO_Degraded, &r1_bio->state);
345 } else
328 /* 346 /*
329 * this branch is our 'one mirror IO has finished' event handler: 347 * Set R1BIO_Uptodate in our master bio, so that we
348 * will return a good error code for to the higher
349 * levels even if IO on some other mirrored buffer
350 * fails.
351 *
352 * The 'master' represents the composite IO operation
353 * to user-side. So if something waits for IO, then it
354 * will wait for the 'master' bio.
330 */ 355 */
331 r1_bio->bios[mirror] = NULL; 356 set_bit(R1BIO_Uptodate, &r1_bio->state);
332 to_put = bio; 357
333 if (!uptodate) { 358 update_head_pos(mirror, r1_bio);
334 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 359
335 /* an I/O failed, we can't clear the bitmap */ 360 if (behind) {
336 set_bit(R1BIO_Degraded, &r1_bio->state); 361 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
337 } else 362 atomic_dec(&r1_bio->behind_remaining);
338 /* 363
339 * Set R1BIO_Uptodate in our master bio, so that 364 /*
340 * we will return a good error code for to the higher 365 * In behind mode, we ACK the master bio once the I/O
341 * levels even if IO on some other mirrored buffer fails. 366 * has safely reached all non-writemostly
342 * 367 * disks. Setting the Returned bit ensures that this
343 * The 'master' represents the composite IO operation to 368 * gets done only once -- we don't ever want to return
344 * user-side. So if something waits for IO, then it will 369 * -EIO here, instead we'll wait
345 * wait for the 'master' bio. 370 */
346 */ 371 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
347 set_bit(R1BIO_Uptodate, &r1_bio->state); 372 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
348 373 /* Maybe we can return now */
349 update_head_pos(mirror, r1_bio); 374 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
350 375 struct bio *mbio = r1_bio->master_bio;
351 if (behind) { 376 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
352 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 377 (unsigned long long) mbio->bi_sector,
353 atomic_dec(&r1_bio->behind_remaining); 378 (unsigned long long) mbio->bi_sector +
354 379 (mbio->bi_size >> 9) - 1);
355 /* In behind mode, we ACK the master bio once the I/O has safely 380 bio_endio(mbio, 0);
356 * reached all non-writemostly disks. Setting the Returned bit
357 * ensures that this gets done only once -- we don't ever want to
358 * return -EIO here, instead we'll wait */
359
360 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
361 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
362 /* Maybe we can return now */
363 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
364 struct bio *mbio = r1_bio->master_bio;
365 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
366 (unsigned long long) mbio->bi_sector,
367 (unsigned long long) mbio->bi_sector +
368 (mbio->bi_size >> 9) - 1);
369 bio_endio(mbio, 0);
370 }
371 } 381 }
372 } 382 }
373 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
374 } 383 }
384 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
385
375 /* 386 /*
376 *
377 * Let's see if all mirrored write operations have finished 387 * Let's see if all mirrored write operations have finished
378 * already. 388 * already.
379 */ 389 */
380 if (atomic_dec_and_test(&r1_bio->remaining)) { 390 r1_bio_write_done(r1_bio);
381 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
382 reschedule_retry(r1_bio);
383 else {
384 /* it really is the end of this request */
385 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
386 /* free extra copy of the data pages */
387 int i = bio->bi_vcnt;
388 while (i--)
389 safe_put_page(bio->bi_io_vec[i].bv_page);
390 }
391 /* clear the bitmap if all writes complete successfully */
392 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
393 r1_bio->sectors,
394 !test_bit(R1BIO_Degraded, &r1_bio->state),
395 behind);
396 md_write_end(r1_bio->mddev);
397 raid_end_bio_io(r1_bio);
398 }
399 }
400 391
401 if (to_put) 392 if (to_put)
402 bio_put(to_put); 393 bio_put(to_put);
@@ -420,11 +411,13 @@ static void raid1_end_write_request(struct bio *bio, int error)
420static int read_balance(conf_t *conf, r1bio_t *r1_bio) 411static int read_balance(conf_t *conf, r1bio_t *r1_bio)
421{ 412{
422 const sector_t this_sector = r1_bio->sector; 413 const sector_t this_sector = r1_bio->sector;
423 int new_disk = conf->last_used, disk = new_disk;
424 int wonly_disk = -1;
425 const int sectors = r1_bio->sectors; 414 const int sectors = r1_bio->sectors;
426 sector_t new_distance, current_distance; 415 int start_disk;
416 int best_disk;
417 int i;
418 sector_t best_dist;
427 mdk_rdev_t *rdev; 419 mdk_rdev_t *rdev;
420 int choose_first;
428 421
429 rcu_read_lock(); 422 rcu_read_lock();
430 /* 423 /*
@@ -433,100 +426,63 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
433 * We take the first readable disk when above the resync window. 426 * We take the first readable disk when above the resync window.
434 */ 427 */
435 retry: 428 retry:
429 best_disk = -1;
430 best_dist = MaxSector;
436 if (conf->mddev->recovery_cp < MaxSector && 431 if (conf->mddev->recovery_cp < MaxSector &&
437 (this_sector + sectors >= conf->next_resync)) { 432 (this_sector + sectors >= conf->next_resync)) {
438 /* Choose the first operational device, for consistancy */ 433 choose_first = 1;
439 new_disk = 0; 434 start_disk = 0;
440 435 } else {
441 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 436 choose_first = 0;
442 r1_bio->bios[new_disk] == IO_BLOCKED || 437 start_disk = conf->last_used;
443 !rdev || !test_bit(In_sync, &rdev->flags)
444 || test_bit(WriteMostly, &rdev->flags);
445 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
446
447 if (rdev && test_bit(In_sync, &rdev->flags) &&
448 r1_bio->bios[new_disk] != IO_BLOCKED)
449 wonly_disk = new_disk;
450
451 if (new_disk == conf->raid_disks - 1) {
452 new_disk = wonly_disk;
453 break;
454 }
455 }
456 goto rb_out;
457 }
458
459
460 /* make sure the disk is operational */
461 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
462 r1_bio->bios[new_disk] == IO_BLOCKED ||
463 !rdev || !test_bit(In_sync, &rdev->flags) ||
464 test_bit(WriteMostly, &rdev->flags);
465 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
466
467 if (rdev && test_bit(In_sync, &rdev->flags) &&
468 r1_bio->bios[new_disk] != IO_BLOCKED)
469 wonly_disk = new_disk;
470
471 if (new_disk <= 0)
472 new_disk = conf->raid_disks;
473 new_disk--;
474 if (new_disk == disk) {
475 new_disk = wonly_disk;
476 break;
477 }
478 } 438 }
479 439
480 if (new_disk < 0) 440 for (i = 0 ; i < conf->raid_disks ; i++) {
481 goto rb_out; 441 sector_t dist;
482 442 int disk = start_disk + i;
483 disk = new_disk; 443 if (disk >= conf->raid_disks)
484 /* now disk == new_disk == starting point for search */ 444 disk -= conf->raid_disks;
485
486 /*
487 * Don't change to another disk for sequential reads:
488 */
489 if (conf->next_seq_sect == this_sector)
490 goto rb_out;
491 if (this_sector == conf->mirrors[new_disk].head_position)
492 goto rb_out;
493
494 current_distance = abs(this_sector - conf->mirrors[disk].head_position);
495
496 /* Find the disk whose head is closest */
497
498 do {
499 if (disk <= 0)
500 disk = conf->raid_disks;
501 disk--;
502 445
503 rdev = rcu_dereference(conf->mirrors[disk].rdev); 446 rdev = rcu_dereference(conf->mirrors[disk].rdev);
504 447 if (r1_bio->bios[disk] == IO_BLOCKED
505 if (!rdev || r1_bio->bios[disk] == IO_BLOCKED || 448 || rdev == NULL
506 !test_bit(In_sync, &rdev->flags) || 449 || test_bit(Faulty, &rdev->flags))
507 test_bit(WriteMostly, &rdev->flags))
508 continue; 450 continue;
509 451 if (!test_bit(In_sync, &rdev->flags) &&
510 if (!atomic_read(&rdev->nr_pending)) { 452 rdev->recovery_offset < this_sector + sectors)
511 new_disk = disk; 453 continue;
454 if (test_bit(WriteMostly, &rdev->flags)) {
455 /* Don't balance among write-mostly, just
456 * use the first as a last resort */
457 if (best_disk < 0)
458 best_disk = disk;
459 continue;
460 }
461 /* This is a reasonable device to use. It might
462 * even be best.
463 */
464 dist = abs(this_sector - conf->mirrors[disk].head_position);
465 if (choose_first
466 /* Don't change to another disk for sequential reads */
467 || conf->next_seq_sect == this_sector
468 || dist == 0
469 /* If device is idle, use it */
470 || atomic_read(&rdev->nr_pending) == 0) {
471 best_disk = disk;
512 break; 472 break;
513 } 473 }
514 new_distance = abs(this_sector - conf->mirrors[disk].head_position); 474 if (dist < best_dist) {
515 if (new_distance < current_distance) { 475 best_dist = dist;
516 current_distance = new_distance; 476 best_disk = disk;
517 new_disk = disk;
518 } 477 }
519 } while (disk != conf->last_used); 478 }
520
521 rb_out:
522
523 479
524 if (new_disk >= 0) { 480 if (best_disk >= 0) {
525 rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 481 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
526 if (!rdev) 482 if (!rdev)
527 goto retry; 483 goto retry;
528 atomic_inc(&rdev->nr_pending); 484 atomic_inc(&rdev->nr_pending);
529 if (!test_bit(In_sync, &rdev->flags)) { 485 if (test_bit(Faulty, &rdev->flags)) {
530 /* cannot risk returning a device that failed 486 /* cannot risk returning a device that failed
531 * before we inc'ed nr_pending 487 * before we inc'ed nr_pending
532 */ 488 */
@@ -534,59 +490,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
534 goto retry; 490 goto retry;
535 } 491 }
536 conf->next_seq_sect = this_sector + sectors; 492 conf->next_seq_sect = this_sector + sectors;
537 conf->last_used = new_disk; 493 conf->last_used = best_disk;
538 } 494 }
539 rcu_read_unlock(); 495 rcu_read_unlock();
540 496
541 return new_disk; 497 return best_disk;
542} 498}
543 499
544static void unplug_slaves(mddev_t *mddev) 500int md_raid1_congested(mddev_t *mddev, int bits)
545{ 501{
546 conf_t *conf = mddev->private; 502 conf_t *conf = mddev->private;
547 int i;
548
549 rcu_read_lock();
550 for (i=0; i<mddev->raid_disks; i++) {
551 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
552 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
553 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
554
555 atomic_inc(&rdev->nr_pending);
556 rcu_read_unlock();
557
558 blk_unplug(r_queue);
559
560 rdev_dec_pending(rdev, mddev);
561 rcu_read_lock();
562 }
563 }
564 rcu_read_unlock();
565}
566
567static void raid1_unplug(struct request_queue *q)
568{
569 mddev_t *mddev = q->queuedata;
570
571 unplug_slaves(mddev);
572 md_wakeup_thread(mddev->thread);
573}
574
575static int raid1_congested(void *data, int bits)
576{
577 mddev_t *mddev = data;
578 conf_t *conf = mddev->private;
579 int i, ret = 0; 503 int i, ret = 0;
580 504
581 if (mddev_congested(mddev, bits))
582 return 1;
583
584 rcu_read_lock(); 505 rcu_read_lock();
585 for (i = 0; i < mddev->raid_disks; i++) { 506 for (i = 0; i < mddev->raid_disks; i++) {
586 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 507 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
587 if (rdev && !test_bit(Faulty, &rdev->flags)) { 508 if (rdev && !test_bit(Faulty, &rdev->flags)) {
588 struct request_queue *q = bdev_get_queue(rdev->bdev); 509 struct request_queue *q = bdev_get_queue(rdev->bdev);
589 510
511 BUG_ON(!q);
512
590 /* Note the '|| 1' - when read_balance prefers 513 /* Note the '|| 1' - when read_balance prefers
591 * non-congested targets, it can be removed 514 * non-congested targets, it can be removed
592 */ 515 */
@@ -599,22 +522,26 @@ static int raid1_congested(void *data, int bits)
599 rcu_read_unlock(); 522 rcu_read_unlock();
600 return ret; 523 return ret;
601} 524}
525EXPORT_SYMBOL_GPL(md_raid1_congested);
602 526
527static int raid1_congested(void *data, int bits)
528{
529 mddev_t *mddev = data;
603 530
604static int flush_pending_writes(conf_t *conf) 531 return mddev_congested(mddev, bits) ||
532 md_raid1_congested(mddev, bits);
533}
534
535static void flush_pending_writes(conf_t *conf)
605{ 536{
606 /* Any writes that have been queued but are awaiting 537 /* Any writes that have been queued but are awaiting
607 * bitmap updates get flushed here. 538 * bitmap updates get flushed here.
608 * We return 1 if any requests were actually submitted.
609 */ 539 */
610 int rv = 0;
611
612 spin_lock_irq(&conf->device_lock); 540 spin_lock_irq(&conf->device_lock);
613 541
614 if (conf->pending_bio_list.head) { 542 if (conf->pending_bio_list.head) {
615 struct bio *bio; 543 struct bio *bio;
616 bio = bio_list_get(&conf->pending_bio_list); 544 bio = bio_list_get(&conf->pending_bio_list);
617 blk_remove_plug(conf->mddev->queue);
618 spin_unlock_irq(&conf->device_lock); 545 spin_unlock_irq(&conf->device_lock);
619 /* flush any pending bitmap writes to 546 /* flush any pending bitmap writes to
620 * disk before proceeding w/ I/O */ 547 * disk before proceeding w/ I/O */
@@ -626,10 +553,8 @@ static int flush_pending_writes(conf_t *conf)
626 generic_make_request(bio); 553 generic_make_request(bio);
627 bio = next; 554 bio = next;
628 } 555 }
629 rv = 1;
630 } else 556 } else
631 spin_unlock_irq(&conf->device_lock); 557 spin_unlock_irq(&conf->device_lock);
632 return rv;
633} 558}
634 559
635/* Barriers.... 560/* Barriers....
@@ -661,17 +586,15 @@ static void raise_barrier(conf_t *conf)
661 586
662 /* Wait until no block IO is waiting */ 587 /* Wait until no block IO is waiting */
663 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, 588 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
664 conf->resync_lock, 589 conf->resync_lock, );
665 raid1_unplug(conf->mddev->queue));
666 590
667 /* block any new IO from starting */ 591 /* block any new IO from starting */
668 conf->barrier++; 592 conf->barrier++;
669 593
670 /* No wait for all pending IO to complete */ 594 /* Now wait for all pending IO to complete */
671 wait_event_lock_irq(conf->wait_barrier, 595 wait_event_lock_irq(conf->wait_barrier,
672 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 596 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
673 conf->resync_lock, 597 conf->resync_lock, );
674 raid1_unplug(conf->mddev->queue));
675 598
676 spin_unlock_irq(&conf->resync_lock); 599 spin_unlock_irq(&conf->resync_lock);
677} 600}
@@ -693,7 +616,7 @@ static void wait_barrier(conf_t *conf)
693 conf->nr_waiting++; 616 conf->nr_waiting++;
694 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 617 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
695 conf->resync_lock, 618 conf->resync_lock,
696 raid1_unplug(conf->mddev->queue)); 619 );
697 conf->nr_waiting--; 620 conf->nr_waiting--;
698 } 621 }
699 conf->nr_pending++; 622 conf->nr_pending++;
@@ -729,8 +652,7 @@ static void freeze_array(conf_t *conf)
729 wait_event_lock_irq(conf->wait_barrier, 652 wait_event_lock_irq(conf->wait_barrier,
730 conf->nr_pending == conf->nr_queued+1, 653 conf->nr_pending == conf->nr_queued+1,
731 conf->resync_lock, 654 conf->resync_lock,
732 ({ flush_pending_writes(conf); 655 flush_pending_writes(conf));
733 raid1_unplug(conf->mddev->queue); }));
734 spin_unlock_irq(&conf->resync_lock); 656 spin_unlock_irq(&conf->resync_lock);
735} 657}
736static void unfreeze_array(conf_t *conf) 658static void unfreeze_array(conf_t *conf)
@@ -744,15 +666,16 @@ static void unfreeze_array(conf_t *conf)
744} 666}
745 667
746 668
747/* duplicate the data pages for behind I/O */ 669/* duplicate the data pages for behind I/O
748static struct page **alloc_behind_pages(struct bio *bio) 670 */
671static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
749{ 672{
750 int i; 673 int i;
751 struct bio_vec *bvec; 674 struct bio_vec *bvec;
752 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), 675 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
753 GFP_NOIO); 676 GFP_NOIO);
754 if (unlikely(!pages)) 677 if (unlikely(!pages))
755 goto do_sync_io; 678 return;
756 679
757 bio_for_each_segment(bvec, bio, i) { 680 bio_for_each_segment(bvec, bio, i) {
758 pages[i] = alloc_page(GFP_NOIO); 681 pages[i] = alloc_page(GFP_NOIO);
@@ -763,16 +686,17 @@ static struct page **alloc_behind_pages(struct bio *bio)
763 kunmap(pages[i]); 686 kunmap(pages[i]);
764 kunmap(bvec->bv_page); 687 kunmap(bvec->bv_page);
765 } 688 }
766 689 r1_bio->behind_pages = pages;
767 return pages; 690 r1_bio->behind_page_count = bio->bi_vcnt;
691 set_bit(R1BIO_BehindIO, &r1_bio->state);
692 return;
768 693
769do_sync_io: 694do_sync_io:
770 if (pages) 695 for (i = 0; i < bio->bi_vcnt; i++)
771 for (i = 0; i < bio->bi_vcnt && pages[i]; i++) 696 if (pages[i])
772 put_page(pages[i]); 697 put_page(pages[i]);
773 kfree(pages); 698 kfree(pages);
774 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 699 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
775 return NULL;
776} 700}
777 701
778static int make_request(mddev_t *mddev, struct bio * bio) 702static int make_request(mddev_t *mddev, struct bio * bio)
@@ -784,20 +708,16 @@ static int make_request(mddev_t *mddev, struct bio * bio)
784 int i, targets = 0, disks; 708 int i, targets = 0, disks;
785 struct bitmap *bitmap; 709 struct bitmap *bitmap;
786 unsigned long flags; 710 unsigned long flags;
787 struct bio_list bl;
788 struct page **behind_pages = NULL;
789 const int rw = bio_data_dir(bio); 711 const int rw = bio_data_dir(bio);
790 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 712 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
791 unsigned long do_barriers; 713 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
792 mdk_rdev_t *blocked_rdev; 714 mdk_rdev_t *blocked_rdev;
715 int plugged;
793 716
794 /* 717 /*
795 * Register the new request and wait if the reconstruction 718 * Register the new request and wait if the reconstruction
796 * thread has put up a bar for new requests. 719 * thread has put up a bar for new requests.
797 * Continue immediately if no resync is active currently. 720 * Continue immediately if no resync is active currently.
798 * We test barriers_work *after* md_write_start as md_write_start
799 * may cause the first superblock write, and that will check out
800 * if barriers work.
801 */ 721 */
802 722
803 md_write_start(mddev, bio); /* wait on superblock update early */ 723 md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +741,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
821 } 741 }
822 finish_wait(&conf->wait_barrier, &w); 742 finish_wait(&conf->wait_barrier, &w);
823 } 743 }
824 if (unlikely(!mddev->barriers_work &&
825 (bio->bi_rw & REQ_HARDBARRIER))) {
826 if (rw == WRITE)
827 md_write_end(mddev);
828 bio_endio(bio, -EOPNOTSUPP);
829 return 0;
830 }
831 744
832 wait_barrier(conf); 745 wait_barrier(conf);
833 746
@@ -870,7 +783,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
870 } 783 }
871 r1_bio->read_disk = rdisk; 784 r1_bio->read_disk = rdisk;
872 785
873 read_bio = bio_clone(bio, GFP_NOIO); 786 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
874 787
875 r1_bio->bios[rdisk] = read_bio; 788 r1_bio->bios[rdisk] = read_bio;
876 789
@@ -891,14 +804,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
891 * inc refcount on their rdev. Record them by setting 804 * inc refcount on their rdev. Record them by setting
892 * bios[x] to bio 805 * bios[x] to bio
893 */ 806 */
807 plugged = mddev_check_plugged(mddev);
808
894 disks = conf->raid_disks; 809 disks = conf->raid_disks;
895#if 0
896 { static int first=1;
897 if (first) printk("First Write sector %llu disks %d\n",
898 (unsigned long long)r1_bio->sector, disks);
899 first = 0;
900 }
901#endif
902 retry_write: 810 retry_write:
903 blocked_rdev = NULL; 811 blocked_rdev = NULL;
904 rcu_read_lock(); 812 rcu_read_lock();
@@ -952,33 +860,29 @@ static int make_request(mddev_t *mddev, struct bio * bio)
952 if (bitmap && 860 if (bitmap &&
953 (atomic_read(&bitmap->behind_writes) 861 (atomic_read(&bitmap->behind_writes)
954 < mddev->bitmap_info.max_write_behind) && 862 < mddev->bitmap_info.max_write_behind) &&
955 !waitqueue_active(&bitmap->behind_wait) && 863 !waitqueue_active(&bitmap->behind_wait))
956 (behind_pages = alloc_behind_pages(bio)) != NULL) 864 alloc_behind_pages(bio, r1_bio);
957 set_bit(R1BIO_BehindIO, &r1_bio->state);
958 865
959 atomic_set(&r1_bio->remaining, 0); 866 atomic_set(&r1_bio->remaining, 1);
960 atomic_set(&r1_bio->behind_remaining, 0); 867 atomic_set(&r1_bio->behind_remaining, 0);
961 868
962 do_barriers = bio->bi_rw & REQ_HARDBARRIER; 869 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
963 if (do_barriers) 870 test_bit(R1BIO_BehindIO, &r1_bio->state));
964 set_bit(R1BIO_Barrier, &r1_bio->state);
965
966 bio_list_init(&bl);
967 for (i = 0; i < disks; i++) { 871 for (i = 0; i < disks; i++) {
968 struct bio *mbio; 872 struct bio *mbio;
969 if (!r1_bio->bios[i]) 873 if (!r1_bio->bios[i])
970 continue; 874 continue;
971 875
972 mbio = bio_clone(bio, GFP_NOIO); 876 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
973 r1_bio->bios[i] = mbio; 877 r1_bio->bios[i] = mbio;
974 878
975 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 879 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
976 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 880 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
977 mbio->bi_end_io = raid1_end_write_request; 881 mbio->bi_end_io = raid1_end_write_request;
978 mbio->bi_rw = WRITE | do_barriers | do_sync; 882 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
979 mbio->bi_private = r1_bio; 883 mbio->bi_private = r1_bio;
980 884
981 if (behind_pages) { 885 if (r1_bio->behind_pages) {
982 struct bio_vec *bvec; 886 struct bio_vec *bvec;
983 int j; 887 int j;
984 888
@@ -986,39 +890,27 @@ static int make_request(mddev_t *mddev, struct bio * bio)
986 * we clear any unused pointer in the io_vec, rather 890 * we clear any unused pointer in the io_vec, rather
987 * than leave them unchanged. This is important 891 * than leave them unchanged. This is important
988 * because when we come to free the pages, we won't 892 * because when we come to free the pages, we won't
989 * know the originial bi_idx, so we just free 893 * know the original bi_idx, so we just free
990 * them all 894 * them all
991 */ 895 */
992 __bio_for_each_segment(bvec, mbio, j, 0) 896 __bio_for_each_segment(bvec, mbio, j, 0)
993 bvec->bv_page = behind_pages[j]; 897 bvec->bv_page = r1_bio->behind_pages[j];
994 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 898 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
995 atomic_inc(&r1_bio->behind_remaining); 899 atomic_inc(&r1_bio->behind_remaining);
996 } 900 }
997 901
998 atomic_inc(&r1_bio->remaining); 902 atomic_inc(&r1_bio->remaining);
999 903 spin_lock_irqsave(&conf->device_lock, flags);
1000 bio_list_add(&bl, mbio); 904 bio_list_add(&conf->pending_bio_list, mbio);
905 spin_unlock_irqrestore(&conf->device_lock, flags);
1001 } 906 }
1002 kfree(behind_pages); /* the behind pages are attached to the bios now */ 907 r1_bio_write_done(r1_bio);
1003
1004 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
1005 test_bit(R1BIO_BehindIO, &r1_bio->state));
1006 spin_lock_irqsave(&conf->device_lock, flags);
1007 bio_list_merge(&conf->pending_bio_list, &bl);
1008 bio_list_init(&bl);
1009 908
1010 blk_plug_device(mddev->queue); 909 /* In case raid1d snuck in to freeze_array */
1011 spin_unlock_irqrestore(&conf->device_lock, flags);
1012
1013 /* In case raid1d snuck into freeze_array */
1014 wake_up(&conf->wait_barrier); 910 wake_up(&conf->wait_barrier);
1015 911
1016 if (do_sync) 912 if (do_sync || !bitmap || !plugged)
1017 md_wakeup_thread(mddev->thread); 913 md_wakeup_thread(mddev->thread);
1018#if 0
1019 while ((bio = bio_list_pop(&bl)) != NULL)
1020 generic_make_request(bio);
1021#endif
1022 914
1023 return 0; 915 return 0;
1024} 916}
@@ -1076,8 +968,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1076 } else 968 } else
1077 set_bit(Faulty, &rdev->flags); 969 set_bit(Faulty, &rdev->flags);
1078 set_bit(MD_CHANGE_DEVS, &mddev->flags); 970 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1079 printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" 971 printk(KERN_ALERT
1080 KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", 972 "md/raid1:%s: Disk failure on %s, disabling device.\n"
973 "md/raid1:%s: Operation continuing on %d devices.\n",
1081 mdname(mddev), bdevname(rdev->bdev, b), 974 mdname(mddev), bdevname(rdev->bdev, b),
1082 mdname(mddev), conf->raid_disks - mddev->degraded); 975 mdname(mddev), conf->raid_disks - mddev->degraded);
1083} 976}
@@ -1206,10 +1099,11 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
1206 err = -EBUSY; 1099 err = -EBUSY;
1207 goto abort; 1100 goto abort;
1208 } 1101 }
1209 /* Only remove non-faulty devices is recovery 1102 /* Only remove non-faulty devices if recovery
1210 * is not possible. 1103 * is not possible.
1211 */ 1104 */
1212 if (!test_bit(Faulty, &rdev->flags) && 1105 if (!test_bit(Faulty, &rdev->flags) &&
1106 !mddev->recovery_disabled &&
1213 mddev->degraded < conf->raid_disks) { 1107 mddev->degraded < conf->raid_disks) {
1214 err = -EBUSY; 1108 err = -EBUSY;
1215 goto abort; 1109 goto abort;
@@ -1222,7 +1116,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
1222 p->rdev = rdev; 1116 p->rdev = rdev;
1223 goto abort; 1117 goto abort;
1224 } 1118 }
1225 md_integrity_register(mddev); 1119 err = md_integrity_register(mddev);
1226 } 1120 }
1227abort: 1121abort:
1228 1122
@@ -1268,7 +1162,7 @@ static void end_sync_write(struct bio *bio, int error)
1268 break; 1162 break;
1269 } 1163 }
1270 if (!uptodate) { 1164 if (!uptodate) {
1271 int sync_blocks = 0; 1165 sector_t sync_blocks = 0;
1272 sector_t s = r1_bio->sector; 1166 sector_t s = r1_bio->sector;
1273 long sectors_to_go = r1_bio->sectors; 1167 long sectors_to_go = r1_bio->sectors;
1274 /* make sure these bits doesn't get cleared. */ 1168 /* make sure these bits doesn't get cleared. */
@@ -1290,194 +1184,210 @@ static void end_sync_write(struct bio *bio, int error)
1290 } 1184 }
1291} 1185}
1292 1186
1293static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) 1187static int fix_sync_read_error(r1bio_t *r1_bio)
1294{ 1188{
1189 /* Try some synchronous reads of other devices to get
1190 * good data, much like with normal read errors. Only
1191 * read into the pages we already have so we don't
1192 * need to re-issue the read request.
1193 * We don't need to freeze the array, because being in an
1194 * active sync request, there is no normal IO, and
1195 * no overlapping syncs.
1196 */
1197 mddev_t *mddev = r1_bio->mddev;
1295 conf_t *conf = mddev->private; 1198 conf_t *conf = mddev->private;
1296 int i; 1199 struct bio *bio = r1_bio->bios[r1_bio->read_disk];
1297 int disks = conf->raid_disks; 1200 sector_t sect = r1_bio->sector;
1298 struct bio *bio, *wbio; 1201 int sectors = r1_bio->sectors;
1299 1202 int idx = 0;
1300 bio = r1_bio->bios[r1_bio->read_disk];
1301 1203
1204 while(sectors) {
1205 int s = sectors;
1206 int d = r1_bio->read_disk;
1207 int success = 0;
1208 mdk_rdev_t *rdev;
1209 int start;
1302 1210
1303 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 1211 if (s > (PAGE_SIZE>>9))
1304 /* We have read all readable devices. If we haven't 1212 s = PAGE_SIZE >> 9;
1305 * got the block, then there is no hope left. 1213 do {
1306 * If we have, then we want to do a comparison 1214 if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1307 * and skip the write if everything is the same. 1215 /* No rcu protection needed here devices
1308 * If any blocks failed to read, then we need to 1216 * can only be removed when no resync is
1309 * attempt an over-write 1217 * active, and resync is currently active
1310 */ 1218 */
1311 int primary; 1219 rdev = conf->mirrors[d].rdev;
1312 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1220 if (sync_page_io(rdev,
1313 for (i=0; i<mddev->raid_disks; i++) 1221 sect,
1314 if (r1_bio->bios[i]->bi_end_io == end_sync_read) 1222 s<<9,
1315 md_error(mddev, conf->mirrors[i].rdev); 1223 bio->bi_io_vec[idx].bv_page,
1224 READ, false)) {
1225 success = 1;
1226 break;
1227 }
1228 }
1229 d++;
1230 if (d == conf->raid_disks)
1231 d = 0;
1232 } while (!success && d != r1_bio->read_disk);
1316 1233
1317 md_done_sync(mddev, r1_bio->sectors, 1); 1234 if (!success) {
1235 char b[BDEVNAME_SIZE];
1236 /* Cannot read from anywhere, array is toast */
1237 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1238 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1239 " for block %llu\n",
1240 mdname(mddev),
1241 bdevname(bio->bi_bdev, b),
1242 (unsigned long long)r1_bio->sector);
1243 md_done_sync(mddev, r1_bio->sectors, 0);
1318 put_buf(r1_bio); 1244 put_buf(r1_bio);
1319 return; 1245 return 0;
1320 } 1246 }
1321 for (primary=0; primary<mddev->raid_disks; primary++)
1322 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1323 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1324 r1_bio->bios[primary]->bi_end_io = NULL;
1325 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1326 break;
1327 }
1328 r1_bio->read_disk = primary;
1329 for (i=0; i<mddev->raid_disks; i++)
1330 if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
1331 int j;
1332 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1333 struct bio *pbio = r1_bio->bios[primary];
1334 struct bio *sbio = r1_bio->bios[i];
1335
1336 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
1337 for (j = vcnt; j-- ; ) {
1338 struct page *p, *s;
1339 p = pbio->bi_io_vec[j].bv_page;
1340 s = sbio->bi_io_vec[j].bv_page;
1341 if (memcmp(page_address(p),
1342 page_address(s),
1343 PAGE_SIZE))
1344 break;
1345 }
1346 } else
1347 j = 0;
1348 if (j >= 0)
1349 mddev->resync_mismatches += r1_bio->sectors;
1350 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
1351 && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
1352 sbio->bi_end_io = NULL;
1353 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1354 } else {
1355 /* fixup the bio for reuse */
1356 int size;
1357 sbio->bi_vcnt = vcnt;
1358 sbio->bi_size = r1_bio->sectors << 9;
1359 sbio->bi_idx = 0;
1360 sbio->bi_phys_segments = 0;
1361 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1362 sbio->bi_flags |= 1 << BIO_UPTODATE;
1363 sbio->bi_next = NULL;
1364 sbio->bi_sector = r1_bio->sector +
1365 conf->mirrors[i].rdev->data_offset;
1366 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1367 size = sbio->bi_size;
1368 for (j = 0; j < vcnt ; j++) {
1369 struct bio_vec *bi;
1370 bi = &sbio->bi_io_vec[j];
1371 bi->bv_offset = 0;
1372 if (size > PAGE_SIZE)
1373 bi->bv_len = PAGE_SIZE;
1374 else
1375 bi->bv_len = size;
1376 size -= PAGE_SIZE;
1377 memcpy(page_address(bi->bv_page),
1378 page_address(pbio->bi_io_vec[j].bv_page),
1379 PAGE_SIZE);
1380 }
1381 1247
1382 } 1248 start = d;
1383 } 1249 /* write it back and re-read */
1250 while (d != r1_bio->read_disk) {
1251 if (d == 0)
1252 d = conf->raid_disks;
1253 d--;
1254 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1255 continue;
1256 rdev = conf->mirrors[d].rdev;
1257 if (sync_page_io(rdev,
1258 sect,
1259 s<<9,
1260 bio->bi_io_vec[idx].bv_page,
1261 WRITE, false) == 0) {
1262 r1_bio->bios[d]->bi_end_io = NULL;
1263 rdev_dec_pending(rdev, mddev);
1264 md_error(mddev, rdev);
1265 } else
1266 atomic_add(s, &rdev->corrected_errors);
1267 }
1268 d = start;
1269 while (d != r1_bio->read_disk) {
1270 if (d == 0)
1271 d = conf->raid_disks;
1272 d--;
1273 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1274 continue;
1275 rdev = conf->mirrors[d].rdev;
1276 if (sync_page_io(rdev,
1277 sect,
1278 s<<9,
1279 bio->bi_io_vec[idx].bv_page,
1280 READ, false) == 0)
1281 md_error(mddev, rdev);
1282 }
1283 sectors -= s;
1284 sect += s;
1285 idx ++;
1384 } 1286 }
1385 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1287 set_bit(R1BIO_Uptodate, &r1_bio->state);
1386 /* ouch - failed to read all of that. 1288 set_bit(BIO_UPTODATE, &bio->bi_flags);
1387 * Try some synchronous reads of other devices to get 1289 return 1;
1388 * good data, much like with normal read errors. Only 1290}
1389 * read into the pages we already have so we don't 1291
1390 * need to re-issue the read request. 1292static int process_checks(r1bio_t *r1_bio)
1391 * We don't need to freeze the array, because being in an 1293{
1392 * active sync request, there is no normal IO, and 1294 /* We have read all readable devices. If we haven't
1393 * no overlapping syncs. 1295 * got the block, then there is no hope left.
1394 */ 1296 * If we have, then we want to do a comparison
1395 sector_t sect = r1_bio->sector; 1297 * and skip the write if everything is the same.
1396 int sectors = r1_bio->sectors; 1298 * If any blocks failed to read, then we need to
1397 int idx = 0; 1299 * attempt an over-write
1398 1300 */
1399 while(sectors) { 1301 mddev_t *mddev = r1_bio->mddev;
1400 int s = sectors; 1302 conf_t *conf = mddev->private;
1401 int d = r1_bio->read_disk; 1303 int primary;
1402 int success = 0; 1304 int i;
1403 mdk_rdev_t *rdev; 1305
1404 1306 for (primary = 0; primary < conf->raid_disks; primary++)
1405 if (s > (PAGE_SIZE>>9)) 1307 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1406 s = PAGE_SIZE >> 9; 1308 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1407 do { 1309 r1_bio->bios[primary]->bi_end_io = NULL;
1408 if (r1_bio->bios[d]->bi_end_io == end_sync_read) { 1310 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1409 /* No rcu protection needed here devices 1311 break;
1410 * can only be removed when no resync is 1312 }
1411 * active, and resync is currently active 1313 r1_bio->read_disk = primary;
1412 */ 1314 for (i = 0; i < conf->raid_disks; i++) {
1413 rdev = conf->mirrors[d].rdev; 1315 int j;
1414 if (sync_page_io(rdev->bdev, 1316 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1415 sect + rdev->data_offset, 1317 struct bio *pbio = r1_bio->bios[primary];
1416 s<<9, 1318 struct bio *sbio = r1_bio->bios[i];
1417 bio->bi_io_vec[idx].bv_page, 1319 int size;
1418 READ)) { 1320
1419 success = 1; 1321 if (r1_bio->bios[i]->bi_end_io != end_sync_read)
1420 break; 1322 continue;
1421 } 1323
1422 } 1324 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
1423 d++; 1325 for (j = vcnt; j-- ; ) {
1424 if (d == conf->raid_disks) 1326 struct page *p, *s;
1425 d = 0; 1327 p = pbio->bi_io_vec[j].bv_page;
1426 } while (!success && d != r1_bio->read_disk); 1328 s = sbio->bi_io_vec[j].bv_page;
1427 1329 if (memcmp(page_address(p),
1428 if (success) { 1330 page_address(s),
1429 int start = d; 1331 PAGE_SIZE))
1430 /* write it back and re-read */ 1332 break;
1431 set_bit(R1BIO_Uptodate, &r1_bio->state);
1432 while (d != r1_bio->read_disk) {
1433 if (d == 0)
1434 d = conf->raid_disks;
1435 d--;
1436 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1437 continue;
1438 rdev = conf->mirrors[d].rdev;
1439 atomic_add(s, &rdev->corrected_errors);
1440 if (sync_page_io(rdev->bdev,
1441 sect + rdev->data_offset,
1442 s<<9,
1443 bio->bi_io_vec[idx].bv_page,
1444 WRITE) == 0)
1445 md_error(mddev, rdev);
1446 }
1447 d = start;
1448 while (d != r1_bio->read_disk) {
1449 if (d == 0)
1450 d = conf->raid_disks;
1451 d--;
1452 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1453 continue;
1454 rdev = conf->mirrors[d].rdev;
1455 if (sync_page_io(rdev->bdev,
1456 sect + rdev->data_offset,
1457 s<<9,
1458 bio->bi_io_vec[idx].bv_page,
1459 READ) == 0)
1460 md_error(mddev, rdev);
1461 }
1462 } else {
1463 char b[BDEVNAME_SIZE];
1464 /* Cannot read from anywhere, array is toast */
1465 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1466 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1467 " for block %llu\n",
1468 mdname(mddev),
1469 bdevname(bio->bi_bdev, b),
1470 (unsigned long long)r1_bio->sector);
1471 md_done_sync(mddev, r1_bio->sectors, 0);
1472 put_buf(r1_bio);
1473 return;
1474 } 1333 }
1475 sectors -= s; 1334 } else
1476 sect += s; 1335 j = 0;
1477 idx ++; 1336 if (j >= 0)
1337 mddev->resync_mismatches += r1_bio->sectors;
1338 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
1339 && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
1340 /* No need to write to this device. */
1341 sbio->bi_end_io = NULL;
1342 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1343 continue;
1344 }
1345 /* fixup the bio for reuse */
1346 sbio->bi_vcnt = vcnt;
1347 sbio->bi_size = r1_bio->sectors << 9;
1348 sbio->bi_idx = 0;
1349 sbio->bi_phys_segments = 0;
1350 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1351 sbio->bi_flags |= 1 << BIO_UPTODATE;
1352 sbio->bi_next = NULL;
1353 sbio->bi_sector = r1_bio->sector +
1354 conf->mirrors[i].rdev->data_offset;
1355 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1356 size = sbio->bi_size;
1357 for (j = 0; j < vcnt ; j++) {
1358 struct bio_vec *bi;
1359 bi = &sbio->bi_io_vec[j];
1360 bi->bv_offset = 0;
1361 if (size > PAGE_SIZE)
1362 bi->bv_len = PAGE_SIZE;
1363 else
1364 bi->bv_len = size;
1365 size -= PAGE_SIZE;
1366 memcpy(page_address(bi->bv_page),
1367 page_address(pbio->bi_io_vec[j].bv_page),
1368 PAGE_SIZE);
1478 } 1369 }
1479 } 1370 }
1371 return 0;
1372}
1373
1374static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1375{
1376 conf_t *conf = mddev->private;
1377 int i;
1378 int disks = conf->raid_disks;
1379 struct bio *bio, *wbio;
1380
1381 bio = r1_bio->bios[r1_bio->read_disk];
1480 1382
1383 if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
1384 /* ouch - failed to read all of that. */
1385 if (!fix_sync_read_error(r1_bio))
1386 return;
1387
1388 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1389 if (process_checks(r1_bio) < 0)
1390 return;
1481 /* 1391 /*
1482 * schedule writes 1392 * schedule writes
1483 */ 1393 */
@@ -1536,10 +1446,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
1536 rdev = conf->mirrors[d].rdev; 1446 rdev = conf->mirrors[d].rdev;
1537 if (rdev && 1447 if (rdev &&
1538 test_bit(In_sync, &rdev->flags) && 1448 test_bit(In_sync, &rdev->flags) &&
1539 sync_page_io(rdev->bdev, 1449 sync_page_io(rdev, sect, s<<9,
1540 sect + rdev->data_offset, 1450 conf->tmppage, READ, false))
1541 s<<9,
1542 conf->tmppage, READ))
1543 success = 1; 1451 success = 1;
1544 else { 1452 else {
1545 d++; 1453 d++;
@@ -1562,9 +1470,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
1562 rdev = conf->mirrors[d].rdev; 1470 rdev = conf->mirrors[d].rdev;
1563 if (rdev && 1471 if (rdev &&
1564 test_bit(In_sync, &rdev->flags)) { 1472 test_bit(In_sync, &rdev->flags)) {
1565 if (sync_page_io(rdev->bdev, 1473 if (sync_page_io(rdev, sect, s<<9,
1566 sect + rdev->data_offset, 1474 conf->tmppage, WRITE, false)
1567 s<<9, conf->tmppage, WRITE)
1568 == 0) 1475 == 0)
1569 /* Well, this device is dead */ 1476 /* Well, this device is dead */
1570 md_error(mddev, rdev); 1477 md_error(mddev, rdev);
@@ -1579,9 +1486,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
1579 rdev = conf->mirrors[d].rdev; 1486 rdev = conf->mirrors[d].rdev;
1580 if (rdev && 1487 if (rdev &&
1581 test_bit(In_sync, &rdev->flags)) { 1488 test_bit(In_sync, &rdev->flags)) {
1582 if (sync_page_io(rdev->bdev, 1489 if (sync_page_io(rdev, sect, s<<9,
1583 sect + rdev->data_offset, 1490 conf->tmppage, READ, false)
1584 s<<9, conf->tmppage, READ)
1585 == 0) 1491 == 0)
1586 /* Well, this device is dead */ 1492 /* Well, this device is dead */
1587 md_error(mddev, rdev); 1493 md_error(mddev, rdev);
@@ -1609,15 +1515,17 @@ static void raid1d(mddev_t *mddev)
1609 unsigned long flags; 1515 unsigned long flags;
1610 conf_t *conf = mddev->private; 1516 conf_t *conf = mddev->private;
1611 struct list_head *head = &conf->retry_list; 1517 struct list_head *head = &conf->retry_list;
1612 int unplug=0;
1613 mdk_rdev_t *rdev; 1518 mdk_rdev_t *rdev;
1519 struct blk_plug plug;
1614 1520
1615 md_check_recovery(mddev); 1521 md_check_recovery(mddev);
1616 1522
1523 blk_start_plug(&plug);
1617 for (;;) { 1524 for (;;) {
1618 char b[BDEVNAME_SIZE]; 1525 char b[BDEVNAME_SIZE];
1619 1526
1620 unplug += flush_pending_writes(conf); 1527 if (atomic_read(&mddev->plug_cnt) == 0)
1528 flush_pending_writes(conf);
1621 1529
1622 spin_lock_irqsave(&conf->device_lock, flags); 1530 spin_lock_irqsave(&conf->device_lock, flags);
1623 if (list_empty(head)) { 1531 if (list_empty(head)) {
@@ -1631,45 +1539,9 @@ static void raid1d(mddev_t *mddev)
1631 1539
1632 mddev = r1_bio->mddev; 1540 mddev = r1_bio->mddev;
1633 conf = mddev->private; 1541 conf = mddev->private;
1634 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1542 if (test_bit(R1BIO_IsSync, &r1_bio->state))
1635 sync_request_write(mddev, r1_bio); 1543 sync_request_write(mddev, r1_bio);
1636 unplug = 1; 1544 else {
1637 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1638 /* some requests in the r1bio were REQ_HARDBARRIER
1639 * requests which failed with -EOPNOTSUPP. Hohumm..
1640 * Better resubmit without the barrier.
1641 * We know which devices to resubmit for, because
1642 * all others have had their bios[] entry cleared.
1643 * We already have a nr_pending reference on these rdevs.
1644 */
1645 int i;
1646 const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
1647 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1648 clear_bit(R1BIO_Barrier, &r1_bio->state);
1649 for (i=0; i < conf->raid_disks; i++)
1650 if (r1_bio->bios[i])
1651 atomic_inc(&r1_bio->remaining);
1652 for (i=0; i < conf->raid_disks; i++)
1653 if (r1_bio->bios[i]) {
1654 struct bio_vec *bvec;
1655 int j;
1656
1657 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1658 /* copy pages from the failed bio, as
1659 * this might be a write-behind device */
1660 __bio_for_each_segment(bvec, bio, j, 0)
1661 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1662 bio_put(r1_bio->bios[i]);
1663 bio->bi_sector = r1_bio->sector +
1664 conf->mirrors[i].rdev->data_offset;
1665 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1666 bio->bi_end_io = raid1_end_write_request;
1667 bio->bi_rw = WRITE | do_sync;
1668 bio->bi_private = r1_bio;
1669 r1_bio->bios[i] = bio;
1670 generic_make_request(bio);
1671 }
1672 } else {
1673 int disk; 1545 int disk;
1674 1546
1675 /* we got a read error. Maybe the drive is bad. Maybe just 1547 /* we got a read error. Maybe the drive is bad. Maybe just
@@ -1704,7 +1576,8 @@ static void raid1d(mddev_t *mddev)
1704 mddev->ro ? IO_BLOCKED : NULL; 1576 mddev->ro ? IO_BLOCKED : NULL;
1705 r1_bio->read_disk = disk; 1577 r1_bio->read_disk = disk;
1706 bio_put(bio); 1578 bio_put(bio);
1707 bio = bio_clone(r1_bio->master_bio, GFP_NOIO); 1579 bio = bio_clone_mddev(r1_bio->master_bio,
1580 GFP_NOIO, mddev);
1708 r1_bio->bios[r1_bio->read_disk] = bio; 1581 r1_bio->bios[r1_bio->read_disk] = bio;
1709 rdev = conf->mirrors[disk].rdev; 1582 rdev = conf->mirrors[disk].rdev;
1710 if (printk_ratelimit()) 1583 if (printk_ratelimit())
@@ -1718,14 +1591,12 @@ static void raid1d(mddev_t *mddev)
1718 bio->bi_end_io = raid1_end_read_request; 1591 bio->bi_end_io = raid1_end_read_request;
1719 bio->bi_rw = READ | do_sync; 1592 bio->bi_rw = READ | do_sync;
1720 bio->bi_private = r1_bio; 1593 bio->bi_private = r1_bio;
1721 unplug = 1;
1722 generic_make_request(bio); 1594 generic_make_request(bio);
1723 } 1595 }
1724 } 1596 }
1725 cond_resched(); 1597 cond_resched();
1726 } 1598 }
1727 if (unplug) 1599 blk_finish_plug(&plug);
1728 unplug_slaves(mddev);
1729} 1600}
1730 1601
1731 1602
@@ -1763,7 +1634,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1763 int i; 1634 int i;
1764 int wonly = -1; 1635 int wonly = -1;
1765 int write_targets = 0, read_targets = 0; 1636 int write_targets = 0, read_targets = 0;
1766 int sync_blocks; 1637 sector_t sync_blocks;
1767 int still_degraded = 0; 1638 int still_degraded = 0;
1768 1639
1769 if (!conf->r1buf_pool) 1640 if (!conf->r1buf_pool)
@@ -1813,11 +1684,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1813 msleep_interruptible(1000); 1684 msleep_interruptible(1000);
1814 1685
1815 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 1686 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
1687 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1816 raise_barrier(conf); 1688 raise_barrier(conf);
1817 1689
1818 conf->next_resync = sector_nr; 1690 conf->next_resync = sector_nr;
1819 1691
1820 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1821 rcu_read_lock(); 1692 rcu_read_lock();
1822 /* 1693 /*
1823 * If we get a correctably read error during resync or recovery, 1694 * If we get a correctably read error during resync or recovery,
@@ -2029,7 +1900,6 @@ static conf_t *setup_conf(mddev_t *mddev)
2029 init_waitqueue_head(&conf->wait_barrier); 1900 init_waitqueue_head(&conf->wait_barrier);
2030 1901
2031 bio_list_init(&conf->pending_bio_list); 1902 bio_list_init(&conf->pending_bio_list);
2032 bio_list_init(&conf->flushing_bio_list);
2033 1903
2034 conf->last_used = -1; 1904 conf->last_used = -1;
2035 for (i = 0; i < conf->raid_disks; i++) { 1905 for (i = 0; i < conf->raid_disks; i++) {
@@ -2107,8 +1977,9 @@ static int run(mddev_t *mddev)
2107 if (IS_ERR(conf)) 1977 if (IS_ERR(conf))
2108 return PTR_ERR(conf); 1978 return PTR_ERR(conf);
2109 1979
2110 mddev->queue->queue_lock = &conf->device_lock;
2111 list_for_each_entry(rdev, &mddev->disks, same_set) { 1980 list_for_each_entry(rdev, &mddev->disks, same_set) {
1981 if (!mddev->gendisk)
1982 continue;
2112 disk_stack_limits(mddev->gendisk, rdev->bdev, 1983 disk_stack_limits(mddev->gendisk, rdev->bdev,
2113 rdev->data_offset << 9); 1984 rdev->data_offset << 9);
2114 /* as we don't honour merge_bvec_fn, we must never risk 1985 /* as we don't honour merge_bvec_fn, we must never risk
@@ -2150,11 +2021,11 @@ static int run(mddev_t *mddev)
2150 2021
2151 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); 2022 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2152 2023
2153 mddev->queue->unplug_fn = raid1_unplug; 2024 if (mddev->queue) {
2154 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2025 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2155 mddev->queue->backing_dev_info.congested_data = mddev; 2026 mddev->queue->backing_dev_info.congested_data = mddev;
2156 md_integrity_register(mddev); 2027 }
2157 return 0; 2028 return md_integrity_register(mddev);
2158} 2029}
2159 2030
2160static int stop(mddev_t *mddev) 2031static int stop(mddev_t *mddev)
@@ -2176,7 +2047,6 @@ static int stop(mddev_t *mddev)
2176 2047
2177 md_unregister_thread(mddev->thread); 2048 md_unregister_thread(mddev->thread);
2178 mddev->thread = NULL; 2049 mddev->thread = NULL;
2179 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2180 if (conf->r1bio_pool) 2050 if (conf->r1bio_pool)
2181 mempool_destroy(conf->r1bio_pool); 2051 mempool_destroy(conf->r1bio_pool);
2182 kfree(conf->mirrors); 2052 kfree(conf->mirrors);
@@ -2201,7 +2071,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
2201 set_capacity(mddev->gendisk, mddev->array_sectors); 2071 set_capacity(mddev->gendisk, mddev->array_sectors);
2202 revalidate_disk(mddev->gendisk); 2072 revalidate_disk(mddev->gendisk);
2203 if (sectors > mddev->dev_sectors && 2073 if (sectors > mddev->dev_sectors &&
2204 mddev->recovery_cp == MaxSector) { 2074 mddev->recovery_cp > mddev->dev_sectors) {
2205 mddev->recovery_cp = mddev->dev_sectors; 2075 mddev->recovery_cp = mddev->dev_sectors;
2206 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2076 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2207 } 2077 }