aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c234
1 files changed, 203 insertions, 31 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 51d9645ed09c..a93ca478142a 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
222{ 222{
223 struct bio *bio = r1_bio->master_bio; 223 struct bio *bio = r1_bio->master_bio;
224 224
225 bio_endio(bio, bio->bi_size, 225 /* if nobody has done the final endio yet, do it now */
226 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); 226 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
227 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
228 (bio_data_dir(bio) == WRITE) ? "write" : "read",
229 (unsigned long long) bio->bi_sector,
230 (unsigned long long) bio->bi_sector +
231 (bio->bi_size >> 9) - 1);
232
233 bio_endio(bio, bio->bi_size,
234 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
235 }
227 free_r1bio(r1_bio); 236 free_r1bio(r1_bio);
228} 237}
229 238
@@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
292{ 301{
293 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
294 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
295 int mirror; 304 int mirror, behind;
296 conf_t *conf = mddev_to_conf(r1_bio->mddev); 305 conf_t *conf = mddev_to_conf(r1_bio->mddev);
297 306
298 if (bio->bi_size) 307 if (bio->bi_size)
@@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
323 332
324 update_head_pos(mirror, r1_bio); 333 update_head_pos(mirror, r1_bio);
325 334
335 behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
336 if (behind) {
337 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
338 atomic_dec(&r1_bio->behind_remaining);
339
340 /* In behind mode, we ACK the master bio once the I/O has safely
341 * reached all non-writemostly disks. Setting the Returned bit
342 * ensures that this gets done only once -- we don't ever want to
343 * return -EIO here, instead we'll wait */
344
345 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
346 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
347 /* Maybe we can return now */
348 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
349 struct bio *mbio = r1_bio->master_bio;
350 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
351 (unsigned long long) mbio->bi_sector,
352 (unsigned long long) mbio->bi_sector +
353 (mbio->bi_size >> 9) - 1);
354 bio_endio(mbio, mbio->bi_size, 0);
355 }
356 }
357 }
326 /* 358 /*
327 * 359 *
328 * Let's see if all mirrored write operations have finished 360 * Let's see if all mirrored write operations have finished
329 * already. 361 * already.
330 */ 362 */
331 if (atomic_dec_and_test(&r1_bio->remaining)) { 363 if (atomic_dec_and_test(&r1_bio->remaining)) {
364 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
365 /* free extra copy of the data pages */
366 int i = bio->bi_vcnt;
367 while (i--)
368 __free_page(bio->bi_io_vec[i].bv_page);
369 }
332 /* clear the bitmap if all writes complete successfully */ 370 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 371 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
334 r1_bio->sectors, 372 r1_bio->sectors,
335 !test_bit(R1BIO_Degraded, &r1_bio->state)); 373 !test_bit(R1BIO_Degraded, &r1_bio->state),
374 behind);
336 md_write_end(r1_bio->mddev); 375 md_write_end(r1_bio->mddev);
337 raid_end_bio_io(r1_bio); 376 raid_end_bio_io(r1_bio);
338 } 377 }
@@ -360,13 +399,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
360{ 399{
361 const unsigned long this_sector = r1_bio->sector; 400 const unsigned long this_sector = r1_bio->sector;
362 int new_disk = conf->last_used, disk = new_disk; 401 int new_disk = conf->last_used, disk = new_disk;
402 int wonly_disk = -1;
363 const int sectors = r1_bio->sectors; 403 const int sectors = r1_bio->sectors;
364 sector_t new_distance, current_distance; 404 sector_t new_distance, current_distance;
365 mdk_rdev_t *new_rdev, *rdev; 405 mdk_rdev_t *rdev;
366 406
367 rcu_read_lock(); 407 rcu_read_lock();
368 /* 408 /*
369 * Check if it if we can balance. We can balance on the whole 409 * Check if we can balance. We can balance on the whole
370 * device if no resync is going on, or below the resync window. 410 * device if no resync is going on, or below the resync window.
371 * We take the first readable disk when above the resync window. 411 * We take the first readable disk when above the resync window.
372 */ 412 */
@@ -376,11 +416,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
376 /* Choose the first operation device, for consistancy */ 416 /* Choose the first operation device, for consistancy */
377 new_disk = 0; 417 new_disk = 0;
378 418
379 while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || 419 for (rdev = conf->mirrors[new_disk].rdev;
380 !new_rdev->in_sync) { 420 !rdev || !rdev->in_sync
381 new_disk++; 421 || test_bit(WriteMostly, &rdev->flags);
382 if (new_disk == conf->raid_disks) { 422 rdev = conf->mirrors[++new_disk].rdev) {
383 new_disk = -1; 423
424 if (rdev && rdev->in_sync)
425 wonly_disk = new_disk;
426
427 if (new_disk == conf->raid_disks - 1) {
428 new_disk = wonly_disk;
384 break; 429 break;
385 } 430 }
386 } 431 }
@@ -389,16 +434,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
389 434
390 435
391 /* make sure the disk is operational */ 436 /* make sure the disk is operational */
392 while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || 437 for (rdev = conf->mirrors[new_disk].rdev;
393 !new_rdev->in_sync) { 438 !rdev || !rdev->in_sync ||
439 test_bit(WriteMostly, &rdev->flags);
440 rdev = conf->mirrors[new_disk].rdev) {
441
442 if (rdev && rdev->in_sync)
443 wonly_disk = new_disk;
444
394 if (new_disk <= 0) 445 if (new_disk <= 0)
395 new_disk = conf->raid_disks; 446 new_disk = conf->raid_disks;
396 new_disk--; 447 new_disk--;
397 if (new_disk == disk) { 448 if (new_disk == disk) {
398 new_disk = -1; 449 new_disk = wonly_disk;
399 goto rb_out; 450 break;
400 } 451 }
401 } 452 }
453
454 if (new_disk < 0)
455 goto rb_out;
456
402 disk = new_disk; 457 disk = new_disk;
403 /* now disk == new_disk == starting point for search */ 458 /* now disk == new_disk == starting point for search */
404 459
@@ -419,37 +474,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
419 disk = conf->raid_disks; 474 disk = conf->raid_disks;
420 disk--; 475 disk--;
421 476
422 if ((rdev=conf->mirrors[disk].rdev) == NULL || 477 rdev = conf->mirrors[disk].rdev;
423 !rdev->in_sync) 478
479 if (!rdev ||
480 !rdev->in_sync ||
481 test_bit(WriteMostly, &rdev->flags))
424 continue; 482 continue;
425 483
426 if (!atomic_read(&rdev->nr_pending)) { 484 if (!atomic_read(&rdev->nr_pending)) {
427 new_disk = disk; 485 new_disk = disk;
428 new_rdev = rdev;
429 break; 486 break;
430 } 487 }
431 new_distance = abs(this_sector - conf->mirrors[disk].head_position); 488 new_distance = abs(this_sector - conf->mirrors[disk].head_position);
432 if (new_distance < current_distance) { 489 if (new_distance < current_distance) {
433 current_distance = new_distance; 490 current_distance = new_distance;
434 new_disk = disk; 491 new_disk = disk;
435 new_rdev = rdev;
436 } 492 }
437 } while (disk != conf->last_used); 493 } while (disk != conf->last_used);
438 494
439rb_out: 495 rb_out:
440 496
441 497
442 if (new_disk >= 0) { 498 if (new_disk >= 0) {
443 conf->next_seq_sect = this_sector + sectors; 499 rdev = conf->mirrors[new_disk].rdev;
444 conf->last_used = new_disk; 500 if (!rdev)
445 atomic_inc(&new_rdev->nr_pending); 501 goto retry;
446 if (!new_rdev->in_sync) { 502 atomic_inc(&rdev->nr_pending);
503 if (!rdev->in_sync) {
447 /* cannot risk returning a device that failed 504 /* cannot risk returning a device that failed
448 * before we inc'ed nr_pending 505 * before we inc'ed nr_pending
449 */ 506 */
450 atomic_dec(&new_rdev->nr_pending); 507 atomic_dec(&rdev->nr_pending);
451 goto retry; 508 goto retry;
452 } 509 }
510 conf->next_seq_sect = this_sector + sectors;
511 conf->last_used = new_disk;
453 } 512 }
454 rcu_read_unlock(); 513 rcu_read_unlock();
455 514
@@ -542,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect)
542 spin_unlock_irq(&conf->resync_lock); 601 spin_unlock_irq(&conf->resync_lock);
543} 602}
544 603
604/* duplicate the data pages for behind I/O */
605static struct page **alloc_behind_pages(struct bio *bio)
606{
607 int i;
608 struct bio_vec *bvec;
609 struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *),
610 GFP_NOIO);
611 if (unlikely(!pages))
612 goto do_sync_io;
613
614 memset(pages, 0, bio->bi_vcnt * sizeof(struct page *));
615
616 bio_for_each_segment(bvec, bio, i) {
617 pages[i] = alloc_page(GFP_NOIO);
618 if (unlikely(!pages[i]))
619 goto do_sync_io;
620 memcpy(kmap(pages[i]) + bvec->bv_offset,
621 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
622 kunmap(pages[i]);
623 kunmap(bvec->bv_page);
624 }
625
626 return pages;
627
628do_sync_io:
629 if (pages)
630 for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
631 __free_page(pages[i]);
632 kfree(pages);
633 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
634 return NULL;
635}
636
545static int make_request(request_queue_t *q, struct bio * bio) 637static int make_request(request_queue_t *q, struct bio * bio)
546{ 638{
547 mddev_t *mddev = q->queuedata; 639 mddev_t *mddev = q->queuedata;
@@ -554,7 +646,12 @@ static int make_request(request_queue_t *q, struct bio * bio)
554 struct bitmap *bitmap = mddev->bitmap; 646 struct bitmap *bitmap = mddev->bitmap;
555 unsigned long flags; 647 unsigned long flags;
556 struct bio_list bl; 648 struct bio_list bl;
649 struct page **behind_pages = NULL;
557 650
651 if (unlikely(bio_barrier(bio))) {
652 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
653 return 0;
654 }
558 655
559 /* 656 /*
560 * Register the new request and wait if the reconstruction 657 * Register the new request and wait if the reconstruction
@@ -589,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio)
589 r1_bio->mddev = mddev; 686 r1_bio->mddev = mddev;
590 r1_bio->sector = bio->bi_sector; 687 r1_bio->sector = bio->bi_sector;
591 688
592 r1_bio->state = 0;
593
594 if (bio_data_dir(bio) == READ) { 689 if (bio_data_dir(bio) == READ) {
595 /* 690 /*
596 * read balancing logic: 691 * read balancing logic:
@@ -651,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio)
651 } 746 }
652 rcu_read_unlock(); 747 rcu_read_unlock();
653 748
749 BUG_ON(targets == 0); /* we never fail the last device */
750
654 if (targets < conf->raid_disks) { 751 if (targets < conf->raid_disks) {
655 /* array is degraded, we will not clear the bitmap 752 /* array is degraded, we will not clear the bitmap
656 * on I/O completion (see raid1_end_write_request) */ 753 * on I/O completion (see raid1_end_write_request) */
657 set_bit(R1BIO_Degraded, &r1_bio->state); 754 set_bit(R1BIO_Degraded, &r1_bio->state);
658 } 755 }
659 756
757 /* do behind I/O ? */
758 if (bitmap &&
759 atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
760 (behind_pages = alloc_behind_pages(bio)) != NULL)
761 set_bit(R1BIO_BehindIO, &r1_bio->state);
762
660 atomic_set(&r1_bio->remaining, 0); 763 atomic_set(&r1_bio->remaining, 0);
764 atomic_set(&r1_bio->behind_remaining, 0);
661 765
662 bio_list_init(&bl); 766 bio_list_init(&bl);
663 for (i = 0; i < disks; i++) { 767 for (i = 0; i < disks; i++) {
@@ -674,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio)
674 mbio->bi_rw = WRITE; 778 mbio->bi_rw = WRITE;
675 mbio->bi_private = r1_bio; 779 mbio->bi_private = r1_bio;
676 780
781 if (behind_pages) {
782 struct bio_vec *bvec;
783 int j;
784
785 /* Yes, I really want the '__' version so that
786 * we clear any unused pointer in the io_vec, rather
787 * than leave them unchanged. This is important
788 * because when we come to free the pages, we won't
789 * know the originial bi_idx, so we just free
790 * them all
791 */
792 __bio_for_each_segment(bvec, mbio, j, 0)
793 bvec->bv_page = behind_pages[j];
794 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
795 atomic_inc(&r1_bio->behind_remaining);
796 }
797
677 atomic_inc(&r1_bio->remaining); 798 atomic_inc(&r1_bio->remaining);
678 799
679 bio_list_add(&bl, mbio); 800 bio_list_add(&bl, mbio);
680 } 801 }
802 kfree(behind_pages); /* the behind pages are attached to the bios now */
681 803
682 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); 804 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
805 test_bit(R1BIO_BehindIO, &r1_bio->state));
683 spin_lock_irqsave(&conf->device_lock, flags); 806 spin_lock_irqsave(&conf->device_lock, flags);
684 bio_list_merge(&conf->pending_bio_list, &bl); 807 bio_list_merge(&conf->pending_bio_list, &bl);
685 bio_list_init(&bl); 808 bio_list_init(&bl);
@@ -1105,6 +1228,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1105 sector_t max_sector, nr_sectors; 1228 sector_t max_sector, nr_sectors;
1106 int disk; 1229 int disk;
1107 int i; 1230 int i;
1231 int wonly;
1108 int write_targets = 0; 1232 int write_targets = 0;
1109 int sync_blocks; 1233 int sync_blocks;
1110 int still_degraded = 0; 1234 int still_degraded = 0;
@@ -1160,14 +1284,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1160 */ 1284 */
1161 disk = conf->last_used; 1285 disk = conf->last_used;
1162 /* make sure disk is operational */ 1286 /* make sure disk is operational */
1163 1287 wonly = disk;
1164 while (conf->mirrors[disk].rdev == NULL || 1288 while (conf->mirrors[disk].rdev == NULL ||
1165 !conf->mirrors[disk].rdev->in_sync) { 1289 !conf->mirrors[disk].rdev->in_sync ||
1290 test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
1291 ) {
1292 if (conf->mirrors[disk].rdev &&
1293 conf->mirrors[disk].rdev->in_sync)
1294 wonly = disk;
1166 if (disk <= 0) 1295 if (disk <= 0)
1167 disk = conf->raid_disks; 1296 disk = conf->raid_disks;
1168 disk--; 1297 disk--;
1169 if (disk == conf->last_used) 1298 if (disk == conf->last_used) {
1299 disk = wonly;
1170 break; 1300 break;
1301 }
1171 } 1302 }
1172 conf->last_used = disk; 1303 conf->last_used = disk;
1173 atomic_inc(&conf->mirrors[disk].rdev->nr_pending); 1304 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
@@ -1439,6 +1570,17 @@ out:
1439static int stop(mddev_t *mddev) 1570static int stop(mddev_t *mddev)
1440{ 1571{
1441 conf_t *conf = mddev_to_conf(mddev); 1572 conf_t *conf = mddev_to_conf(mddev);
1573 struct bitmap *bitmap = mddev->bitmap;
1574 int behind_wait = 0;
1575
1576 /* wait for behind writes to complete */
1577 while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
1578 behind_wait++;
1579 printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
1580 set_current_state(TASK_UNINTERRUPTIBLE);
1581 schedule_timeout(HZ); /* wait a second */
1582 /* need to kick something here to make sure I/O goes? */
1583 }
1442 1584
1443 md_unregister_thread(mddev->thread); 1585 md_unregister_thread(mddev->thread);
1444 mddev->thread = NULL; 1586 mddev->thread = NULL;
@@ -1561,6 +1703,35 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1561 return 0; 1703 return 0;
1562} 1704}
1563 1705
1706static void raid1_quiesce(mddev_t *mddev, int state)
1707{
1708 conf_t *conf = mddev_to_conf(mddev);
1709
1710 switch(state) {
1711 case 1:
1712 spin_lock_irq(&conf->resync_lock);
1713 conf->barrier++;
1714 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
1715 conf->resync_lock, raid1_unplug(mddev->queue));
1716 spin_unlock_irq(&conf->resync_lock);
1717 break;
1718 case 0:
1719 spin_lock_irq(&conf->resync_lock);
1720 conf->barrier--;
1721 spin_unlock_irq(&conf->resync_lock);
1722 wake_up(&conf->wait_resume);
1723 wake_up(&conf->wait_idle);
1724 break;
1725 }
1726 if (mddev->thread) {
1727 if (mddev->bitmap)
1728 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1729 else
1730 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1731 md_wakeup_thread(mddev->thread);
1732 }
1733}
1734
1564 1735
1565static mdk_personality_t raid1_personality = 1736static mdk_personality_t raid1_personality =
1566{ 1737{
@@ -1577,6 +1748,7 @@ static mdk_personality_t raid1_personality =
1577 .sync_request = sync_request, 1748 .sync_request = sync_request,
1578 .resize = raid1_resize, 1749 .resize = raid1_resize,
1579 .reshape = raid1_reshape, 1750 .reshape = raid1_reshape,
1751 .quiesce = raid1_quiesce,
1580}; 1752};
1581 1753
1582static int __init raid_init(void) 1754static int __init raid_init(void)