aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-07-21 13:29:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-21 13:29:12 -0400
commit8a392625b665c676a77c62f8608d10ff430bcb83 (patch)
tree4000a65d61baed73200e47f91dea5263ed16edd0 /drivers/md/raid5.c
parent519f0141f1c42e2b8b59c7dea005cbf6095358e8 (diff)
parent4b80991c6cb9efa607bc4fd6f3ecdf5511c31bb0 (diff)
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (52 commits) md: Protect access to mddev->disks list using RCU md: only count actual openers as access which prevent a 'stop' md: linear: Make array_size sector-based and rename it to array_sectors. md: Make mddev->array_size sector-based. md: Make super_type->rdev_size_change() take sector-based sizes. md: Fix check for overlapping devices. md: Tidy up rdev_size_store a bit: md: Remove some unused macros. md: Turn rdev->sb_offset into a sector-based quantity. md: Make calc_dev_sboffset() return a sector count. md: Replace calc_dev_size() by calc_num_sectors(). md: Make update_size() take the number of sectors. md: Better control of when do_md_stop is allowed to stop the array. md: get_disk_info(): Don't convert between signed and unsigned and back. md: Simplify restart_array(). md: alloc_disk_sb(): Return proper error value. md: Simplify sb_equal(). md: Simplify uuid_equal(). md: sb_equal(): Fix misleading printk. md: Fix a typo in the comment to cmd_match(). ...
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c745
1 files changed, 255 insertions, 490 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9ce7154845c6..55e7c56045a0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -115,15 +115,20 @@ static void return_io(struct bio *return_bi)
115 return_bi = bi->bi_next; 115 return_bi = bi->bi_next;
116 bi->bi_next = NULL; 116 bi->bi_next = NULL;
117 bi->bi_size = 0; 117 bi->bi_size = 0;
118 bi->bi_end_io(bi, 118 bio_endio(bi, 0);
119 test_bit(BIO_UPTODATE, &bi->bi_flags)
120 ? 0 : -EIO);
121 bi = return_bi; 119 bi = return_bi;
122 } 120 }
123} 121}
124 122
125static void print_raid5_conf (raid5_conf_t *conf); 123static void print_raid5_conf (raid5_conf_t *conf);
126 124
125static int stripe_operations_active(struct stripe_head *sh)
126{
127 return sh->check_state || sh->reconstruct_state ||
128 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
129 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
130}
131
127static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 132static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
128{ 133{
129 if (atomic_dec_and_test(&sh->count)) { 134 if (atomic_dec_and_test(&sh->count)) {
@@ -143,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
143 } 148 }
144 md_wakeup_thread(conf->mddev->thread); 149 md_wakeup_thread(conf->mddev->thread);
145 } else { 150 } else {
146 BUG_ON(sh->ops.pending); 151 BUG_ON(stripe_operations_active(sh));
147 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 152 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
148 atomic_dec(&conf->preread_active_stripes); 153 atomic_dec(&conf->preread_active_stripes);
149 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 154 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -245,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
245 250
246 BUG_ON(atomic_read(&sh->count) != 0); 251 BUG_ON(atomic_read(&sh->count) != 0);
247 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 252 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
248 BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); 253 BUG_ON(stripe_operations_active(sh));
249 254
250 CHECK_DEVLOCK(); 255 CHECK_DEVLOCK();
251 pr_debug("init_stripe called, stripe %llu\n", 256 pr_debug("init_stripe called, stripe %llu\n",
@@ -346,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
346 return sh; 351 return sh;
347} 352}
348 353
349/* test_and_ack_op() ensures that we only dequeue an operation once */
350#define test_and_ack_op(op, pend) \
351do { \
352 if (test_bit(op, &sh->ops.pending) && \
353 !test_bit(op, &sh->ops.complete)) { \
354 if (test_and_set_bit(op, &sh->ops.ack)) \
355 clear_bit(op, &pend); \
356 else \
357 ack++; \
358 } else \
359 clear_bit(op, &pend); \
360} while (0)
361
362/* find new work to run, do not resubmit work that is already
363 * in flight
364 */
365static unsigned long get_stripe_work(struct stripe_head *sh)
366{
367 unsigned long pending;
368 int ack = 0;
369
370 pending = sh->ops.pending;
371
372 test_and_ack_op(STRIPE_OP_BIOFILL, pending);
373 test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
374 test_and_ack_op(STRIPE_OP_PREXOR, pending);
375 test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
376 test_and_ack_op(STRIPE_OP_POSTXOR, pending);
377 test_and_ack_op(STRIPE_OP_CHECK, pending);
378 if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
379 ack++;
380
381 sh->ops.count -= ack;
382 if (unlikely(sh->ops.count < 0)) {
383 printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
384 "ops.complete: %#lx\n", pending, sh->ops.pending,
385 sh->ops.ack, sh->ops.complete);
386 BUG();
387 }
388
389 return pending;
390}
391
392static void 354static void
393raid5_end_read_request(struct bio *bi, int error); 355raid5_end_read_request(struct bio *bi, int error);
394static void 356static void
395raid5_end_write_request(struct bio *bi, int error); 357raid5_end_write_request(struct bio *bi, int error);
396 358
397static void ops_run_io(struct stripe_head *sh) 359static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
398{ 360{
399 raid5_conf_t *conf = sh->raid_conf; 361 raid5_conf_t *conf = sh->raid_conf;
400 int i, disks = sh->disks; 362 int i, disks = sh->disks;
401 363
402 might_sleep(); 364 might_sleep();
403 365
404 set_bit(STRIPE_IO_STARTED, &sh->state);
405 for (i = disks; i--; ) { 366 for (i = disks; i--; ) {
406 int rw; 367 int rw;
407 struct bio *bi; 368 struct bio *bi;
@@ -430,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh)
430 rcu_read_unlock(); 391 rcu_read_unlock();
431 392
432 if (rdev) { 393 if (rdev) {
433 if (test_bit(STRIPE_SYNCING, &sh->state) || 394 if (s->syncing || s->expanding || s->expanded)
434 test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
435 test_bit(STRIPE_EXPAND_READY, &sh->state))
436 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 395 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
437 396
397 set_bit(STRIPE_IO_STARTED, &sh->state);
398
438 bi->bi_bdev = rdev->bdev; 399 bi->bi_bdev = rdev->bdev;
439 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 400 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
440 __func__, (unsigned long long)sh->sector, 401 __func__, (unsigned long long)sh->sector,
@@ -528,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref)
528 (unsigned long long)sh->sector); 489 (unsigned long long)sh->sector);
529 490
530 /* clear completed biofills */ 491 /* clear completed biofills */
492 spin_lock_irq(&conf->device_lock);
531 for (i = sh->disks; i--; ) { 493 for (i = sh->disks; i--; ) {
532 struct r5dev *dev = &sh->dev[i]; 494 struct r5dev *dev = &sh->dev[i];
533 495
534 /* acknowledge completion of a biofill operation */ 496 /* acknowledge completion of a biofill operation */
535 /* and check if we need to reply to a read request, 497 /* and check if we need to reply to a read request,
536 * new R5_Wantfill requests are held off until 498 * new R5_Wantfill requests are held off until
537 * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) 499 * !STRIPE_BIOFILL_RUN
538 */ 500 */
539 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 501 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
540 struct bio *rbi, *rbi2; 502 struct bio *rbi, *rbi2;
541 503
542 /* The access to dev->read is outside of the
543 * spin_lock_irq(&conf->device_lock), but is protected
544 * by the STRIPE_OP_BIOFILL pending bit
545 */
546 BUG_ON(!dev->read); 504 BUG_ON(!dev->read);
547 rbi = dev->read; 505 rbi = dev->read;
548 dev->read = NULL; 506 dev->read = NULL;
549 while (rbi && rbi->bi_sector < 507 while (rbi && rbi->bi_sector <
550 dev->sector + STRIPE_SECTORS) { 508 dev->sector + STRIPE_SECTORS) {
551 rbi2 = r5_next_bio(rbi, dev->sector); 509 rbi2 = r5_next_bio(rbi, dev->sector);
552 spin_lock_irq(&conf->device_lock);
553 if (--rbi->bi_phys_segments == 0) { 510 if (--rbi->bi_phys_segments == 0) {
554 rbi->bi_next = return_bi; 511 rbi->bi_next = return_bi;
555 return_bi = rbi; 512 return_bi = rbi;
556 } 513 }
557 spin_unlock_irq(&conf->device_lock);
558 rbi = rbi2; 514 rbi = rbi2;
559 } 515 }
560 } 516 }
561 } 517 }
562 set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); 518 spin_unlock_irq(&conf->device_lock);
519 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
563 520
564 return_io(return_bi); 521 return_io(return_bi);
565 522
@@ -610,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref)
610 set_bit(R5_UPTODATE, &tgt->flags); 567 set_bit(R5_UPTODATE, &tgt->flags);
611 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 568 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
612 clear_bit(R5_Wantcompute, &tgt->flags); 569 clear_bit(R5_Wantcompute, &tgt->flags);
613 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); 570 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
571 if (sh->check_state == check_state_compute_run)
572 sh->check_state = check_state_compute_result;
614 set_bit(STRIPE_HANDLE, &sh->state); 573 set_bit(STRIPE_HANDLE, &sh->state);
615 release_stripe(sh); 574 release_stripe(sh);
616} 575}
617 576
618static struct dma_async_tx_descriptor * 577static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
619ops_run_compute5(struct stripe_head *sh, unsigned long pending)
620{ 578{
621 /* kernel stack size limits the total number of disks */ 579 /* kernel stack size limits the total number of disks */
622 int disks = sh->disks; 580 int disks = sh->disks;
@@ -646,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
646 ASYNC_TX_XOR_ZERO_DST, NULL, 604 ASYNC_TX_XOR_ZERO_DST, NULL,
647 ops_complete_compute5, sh); 605 ops_complete_compute5, sh);
648 606
649 /* ack now if postxor is not set to be run */
650 if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
651 async_tx_ack(tx);
652
653 return tx; 607 return tx;
654} 608}
655 609
@@ -659,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref)
659 613
660 pr_debug("%s: stripe %llu\n", __func__, 614 pr_debug("%s: stripe %llu\n", __func__,
661 (unsigned long long)sh->sector); 615 (unsigned long long)sh->sector);
662
663 set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
664} 616}
665 617
666static struct dma_async_tx_descriptor * 618static struct dma_async_tx_descriptor *
@@ -680,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
680 for (i = disks; i--; ) { 632 for (i = disks; i--; ) {
681 struct r5dev *dev = &sh->dev[i]; 633 struct r5dev *dev = &sh->dev[i];
682 /* Only process blocks that are known to be uptodate */ 634 /* Only process blocks that are known to be uptodate */
683 if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) 635 if (test_bit(R5_Wantdrain, &dev->flags))
684 xor_srcs[count++] = dev->page; 636 xor_srcs[count++] = dev->page;
685 } 637 }
686 638
@@ -692,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
692} 644}
693 645
694static struct dma_async_tx_descriptor * 646static struct dma_async_tx_descriptor *
695ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, 647ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
696 unsigned long pending)
697{ 648{
698 int disks = sh->disks; 649 int disks = sh->disks;
699 int pd_idx = sh->pd_idx, i; 650 int i;
700
701 /* check if prexor is active which means only process blocks
702 * that are part of a read-modify-write (Wantprexor)
703 */
704 int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
705 651
706 pr_debug("%s: stripe %llu\n", __func__, 652 pr_debug("%s: stripe %llu\n", __func__,
707 (unsigned long long)sh->sector); 653 (unsigned long long)sh->sector);
@@ -709,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
709 for (i = disks; i--; ) { 655 for (i = disks; i--; ) {
710 struct r5dev *dev = &sh->dev[i]; 656 struct r5dev *dev = &sh->dev[i];
711 struct bio *chosen; 657 struct bio *chosen;
712 int towrite;
713
714 towrite = 0;
715 if (prexor) { /* rmw */
716 if (dev->towrite &&
717 test_bit(R5_Wantprexor, &dev->flags))
718 towrite = 1;
719 } else { /* rcw */
720 if (i != pd_idx && dev->towrite &&
721 test_bit(R5_LOCKED, &dev->flags))
722 towrite = 1;
723 }
724 658
725 if (towrite) { 659 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
726 struct bio *wbi; 660 struct bio *wbi;
727 661
728 spin_lock(&sh->lock); 662 spin_lock(&sh->lock);
@@ -747,18 +681,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
747static void ops_complete_postxor(void *stripe_head_ref) 681static void ops_complete_postxor(void *stripe_head_ref)
748{ 682{
749 struct stripe_head *sh = stripe_head_ref; 683 struct stripe_head *sh = stripe_head_ref;
750
751 pr_debug("%s: stripe %llu\n", __func__,
752 (unsigned long long)sh->sector);
753
754 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
755 set_bit(STRIPE_HANDLE, &sh->state);
756 release_stripe(sh);
757}
758
759static void ops_complete_write(void *stripe_head_ref)
760{
761 struct stripe_head *sh = stripe_head_ref;
762 int disks = sh->disks, i, pd_idx = sh->pd_idx; 684 int disks = sh->disks, i, pd_idx = sh->pd_idx;
763 685
764 pr_debug("%s: stripe %llu\n", __func__, 686 pr_debug("%s: stripe %llu\n", __func__,
@@ -770,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref)
770 set_bit(R5_UPTODATE, &dev->flags); 692 set_bit(R5_UPTODATE, &dev->flags);
771 } 693 }
772 694
773 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); 695 if (sh->reconstruct_state == reconstruct_state_drain_run)
774 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); 696 sh->reconstruct_state = reconstruct_state_drain_result;
697 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
698 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
699 else {
700 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
701 sh->reconstruct_state = reconstruct_state_result;
702 }
775 703
776 set_bit(STRIPE_HANDLE, &sh->state); 704 set_bit(STRIPE_HANDLE, &sh->state);
777 release_stripe(sh); 705 release_stripe(sh);
778} 706}
779 707
780static void 708static void
781ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, 709ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
782 unsigned long pending)
783{ 710{
784 /* kernel stack size limits the total number of disks */ 711 /* kernel stack size limits the total number of disks */
785 int disks = sh->disks; 712 int disks = sh->disks;
@@ -787,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
787 714
788 int count = 0, pd_idx = sh->pd_idx, i; 715 int count = 0, pd_idx = sh->pd_idx, i;
789 struct page *xor_dest; 716 struct page *xor_dest;
790 int prexor = test_bit(STRIPE_OP_PREXOR, &pending); 717 int prexor = 0;
791 unsigned long flags; 718 unsigned long flags;
792 dma_async_tx_callback callback;
793 719
794 pr_debug("%s: stripe %llu\n", __func__, 720 pr_debug("%s: stripe %llu\n", __func__,
795 (unsigned long long)sh->sector); 721 (unsigned long long)sh->sector);
@@ -797,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
797 /* check if prexor is active which means only process blocks 723 /* check if prexor is active which means only process blocks
798 * that are part of a read-modify-write (written) 724 * that are part of a read-modify-write (written)
799 */ 725 */
800 if (prexor) { 726 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
727 prexor = 1;
801 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 728 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
802 for (i = disks; i--; ) { 729 for (i = disks; i--; ) {
803 struct r5dev *dev = &sh->dev[i]; 730 struct r5dev *dev = &sh->dev[i];
@@ -813,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
813 } 740 }
814 } 741 }
815 742
816 /* check whether this postxor is part of a write */
817 callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
818 ops_complete_write : ops_complete_postxor;
819
820 /* 1/ if we prexor'd then the dest is reused as a source 743 /* 1/ if we prexor'd then the dest is reused as a source
821 * 2/ if we did not prexor then we are redoing the parity 744 * 2/ if we did not prexor then we are redoing the parity
822 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 745 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
@@ -830,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
830 if (unlikely(count == 1)) { 753 if (unlikely(count == 1)) {
831 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); 754 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
832 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 755 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
833 flags, tx, callback, sh); 756 flags, tx, ops_complete_postxor, sh);
834 } else 757 } else
835 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 758 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
836 flags, tx, callback, sh); 759 flags, tx, ops_complete_postxor, sh);
837} 760}
838 761
839static void ops_complete_check(void *stripe_head_ref) 762static void ops_complete_check(void *stripe_head_ref)
840{ 763{
841 struct stripe_head *sh = stripe_head_ref; 764 struct stripe_head *sh = stripe_head_ref;
842 int pd_idx = sh->pd_idx;
843 765
844 pr_debug("%s: stripe %llu\n", __func__, 766 pr_debug("%s: stripe %llu\n", __func__,
845 (unsigned long long)sh->sector); 767 (unsigned long long)sh->sector);
846 768
847 if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && 769 sh->check_state = check_state_check_result;
848 sh->ops.zero_sum_result == 0)
849 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
850
851 set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
852 set_bit(STRIPE_HANDLE, &sh->state); 770 set_bit(STRIPE_HANDLE, &sh->state);
853 release_stripe(sh); 771 release_stripe(sh);
854} 772}
@@ -875,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh)
875 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 793 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
876 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); 794 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
877 795
878 if (tx)
879 set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
880 else
881 clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
882
883 atomic_inc(&sh->count); 796 atomic_inc(&sh->count);
884 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 797 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
885 ops_complete_check, sh); 798 ops_complete_check, sh);
886} 799}
887 800
888static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) 801static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
889{ 802{
890 int overlap_clear = 0, i, disks = sh->disks; 803 int overlap_clear = 0, i, disks = sh->disks;
891 struct dma_async_tx_descriptor *tx = NULL; 804 struct dma_async_tx_descriptor *tx = NULL;
892 805
893 if (test_bit(STRIPE_OP_BIOFILL, &pending)) { 806 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
894 ops_run_biofill(sh); 807 ops_run_biofill(sh);
895 overlap_clear++; 808 overlap_clear++;
896 } 809 }
897 810
898 if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) 811 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
899 tx = ops_run_compute5(sh, pending); 812 tx = ops_run_compute5(sh);
813 /* terminate the chain if postxor is not set to be run */
814 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
815 async_tx_ack(tx);
816 }
900 817
901 if (test_bit(STRIPE_OP_PREXOR, &pending)) 818 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
902 tx = ops_run_prexor(sh, tx); 819 tx = ops_run_prexor(sh, tx);
903 820
904 if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { 821 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
905 tx = ops_run_biodrain(sh, tx, pending); 822 tx = ops_run_biodrain(sh, tx);
906 overlap_clear++; 823 overlap_clear++;
907 } 824 }
908 825
909 if (test_bit(STRIPE_OP_POSTXOR, &pending)) 826 if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
910 ops_run_postxor(sh, tx, pending); 827 ops_run_postxor(sh, tx);
911 828
912 if (test_bit(STRIPE_OP_CHECK, &pending)) 829 if (test_bit(STRIPE_OP_CHECK, &ops_request))
913 ops_run_check(sh); 830 ops_run_check(sh);
914 831
915 if (test_bit(STRIPE_OP_IO, &pending))
916 ops_run_io(sh);
917
918 if (overlap_clear) 832 if (overlap_clear)
919 for (i = disks; i--; ) { 833 for (i = disks; i--; ) {
920 struct r5dev *dev = &sh->dev[i]; 834 struct r5dev *dev = &sh->dev[i];
@@ -997,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
997 struct stripe_head *osh, *nsh; 911 struct stripe_head *osh, *nsh;
998 LIST_HEAD(newstripes); 912 LIST_HEAD(newstripes);
999 struct disk_info *ndisks; 913 struct disk_info *ndisks;
1000 int err = 0; 914 int err;
1001 struct kmem_cache *sc; 915 struct kmem_cache *sc;
1002 int i; 916 int i;
1003 917
1004 if (newsize <= conf->pool_size) 918 if (newsize <= conf->pool_size)
1005 return 0; /* never bother to shrink */ 919 return 0; /* never bother to shrink */
1006 920
1007 md_allow_write(conf->mddev); 921 err = md_allow_write(conf->mddev);
922 if (err)
923 return err;
1008 924
1009 /* Step 1 */ 925 /* Step 1 */
1010 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 926 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -1703,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1703 } 1619 }
1704} 1620}
1705 1621
1706static int 1622static void
1707handle_write_operations5(struct stripe_head *sh, int rcw, int expand) 1623schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1624 int rcw, int expand)
1708{ 1625{
1709 int i, pd_idx = sh->pd_idx, disks = sh->disks; 1626 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1710 int locked = 0;
1711 1627
1712 if (rcw) { 1628 if (rcw) {
1713 /* if we are not expanding this is a proper write request, and 1629 /* if we are not expanding this is a proper write request, and
@@ -1715,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1715 * stripe cache 1631 * stripe cache
1716 */ 1632 */
1717 if (!expand) { 1633 if (!expand) {
1718 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); 1634 sh->reconstruct_state = reconstruct_state_drain_run;
1719 sh->ops.count++; 1635 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1720 } 1636 } else
1637 sh->reconstruct_state = reconstruct_state_run;
1721 1638
1722 set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); 1639 set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1723 sh->ops.count++;
1724 1640
1725 for (i = disks; i--; ) { 1641 for (i = disks; i--; ) {
1726 struct r5dev *dev = &sh->dev[i]; 1642 struct r5dev *dev = &sh->dev[i];
1727 1643
1728 if (dev->towrite) { 1644 if (dev->towrite) {
1729 set_bit(R5_LOCKED, &dev->flags); 1645 set_bit(R5_LOCKED, &dev->flags);
1646 set_bit(R5_Wantdrain, &dev->flags);
1730 if (!expand) 1647 if (!expand)
1731 clear_bit(R5_UPTODATE, &dev->flags); 1648 clear_bit(R5_UPTODATE, &dev->flags);
1732 locked++; 1649 s->locked++;
1733 } 1650 }
1734 } 1651 }
1735 if (locked + 1 == disks) 1652 if (s->locked + 1 == disks)
1736 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 1653 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1737 atomic_inc(&sh->raid_conf->pending_full_writes); 1654 atomic_inc(&sh->raid_conf->pending_full_writes);
1738 } else { 1655 } else {
1739 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1656 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1740 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1657 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1741 1658
1742 set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); 1659 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1743 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); 1660 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1744 set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); 1661 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1745 1662 set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1746 sh->ops.count += 3;
1747 1663
1748 for (i = disks; i--; ) { 1664 for (i = disks; i--; ) {
1749 struct r5dev *dev = &sh->dev[i]; 1665 struct r5dev *dev = &sh->dev[i];
1750 if (i == pd_idx) 1666 if (i == pd_idx)
1751 continue; 1667 continue;
1752 1668
1753 /* For a read-modify write there may be blocks that are
1754 * locked for reading while others are ready to be
1755 * written so we distinguish these blocks by the
1756 * R5_Wantprexor bit
1757 */
1758 if (dev->towrite && 1669 if (dev->towrite &&
1759 (test_bit(R5_UPTODATE, &dev->flags) || 1670 (test_bit(R5_UPTODATE, &dev->flags) ||
1760 test_bit(R5_Wantcompute, &dev->flags))) { 1671 test_bit(R5_Wantcompute, &dev->flags))) {
1761 set_bit(R5_Wantprexor, &dev->flags); 1672 set_bit(R5_Wantdrain, &dev->flags);
1762 set_bit(R5_LOCKED, &dev->flags); 1673 set_bit(R5_LOCKED, &dev->flags);
1763 clear_bit(R5_UPTODATE, &dev->flags); 1674 clear_bit(R5_UPTODATE, &dev->flags);
1764 locked++; 1675 s->locked++;
1765 } 1676 }
1766 } 1677 }
1767 } 1678 }
@@ -1771,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1771 */ 1682 */
1772 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 1683 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1773 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1684 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1774 locked++; 1685 s->locked++;
1775 1686
1776 pr_debug("%s: stripe %llu locked: %d pending: %lx\n", 1687 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
1777 __func__, (unsigned long long)sh->sector, 1688 __func__, (unsigned long long)sh->sector,
1778 locked, sh->ops.pending); 1689 s->locked, s->ops_request);
1779
1780 return locked;
1781} 1690}
1782 1691
1783/* 1692/*
@@ -1876,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1876} 1785}
1877 1786
1878static void 1787static void
1879handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, 1788handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
1880 struct stripe_head_state *s, int disks, 1789 struct stripe_head_state *s, int disks,
1881 struct bio **return_bi) 1790 struct bio **return_bi)
1882{ 1791{
@@ -1967,48 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
1967 md_wakeup_thread(conf->mddev->thread); 1876 md_wakeup_thread(conf->mddev->thread);
1968} 1877}
1969 1878
1970/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks 1879/* fetch_block5 - checks the given member device to see if its data needs
1971 * to process 1880 * to be read or computed to satisfy a request.
1881 *
1882 * Returns 1 when no more member devices need to be checked, otherwise returns
1883 * 0 to tell the loop in handle_stripe_fill5 to continue
1972 */ 1884 */
1973static int __handle_issuing_new_read_requests5(struct stripe_head *sh, 1885static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
1974 struct stripe_head_state *s, int disk_idx, int disks) 1886 int disk_idx, int disks)
1975{ 1887{
1976 struct r5dev *dev = &sh->dev[disk_idx]; 1888 struct r5dev *dev = &sh->dev[disk_idx];
1977 struct r5dev *failed_dev = &sh->dev[s->failed_num]; 1889 struct r5dev *failed_dev = &sh->dev[s->failed_num];
1978 1890
1979 /* don't schedule compute operations or reads on the parity block while
1980 * a check is in flight
1981 */
1982 if ((disk_idx == sh->pd_idx) &&
1983 test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
1984 return ~0;
1985
1986 /* is the data in this block needed, and can we get it? */ 1891 /* is the data in this block needed, and can we get it? */
1987 if (!test_bit(R5_LOCKED, &dev->flags) && 1892 if (!test_bit(R5_LOCKED, &dev->flags) &&
1988 !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || 1893 !test_bit(R5_UPTODATE, &dev->flags) &&
1989 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 1894 (dev->toread ||
1990 s->syncing || s->expanding || (s->failed && 1895 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1991 (failed_dev->toread || (failed_dev->towrite && 1896 s->syncing || s->expanding ||
1992 !test_bit(R5_OVERWRITE, &failed_dev->flags) 1897 (s->failed &&
1993 ))))) { 1898 (failed_dev->toread ||
1994 /* 1/ We would like to get this block, possibly by computing it, 1899 (failed_dev->towrite &&
1995 * but we might not be able to. 1900 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
1996 * 1901 /* We would like to get this block, possibly by computing it,
1997 * 2/ Since parity check operations potentially make the parity 1902 * otherwise read it if the backing disk is insync
1998 * block !uptodate it will need to be refreshed before any
1999 * compute operations on data disks are scheduled.
2000 *
2001 * 3/ We hold off parity block re-reads until check operations
2002 * have quiesced.
2003 */ 1903 */
2004 if ((s->uptodate == disks - 1) && 1904 if ((s->uptodate == disks - 1) &&
2005 (s->failed && disk_idx == s->failed_num) && 1905 (s->failed && disk_idx == s->failed_num)) {
2006 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { 1906 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2007 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); 1907 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2008 set_bit(R5_Wantcompute, &dev->flags); 1908 set_bit(R5_Wantcompute, &dev->flags);
2009 sh->ops.target = disk_idx; 1909 sh->ops.target = disk_idx;
2010 s->req_compute = 1; 1910 s->req_compute = 1;
2011 sh->ops.count++;
2012 /* Careful: from this point on 'uptodate' is in the eye 1911 /* Careful: from this point on 'uptodate' is in the eye
2013 * of raid5_run_ops which services 'compute' operations 1912 * of raid5_run_ops which services 'compute' operations
2014 * before writes. R5_Wantcompute flags a block that will 1913 * before writes. R5_Wantcompute flags a block that will
@@ -2016,53 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
2016 * subsequent operation. 1915 * subsequent operation.
2017 */ 1916 */
2018 s->uptodate++; 1917 s->uptodate++;
2019 return 0; /* uptodate + compute == disks */ 1918 return 1; /* uptodate + compute == disks */
2020 } else if (test_bit(R5_Insync, &dev->flags)) { 1919 } else if (test_bit(R5_Insync, &dev->flags)) {
2021 set_bit(R5_LOCKED, &dev->flags); 1920 set_bit(R5_LOCKED, &dev->flags);
2022 set_bit(R5_Wantread, &dev->flags); 1921 set_bit(R5_Wantread, &dev->flags);
2023 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2024 sh->ops.count++;
2025 s->locked++; 1922 s->locked++;
2026 pr_debug("Reading block %d (sync=%d)\n", disk_idx, 1923 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2027 s->syncing); 1924 s->syncing);
2028 } 1925 }
2029 } 1926 }
2030 1927
2031 return ~0; 1928 return 0;
2032} 1929}
2033 1930
2034static void handle_issuing_new_read_requests5(struct stripe_head *sh, 1931/**
1932 * handle_stripe_fill5 - read or compute data to satisfy pending requests.
1933 */
1934static void handle_stripe_fill5(struct stripe_head *sh,
2035 struct stripe_head_state *s, int disks) 1935 struct stripe_head_state *s, int disks)
2036{ 1936{
2037 int i; 1937 int i;
2038 1938
2039 /* Clear completed compute operations. Parity recovery
2040 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
2041 * later on in this routine
2042 */
2043 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2044 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2045 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2046 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2047 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2048 }
2049
2050 /* look for blocks to read/compute, skip this if a compute 1939 /* look for blocks to read/compute, skip this if a compute
2051 * is already in flight, or if the stripe contents are in the 1940 * is already in flight, or if the stripe contents are in the
2052 * midst of changing due to a write 1941 * midst of changing due to a write
2053 */ 1942 */
2054 if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && 1943 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2055 !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && 1944 !sh->reconstruct_state)
2056 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2057 for (i = disks; i--; ) 1945 for (i = disks; i--; )
2058 if (__handle_issuing_new_read_requests5( 1946 if (fetch_block5(sh, s, i, disks))
2059 sh, s, i, disks) == 0)
2060 break; 1947 break;
2061 }
2062 set_bit(STRIPE_HANDLE, &sh->state); 1948 set_bit(STRIPE_HANDLE, &sh->state);
2063} 1949}
2064 1950
2065static void handle_issuing_new_read_requests6(struct stripe_head *sh, 1951static void handle_stripe_fill6(struct stripe_head *sh,
2066 struct stripe_head_state *s, struct r6_state *r6s, 1952 struct stripe_head_state *s, struct r6_state *r6s,
2067 int disks) 1953 int disks)
2068{ 1954{
@@ -2121,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
2121} 2007}
2122 2008
2123 2009
2124/* handle_completed_write_requests 2010/* handle_stripe_clean_event
2125 * any written block on an uptodate or failed drive can be returned. 2011 * any written block on an uptodate or failed drive can be returned.
2126 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2012 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2127 * never LOCKED, so we don't need to test 'failed' directly. 2013 * never LOCKED, so we don't need to test 'failed' directly.
2128 */ 2014 */
2129static void handle_completed_write_requests(raid5_conf_t *conf, 2015static void handle_stripe_clean_event(raid5_conf_t *conf,
2130 struct stripe_head *sh, int disks, struct bio **return_bi) 2016 struct stripe_head *sh, int disks, struct bio **return_bi)
2131{ 2017{
2132 int i; 2018 int i;
@@ -2171,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
2171 md_wakeup_thread(conf->mddev->thread); 2057 md_wakeup_thread(conf->mddev->thread);
2172} 2058}
2173 2059
2174static void handle_issuing_new_write_requests5(raid5_conf_t *conf, 2060static void handle_stripe_dirtying5(raid5_conf_t *conf,
2175 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2061 struct stripe_head *sh, struct stripe_head_state *s, int disks)
2176{ 2062{
2177 int rmw = 0, rcw = 0, i; 2063 int rmw = 0, rcw = 0, i;
@@ -2215,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2215 "%d for r-m-w\n", i); 2101 "%d for r-m-w\n", i);
2216 set_bit(R5_LOCKED, &dev->flags); 2102 set_bit(R5_LOCKED, &dev->flags);
2217 set_bit(R5_Wantread, &dev->flags); 2103 set_bit(R5_Wantread, &dev->flags);
2218 if (!test_and_set_bit(
2219 STRIPE_OP_IO, &sh->ops.pending))
2220 sh->ops.count++;
2221 s->locked++; 2104 s->locked++;
2222 } else { 2105 } else {
2223 set_bit(STRIPE_DELAYED, &sh->state); 2106 set_bit(STRIPE_DELAYED, &sh->state);
@@ -2241,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2241 "%d for Reconstruct\n", i); 2124 "%d for Reconstruct\n", i);
2242 set_bit(R5_LOCKED, &dev->flags); 2125 set_bit(R5_LOCKED, &dev->flags);
2243 set_bit(R5_Wantread, &dev->flags); 2126 set_bit(R5_Wantread, &dev->flags);
2244 if (!test_and_set_bit(
2245 STRIPE_OP_IO, &sh->ops.pending))
2246 sh->ops.count++;
2247 s->locked++; 2127 s->locked++;
2248 } else { 2128 } else {
2249 set_bit(STRIPE_DELAYED, &sh->state); 2129 set_bit(STRIPE_DELAYED, &sh->state);
@@ -2261,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2261 * simultaneously. If this is not the case then new writes need to be 2141 * simultaneously. If this is not the case then new writes need to be
2262 * held off until the compute completes. 2142 * held off until the compute completes.
2263 */ 2143 */
2264 if ((s->req_compute || 2144 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2265 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && 2145 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2266 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2146 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2267 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2147 schedule_reconstruction5(sh, s, rcw == 0, 0);
2268 s->locked += handle_write_operations5(sh, rcw == 0, 0);
2269} 2148}
2270 2149
2271static void handle_issuing_new_write_requests6(raid5_conf_t *conf, 2150static void handle_stripe_dirtying6(raid5_conf_t *conf,
2272 struct stripe_head *sh, struct stripe_head_state *s, 2151 struct stripe_head *sh, struct stripe_head_state *s,
2273 struct r6_state *r6s, int disks) 2152 struct r6_state *r6s, int disks)
2274{ 2153{
@@ -2371,92 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
2371static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2250static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2372 struct stripe_head_state *s, int disks) 2251 struct stripe_head_state *s, int disks)
2373{ 2252{
2374 int canceled_check = 0; 2253 struct r5dev *dev = NULL;
2375 2254
2376 set_bit(STRIPE_HANDLE, &sh->state); 2255 set_bit(STRIPE_HANDLE, &sh->state);
2377 2256
2378 /* complete a check operation */ 2257 switch (sh->check_state) {
2379 if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { 2258 case check_state_idle:
2380 clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); 2259 /* start a new check operation if there are no failures */
2381 clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
2382 if (s->failed == 0) { 2260 if (s->failed == 0) {
2383 if (sh->ops.zero_sum_result == 0)
2384 /* parity is correct (on disc,
2385 * not in buffer any more)
2386 */
2387 set_bit(STRIPE_INSYNC, &sh->state);
2388 else {
2389 conf->mddev->resync_mismatches +=
2390 STRIPE_SECTORS;
2391 if (test_bit(
2392 MD_RECOVERY_CHECK, &conf->mddev->recovery))
2393 /* don't try to repair!! */
2394 set_bit(STRIPE_INSYNC, &sh->state);
2395 else {
2396 set_bit(STRIPE_OP_COMPUTE_BLK,
2397 &sh->ops.pending);
2398 set_bit(STRIPE_OP_MOD_REPAIR_PD,
2399 &sh->ops.pending);
2400 set_bit(R5_Wantcompute,
2401 &sh->dev[sh->pd_idx].flags);
2402 sh->ops.target = sh->pd_idx;
2403 sh->ops.count++;
2404 s->uptodate++;
2405 }
2406 }
2407 } else
2408 canceled_check = 1; /* STRIPE_INSYNC is not set */
2409 }
2410
2411 /* start a new check operation if there are no failures, the stripe is
2412 * not insync, and a repair is not in flight
2413 */
2414 if (s->failed == 0 &&
2415 !test_bit(STRIPE_INSYNC, &sh->state) &&
2416 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2417 if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
2418 BUG_ON(s->uptodate != disks); 2261 BUG_ON(s->uptodate != disks);
2262 sh->check_state = check_state_run;
2263 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2419 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2264 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2420 sh->ops.count++;
2421 s->uptodate--; 2265 s->uptodate--;
2266 break;
2422 } 2267 }
2423 } 2268 dev = &sh->dev[s->failed_num];
2424 2269 /* fall through */
2425 /* check if we can clear a parity disk reconstruct */ 2270 case check_state_compute_result:
2426 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && 2271 sh->check_state = check_state_idle;
2427 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { 2272 if (!dev)
2428 2273 dev = &sh->dev[sh->pd_idx];
2429 clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); 2274
2430 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); 2275 /* check that a write has not made the stripe insync */
2431 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); 2276 if (test_bit(STRIPE_INSYNC, &sh->state))
2432 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); 2277 break;
2433 }
2434
2435 2278
2436 /* Wait for check parity and compute block operations to complete
2437 * before write-back. If a failure occurred while the check operation
2438 * was in flight we need to cycle this stripe through handle_stripe
2439 * since the parity block may not be uptodate
2440 */
2441 if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
2442 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
2443 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
2444 struct r5dev *dev;
2445 /* either failed parity check, or recovery is happening */ 2279 /* either failed parity check, or recovery is happening */
2446 if (s->failed == 0)
2447 s->failed_num = sh->pd_idx;
2448 dev = &sh->dev[s->failed_num];
2449 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2280 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2450 BUG_ON(s->uptodate != disks); 2281 BUG_ON(s->uptodate != disks);
2451 2282
2452 set_bit(R5_LOCKED, &dev->flags); 2283 set_bit(R5_LOCKED, &dev->flags);
2284 s->locked++;
2453 set_bit(R5_Wantwrite, &dev->flags); 2285 set_bit(R5_Wantwrite, &dev->flags);
2454 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2455 sh->ops.count++;
2456 2286
2457 clear_bit(STRIPE_DEGRADED, &sh->state); 2287 clear_bit(STRIPE_DEGRADED, &sh->state);
2458 s->locked++;
2459 set_bit(STRIPE_INSYNC, &sh->state); 2288 set_bit(STRIPE_INSYNC, &sh->state);
2289 break;
2290 case check_state_run:
2291 break; /* we will be called again upon completion */
2292 case check_state_check_result:
2293 sh->check_state = check_state_idle;
2294
2295 /* if a failure occurred during the check operation, leave
2296 * STRIPE_INSYNC not set and let the stripe be handled again
2297 */
2298 if (s->failed)
2299 break;
2300
2301 /* handle a successful check operation, if parity is correct
2302 * we are done. Otherwise update the mismatch count and repair
2303 * parity if !MD_RECOVERY_CHECK
2304 */
2305 if (sh->ops.zero_sum_result == 0)
2306 /* parity is correct (on disc,
2307 * not in buffer any more)
2308 */
2309 set_bit(STRIPE_INSYNC, &sh->state);
2310 else {
2311 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2312 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2313 /* don't try to repair!! */
2314 set_bit(STRIPE_INSYNC, &sh->state);
2315 else {
2316 sh->check_state = check_state_compute_run;
2317 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2318 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2319 set_bit(R5_Wantcompute,
2320 &sh->dev[sh->pd_idx].flags);
2321 sh->ops.target = sh->pd_idx;
2322 s->uptodate++;
2323 }
2324 }
2325 break;
2326 case check_state_compute_run:
2327 break;
2328 default:
2329 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2330 __func__, sh->check_state,
2331 (unsigned long long) sh->sector);
2332 BUG();
2460 } 2333 }
2461} 2334}
2462 2335
@@ -2641,15 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh)
2641 struct bio *return_bi = NULL; 2514 struct bio *return_bi = NULL;
2642 struct stripe_head_state s; 2515 struct stripe_head_state s;
2643 struct r5dev *dev; 2516 struct r5dev *dev;
2644 unsigned long pending = 0;
2645 mdk_rdev_t *blocked_rdev = NULL; 2517 mdk_rdev_t *blocked_rdev = NULL;
2646 int prexor; 2518 int prexor;
2647 2519
2648 memset(&s, 0, sizeof(s)); 2520 memset(&s, 0, sizeof(s));
2649 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " 2521 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
2650 "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, 2522 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
2651 atomic_read(&sh->count), sh->pd_idx, 2523 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
2652 sh->ops.pending, sh->ops.ack, sh->ops.complete); 2524 sh->reconstruct_state);
2653 2525
2654 spin_lock(&sh->lock); 2526 spin_lock(&sh->lock);
2655 clear_bit(STRIPE_HANDLE, &sh->state); 2527 clear_bit(STRIPE_HANDLE, &sh->state);
@@ -2658,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh)
2658 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 2530 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2659 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2531 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2660 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2532 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2661 /* Now to look around and see what can be done */
2662
2663 /* clean-up completed biofill operations */
2664 if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
2665 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
2666 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
2667 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
2668 }
2669 2533
2534 /* Now to look around and see what can be done */
2670 rcu_read_lock(); 2535 rcu_read_lock();
2671 for (i=disks; i--; ) { 2536 for (i=disks; i--; ) {
2672 mdk_rdev_t *rdev; 2537 mdk_rdev_t *rdev;
@@ -2680,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh)
2680 /* maybe we can request a biofill operation 2545 /* maybe we can request a biofill operation
2681 * 2546 *
2682 * new wantfill requests are only permitted while 2547 * new wantfill requests are only permitted while
2683 * STRIPE_OP_BIOFILL is clear 2548 * ops_complete_biofill is guaranteed to be inactive
2684 */ 2549 */
2685 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 2550 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
2686 !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) 2551 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
2687 set_bit(R5_Wantfill, &dev->flags); 2552 set_bit(R5_Wantfill, &dev->flags);
2688 2553
2689 /* now count some things */ 2554 /* now count some things */
@@ -2727,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh)
2727 goto unlock; 2592 goto unlock;
2728 } 2593 }
2729 2594
2730 if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) 2595 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
2731 sh->ops.count++; 2596 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
2597 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
2598 }
2732 2599
2733 pr_debug("locked=%d uptodate=%d to_read=%d" 2600 pr_debug("locked=%d uptodate=%d to_read=%d"
2734 " to_write=%d failed=%d failed_num=%d\n", 2601 " to_write=%d failed=%d failed_num=%d\n",
@@ -2738,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh)
2738 * need to be failed 2605 * need to be failed
2739 */ 2606 */
2740 if (s.failed > 1 && s.to_read+s.to_write+s.written) 2607 if (s.failed > 1 && s.to_read+s.to_write+s.written)
2741 handle_requests_to_failed_array(conf, sh, &s, disks, 2608 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
2742 &return_bi);
2743 if (s.failed > 1 && s.syncing) { 2609 if (s.failed > 1 && s.syncing) {
2744 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2610 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2745 clear_bit(STRIPE_SYNCING, &sh->state); 2611 clear_bit(STRIPE_SYNCING, &sh->state);
@@ -2755,48 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh)
2755 !test_bit(R5_LOCKED, &dev->flags) && 2621 !test_bit(R5_LOCKED, &dev->flags) &&
2756 test_bit(R5_UPTODATE, &dev->flags)) || 2622 test_bit(R5_UPTODATE, &dev->flags)) ||
2757 (s.failed == 1 && s.failed_num == sh->pd_idx))) 2623 (s.failed == 1 && s.failed_num == sh->pd_idx)))
2758 handle_completed_write_requests(conf, sh, disks, &return_bi); 2624 handle_stripe_clean_event(conf, sh, disks, &return_bi);
2759 2625
2760 /* Now we might consider reading some blocks, either to check/generate 2626 /* Now we might consider reading some blocks, either to check/generate
2761 * parity, or to satisfy requests 2627 * parity, or to satisfy requests
2762 * or to load a block that is being partially written. 2628 * or to load a block that is being partially written.
2763 */ 2629 */
2764 if (s.to_read || s.non_overwrite || 2630 if (s.to_read || s.non_overwrite ||
2765 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || 2631 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
2766 test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 2632 handle_stripe_fill5(sh, &s, disks);
2767 handle_issuing_new_read_requests5(sh, &s, disks);
2768 2633
2769 /* Now we check to see if any write operations have recently 2634 /* Now we check to see if any write operations have recently
2770 * completed 2635 * completed
2771 */ 2636 */
2772
2773 /* leave prexor set until postxor is done, allows us to distinguish
2774 * a rmw from a rcw during biodrain
2775 */
2776 prexor = 0; 2637 prexor = 0;
2777 if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && 2638 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
2778 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2779
2780 prexor = 1; 2639 prexor = 1;
2781 clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); 2640 if (sh->reconstruct_state == reconstruct_state_drain_result ||
2782 clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); 2641 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
2783 clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); 2642 sh->reconstruct_state = reconstruct_state_idle;
2784
2785 for (i = disks; i--; )
2786 clear_bit(R5_Wantprexor, &sh->dev[i].flags);
2787 }
2788
2789 /* if only POSTXOR is set then this is an 'expand' postxor */
2790 if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
2791 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2792
2793 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
2794 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
2795 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
2796
2797 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2798 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2799 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2800 2643
2801 /* All the 'written' buffers and the parity block are ready to 2644 /* All the 'written' buffers and the parity block are ready to
2802 * be written back to disk 2645 * be written back to disk
@@ -2808,9 +2651,6 @@ static void handle_stripe5(struct stripe_head *sh)
2808 (i == sh->pd_idx || dev->written)) { 2651 (i == sh->pd_idx || dev->written)) {
2809 pr_debug("Writing block %d\n", i); 2652 pr_debug("Writing block %d\n", i);
2810 set_bit(R5_Wantwrite, &dev->flags); 2653 set_bit(R5_Wantwrite, &dev->flags);
2811 if (!test_and_set_bit(
2812 STRIPE_OP_IO, &sh->ops.pending))
2813 sh->ops.count++;
2814 if (prexor) 2654 if (prexor)
2815 continue; 2655 continue;
2816 if (!test_bit(R5_Insync, &dev->flags) || 2656 if (!test_bit(R5_Insync, &dev->flags) ||
@@ -2832,20 +2672,18 @@ static void handle_stripe5(struct stripe_head *sh)
2832 * 2/ A 'check' operation is in flight, as it may clobber the parity 2672 * 2/ A 'check' operation is in flight, as it may clobber the parity
2833 * block. 2673 * block.
2834 */ 2674 */
2835 if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && 2675 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
2836 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) 2676 handle_stripe_dirtying5(conf, sh, &s, disks);
2837 handle_issuing_new_write_requests5(conf, sh, &s, disks);
2838 2677
2839 /* maybe we need to check and possibly fix the parity for this stripe 2678 /* maybe we need to check and possibly fix the parity for this stripe
2840 * Any reads will already have been scheduled, so we just see if enough 2679 * Any reads will already have been scheduled, so we just see if enough
2841 * data is available. The parity check is held off while parity 2680 * data is available. The parity check is held off while parity
2842 * dependent operations are in flight. 2681 * dependent operations are in flight.
2843 */ 2682 */
2844 if ((s.syncing && s.locked == 0 && 2683 if (sh->check_state ||
2845 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && 2684 (s.syncing && s.locked == 0 &&
2846 !test_bit(STRIPE_INSYNC, &sh->state)) || 2685 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
2847 test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || 2686 !test_bit(STRIPE_INSYNC, &sh->state)))
2848 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
2849 handle_parity_checks5(conf, sh, &s, disks); 2687 handle_parity_checks5(conf, sh, &s, disks);
2850 2688
2851 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 2689 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -2864,52 +2702,35 @@ static void handle_stripe5(struct stripe_head *sh)
2864 dev = &sh->dev[s.failed_num]; 2702 dev = &sh->dev[s.failed_num];
2865 if (!test_bit(R5_ReWrite, &dev->flags)) { 2703 if (!test_bit(R5_ReWrite, &dev->flags)) {
2866 set_bit(R5_Wantwrite, &dev->flags); 2704 set_bit(R5_Wantwrite, &dev->flags);
2867 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2868 sh->ops.count++;
2869 set_bit(R5_ReWrite, &dev->flags); 2705 set_bit(R5_ReWrite, &dev->flags);
2870 set_bit(R5_LOCKED, &dev->flags); 2706 set_bit(R5_LOCKED, &dev->flags);
2871 s.locked++; 2707 s.locked++;
2872 } else { 2708 } else {
2873 /* let's read it back */ 2709 /* let's read it back */
2874 set_bit(R5_Wantread, &dev->flags); 2710 set_bit(R5_Wantread, &dev->flags);
2875 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2876 sh->ops.count++;
2877 set_bit(R5_LOCKED, &dev->flags); 2711 set_bit(R5_LOCKED, &dev->flags);
2878 s.locked++; 2712 s.locked++;
2879 } 2713 }
2880 } 2714 }
2881 2715
2882 /* Finish postxor operations initiated by the expansion 2716 /* Finish reconstruct operations initiated by the expansion process */
2883 * process 2717 if (sh->reconstruct_state == reconstruct_state_result) {
2884 */ 2718 sh->reconstruct_state = reconstruct_state_idle;
2885 if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
2886 !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
2887
2888 clear_bit(STRIPE_EXPANDING, &sh->state); 2719 clear_bit(STRIPE_EXPANDING, &sh->state);
2889 2720 for (i = conf->raid_disks; i--; )
2890 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2891 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2892 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2893
2894 for (i = conf->raid_disks; i--; ) {
2895 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2721 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2896 set_bit(R5_LOCKED, &dev->flags); 2722 set_bit(R5_LOCKED, &dev->flags);
2897 s.locked++; 2723 s.locked++;
2898 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2899 sh->ops.count++;
2900 }
2901 } 2724 }
2902 2725
2903 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 2726 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
2904 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { 2727 !sh->reconstruct_state) {
2905 /* Need to write out all blocks after computing parity */ 2728 /* Need to write out all blocks after computing parity */
2906 sh->disks = conf->raid_disks; 2729 sh->disks = conf->raid_disks;
2907 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 2730 sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
2908 conf->raid_disks); 2731 conf->raid_disks);
2909 s.locked += handle_write_operations5(sh, 1, 1); 2732 schedule_reconstruction5(sh, &s, 1, 1);
2910 } else if (s.expanded && 2733 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
2911 s.locked == 0 &&
2912 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2913 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2734 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2914 atomic_dec(&conf->reshape_stripes); 2735 atomic_dec(&conf->reshape_stripes);
2915 wake_up(&conf->wait_for_overlap); 2736 wake_up(&conf->wait_for_overlap);
@@ -2917,12 +2738,9 @@ static void handle_stripe5(struct stripe_head *sh)
2917 } 2738 }
2918 2739
2919 if (s.expanding && s.locked == 0 && 2740 if (s.expanding && s.locked == 0 &&
2920 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 2741 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
2921 handle_stripe_expansion(conf, sh, NULL); 2742 handle_stripe_expansion(conf, sh, NULL);
2922 2743
2923 if (sh->ops.count)
2924 pending = get_stripe_work(sh);
2925
2926 unlock: 2744 unlock:
2927 spin_unlock(&sh->lock); 2745 spin_unlock(&sh->lock);
2928 2746
@@ -2930,11 +2748,12 @@ static void handle_stripe5(struct stripe_head *sh)
2930 if (unlikely(blocked_rdev)) 2748 if (unlikely(blocked_rdev))
2931 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2749 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2932 2750
2933 if (pending) 2751 if (s.ops_request)
2934 raid5_run_ops(sh, pending); 2752 raid5_run_ops(sh, s.ops_request);
2935 2753
2936 return_io(return_bi); 2754 ops_run_io(sh, &s);
2937 2755
2756 return_io(return_bi);
2938} 2757}
2939 2758
2940static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 2759static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
@@ -3042,8 +2861,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3042 * might need to be failed 2861 * might need to be failed
3043 */ 2862 */
3044 if (s.failed > 2 && s.to_read+s.to_write+s.written) 2863 if (s.failed > 2 && s.to_read+s.to_write+s.written)
3045 handle_requests_to_failed_array(conf, sh, &s, disks, 2864 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
3046 &return_bi);
3047 if (s.failed > 2 && s.syncing) { 2865 if (s.failed > 2 && s.syncing) {
3048 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2866 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3049 clear_bit(STRIPE_SYNCING, &sh->state); 2867 clear_bit(STRIPE_SYNCING, &sh->state);
@@ -3068,7 +2886,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3068 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 2886 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3069 && !test_bit(R5_LOCKED, &qdev->flags) 2887 && !test_bit(R5_LOCKED, &qdev->flags)
3070 && test_bit(R5_UPTODATE, &qdev->flags))))) 2888 && test_bit(R5_UPTODATE, &qdev->flags)))))
3071 handle_completed_write_requests(conf, sh, disks, &return_bi); 2889 handle_stripe_clean_event(conf, sh, disks, &return_bi);
3072 2890
3073 /* Now we might consider reading some blocks, either to check/generate 2891 /* Now we might consider reading some blocks, either to check/generate
3074 * parity, or to satisfy requests 2892 * parity, or to satisfy requests
@@ -3076,11 +2894,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3076 */ 2894 */
3077 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 2895 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3078 (s.syncing && (s.uptodate < disks)) || s.expanding) 2896 (s.syncing && (s.uptodate < disks)) || s.expanding)
3079 handle_issuing_new_read_requests6(sh, &s, &r6s, disks); 2897 handle_stripe_fill6(sh, &s, &r6s, disks);
3080 2898
3081 /* now to consider writing and what else, if anything should be read */ 2899 /* now to consider writing and what else, if anything should be read */
3082 if (s.to_write) 2900 if (s.to_write)
3083 handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); 2901 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3084 2902
3085 /* maybe we need to check and possibly fix the parity for this stripe 2903 /* maybe we need to check and possibly fix the parity for this stripe
3086 * Any reads will already have been scheduled, so we just see if enough 2904 * Any reads will already have been scheduled, so we just see if enough
@@ -3136,7 +2954,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3136 } 2954 }
3137 2955
3138 if (s.expanding && s.locked == 0 && 2956 if (s.expanding && s.locked == 0 &&
3139 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 2957 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3140 handle_stripe_expansion(conf, sh, &r6s); 2958 handle_stripe_expansion(conf, sh, &r6s);
3141 2959
3142 unlock: 2960 unlock:
@@ -3146,68 +2964,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3146 if (unlikely(blocked_rdev)) 2964 if (unlikely(blocked_rdev))
3147 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2965 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3148 2966
3149 return_io(return_bi); 2967 ops_run_io(sh, &s);
3150
3151 for (i=disks; i-- ;) {
3152 int rw;
3153 struct bio *bi;
3154 mdk_rdev_t *rdev;
3155 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
3156 rw = WRITE;
3157 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
3158 rw = READ;
3159 else
3160 continue;
3161
3162 set_bit(STRIPE_IO_STARTED, &sh->state);
3163
3164 bi = &sh->dev[i].req;
3165
3166 bi->bi_rw = rw;
3167 if (rw == WRITE)
3168 bi->bi_end_io = raid5_end_write_request;
3169 else
3170 bi->bi_end_io = raid5_end_read_request;
3171
3172 rcu_read_lock();
3173 rdev = rcu_dereference(conf->disks[i].rdev);
3174 if (rdev && test_bit(Faulty, &rdev->flags))
3175 rdev = NULL;
3176 if (rdev)
3177 atomic_inc(&rdev->nr_pending);
3178 rcu_read_unlock();
3179 2968
3180 if (rdev) { 2969 return_io(return_bi);
3181 if (s.syncing || s.expanding || s.expanded)
3182 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
3183
3184 bi->bi_bdev = rdev->bdev;
3185 pr_debug("for %llu schedule op %ld on disc %d\n",
3186 (unsigned long long)sh->sector, bi->bi_rw, i);
3187 atomic_inc(&sh->count);
3188 bi->bi_sector = sh->sector + rdev->data_offset;
3189 bi->bi_flags = 1 << BIO_UPTODATE;
3190 bi->bi_vcnt = 1;
3191 bi->bi_max_vecs = 1;
3192 bi->bi_idx = 0;
3193 bi->bi_io_vec = &sh->dev[i].vec;
3194 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
3195 bi->bi_io_vec[0].bv_offset = 0;
3196 bi->bi_size = STRIPE_SIZE;
3197 bi->bi_next = NULL;
3198 if (rw == WRITE &&
3199 test_bit(R5_ReWrite, &sh->dev[i].flags))
3200 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
3201 generic_make_request(bi);
3202 } else {
3203 if (rw == WRITE)
3204 set_bit(STRIPE_DEGRADED, &sh->state);
3205 pr_debug("skip op %ld on disc %d for sector %llu\n",
3206 bi->bi_rw, i, (unsigned long long)sh->sector);
3207 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3208 set_bit(STRIPE_HANDLE, &sh->state);
3209 }
3210 }
3211} 2970}
3212 2971
3213static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) 2972static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
@@ -3697,9 +3456,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
3697 if ( rw == WRITE ) 3456 if ( rw == WRITE )
3698 md_write_end(mddev); 3457 md_write_end(mddev);
3699 3458
3700 bi->bi_end_io(bi, 3459 bio_endio(bi, 0);
3701 test_bit(BIO_UPTODATE, &bi->bi_flags)
3702 ? 0 : -EIO);
3703 } 3460 }
3704 return 0; 3461 return 0;
3705} 3462}
@@ -3785,7 +3542,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3785 j == raid6_next_disk(sh->pd_idx, sh->disks)) 3542 j == raid6_next_disk(sh->pd_idx, sh->disks))
3786 continue; 3543 continue;
3787 s = compute_blocknr(sh, j); 3544 s = compute_blocknr(sh, j);
3788 if (s < (mddev->array_size<<1)) { 3545 if (s < mddev->array_sectors) {
3789 skipped = 1; 3546 skipped = 1;
3790 continue; 3547 continue;
3791 } 3548 }
@@ -4002,12 +3759,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4002 spin_lock_irq(&conf->device_lock); 3759 spin_lock_irq(&conf->device_lock);
4003 remaining = --raid_bio->bi_phys_segments; 3760 remaining = --raid_bio->bi_phys_segments;
4004 spin_unlock_irq(&conf->device_lock); 3761 spin_unlock_irq(&conf->device_lock);
4005 if (remaining == 0) { 3762 if (remaining == 0)
4006 3763 bio_endio(raid_bio, 0);
4007 raid_bio->bi_end_io(raid_bio,
4008 test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
4009 ? 0 : -EIO);
4010 }
4011 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3764 if (atomic_dec_and_test(&conf->active_aligned_reads))
4012 wake_up(&conf->wait_for_stripe); 3765 wake_up(&conf->wait_for_stripe);
4013 return handled; 3766 return handled;
@@ -4094,6 +3847,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4094{ 3847{
4095 raid5_conf_t *conf = mddev_to_conf(mddev); 3848 raid5_conf_t *conf = mddev_to_conf(mddev);
4096 unsigned long new; 3849 unsigned long new;
3850 int err;
3851
4097 if (len >= PAGE_SIZE) 3852 if (len >= PAGE_SIZE)
4098 return -EINVAL; 3853 return -EINVAL;
4099 if (!conf) 3854 if (!conf)
@@ -4109,7 +3864,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4109 else 3864 else
4110 break; 3865 break;
4111 } 3866 }
4112 md_allow_write(mddev); 3867 err = md_allow_write(mddev);
3868 if (err)
3869 return err;
4113 while (new > conf->max_nr_stripes) { 3870 while (new > conf->max_nr_stripes) {
4114 if (grow_one_stripe(conf)) 3871 if (grow_one_stripe(conf))
4115 conf->max_nr_stripes++; 3872 conf->max_nr_stripes++;
@@ -4434,7 +4191,7 @@ static int run(mddev_t *mddev)
4434 mddev->queue->backing_dev_info.congested_data = mddev; 4191 mddev->queue->backing_dev_info.congested_data = mddev;
4435 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 4192 mddev->queue->backing_dev_info.congested_fn = raid5_congested;
4436 4193
4437 mddev->array_size = mddev->size * (conf->previous_raid_disks - 4194 mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
4438 conf->max_degraded); 4195 conf->max_degraded);
4439 4196
4440 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 4197 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
@@ -4609,35 +4366,41 @@ abort:
4609static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 4366static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
4610{ 4367{
4611 raid5_conf_t *conf = mddev->private; 4368 raid5_conf_t *conf = mddev->private;
4612 int found = 0; 4369 int err = -EEXIST;
4613 int disk; 4370 int disk;
4614 struct disk_info *p; 4371 struct disk_info *p;
4372 int first = 0;
4373 int last = conf->raid_disks - 1;
4615 4374
4616 if (mddev->degraded > conf->max_degraded) 4375 if (mddev->degraded > conf->max_degraded)
4617 /* no point adding a device */ 4376 /* no point adding a device */
4618 return 0; 4377 return -EINVAL;
4378
4379 if (rdev->raid_disk >= 0)
4380 first = last = rdev->raid_disk;
4619 4381
4620 /* 4382 /*
4621 * find the disk ... but prefer rdev->saved_raid_disk 4383 * find the disk ... but prefer rdev->saved_raid_disk
4622 * if possible. 4384 * if possible.
4623 */ 4385 */
4624 if (rdev->saved_raid_disk >= 0 && 4386 if (rdev->saved_raid_disk >= 0 &&
4387 rdev->saved_raid_disk >= first &&
4625 conf->disks[rdev->saved_raid_disk].rdev == NULL) 4388 conf->disks[rdev->saved_raid_disk].rdev == NULL)
4626 disk = rdev->saved_raid_disk; 4389 disk = rdev->saved_raid_disk;
4627 else 4390 else
4628 disk = 0; 4391 disk = first;
4629 for ( ; disk < conf->raid_disks; disk++) 4392 for ( ; disk <= last ; disk++)
4630 if ((p=conf->disks + disk)->rdev == NULL) { 4393 if ((p=conf->disks + disk)->rdev == NULL) {
4631 clear_bit(In_sync, &rdev->flags); 4394 clear_bit(In_sync, &rdev->flags);
4632 rdev->raid_disk = disk; 4395 rdev->raid_disk = disk;
4633 found = 1; 4396 err = 0;
4634 if (rdev->saved_raid_disk != disk) 4397 if (rdev->saved_raid_disk != disk)
4635 conf->fullsync = 1; 4398 conf->fullsync = 1;
4636 rcu_assign_pointer(p->rdev, rdev); 4399 rcu_assign_pointer(p->rdev, rdev);
4637 break; 4400 break;
4638 } 4401 }
4639 print_raid5_conf(conf); 4402 print_raid5_conf(conf);
4640 return found; 4403 return err;
4641} 4404}
4642 4405
4643static int raid5_resize(mddev_t *mddev, sector_t sectors) 4406static int raid5_resize(mddev_t *mddev, sector_t sectors)
@@ -4652,8 +4415,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
4652 raid5_conf_t *conf = mddev_to_conf(mddev); 4415 raid5_conf_t *conf = mddev_to_conf(mddev);
4653 4416
4654 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 4417 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
4655 mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; 4418 mddev->array_sectors = sectors * (mddev->raid_disks
4656 set_capacity(mddev->gendisk, mddev->array_size << 1); 4419 - conf->max_degraded);
4420 set_capacity(mddev->gendisk, mddev->array_sectors);
4657 mddev->changed = 1; 4421 mddev->changed = 1;
4658 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { 4422 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
4659 mddev->recovery_cp = mddev->size << 1; 4423 mddev->recovery_cp = mddev->size << 1;
@@ -4738,7 +4502,7 @@ static int raid5_start_reshape(mddev_t *mddev)
4738 rdev_for_each(rdev, rtmp, mddev) 4502 rdev_for_each(rdev, rtmp, mddev)
4739 if (rdev->raid_disk < 0 && 4503 if (rdev->raid_disk < 0 &&
4740 !test_bit(Faulty, &rdev->flags)) { 4504 !test_bit(Faulty, &rdev->flags)) {
4741 if (raid5_add_disk(mddev, rdev)) { 4505 if (raid5_add_disk(mddev, rdev) == 0) {
4742 char nm[20]; 4506 char nm[20];
4743 set_bit(In_sync, &rdev->flags); 4507 set_bit(In_sync, &rdev->flags);
4744 added_devices++; 4508 added_devices++;
@@ -4786,15 +4550,16 @@ static void end_reshape(raid5_conf_t *conf)
4786 struct block_device *bdev; 4550 struct block_device *bdev;
4787 4551
4788 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 4552 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
4789 conf->mddev->array_size = conf->mddev->size * 4553 conf->mddev->array_sectors = 2 * conf->mddev->size *
4790 (conf->raid_disks - conf->max_degraded); 4554 (conf->raid_disks - conf->max_degraded);
4791 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); 4555 set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
4792 conf->mddev->changed = 1; 4556 conf->mddev->changed = 1;
4793 4557
4794 bdev = bdget_disk(conf->mddev->gendisk, 0); 4558 bdev = bdget_disk(conf->mddev->gendisk, 0);
4795 if (bdev) { 4559 if (bdev) {
4796 mutex_lock(&bdev->bd_inode->i_mutex); 4560 mutex_lock(&bdev->bd_inode->i_mutex);
4797 i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); 4561 i_size_write(bdev->bd_inode,
4562 (loff_t)conf->mddev->array_sectors << 9);
4798 mutex_unlock(&bdev->bd_inode->i_mutex); 4563 mutex_unlock(&bdev->bd_inode->i_mutex);
4799 bdput(bdev); 4564 bdput(bdev);
4800 } 4565 }