aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2007-01-02 15:52:30 -0500
committerDan Williams <dan.j.williams@intel.com>2007-07-13 11:06:15 -0400
commit91c00924846a0034020451c280c76baa4299f9dc (patch)
tree7124ed6706937b793a10c37a861c5fc0f2e5b348
parent45b4233caac05da0118b608a9fc2a40a9fc580cd (diff)
md: raid5_run_ops - run stripe operations outside sh->lock
When the raid acceleration work was proposed, Neil laid out the following attack plan: 1/ move the xor and copy operations outside spin_lock(&sh->lock) 2/ find/implement an asynchronous offload api The raid5_run_ops routine uses the asynchronous offload api (async_tx) and the stripe_operations member of a stripe_head to carry out xor+copy operations asynchronously, outside the lock. To perform operations outside the lock a new set of state flags is needed to track new requests, in-flight requests, and completed requests. In this new model handle_stripe is tasked with scanning the stripe_head for work, updating the stripe_operations structure, and finally dropping the lock and calling raid5_run_ops for processing. The following flags outline the requests that handle_stripe can make of raid5_run_ops: STRIPE_OP_BIOFILL - copy data into request buffers to satisfy a read request STRIPE_OP_COMPUTE_BLK - generate a missing block in the cache from the other blocks STRIPE_OP_PREXOR - subtract existing data as part of the read-modify-write process STRIPE_OP_BIODRAIN - copy data out of request buffers to satisfy a write request STRIPE_OP_POSTXOR - recalculate parity for new data that has entered the cache STRIPE_OP_CHECK - verify that the parity is correct STRIPE_OP_IO - submit i/o to the member disks (note this was already performed outside the stripe lock, but it made sense to add it as an operation type The flow is: 1/ handle_stripe sets STRIPE_OP_* in sh->ops.pending 2/ raid5_run_ops reads sh->ops.pending, sets sh->ops.ack, and submits the operation to the async_tx api 3/ async_tx triggers the completion callback routine to set sh->ops.complete and release the stripe 4/ handle_stripe runs again to finish the operation and optionally submit new operations that were previously blocked Note this patch just defines raid5_run_ops, subsequent commits (one per major operation type) modify handle_stripe to take advantage of this routine. Changelog: * removed ops_complete_biodrain in favor of ops_complete_postxor and ops_complete_write. * removed the raid5_run_ops workqueue * call bi_end_io for reads in ops_complete_biofill, saves a call to handle_stripe * explicitly handle the 2-disk raid5 case (xor becomes memcpy), Neil Brown * fix race between async engines and bi_end_io call for reads, Neil Brown * remove unnecessary spin_lock from ops_complete_biofill * remove test_and_set/test_and_clear BUG_ONs, Neil Brown * remove explicit interrupt handling for channel switching, this feature was absorbed (i.e. it is now implicit) by the async_tx api * use return_io in ops_complete_biofill Signed-off-by: Dan Williams <dan.j.williams@intel.com> Acked-By: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/raid5.c536
-rw-r--r--include/linux/raid/raid5.h81
2 files changed, 614 insertions, 3 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e372e57687ee..0b7002479655 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -52,6 +52,7 @@
52#include "raid6.h" 52#include "raid6.h"
53 53
54#include <linux/raid/bitmap.h> 54#include <linux/raid/bitmap.h>
55#include <linux/async_tx.h>
55 56
56/* 57/*
57 * Stripe cache 58 * Stripe cache
@@ -341,6 +342,541 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
341 return sh; 342 return sh;
342} 343}
343 344
345static int
346raid5_end_read_request(struct bio *bi, unsigned int bytes_done, int error);
347static int
348raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error);
349
350static void ops_run_io(struct stripe_head *sh)
351{
352 raid5_conf_t *conf = sh->raid_conf;
353 int i, disks = sh->disks;
354
355 might_sleep();
356
357 for (i = disks; i--; ) {
358 int rw;
359 struct bio *bi;
360 mdk_rdev_t *rdev;
361 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
362 rw = WRITE;
363 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
364 rw = READ;
365 else
366 continue;
367
368 bi = &sh->dev[i].req;
369
370 bi->bi_rw = rw;
371 if (rw == WRITE)
372 bi->bi_end_io = raid5_end_write_request;
373 else
374 bi->bi_end_io = raid5_end_read_request;
375
376 rcu_read_lock();
377 rdev = rcu_dereference(conf->disks[i].rdev);
378 if (rdev && test_bit(Faulty, &rdev->flags))
379 rdev = NULL;
380 if (rdev)
381 atomic_inc(&rdev->nr_pending);
382 rcu_read_unlock();
383
384 if (rdev) {
385 if (test_bit(STRIPE_SYNCING, &sh->state) ||
386 test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
387 test_bit(STRIPE_EXPAND_READY, &sh->state))
388 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
389
390 bi->bi_bdev = rdev->bdev;
391 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
392 __FUNCTION__, (unsigned long long)sh->sector,
393 bi->bi_rw, i);
394 atomic_inc(&sh->count);
395 bi->bi_sector = sh->sector + rdev->data_offset;
396 bi->bi_flags = 1 << BIO_UPTODATE;
397 bi->bi_vcnt = 1;
398 bi->bi_max_vecs = 1;
399 bi->bi_idx = 0;
400 bi->bi_io_vec = &sh->dev[i].vec;
401 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
402 bi->bi_io_vec[0].bv_offset = 0;
403 bi->bi_size = STRIPE_SIZE;
404 bi->bi_next = NULL;
405 if (rw == WRITE &&
406 test_bit(R5_ReWrite, &sh->dev[i].flags))
407 atomic_add(STRIPE_SECTORS,
408 &rdev->corrected_errors);
409 generic_make_request(bi);
410 } else {
411 if (rw == WRITE)
412 set_bit(STRIPE_DEGRADED, &sh->state);
413 pr_debug("skip op %ld on disc %d for sector %llu\n",
414 bi->bi_rw, i, (unsigned long long)sh->sector);
415 clear_bit(R5_LOCKED, &sh->dev[i].flags);
416 set_bit(STRIPE_HANDLE, &sh->state);
417 }
418 }
419}
420
421static struct dma_async_tx_descriptor *
422async_copy_data(int frombio, struct bio *bio, struct page *page,
423 sector_t sector, struct dma_async_tx_descriptor *tx)
424{
425 struct bio_vec *bvl;
426 struct page *bio_page;
427 int i;
428 int page_offset;
429
430 if (bio->bi_sector >= sector)
431 page_offset = (signed)(bio->bi_sector - sector) * 512;
432 else
433 page_offset = (signed)(sector - bio->bi_sector) * -512;
434 bio_for_each_segment(bvl, bio, i) {
435 int len = bio_iovec_idx(bio, i)->bv_len;
436 int clen;
437 int b_offset = 0;
438
439 if (page_offset < 0) {
440 b_offset = -page_offset;
441 page_offset += b_offset;
442 len -= b_offset;
443 }
444
445 if (len > 0 && page_offset + len > STRIPE_SIZE)
446 clen = STRIPE_SIZE - page_offset;
447 else
448 clen = len;
449
450 if (clen > 0) {
451 b_offset += bio_iovec_idx(bio, i)->bv_offset;
452 bio_page = bio_iovec_idx(bio, i)->bv_page;
453 if (frombio)
454 tx = async_memcpy(page, bio_page, page_offset,
455 b_offset, clen,
456 ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_SRC,
457 tx, NULL, NULL);
458 else
459 tx = async_memcpy(bio_page, page, b_offset,
460 page_offset, clen,
461 ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_DST,
462 tx, NULL, NULL);
463 }
464 if (clen < len) /* hit end of page */
465 break;
466 page_offset += len;
467 }
468
469 return tx;
470}
471
472static void ops_complete_biofill(void *stripe_head_ref)
473{
474 struct stripe_head *sh = stripe_head_ref;
475 struct bio *return_bi = NULL;
476 raid5_conf_t *conf = sh->raid_conf;
477 int i, more_to_read = 0;
478
479 pr_debug("%s: stripe %llu\n", __FUNCTION__,
480 (unsigned long long)sh->sector);
481
482 /* clear completed biofills */
483 for (i = sh->disks; i--; ) {
484 struct r5dev *dev = &sh->dev[i];
485 /* check if this stripe has new incoming reads */
486 if (dev->toread)
487 more_to_read++;
488
489 /* acknowledge completion of a biofill operation */
490 /* and check if we need to reply to a read request
491 */
492 if (test_bit(R5_Wantfill, &dev->flags) && !dev->toread) {
493 struct bio *rbi, *rbi2;
494 clear_bit(R5_Wantfill, &dev->flags);
495
496 /* The access to dev->read is outside of the
497 * spin_lock_irq(&conf->device_lock), but is protected
498 * by the STRIPE_OP_BIOFILL pending bit
499 */
500 BUG_ON(!dev->read);
501 rbi = dev->read;
502 dev->read = NULL;
503 while (rbi && rbi->bi_sector <
504 dev->sector + STRIPE_SECTORS) {
505 rbi2 = r5_next_bio(rbi, dev->sector);
506 spin_lock_irq(&conf->device_lock);
507 if (--rbi->bi_phys_segments == 0) {
508 rbi->bi_next = return_bi;
509 return_bi = rbi;
510 }
511 spin_unlock_irq(&conf->device_lock);
512 rbi = rbi2;
513 }
514 }
515 }
516 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
517 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
518
519 return_io(return_bi);
520
521 if (more_to_read)
522 set_bit(STRIPE_HANDLE, &sh->state);
523 release_stripe(sh);
524}
525
526static void ops_run_biofill(struct stripe_head *sh)
527{
528 struct dma_async_tx_descriptor *tx = NULL;
529 raid5_conf_t *conf = sh->raid_conf;
530 int i;
531
532 pr_debug("%s: stripe %llu\n", __FUNCTION__,
533 (unsigned long long)sh->sector);
534
535 for (i = sh->disks; i--; ) {
536 struct r5dev *dev = &sh->dev[i];
537 if (test_bit(R5_Wantfill, &dev->flags)) {
538 struct bio *rbi;
539 spin_lock_irq(&conf->device_lock);
540 dev->read = rbi = dev->toread;
541 dev->toread = NULL;
542 spin_unlock_irq(&conf->device_lock);
543 while (rbi && rbi->bi_sector <
544 dev->sector + STRIPE_SECTORS) {
545 tx = async_copy_data(0, rbi, dev->page,
546 dev->sector, tx);
547 rbi = r5_next_bio(rbi, dev->sector);
548 }
549 }
550 }
551
552 atomic_inc(&sh->count);
553 async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
554 ops_complete_biofill, sh);
555}
556
557static void ops_complete_compute5(void *stripe_head_ref)
558{
559 struct stripe_head *sh = stripe_head_ref;
560 int target = sh->ops.target;
561 struct r5dev *tgt = &sh->dev[target];
562
563 pr_debug("%s: stripe %llu\n", __FUNCTION__,
564 (unsigned long long)sh->sector);
565
566 set_bit(R5_UPTODATE, &tgt->flags);
567 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
568 clear_bit(R5_Wantcompute, &tgt->flags);
569 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
570 set_bit(STRIPE_HANDLE, &sh->state);
571 release_stripe(sh);
572}
573
574static struct dma_async_tx_descriptor *
575ops_run_compute5(struct stripe_head *sh, unsigned long pending)
576{
577 /* kernel stack size limits the total number of disks */
578 int disks = sh->disks;
579 struct page *xor_srcs[disks];
580 int target = sh->ops.target;
581 struct r5dev *tgt = &sh->dev[target];
582 struct page *xor_dest = tgt->page;
583 int count = 0;
584 struct dma_async_tx_descriptor *tx;
585 int i;
586
587 pr_debug("%s: stripe %llu block: %d\n",
588 __FUNCTION__, (unsigned long long)sh->sector, target);
589 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
590
591 for (i = disks; i--; )
592 if (i != target)
593 xor_srcs[count++] = sh->dev[i].page;
594
595 atomic_inc(&sh->count);
596
597 if (unlikely(count == 1))
598 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
599 0, NULL, ops_complete_compute5, sh);
600 else
601 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
602 ASYNC_TX_XOR_ZERO_DST, NULL,
603 ops_complete_compute5, sh);
604
605 /* ack now if postxor is not set to be run */
606 if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
607 async_tx_ack(tx);
608
609 return tx;
610}
611
612static void ops_complete_prexor(void *stripe_head_ref)
613{
614 struct stripe_head *sh = stripe_head_ref;
615
616 pr_debug("%s: stripe %llu\n", __FUNCTION__,
617 (unsigned long long)sh->sector);
618
619 set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
620}
621
622static struct dma_async_tx_descriptor *
623ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
624{
625 /* kernel stack size limits the total number of disks */
626 int disks = sh->disks;
627 struct page *xor_srcs[disks];
628 int count = 0, pd_idx = sh->pd_idx, i;
629
630 /* existing parity data subtracted */
631 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
632
633 pr_debug("%s: stripe %llu\n", __FUNCTION__,
634 (unsigned long long)sh->sector);
635
636 for (i = disks; i--; ) {
637 struct r5dev *dev = &sh->dev[i];
638 /* Only process blocks that are known to be uptodate */
639 if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags))
640 xor_srcs[count++] = dev->page;
641 }
642
643 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
644 ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
645 ops_complete_prexor, sh);
646
647 return tx;
648}
649
650static struct dma_async_tx_descriptor *
651ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
652{
653 int disks = sh->disks;
654 int pd_idx = sh->pd_idx, i;
655
656 /* check if prexor is active which means only process blocks
657 * that are part of a read-modify-write (Wantprexor)
658 */
659 int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
660
661 pr_debug("%s: stripe %llu\n", __FUNCTION__,
662 (unsigned long long)sh->sector);
663
664 for (i = disks; i--; ) {
665 struct r5dev *dev = &sh->dev[i];
666 struct bio *chosen;
667 int towrite;
668
669 towrite = 0;
670 if (prexor) { /* rmw */
671 if (dev->towrite &&
672 test_bit(R5_Wantprexor, &dev->flags))
673 towrite = 1;
674 } else { /* rcw */
675 if (i != pd_idx && dev->towrite &&
676 test_bit(R5_LOCKED, &dev->flags))
677 towrite = 1;
678 }
679
680 if (towrite) {
681 struct bio *wbi;
682
683 spin_lock(&sh->lock);
684 chosen = dev->towrite;
685 dev->towrite = NULL;
686 BUG_ON(dev->written);
687 wbi = dev->written = chosen;
688 spin_unlock(&sh->lock);
689
690 while (wbi && wbi->bi_sector <
691 dev->sector + STRIPE_SECTORS) {
692 tx = async_copy_data(1, wbi, dev->page,
693 dev->sector, tx);
694 wbi = r5_next_bio(wbi, dev->sector);
695 }
696 }
697 }
698
699 return tx;
700}
701
702static void ops_complete_postxor(void *stripe_head_ref)
703{
704 struct stripe_head *sh = stripe_head_ref;
705
706 pr_debug("%s: stripe %llu\n", __FUNCTION__,
707 (unsigned long long)sh->sector);
708
709 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
710 set_bit(STRIPE_HANDLE, &sh->state);
711 release_stripe(sh);
712}
713
714static void ops_complete_write(void *stripe_head_ref)
715{
716 struct stripe_head *sh = stripe_head_ref;
717 int disks = sh->disks, i, pd_idx = sh->pd_idx;
718
719 pr_debug("%s: stripe %llu\n", __FUNCTION__,
720 (unsigned long long)sh->sector);
721
722 for (i = disks; i--; ) {
723 struct r5dev *dev = &sh->dev[i];
724 if (dev->written || i == pd_idx)
725 set_bit(R5_UPTODATE, &dev->flags);
726 }
727
728 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
729 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
730
731 set_bit(STRIPE_HANDLE, &sh->state);
732 release_stripe(sh);
733}
734
735static void
736ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
737{
738 /* kernel stack size limits the total number of disks */
739 int disks = sh->disks;
740 struct page *xor_srcs[disks];
741
742 int count = 0, pd_idx = sh->pd_idx, i;
743 struct page *xor_dest;
744 int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
745 unsigned long flags;
746 dma_async_tx_callback callback;
747
748 pr_debug("%s: stripe %llu\n", __FUNCTION__,
749 (unsigned long long)sh->sector);
750
751 /* check if prexor is active which means only process blocks
752 * that are part of a read-modify-write (written)
753 */
754 if (prexor) {
755 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
756 for (i = disks; i--; ) {
757 struct r5dev *dev = &sh->dev[i];
758 if (dev->written)
759 xor_srcs[count++] = dev->page;
760 }
761 } else {
762 xor_dest = sh->dev[pd_idx].page;
763 for (i = disks; i--; ) {
764 struct r5dev *dev = &sh->dev[i];
765 if (i != pd_idx)
766 xor_srcs[count++] = dev->page;
767 }
768 }
769
770 /* check whether this postxor is part of a write */
771 callback = test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) ?
772 ops_complete_write : ops_complete_postxor;
773
774 /* 1/ if we prexor'd then the dest is reused as a source
775 * 2/ if we did not prexor then we are redoing the parity
776 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
777 * for the synchronous xor case
778 */
779 flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
780 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
781
782 atomic_inc(&sh->count);
783
784 if (unlikely(count == 1)) {
785 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
786 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
787 flags, tx, callback, sh);
788 } else
789 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
790 flags, tx, callback, sh);
791}
792
793static void ops_complete_check(void *stripe_head_ref)
794{
795 struct stripe_head *sh = stripe_head_ref;
796 int pd_idx = sh->pd_idx;
797
798 pr_debug("%s: stripe %llu\n", __FUNCTION__,
799 (unsigned long long)sh->sector);
800
801 if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
802 sh->ops.zero_sum_result == 0)
803 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
804
805 set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
806 set_bit(STRIPE_HANDLE, &sh->state);
807 release_stripe(sh);
808}
809
810static void ops_run_check(struct stripe_head *sh)
811{
812 /* kernel stack size limits the total number of disks */
813 int disks = sh->disks;
814 struct page *xor_srcs[disks];
815 struct dma_async_tx_descriptor *tx;
816
817 int count = 0, pd_idx = sh->pd_idx, i;
818 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
819
820 pr_debug("%s: stripe %llu\n", __FUNCTION__,
821 (unsigned long long)sh->sector);
822
823 for (i = disks; i--; ) {
824 struct r5dev *dev = &sh->dev[i];
825 if (i != pd_idx)
826 xor_srcs[count++] = dev->page;
827 }
828
829 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
830 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
831
832 if (tx)
833 set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
834 else
835 clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
836
837 atomic_inc(&sh->count);
838 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
839 ops_complete_check, sh);
840}
841
842static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
843{
844 int overlap_clear = 0, i, disks = sh->disks;
845 struct dma_async_tx_descriptor *tx = NULL;
846
847 if (test_bit(STRIPE_OP_BIOFILL, &pending)) {
848 ops_run_biofill(sh);
849 overlap_clear++;
850 }
851
852 if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
853 tx = ops_run_compute5(sh, pending);
854
855 if (test_bit(STRIPE_OP_PREXOR, &pending))
856 tx = ops_run_prexor(sh, tx);
857
858 if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
859 tx = ops_run_biodrain(sh, tx);
860 overlap_clear++;
861 }
862
863 if (test_bit(STRIPE_OP_POSTXOR, &pending))
864 ops_run_postxor(sh, tx);
865
866 if (test_bit(STRIPE_OP_CHECK, &pending))
867 ops_run_check(sh);
868
869 if (test_bit(STRIPE_OP_IO, &pending))
870 ops_run_io(sh);
871
872 if (overlap_clear)
873 for (i = disks; i--; ) {
874 struct r5dev *dev = &sh->dev[i];
875 if (test_and_clear_bit(R5_Overlap, &dev->flags))
876 wake_up(&sh->raid_conf->wait_for_overlap);
877 }
878}
879
344static int grow_one_stripe(raid5_conf_t *conf) 880static int grow_one_stripe(raid5_conf_t *conf)
345{ 881{
346 struct stripe_head *sh; 882 struct stripe_head *sh;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index b99d354f6128..6fb9d94e6f2e 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -116,13 +116,46 @@
116 * attach a request to an active stripe (add_stripe_bh()) 116 * attach a request to an active stripe (add_stripe_bh())
117 * lockdev attach-buffer unlockdev 117 * lockdev attach-buffer unlockdev
118 * handle a stripe (handle_stripe()) 118 * handle a stripe (handle_stripe())
119 * lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io 119 * lockstripe clrSTRIPE_HANDLE ...
120 * (lockdev check-buffers unlockdev) ..
121 * change-state ..
122 * record io/ops needed unlockstripe schedule io/ops
120 * release an active stripe (release_stripe()) 123 * release an active stripe (release_stripe())
121 * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev 124 * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
122 * 125 *
123 * The refcount counts each thread that have activated the stripe, 126 * The refcount counts each thread that have activated the stripe,
124 * plus raid5d if it is handling it, plus one for each active request 127 * plus raid5d if it is handling it, plus one for each active request
125 * on a cached buffer. 128 * on a cached buffer, and plus one if the stripe is undergoing stripe
129 * operations.
130 *
131 * Stripe operations are performed outside the stripe lock,
132 * the stripe operations are:
133 * -copying data between the stripe cache and user application buffers
134 * -computing blocks to save a disk access, or to recover a missing block
135 * -updating the parity on a write operation (reconstruct write and
136 * read-modify-write)
137 * -checking parity correctness
138 * -running i/o to disk
139 * These operations are carried out by raid5_run_ops which uses the async_tx
140 * api to (optionally) offload operations to dedicated hardware engines.
141 * When requesting an operation handle_stripe sets the pending bit for the
142 * operation and increments the count. raid5_run_ops is then run whenever
143 * the count is non-zero.
144 * There are some critical dependencies between the operations that prevent some
145 * from being requested while another is in flight.
146 * 1/ Parity check operations destroy the in cache version of the parity block,
147 * so we prevent parity dependent operations like writes and compute_blocks
148 * from starting while a check is in progress. Some dma engines can perform
149 * the check without damaging the parity block, in these cases the parity
150 * block is re-marked up to date (assuming the check was successful) and is
151 * not re-read from disk.
152 * 2/ When a write operation is requested we immediately lock the affected
153 * blocks, and mark them as not up to date. This causes new read requests
154 * to be held off, as well as parity checks and compute block operations.
155 * 3/ Once a compute block operation has been requested handle_stripe treats
156 * that block as if it is up to date. raid5_run_ops guaruntees that any
157 * operation that is dependent on the compute block result is initiated after
158 * the compute block completes.
126 */ 159 */
127 160
128struct stripe_head { 161struct stripe_head {
@@ -136,11 +169,26 @@ struct stripe_head {
136 spinlock_t lock; 169 spinlock_t lock;
137 int bm_seq; /* sequence number for bitmap flushes */ 170 int bm_seq; /* sequence number for bitmap flushes */
138 int disks; /* disks in stripe */ 171 int disks; /* disks in stripe */
172 /* stripe_operations
173 * @pending - pending ops flags (set for request->issue->complete)
174 * @ack - submitted ops flags (set for issue->complete)
175 * @complete - completed ops flags (set for complete)
176 * @target - STRIPE_OP_COMPUTE_BLK target
177 * @count - raid5_runs_ops is set to run when this is non-zero
178 */
179 struct stripe_operations {
180 unsigned long pending;
181 unsigned long ack;
182 unsigned long complete;
183 int target;
184 int count;
185 u32 zero_sum_result;
186 } ops;
139 struct r5dev { 187 struct r5dev {
140 struct bio req; 188 struct bio req;
141 struct bio_vec vec; 189 struct bio_vec vec;
142 struct page *page; 190 struct page *page;
143 struct bio *toread, *towrite, *written; 191 struct bio *toread, *read, *towrite, *written;
144 sector_t sector; /* sector of this page */ 192 sector_t sector; /* sector of this page */
145 unsigned long flags; 193 unsigned long flags;
146 } dev[1]; /* allocated with extra space depending of RAID geometry */ 194 } dev[1]; /* allocated with extra space depending of RAID geometry */
@@ -174,6 +222,15 @@ struct r6_state {
174#define R5_ReWrite 9 /* have tried to over-write the readerror */ 222#define R5_ReWrite 9 /* have tried to over-write the readerror */
175 223
176#define R5_Expanded 10 /* This block now has post-expand data */ 224#define R5_Expanded 10 /* This block now has post-expand data */
225#define R5_Wantcompute 11 /* compute_block in progress treat as
226 * uptodate
227 */
228#define R5_Wantfill 12 /* dev->toread contains a bio that needs
229 * filling
230 */
231#define R5_Wantprexor 13 /* distinguish blocks ready for rmw from
232 * other "towrites"
233 */
177/* 234/*
178 * Write method 235 * Write method
179 */ 236 */
@@ -196,6 +253,24 @@ struct r6_state {
196#define STRIPE_EXPAND_SOURCE 10 253#define STRIPE_EXPAND_SOURCE 10
197#define STRIPE_EXPAND_READY 11 254#define STRIPE_EXPAND_READY 11
198/* 255/*
256 * Operations flags (in issue order)
257 */
258#define STRIPE_OP_BIOFILL 0
259#define STRIPE_OP_COMPUTE_BLK 1
260#define STRIPE_OP_PREXOR 2
261#define STRIPE_OP_BIODRAIN 3
262#define STRIPE_OP_POSTXOR 4
263#define STRIPE_OP_CHECK 5
264#define STRIPE_OP_IO 6
265
266/* modifiers to the base operations
267 * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
268 * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check
269 */
270#define STRIPE_OP_MOD_REPAIR_PD 7
271#define STRIPE_OP_MOD_DMA_CHECK 8
272
273/*
199 * Plugging: 274 * Plugging:
200 * 275 *
201 * To improve write throughput, we need to delay the handling of some 276 * To improve write throughput, we need to delay the handling of some