diff options
author | Dan Williams <dan.j.williams@intel.com> | 2007-01-02 15:52:30 -0500 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2007-07-13 11:06:15 -0400 |
commit | 91c00924846a0034020451c280c76baa4299f9dc (patch) | |
tree | 7124ed6706937b793a10c37a861c5fc0f2e5b348 | |
parent | 45b4233caac05da0118b608a9fc2a40a9fc580cd (diff) |
md: raid5_run_ops - run stripe operations outside sh->lock
When the raid acceleration work was proposed, Neil laid out the following
attack plan:
1/ move the xor and copy operations outside spin_lock(&sh->lock)
2/ find/implement an asynchronous offload api
The raid5_run_ops routine uses the asynchronous offload api (async_tx) and
the stripe_operations member of a stripe_head to carry out xor+copy
operations asynchronously, outside the lock.
To perform operations outside the lock a new set of state flags is needed
to track new requests, in-flight requests, and completed requests. In this
new model handle_stripe is tasked with scanning the stripe_head for work,
updating the stripe_operations structure, and finally dropping the lock and
calling raid5_run_ops for processing. The following flags outline the
requests that handle_stripe can make of raid5_run_ops:
STRIPE_OP_BIOFILL
- copy data into request buffers to satisfy a read request
STRIPE_OP_COMPUTE_BLK
- generate a missing block in the cache from the other blocks
STRIPE_OP_PREXOR
- subtract existing data as part of the read-modify-write process
STRIPE_OP_BIODRAIN
- copy data out of request buffers to satisfy a write request
STRIPE_OP_POSTXOR
- recalculate parity for new data that has entered the cache
STRIPE_OP_CHECK
- verify that the parity is correct
STRIPE_OP_IO
- submit i/o to the member disks (note this was already performed outside
the stripe lock, but it made sense to add it as an operation type
The flow is:
1/ handle_stripe sets STRIPE_OP_* in sh->ops.pending
2/ raid5_run_ops reads sh->ops.pending, sets sh->ops.ack, and submits the
operation to the async_tx api
3/ async_tx triggers the completion callback routine to set
sh->ops.complete and release the stripe
4/ handle_stripe runs again to finish the operation and optionally submit
new operations that were previously blocked
Note this patch just defines raid5_run_ops, subsequent commits (one per
major operation type) modify handle_stripe to take advantage of this
routine.
Changelog:
* removed ops_complete_biodrain in favor of ops_complete_postxor and
ops_complete_write.
* removed the raid5_run_ops workqueue
* call bi_end_io for reads in ops_complete_biofill, saves a call to
handle_stripe
* explicitly handle the 2-disk raid5 case (xor becomes memcpy), Neil Brown
* fix race between async engines and bi_end_io call for reads, Neil Brown
* remove unnecessary spin_lock from ops_complete_biofill
* remove test_and_set/test_and_clear BUG_ONs, Neil Brown
* remove explicit interrupt handling for channel switching, this feature
was absorbed (i.e. it is now implicit) by the async_tx api
* use return_io in ops_complete_biofill
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
-rw-r--r-- | drivers/md/raid5.c | 536 | ||||
-rw-r--r-- | include/linux/raid/raid5.h | 81 |
2 files changed, 614 insertions, 3 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e372e57687ee..0b7002479655 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -52,6 +52,7 @@ | |||
52 | #include "raid6.h" | 52 | #include "raid6.h" |
53 | 53 | ||
54 | #include <linux/raid/bitmap.h> | 54 | #include <linux/raid/bitmap.h> |
55 | #include <linux/async_tx.h> | ||
55 | 56 | ||
56 | /* | 57 | /* |
57 | * Stripe cache | 58 | * Stripe cache |
@@ -341,6 +342,541 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector | |||
341 | return sh; | 342 | return sh; |
342 | } | 343 | } |
343 | 344 | ||
345 | static int | ||
346 | raid5_end_read_request(struct bio *bi, unsigned int bytes_done, int error); | ||
347 | static int | ||
348 | raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error); | ||
349 | |||
350 | static void ops_run_io(struct stripe_head *sh) | ||
351 | { | ||
352 | raid5_conf_t *conf = sh->raid_conf; | ||
353 | int i, disks = sh->disks; | ||
354 | |||
355 | might_sleep(); | ||
356 | |||
357 | for (i = disks; i--; ) { | ||
358 | int rw; | ||
359 | struct bio *bi; | ||
360 | mdk_rdev_t *rdev; | ||
361 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) | ||
362 | rw = WRITE; | ||
363 | else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | ||
364 | rw = READ; | ||
365 | else | ||
366 | continue; | ||
367 | |||
368 | bi = &sh->dev[i].req; | ||
369 | |||
370 | bi->bi_rw = rw; | ||
371 | if (rw == WRITE) | ||
372 | bi->bi_end_io = raid5_end_write_request; | ||
373 | else | ||
374 | bi->bi_end_io = raid5_end_read_request; | ||
375 | |||
376 | rcu_read_lock(); | ||
377 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
378 | if (rdev && test_bit(Faulty, &rdev->flags)) | ||
379 | rdev = NULL; | ||
380 | if (rdev) | ||
381 | atomic_inc(&rdev->nr_pending); | ||
382 | rcu_read_unlock(); | ||
383 | |||
384 | if (rdev) { | ||
385 | if (test_bit(STRIPE_SYNCING, &sh->state) || | ||
386 | test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || | ||
387 | test_bit(STRIPE_EXPAND_READY, &sh->state)) | ||
388 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | ||
389 | |||
390 | bi->bi_bdev = rdev->bdev; | ||
391 | pr_debug("%s: for %llu schedule op %ld on disc %d\n", | ||
392 | __FUNCTION__, (unsigned long long)sh->sector, | ||
393 | bi->bi_rw, i); | ||
394 | atomic_inc(&sh->count); | ||
395 | bi->bi_sector = sh->sector + rdev->data_offset; | ||
396 | bi->bi_flags = 1 << BIO_UPTODATE; | ||
397 | bi->bi_vcnt = 1; | ||
398 | bi->bi_max_vecs = 1; | ||
399 | bi->bi_idx = 0; | ||
400 | bi->bi_io_vec = &sh->dev[i].vec; | ||
401 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | ||
402 | bi->bi_io_vec[0].bv_offset = 0; | ||
403 | bi->bi_size = STRIPE_SIZE; | ||
404 | bi->bi_next = NULL; | ||
405 | if (rw == WRITE && | ||
406 | test_bit(R5_ReWrite, &sh->dev[i].flags)) | ||
407 | atomic_add(STRIPE_SECTORS, | ||
408 | &rdev->corrected_errors); | ||
409 | generic_make_request(bi); | ||
410 | } else { | ||
411 | if (rw == WRITE) | ||
412 | set_bit(STRIPE_DEGRADED, &sh->state); | ||
413 | pr_debug("skip op %ld on disc %d for sector %llu\n", | ||
414 | bi->bi_rw, i, (unsigned long long)sh->sector); | ||
415 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
416 | set_bit(STRIPE_HANDLE, &sh->state); | ||
417 | } | ||
418 | } | ||
419 | } | ||
420 | |||
421 | static struct dma_async_tx_descriptor * | ||
422 | async_copy_data(int frombio, struct bio *bio, struct page *page, | ||
423 | sector_t sector, struct dma_async_tx_descriptor *tx) | ||
424 | { | ||
425 | struct bio_vec *bvl; | ||
426 | struct page *bio_page; | ||
427 | int i; | ||
428 | int page_offset; | ||
429 | |||
430 | if (bio->bi_sector >= sector) | ||
431 | page_offset = (signed)(bio->bi_sector - sector) * 512; | ||
432 | else | ||
433 | page_offset = (signed)(sector - bio->bi_sector) * -512; | ||
434 | bio_for_each_segment(bvl, bio, i) { | ||
435 | int len = bio_iovec_idx(bio, i)->bv_len; | ||
436 | int clen; | ||
437 | int b_offset = 0; | ||
438 | |||
439 | if (page_offset < 0) { | ||
440 | b_offset = -page_offset; | ||
441 | page_offset += b_offset; | ||
442 | len -= b_offset; | ||
443 | } | ||
444 | |||
445 | if (len > 0 && page_offset + len > STRIPE_SIZE) | ||
446 | clen = STRIPE_SIZE - page_offset; | ||
447 | else | ||
448 | clen = len; | ||
449 | |||
450 | if (clen > 0) { | ||
451 | b_offset += bio_iovec_idx(bio, i)->bv_offset; | ||
452 | bio_page = bio_iovec_idx(bio, i)->bv_page; | ||
453 | if (frombio) | ||
454 | tx = async_memcpy(page, bio_page, page_offset, | ||
455 | b_offset, clen, | ||
456 | ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_SRC, | ||
457 | tx, NULL, NULL); | ||
458 | else | ||
459 | tx = async_memcpy(bio_page, page, b_offset, | ||
460 | page_offset, clen, | ||
461 | ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_DST, | ||
462 | tx, NULL, NULL); | ||
463 | } | ||
464 | if (clen < len) /* hit end of page */ | ||
465 | break; | ||
466 | page_offset += len; | ||
467 | } | ||
468 | |||
469 | return tx; | ||
470 | } | ||
471 | |||
472 | static void ops_complete_biofill(void *stripe_head_ref) | ||
473 | { | ||
474 | struct stripe_head *sh = stripe_head_ref; | ||
475 | struct bio *return_bi = NULL; | ||
476 | raid5_conf_t *conf = sh->raid_conf; | ||
477 | int i, more_to_read = 0; | ||
478 | |||
479 | pr_debug("%s: stripe %llu\n", __FUNCTION__, | ||
480 | (unsigned long long)sh->sector); | ||
481 | |||
482 | /* clear completed biofills */ | ||
483 | for (i = sh->disks; i--; ) { | ||
484 | struct r5dev *dev = &sh->dev[i]; | ||
485 | /* check if this stripe has new incoming reads */ | ||
486 | if (dev->toread) | ||
487 | more_to_read++; | ||
488 | |||
489 | /* acknowledge completion of a biofill operation */ | ||
490 | /* and check if we need to reply to a read request | ||
491 | */ | ||
492 | if (test_bit(R5_Wantfill, &dev->flags) && !dev->toread) { | ||
493 | struct bio *rbi, *rbi2; | ||
494 | clear_bit(R5_Wantfill, &dev->flags); | ||
495 | |||
496 | /* The access to dev->read is outside of the | ||
497 | * spin_lock_irq(&conf->device_lock), but is protected | ||
498 | * by the STRIPE_OP_BIOFILL pending bit | ||
499 | */ | ||
500 | BUG_ON(!dev->read); | ||
501 | rbi = dev->read; | ||
502 | dev->read = NULL; | ||
503 | while (rbi && rbi->bi_sector < | ||
504 | dev->sector + STRIPE_SECTORS) { | ||
505 | rbi2 = r5_next_bio(rbi, dev->sector); | ||
506 | spin_lock_irq(&conf->device_lock); | ||
507 | if (--rbi->bi_phys_segments == 0) { | ||
508 | rbi->bi_next = return_bi; | ||
509 | return_bi = rbi; | ||
510 | } | ||
511 | spin_unlock_irq(&conf->device_lock); | ||
512 | rbi = rbi2; | ||
513 | } | ||
514 | } | ||
515 | } | ||
516 | clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); | ||
517 | clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); | ||
518 | |||
519 | return_io(return_bi); | ||
520 | |||
521 | if (more_to_read) | ||
522 | set_bit(STRIPE_HANDLE, &sh->state); | ||
523 | release_stripe(sh); | ||
524 | } | ||
525 | |||
526 | static void ops_run_biofill(struct stripe_head *sh) | ||
527 | { | ||
528 | struct dma_async_tx_descriptor *tx = NULL; | ||
529 | raid5_conf_t *conf = sh->raid_conf; | ||
530 | int i; | ||
531 | |||
532 | pr_debug("%s: stripe %llu\n", __FUNCTION__, | ||
533 | (unsigned long long)sh->sector); | ||
534 | |||
535 | for (i = sh->disks; i--; ) { | ||
536 | struct r5dev *dev = &sh->dev[i]; | ||
537 | if (test_bit(R5_Wantfill, &dev->flags)) { | ||
538 | struct bio *rbi; | ||
539 | spin_lock_irq(&conf->device_lock); | ||
540 | dev->read = rbi = dev->toread; | ||
541 | dev->toread = NULL; | ||
542 | spin_unlock_irq(&conf->device_lock); | ||
543 | while (rbi && rbi->bi_sector < | ||
544 | dev->sector + STRIPE_SECTORS) { | ||
545 | tx = async_copy_data(0, rbi, dev->page, | ||
546 | dev->sector, tx); | ||
547 | rbi = r5_next_bio(rbi, dev->sector); | ||
548 | } | ||
549 | } | ||
550 | } | ||
551 | |||
552 | atomic_inc(&sh->count); | ||
553 | async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | ||
554 | ops_complete_biofill, sh); | ||
555 | } | ||
556 | |||
557 | static void ops_complete_compute5(void *stripe_head_ref) | ||
558 | { | ||
559 | struct stripe_head *sh = stripe_head_ref; | ||
560 | int target = sh->ops.target; | ||
561 | struct r5dev *tgt = &sh->dev[target]; | ||
562 | |||
563 | pr_debug("%s: stripe %llu\n", __FUNCTION__, | ||
564 | (unsigned long long)sh->sector); | ||
565 | |||
566 | set_bit(R5_UPTODATE, &tgt->flags); | ||
567 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | ||
568 | clear_bit(R5_Wantcompute, &tgt->flags); | ||
569 | set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | ||
570 | set_bit(STRIPE_HANDLE, &sh->state); | ||
571 | release_stripe(sh); | ||
572 | } | ||
573 | |||
574 | static struct dma_async_tx_descriptor * | ||
575 | ops_run_compute5(struct stripe_head *sh, unsigned long pending) | ||
576 | { | ||
577 | /* kernel stack size limits the total number of disks */ | ||
578 | int disks = sh->disks; | ||
579 | struct page *xor_srcs[disks]; | ||
580 | int target = sh->ops.target; | ||
581 | struct r5dev *tgt = &sh->dev[target]; | ||
582 | struct page *xor_dest = tgt->page; | ||
583 | int count = 0; | ||
584 | struct dma_async_tx_descriptor *tx; | ||
585 | int i; | ||
586 | |||
587 | pr_debug("%s: stripe %llu block: %d\n", | ||
588 | __FUNCTION__, (unsigned long long)sh->sector, target); | ||
589 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | ||
590 | |||
591 | for (i = disks; i--; ) | ||
592 | if (i != target) | ||
593 | xor_srcs[count++] = sh->dev[i].page; | ||
594 | |||
595 | atomic_inc(&sh->count); | ||
596 | |||
597 | if (unlikely(count == 1)) | ||
598 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | ||
599 | 0, NULL, ops_complete_compute5, sh); | ||
600 | else | ||
601 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | ||
602 | ASYNC_TX_XOR_ZERO_DST, NULL, | ||
603 | ops_complete_compute5, sh); | ||
604 | |||
605 | /* ack now if postxor is not set to be run */ | ||
606 | if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending)) | ||
607 | async_tx_ack(tx); | ||
608 | |||
609 | return tx; | ||
610 | } | ||
611 | |||
612 | static void ops_complete_prexor(void *stripe_head_ref) | ||
613 | { | ||
614 | struct stripe_head *sh = stripe_head_ref; | ||
615 | |||
616 | pr_debug("%s: stripe %llu\n", __FUNCTION__, | ||
617 | (unsigned long long)sh->sector); | ||
618 | |||
619 | set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); | ||
620 | } | ||
621 | |||
622 | static struct dma_async_tx_descriptor * | ||
623 | ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | ||
624 | { | ||
625 | /* kernel stack size limits the total number of disks */ | ||
626 | int disks = sh->disks; | ||
627 | struct page *xor_srcs[disks]; | ||
628 | int count = 0, pd_idx = sh->pd_idx, i; | ||
629 | |||
630 | /* existing parity data subtracted */ | ||
631 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | ||
632 | |||
633 | pr_debug("%s: stripe %llu\n", __FUNCTION__, | ||
634 | (unsigned long long)sh->sector); | ||
635 | |||
636 | for (i = disks; i--; ) { | ||
637 | struct r5dev *dev = &sh->dev[i]; | ||
638 | /* Only process blocks that are known to be uptodate */ | ||
639 | if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) | ||
640 | xor_srcs[count++] = dev->page; | ||
641 | } | ||
642 | |||
643 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | ||
644 | ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, | ||
645 | ops_complete_prexor, sh); | ||
646 | |||
647 | return tx; | ||
648 | } | ||
649 | |||
650 | static struct dma_async_tx_descriptor * | ||
651 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | ||
652 | { | ||
653 | int disks = sh->disks; | ||
654 | int pd_idx = sh->pd_idx, i; | ||
655 | |||
656 | /* check if prexor is active which means only process blocks | ||
657 | * that are part of a read-modify-write (Wantprexor) | ||
658 | */ | ||
659 | int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending); | ||
660 | |||
661 | pr_debug("%s: stripe %llu\n", __FUNCTION__, | ||
662 | (unsigned long long)sh->sector); | ||
663 | |||
664 | for (i = disks; i--; ) { | ||
665 | struct r5dev *dev = &sh->dev[i]; | ||
666 | struct bio *chosen; | ||
667 | int towrite; | ||
668 | |||
669 | towrite = 0; | ||
670 | if (prexor) { /* rmw */ | ||
671 | if (dev->towrite && | ||
672 | test_bit(R5_Wantprexor, &dev->flags)) | ||
673 | towrite = 1; | ||
674 | } else { /* rcw */ | ||
675 | if (i != pd_idx && dev->towrite && | ||
676 | test_bit(R5_LOCKED, &dev->flags)) | ||
677 | towrite = 1; | ||
678 | } | ||
679 | |||
680 | if (towrite) { | ||
681 | struct bio *wbi; | ||
682 | |||
683 | spin_lock(&sh->lock); | ||
684 | chosen = dev->towrite; | ||
685 | dev->towrite = NULL; | ||
686 | BUG_ON(dev->written); | ||
687 | wbi = dev->written = chosen; | ||
688 | spin_unlock(&sh->lock); | ||
689 | |||
690 | while (wbi && wbi->bi_sector < | ||
691 | dev->sector + STRIPE_SECTORS) { | ||
692 | tx = async_copy_data(1, wbi, dev->page, | ||
693 | dev->sector, tx); | ||
694 | wbi = r5_next_bio(wbi, dev->sector); | ||
695 | } | ||
696 | } | ||
697 | } | ||
698 | |||
699 | return tx; | ||
700 | } | ||
701 | |||
702 | static void ops_complete_postxor(void *stripe_head_ref) | ||
703 | { | ||
704 | struct stripe_head *sh = stripe_head_ref; | ||
705 | |||
706 | pr_debug("%s: stripe %llu\n", __FUNCTION__, | ||
707 | (unsigned long long)sh->sector); | ||
708 | |||
709 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | ||
710 | set_bit(STRIPE_HANDLE, &sh->state); | ||
711 | release_stripe(sh); | ||
712 | } | ||
713 | |||
714 | static void ops_complete_write(void *stripe_head_ref) | ||
715 | { | ||
716 | struct stripe_head *sh = stripe_head_ref; | ||
717 | int disks = sh->disks, i, pd_idx = sh->pd_idx; | ||
718 | |||
719 | pr_debug("%s: stripe %llu\n", __FUNCTION__, | ||
720 | (unsigned long long)sh->sector); | ||
721 | |||
722 | for (i = disks; i--; ) { | ||
723 | struct r5dev *dev = &sh->dev[i]; | ||
724 | if (dev->written || i == pd_idx) | ||
725 | set_bit(R5_UPTODATE, &dev->flags); | ||
726 | } | ||
727 | |||
728 | set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); | ||
729 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | ||
730 | |||
731 | set_bit(STRIPE_HANDLE, &sh->state); | ||
732 | release_stripe(sh); | ||
733 | } | ||
734 | |||
735 | static void | ||
736 | ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | ||
737 | { | ||
738 | /* kernel stack size limits the total number of disks */ | ||
739 | int disks = sh->disks; | ||
740 | struct page *xor_srcs[disks]; | ||
741 | |||
742 | int count = 0, pd_idx = sh->pd_idx, i; | ||
743 | struct page *xor_dest; | ||
744 | int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending); | ||
745 | unsigned long flags; | ||
746 | dma_async_tx_callback callback; | ||
747 | |||
748 | pr_debug("%s: stripe %llu\n", __FUNCTION__, | ||
749 | (unsigned long long)sh->sector); | ||
750 | |||
751 | /* check if prexor is active which means only process blocks | ||
752 | * that are part of a read-modify-write (written) | ||
753 | */ | ||
754 | if (prexor) { | ||
755 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | ||
756 | for (i = disks; i--; ) { | ||
757 | struct r5dev *dev = &sh->dev[i]; | ||
758 | if (dev->written) | ||
759 | xor_srcs[count++] = dev->page; | ||
760 | } | ||
761 | } else { | ||
762 | xor_dest = sh->dev[pd_idx].page; | ||
763 | for (i = disks; i--; ) { | ||
764 | struct r5dev *dev = &sh->dev[i]; | ||
765 | if (i != pd_idx) | ||
766 | xor_srcs[count++] = dev->page; | ||
767 | } | ||
768 | } | ||
769 | |||
770 | /* check whether this postxor is part of a write */ | ||
771 | callback = test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) ? | ||
772 | ops_complete_write : ops_complete_postxor; | ||
773 | |||
774 | /* 1/ if we prexor'd then the dest is reused as a source | ||
775 | * 2/ if we did not prexor then we are redoing the parity | ||
776 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST | ||
777 | * for the synchronous xor case | ||
778 | */ | ||
779 | flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | | ||
780 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); | ||
781 | |||
782 | atomic_inc(&sh->count); | ||
783 | |||
784 | if (unlikely(count == 1)) { | ||
785 | flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); | ||
786 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | ||
787 | flags, tx, callback, sh); | ||
788 | } else | ||
789 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | ||
790 | flags, tx, callback, sh); | ||
791 | } | ||
792 | |||
793 | static void ops_complete_check(void *stripe_head_ref) | ||
794 | { | ||
795 | struct stripe_head *sh = stripe_head_ref; | ||
796 | int pd_idx = sh->pd_idx; | ||
797 | |||
798 | pr_debug("%s: stripe %llu\n", __FUNCTION__, | ||
799 | (unsigned long long)sh->sector); | ||
800 | |||
801 | if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && | ||
802 | sh->ops.zero_sum_result == 0) | ||
803 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
804 | |||
805 | set_bit(STRIPE_OP_CHECK, &sh->ops.complete); | ||
806 | set_bit(STRIPE_HANDLE, &sh->state); | ||
807 | release_stripe(sh); | ||
808 | } | ||
809 | |||
810 | static void ops_run_check(struct stripe_head *sh) | ||
811 | { | ||
812 | /* kernel stack size limits the total number of disks */ | ||
813 | int disks = sh->disks; | ||
814 | struct page *xor_srcs[disks]; | ||
815 | struct dma_async_tx_descriptor *tx; | ||
816 | |||
817 | int count = 0, pd_idx = sh->pd_idx, i; | ||
818 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | ||
819 | |||
820 | pr_debug("%s: stripe %llu\n", __FUNCTION__, | ||
821 | (unsigned long long)sh->sector); | ||
822 | |||
823 | for (i = disks; i--; ) { | ||
824 | struct r5dev *dev = &sh->dev[i]; | ||
825 | if (i != pd_idx) | ||
826 | xor_srcs[count++] = dev->page; | ||
827 | } | ||
828 | |||
829 | tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | ||
830 | &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); | ||
831 | |||
832 | if (tx) | ||
833 | set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); | ||
834 | else | ||
835 | clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); | ||
836 | |||
837 | atomic_inc(&sh->count); | ||
838 | tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | ||
839 | ops_complete_check, sh); | ||
840 | } | ||
841 | |||
842 | static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) | ||
843 | { | ||
844 | int overlap_clear = 0, i, disks = sh->disks; | ||
845 | struct dma_async_tx_descriptor *tx = NULL; | ||
846 | |||
847 | if (test_bit(STRIPE_OP_BIOFILL, &pending)) { | ||
848 | ops_run_biofill(sh); | ||
849 | overlap_clear++; | ||
850 | } | ||
851 | |||
852 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) | ||
853 | tx = ops_run_compute5(sh, pending); | ||
854 | |||
855 | if (test_bit(STRIPE_OP_PREXOR, &pending)) | ||
856 | tx = ops_run_prexor(sh, tx); | ||
857 | |||
858 | if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { | ||
859 | tx = ops_run_biodrain(sh, tx); | ||
860 | overlap_clear++; | ||
861 | } | ||
862 | |||
863 | if (test_bit(STRIPE_OP_POSTXOR, &pending)) | ||
864 | ops_run_postxor(sh, tx); | ||
865 | |||
866 | if (test_bit(STRIPE_OP_CHECK, &pending)) | ||
867 | ops_run_check(sh); | ||
868 | |||
869 | if (test_bit(STRIPE_OP_IO, &pending)) | ||
870 | ops_run_io(sh); | ||
871 | |||
872 | if (overlap_clear) | ||
873 | for (i = disks; i--; ) { | ||
874 | struct r5dev *dev = &sh->dev[i]; | ||
875 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | ||
876 | wake_up(&sh->raid_conf->wait_for_overlap); | ||
877 | } | ||
878 | } | ||
879 | |||
344 | static int grow_one_stripe(raid5_conf_t *conf) | 880 | static int grow_one_stripe(raid5_conf_t *conf) |
345 | { | 881 | { |
346 | struct stripe_head *sh; | 882 | struct stripe_head *sh; |
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index b99d354f6128..6fb9d94e6f2e 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h | |||
@@ -116,13 +116,46 @@ | |||
116 | * attach a request to an active stripe (add_stripe_bh()) | 116 | * attach a request to an active stripe (add_stripe_bh()) |
117 | * lockdev attach-buffer unlockdev | 117 | * lockdev attach-buffer unlockdev |
118 | * handle a stripe (handle_stripe()) | 118 | * handle a stripe (handle_stripe()) |
119 | * lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io | 119 | * lockstripe clrSTRIPE_HANDLE ... |
120 | * (lockdev check-buffers unlockdev) .. | ||
121 | * change-state .. | ||
122 | * record io/ops needed unlockstripe schedule io/ops | ||
120 | * release an active stripe (release_stripe()) | 123 | * release an active stripe (release_stripe()) |
121 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev | 124 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev |
122 | * | 125 | * |
123 | * The refcount counts each thread that have activated the stripe, | 126 | * The refcount counts each thread that have activated the stripe, |
124 | * plus raid5d if it is handling it, plus one for each active request | 127 | * plus raid5d if it is handling it, plus one for each active request |
125 | * on a cached buffer. | 128 | * on a cached buffer, and plus one if the stripe is undergoing stripe |
129 | * operations. | ||
130 | * | ||
131 | * Stripe operations are performed outside the stripe lock, | ||
132 | * the stripe operations are: | ||
133 | * -copying data between the stripe cache and user application buffers | ||
134 | * -computing blocks to save a disk access, or to recover a missing block | ||
135 | * -updating the parity on a write operation (reconstruct write and | ||
136 | * read-modify-write) | ||
137 | * -checking parity correctness | ||
138 | * -running i/o to disk | ||
139 | * These operations are carried out by raid5_run_ops which uses the async_tx | ||
140 | * api to (optionally) offload operations to dedicated hardware engines. | ||
141 | * When requesting an operation handle_stripe sets the pending bit for the | ||
142 | * operation and increments the count. raid5_run_ops is then run whenever | ||
143 | * the count is non-zero. | ||
144 | * There are some critical dependencies between the operations that prevent some | ||
145 | * from being requested while another is in flight. | ||
146 | * 1/ Parity check operations destroy the in cache version of the parity block, | ||
147 | * so we prevent parity dependent operations like writes and compute_blocks | ||
148 | * from starting while a check is in progress. Some dma engines can perform | ||
149 | * the check without damaging the parity block, in these cases the parity | ||
150 | * block is re-marked up to date (assuming the check was successful) and is | ||
151 | * not re-read from disk. | ||
152 | * 2/ When a write operation is requested we immediately lock the affected | ||
153 | * blocks, and mark them as not up to date. This causes new read requests | ||
154 | * to be held off, as well as parity checks and compute block operations. | ||
155 | * 3/ Once a compute block operation has been requested handle_stripe treats | ||
156 | * that block as if it is up to date. raid5_run_ops guaruntees that any | ||
157 | * operation that is dependent on the compute block result is initiated after | ||
158 | * the compute block completes. | ||
126 | */ | 159 | */ |
127 | 160 | ||
128 | struct stripe_head { | 161 | struct stripe_head { |
@@ -136,11 +169,26 @@ struct stripe_head { | |||
136 | spinlock_t lock; | 169 | spinlock_t lock; |
137 | int bm_seq; /* sequence number for bitmap flushes */ | 170 | int bm_seq; /* sequence number for bitmap flushes */ |
138 | int disks; /* disks in stripe */ | 171 | int disks; /* disks in stripe */ |
172 | /* stripe_operations | ||
173 | * @pending - pending ops flags (set for request->issue->complete) | ||
174 | * @ack - submitted ops flags (set for issue->complete) | ||
175 | * @complete - completed ops flags (set for complete) | ||
176 | * @target - STRIPE_OP_COMPUTE_BLK target | ||
177 | * @count - raid5_runs_ops is set to run when this is non-zero | ||
178 | */ | ||
179 | struct stripe_operations { | ||
180 | unsigned long pending; | ||
181 | unsigned long ack; | ||
182 | unsigned long complete; | ||
183 | int target; | ||
184 | int count; | ||
185 | u32 zero_sum_result; | ||
186 | } ops; | ||
139 | struct r5dev { | 187 | struct r5dev { |
140 | struct bio req; | 188 | struct bio req; |
141 | struct bio_vec vec; | 189 | struct bio_vec vec; |
142 | struct page *page; | 190 | struct page *page; |
143 | struct bio *toread, *towrite, *written; | 191 | struct bio *toread, *read, *towrite, *written; |
144 | sector_t sector; /* sector of this page */ | 192 | sector_t sector; /* sector of this page */ |
145 | unsigned long flags; | 193 | unsigned long flags; |
146 | } dev[1]; /* allocated with extra space depending of RAID geometry */ | 194 | } dev[1]; /* allocated with extra space depending of RAID geometry */ |
@@ -174,6 +222,15 @@ struct r6_state { | |||
174 | #define R5_ReWrite 9 /* have tried to over-write the readerror */ | 222 | #define R5_ReWrite 9 /* have tried to over-write the readerror */ |
175 | 223 | ||
176 | #define R5_Expanded 10 /* This block now has post-expand data */ | 224 | #define R5_Expanded 10 /* This block now has post-expand data */ |
225 | #define R5_Wantcompute 11 /* compute_block in progress treat as | ||
226 | * uptodate | ||
227 | */ | ||
228 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs | ||
229 | * filling | ||
230 | */ | ||
231 | #define R5_Wantprexor 13 /* distinguish blocks ready for rmw from | ||
232 | * other "towrites" | ||
233 | */ | ||
177 | /* | 234 | /* |
178 | * Write method | 235 | * Write method |
179 | */ | 236 | */ |
@@ -196,6 +253,24 @@ struct r6_state { | |||
196 | #define STRIPE_EXPAND_SOURCE 10 | 253 | #define STRIPE_EXPAND_SOURCE 10 |
197 | #define STRIPE_EXPAND_READY 11 | 254 | #define STRIPE_EXPAND_READY 11 |
198 | /* | 255 | /* |
256 | * Operations flags (in issue order) | ||
257 | */ | ||
258 | #define STRIPE_OP_BIOFILL 0 | ||
259 | #define STRIPE_OP_COMPUTE_BLK 1 | ||
260 | #define STRIPE_OP_PREXOR 2 | ||
261 | #define STRIPE_OP_BIODRAIN 3 | ||
262 | #define STRIPE_OP_POSTXOR 4 | ||
263 | #define STRIPE_OP_CHECK 5 | ||
264 | #define STRIPE_OP_IO 6 | ||
265 | |||
266 | /* modifiers to the base operations | ||
267 | * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back | ||
268 | * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check | ||
269 | */ | ||
270 | #define STRIPE_OP_MOD_REPAIR_PD 7 | ||
271 | #define STRIPE_OP_MOD_DMA_CHECK 8 | ||
272 | |||
273 | /* | ||
199 | * Plugging: | 274 | * Plugging: |
200 | * | 275 | * |
201 | * To improve write throughput, we need to delay the handling of some | 276 | * To improve write throughput, we need to delay the handling of some |