diff options
author | Raz Ben-Jehuda(caro) <raziebe@gmail.com> | 2006-12-10 05:20:47 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.osdl.org> | 2006-12-10 12:57:20 -0500 |
commit | 46031f9a38a9773021f1872abc713d62467ac22e (patch) | |
tree | fe91f661fe0aad5f149447797c5d31544453ca38 /drivers/md | |
parent | f679623f50545bc0577caf2d0f8675b61162f059 (diff) |
[PATCH] md: allow reads that have bypassed the cache to be retried on failure
If a bypass-the-cache read fails, we simply try again through the cache. If
it fails again it will trigger normal recovery precedures.
update 1:
From: NeilBrown <neilb@suse.de>
1/
chunk_aligned_read and retry_aligned_read assume that
data_disks == raid_disks - 1
which is not true for raid6.
So when an aligned read request bypasses the cache, we can get the wrong data.
2/ The cloned bio is being used-after-free in raid5_align_endio
(to test BIO_UPTODATE).
3/ We forgot to add rdev->data_offset when submitting
a bio for aligned-read
4/ clone_bio calls blk_recount_segments and then we change bi_bdev,
so we need to invalidate the segment counts.
5/ We don't de-reference the rdev when the read completes.
This means we need to record the rdev to so it is still
available in the end_io routine. Fortunately
bi_next in the original bio is unused at this point so
we can stuff it in there.
6/ We leak a cloned bio if the target rdev is not usable.
From: NeilBrown <neilb@suse.de>
update 2:
1/ When aligned requests fail (read error) they need to be retried
via the normal method (stripe cache). As we cannot be sure that
we can process a single read in one go (we may not be able to
allocate all the stripes needed) we store a bio-being-retried
and a list of bioes-that-still-need-to-be-retried.
When find a bio that needs to be retried, we should add it to
the list, not to single-bio...
2/ We were never incrementing 'scnt' when resubmitting failed
aligned requests.
[akpm@osdl.org: build fix]
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/raid5.c | 165 |
1 files changed, 160 insertions, 5 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 269b7771a30b..2ac2e56a1a40 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -134,6 +134,8 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | |||
134 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { | 134 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { |
135 | list_add_tail(&sh->lru, &conf->inactive_list); | 135 | list_add_tail(&sh->lru, &conf->inactive_list); |
136 | wake_up(&conf->wait_for_stripe); | 136 | wake_up(&conf->wait_for_stripe); |
137 | if (conf->retry_read_aligned) | ||
138 | md_wakeup_thread(conf->mddev->thread); | ||
137 | } | 139 | } |
138 | } | 140 | } |
139 | } | 141 | } |
@@ -2645,18 +2647,80 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) | |||
2645 | } | 2647 | } |
2646 | 2648 | ||
2647 | /* | 2649 | /* |
2650 | * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) | ||
2651 | * later sampled by raid5d. | ||
2652 | */ | ||
2653 | static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) | ||
2654 | { | ||
2655 | unsigned long flags; | ||
2656 | |||
2657 | spin_lock_irqsave(&conf->device_lock, flags); | ||
2658 | |||
2659 | bi->bi_next = conf->retry_read_aligned_list; | ||
2660 | conf->retry_read_aligned_list = bi; | ||
2661 | |||
2662 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
2663 | md_wakeup_thread(conf->mddev->thread); | ||
2664 | } | ||
2665 | |||
2666 | |||
2667 | static struct bio *remove_bio_from_retry(raid5_conf_t *conf) | ||
2668 | { | ||
2669 | struct bio *bi; | ||
2670 | |||
2671 | bi = conf->retry_read_aligned; | ||
2672 | if (bi) { | ||
2673 | conf->retry_read_aligned = NULL; | ||
2674 | return bi; | ||
2675 | } | ||
2676 | bi = conf->retry_read_aligned_list; | ||
2677 | if(bi) { | ||
2678 | conf->retry_read_aligned = bi->bi_next; | ||
2679 | bi->bi_next = NULL; | ||
2680 | bi->bi_phys_segments = 1; /* biased count of active stripes */ | ||
2681 | bi->bi_hw_segments = 0; /* count of processed stripes */ | ||
2682 | } | ||
2683 | |||
2684 | return bi; | ||
2685 | } | ||
2686 | |||
2687 | |||
2688 | /* | ||
2648 | * The "raid5_align_endio" should check if the read succeeded and if it | 2689 | * The "raid5_align_endio" should check if the read succeeded and if it |
2649 | * did, call bio_endio on the original bio (having bio_put the new bio | 2690 | * did, call bio_endio on the original bio (having bio_put the new bio |
2650 | * first). | 2691 | * first). |
2651 | * If the read failed.. | 2692 | * If the read failed.. |
2652 | */ | 2693 | */ |
2653 | int raid5_align_endio(struct bio *bi, unsigned int bytes , int error) | 2694 | static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) |
2654 | { | 2695 | { |
2655 | struct bio* raid_bi = bi->bi_private; | 2696 | struct bio* raid_bi = bi->bi_private; |
2697 | mddev_t *mddev; | ||
2698 | raid5_conf_t *conf; | ||
2699 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | ||
2700 | mdk_rdev_t *rdev; | ||
2701 | |||
2656 | if (bi->bi_size) | 2702 | if (bi->bi_size) |
2657 | return 1; | 2703 | return 1; |
2658 | bio_put(bi); | 2704 | bio_put(bi); |
2659 | bio_endio(raid_bi, bytes, error); | 2705 | |
2706 | mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; | ||
2707 | conf = mddev_to_conf(mddev); | ||
2708 | rdev = (void*)raid_bi->bi_next; | ||
2709 | raid_bi->bi_next = NULL; | ||
2710 | |||
2711 | rdev_dec_pending(rdev, conf->mddev); | ||
2712 | |||
2713 | if (!error && uptodate) { | ||
2714 | bio_endio(raid_bi, bytes, 0); | ||
2715 | if (atomic_dec_and_test(&conf->active_aligned_reads)) | ||
2716 | wake_up(&conf->wait_for_stripe); | ||
2717 | return 0; | ||
2718 | } | ||
2719 | |||
2720 | |||
2721 | PRINTK("raid5_align_endio : io error...handing IO for a retry\n"); | ||
2722 | |||
2723 | add_bio_to_retry(raid_bi, conf); | ||
2660 | return 0; | 2724 | return 0; |
2661 | } | 2725 | } |
2662 | 2726 | ||
@@ -2665,7 +2729,7 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) | |||
2665 | mddev_t *mddev = q->queuedata; | 2729 | mddev_t *mddev = q->queuedata; |
2666 | raid5_conf_t *conf = mddev_to_conf(mddev); | 2730 | raid5_conf_t *conf = mddev_to_conf(mddev); |
2667 | const unsigned int raid_disks = conf->raid_disks; | 2731 | const unsigned int raid_disks = conf->raid_disks; |
2668 | const unsigned int data_disks = raid_disks - 1; | 2732 | const unsigned int data_disks = raid_disks - conf->max_degraded; |
2669 | unsigned int dd_idx, pd_idx; | 2733 | unsigned int dd_idx, pd_idx; |
2670 | struct bio* align_bi; | 2734 | struct bio* align_bi; |
2671 | mdk_rdev_t *rdev; | 2735 | mdk_rdev_t *rdev; |
@@ -2699,13 +2763,25 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) | |||
2699 | rcu_read_lock(); | 2763 | rcu_read_lock(); |
2700 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); | 2764 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); |
2701 | if (rdev && test_bit(In_sync, &rdev->flags)) { | 2765 | if (rdev && test_bit(In_sync, &rdev->flags)) { |
2702 | align_bi->bi_bdev = rdev->bdev; | ||
2703 | atomic_inc(&rdev->nr_pending); | 2766 | atomic_inc(&rdev->nr_pending); |
2704 | rcu_read_unlock(); | 2767 | rcu_read_unlock(); |
2768 | raid_bio->bi_next = (void*)rdev; | ||
2769 | align_bi->bi_bdev = rdev->bdev; | ||
2770 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); | ||
2771 | align_bi->bi_sector += rdev->data_offset; | ||
2772 | |||
2773 | spin_lock_irq(&conf->device_lock); | ||
2774 | wait_event_lock_irq(conf->wait_for_stripe, | ||
2775 | conf->quiesce == 0, | ||
2776 | conf->device_lock, /* nothing */); | ||
2777 | atomic_inc(&conf->active_aligned_reads); | ||
2778 | spin_unlock_irq(&conf->device_lock); | ||
2779 | |||
2705 | generic_make_request(align_bi); | 2780 | generic_make_request(align_bi); |
2706 | return 1; | 2781 | return 1; |
2707 | } else { | 2782 | } else { |
2708 | rcu_read_unlock(); | 2783 | rcu_read_unlock(); |
2784 | bio_put(align_bi); | ||
2709 | return 0; | 2785 | return 0; |
2710 | } | 2786 | } |
2711 | } | 2787 | } |
@@ -3050,6 +3126,72 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
3050 | return STRIPE_SECTORS; | 3126 | return STRIPE_SECTORS; |
3051 | } | 3127 | } |
3052 | 3128 | ||
3129 | static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | ||
3130 | { | ||
3131 | /* We may not be able to submit a whole bio at once as there | ||
3132 | * may not be enough stripe_heads available. | ||
3133 | * We cannot pre-allocate enough stripe_heads as we may need | ||
3134 | * more than exist in the cache (if we allow ever large chunks). | ||
3135 | * So we do one stripe head at a time and record in | ||
3136 | * ->bi_hw_segments how many have been done. | ||
3137 | * | ||
3138 | * We *know* that this entire raid_bio is in one chunk, so | ||
3139 | * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. | ||
3140 | */ | ||
3141 | struct stripe_head *sh; | ||
3142 | int dd_idx, pd_idx; | ||
3143 | sector_t sector, logical_sector, last_sector; | ||
3144 | int scnt = 0; | ||
3145 | int remaining; | ||
3146 | int handled = 0; | ||
3147 | |||
3148 | logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | ||
3149 | sector = raid5_compute_sector( logical_sector, | ||
3150 | conf->raid_disks, | ||
3151 | conf->raid_disks - conf->max_degraded, | ||
3152 | &dd_idx, | ||
3153 | &pd_idx, | ||
3154 | conf); | ||
3155 | last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); | ||
3156 | |||
3157 | for (; logical_sector < last_sector; | ||
3158 | logical_sector += STRIPE_SECTORS, scnt++) { | ||
3159 | |||
3160 | if (scnt < raid_bio->bi_hw_segments) | ||
3161 | /* already done this stripe */ | ||
3162 | continue; | ||
3163 | |||
3164 | sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); | ||
3165 | |||
3166 | if (!sh) { | ||
3167 | /* failed to get a stripe - must wait */ | ||
3168 | raid_bio->bi_hw_segments = scnt; | ||
3169 | conf->retry_read_aligned = raid_bio; | ||
3170 | return handled; | ||
3171 | } | ||
3172 | |||
3173 | set_bit(R5_ReadError, &sh->dev[dd_idx].flags); | ||
3174 | add_stripe_bio(sh, raid_bio, dd_idx, 0); | ||
3175 | handle_stripe(sh, NULL); | ||
3176 | release_stripe(sh); | ||
3177 | handled++; | ||
3178 | } | ||
3179 | spin_lock_irq(&conf->device_lock); | ||
3180 | remaining = --raid_bio->bi_phys_segments; | ||
3181 | spin_unlock_irq(&conf->device_lock); | ||
3182 | if (remaining == 0) { | ||
3183 | int bytes = raid_bio->bi_size; | ||
3184 | |||
3185 | raid_bio->bi_size = 0; | ||
3186 | raid_bio->bi_end_io(raid_bio, bytes, 0); | ||
3187 | } | ||
3188 | if (atomic_dec_and_test(&conf->active_aligned_reads)) | ||
3189 | wake_up(&conf->wait_for_stripe); | ||
3190 | return handled; | ||
3191 | } | ||
3192 | |||
3193 | |||
3194 | |||
3053 | /* | 3195 | /* |
3054 | * This is our raid5 kernel thread. | 3196 | * This is our raid5 kernel thread. |
3055 | * | 3197 | * |
@@ -3071,6 +3213,7 @@ static void raid5d (mddev_t *mddev) | |||
3071 | spin_lock_irq(&conf->device_lock); | 3213 | spin_lock_irq(&conf->device_lock); |
3072 | while (1) { | 3214 | while (1) { |
3073 | struct list_head *first; | 3215 | struct list_head *first; |
3216 | struct bio *bio; | ||
3074 | 3217 | ||
3075 | if (conf->seq_flush != conf->seq_write) { | 3218 | if (conf->seq_flush != conf->seq_write) { |
3076 | int seq = conf->seq_flush; | 3219 | int seq = conf->seq_flush; |
@@ -3087,6 +3230,16 @@ static void raid5d (mddev_t *mddev) | |||
3087 | !list_empty(&conf->delayed_list)) | 3230 | !list_empty(&conf->delayed_list)) |
3088 | raid5_activate_delayed(conf); | 3231 | raid5_activate_delayed(conf); |
3089 | 3232 | ||
3233 | while ((bio = remove_bio_from_retry(conf))) { | ||
3234 | int ok; | ||
3235 | spin_unlock_irq(&conf->device_lock); | ||
3236 | ok = retry_aligned_read(conf, bio); | ||
3237 | spin_lock_irq(&conf->device_lock); | ||
3238 | if (!ok) | ||
3239 | break; | ||
3240 | handled++; | ||
3241 | } | ||
3242 | |||
3090 | if (list_empty(&conf->handle_list)) | 3243 | if (list_empty(&conf->handle_list)) |
3091 | break; | 3244 | break; |
3092 | 3245 | ||
@@ -3274,6 +3427,7 @@ static int run(mddev_t *mddev) | |||
3274 | INIT_LIST_HEAD(&conf->inactive_list); | 3427 | INIT_LIST_HEAD(&conf->inactive_list); |
3275 | atomic_set(&conf->active_stripes, 0); | 3428 | atomic_set(&conf->active_stripes, 0); |
3276 | atomic_set(&conf->preread_active_stripes, 0); | 3429 | atomic_set(&conf->preread_active_stripes, 0); |
3430 | atomic_set(&conf->active_aligned_reads, 0); | ||
3277 | 3431 | ||
3278 | PRINTK("raid5: run(%s) called.\n", mdname(mddev)); | 3432 | PRINTK("raid5: run(%s) called.\n", mdname(mddev)); |
3279 | 3433 | ||
@@ -3796,7 +3950,8 @@ static void raid5_quiesce(mddev_t *mddev, int state) | |||
3796 | spin_lock_irq(&conf->device_lock); | 3950 | spin_lock_irq(&conf->device_lock); |
3797 | conf->quiesce = 1; | 3951 | conf->quiesce = 1; |
3798 | wait_event_lock_irq(conf->wait_for_stripe, | 3952 | wait_event_lock_irq(conf->wait_for_stripe, |
3799 | atomic_read(&conf->active_stripes) == 0, | 3953 | atomic_read(&conf->active_stripes) == 0 && |
3954 | atomic_read(&conf->active_aligned_reads) == 0, | ||
3800 | conf->device_lock, /* nothing */); | 3955 | conf->device_lock, /* nothing */); |
3801 | spin_unlock_irq(&conf->device_lock); | 3956 | spin_unlock_irq(&conf->device_lock); |
3802 | break; | 3957 | break; |