aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorRaz Ben-Jehuda(caro) <raziebe@gmail.com>2006-12-10 05:20:47 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-10 12:57:20 -0500
commit46031f9a38a9773021f1872abc713d62467ac22e (patch)
treefe91f661fe0aad5f149447797c5d31544453ca38 /drivers/md
parentf679623f50545bc0577caf2d0f8675b61162f059 (diff)
[PATCH] md: allow reads that have bypassed the cache to be retried on failure
If a bypass-the-cache read fails, we simply try again through the cache. If it fails again it will trigger normal recovery precedures. update 1: From: NeilBrown <neilb@suse.de> 1/ chunk_aligned_read and retry_aligned_read assume that data_disks == raid_disks - 1 which is not true for raid6. So when an aligned read request bypasses the cache, we can get the wrong data. 2/ The cloned bio is being used-after-free in raid5_align_endio (to test BIO_UPTODATE). 3/ We forgot to add rdev->data_offset when submitting a bio for aligned-read 4/ clone_bio calls blk_recount_segments and then we change bi_bdev, so we need to invalidate the segment counts. 5/ We don't de-reference the rdev when the read completes. This means we need to record the rdev to so it is still available in the end_io routine. Fortunately bi_next in the original bio is unused at this point so we can stuff it in there. 6/ We leak a cloned bio if the target rdev is not usable. From: NeilBrown <neilb@suse.de> update 2: 1/ When aligned requests fail (read error) they need to be retried via the normal method (stripe cache). As we cannot be sure that we can process a single read in one go (we may not be able to allocate all the stripes needed) we store a bio-being-retried and a list of bioes-that-still-need-to-be-retried. When find a bio that needs to be retried, we should add it to the list, not to single-bio... 2/ We were never incrementing 'scnt' when resubmitting failed aligned requests. [akpm@osdl.org: build fix] Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/raid5.c165
1 files changed, 160 insertions, 5 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 269b7771a30b..2ac2e56a1a40 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -134,6 +134,8 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
134 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 134 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
135 list_add_tail(&sh->lru, &conf->inactive_list); 135 list_add_tail(&sh->lru, &conf->inactive_list);
136 wake_up(&conf->wait_for_stripe); 136 wake_up(&conf->wait_for_stripe);
137 if (conf->retry_read_aligned)
138 md_wakeup_thread(conf->mddev->thread);
137 } 139 }
138 } 140 }
139 } 141 }
@@ -2645,18 +2647,80 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
2645} 2647}
2646 2648
2647/* 2649/*
2650 * add bio to the retry LIFO ( in O(1) ... we are in interrupt )
2651 * later sampled by raid5d.
2652 */
2653static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
2654{
2655 unsigned long flags;
2656
2657 spin_lock_irqsave(&conf->device_lock, flags);
2658
2659 bi->bi_next = conf->retry_read_aligned_list;
2660 conf->retry_read_aligned_list = bi;
2661
2662 spin_unlock_irqrestore(&conf->device_lock, flags);
2663 md_wakeup_thread(conf->mddev->thread);
2664}
2665
2666
2667static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
2668{
2669 struct bio *bi;
2670
2671 bi = conf->retry_read_aligned;
2672 if (bi) {
2673 conf->retry_read_aligned = NULL;
2674 return bi;
2675 }
2676 bi = conf->retry_read_aligned_list;
2677 if(bi) {
2678 conf->retry_read_aligned = bi->bi_next;
2679 bi->bi_next = NULL;
2680 bi->bi_phys_segments = 1; /* biased count of active stripes */
2681 bi->bi_hw_segments = 0; /* count of processed stripes */
2682 }
2683
2684 return bi;
2685}
2686
2687
2688/*
2648 * The "raid5_align_endio" should check if the read succeeded and if it 2689 * The "raid5_align_endio" should check if the read succeeded and if it
2649 * did, call bio_endio on the original bio (having bio_put the new bio 2690 * did, call bio_endio on the original bio (having bio_put the new bio
2650 * first). 2691 * first).
2651 * If the read failed.. 2692 * If the read failed..
2652 */ 2693 */
2653int raid5_align_endio(struct bio *bi, unsigned int bytes , int error) 2694static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
2654{ 2695{
2655 struct bio* raid_bi = bi->bi_private; 2696 struct bio* raid_bi = bi->bi_private;
2697 mddev_t *mddev;
2698 raid5_conf_t *conf;
2699 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
2700 mdk_rdev_t *rdev;
2701
2656 if (bi->bi_size) 2702 if (bi->bi_size)
2657 return 1; 2703 return 1;
2658 bio_put(bi); 2704 bio_put(bi);
2659 bio_endio(raid_bi, bytes, error); 2705
2706 mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
2707 conf = mddev_to_conf(mddev);
2708 rdev = (void*)raid_bi->bi_next;
2709 raid_bi->bi_next = NULL;
2710
2711 rdev_dec_pending(rdev, conf->mddev);
2712
2713 if (!error && uptodate) {
2714 bio_endio(raid_bi, bytes, 0);
2715 if (atomic_dec_and_test(&conf->active_aligned_reads))
2716 wake_up(&conf->wait_for_stripe);
2717 return 0;
2718 }
2719
2720
2721 PRINTK("raid5_align_endio : io error...handing IO for a retry\n");
2722
2723 add_bio_to_retry(raid_bi, conf);
2660 return 0; 2724 return 0;
2661} 2725}
2662 2726
@@ -2665,7 +2729,7 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio)
2665 mddev_t *mddev = q->queuedata; 2729 mddev_t *mddev = q->queuedata;
2666 raid5_conf_t *conf = mddev_to_conf(mddev); 2730 raid5_conf_t *conf = mddev_to_conf(mddev);
2667 const unsigned int raid_disks = conf->raid_disks; 2731 const unsigned int raid_disks = conf->raid_disks;
2668 const unsigned int data_disks = raid_disks - 1; 2732 const unsigned int data_disks = raid_disks - conf->max_degraded;
2669 unsigned int dd_idx, pd_idx; 2733 unsigned int dd_idx, pd_idx;
2670 struct bio* align_bi; 2734 struct bio* align_bi;
2671 mdk_rdev_t *rdev; 2735 mdk_rdev_t *rdev;
@@ -2699,13 +2763,25 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio)
2699 rcu_read_lock(); 2763 rcu_read_lock();
2700 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 2764 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
2701 if (rdev && test_bit(In_sync, &rdev->flags)) { 2765 if (rdev && test_bit(In_sync, &rdev->flags)) {
2702 align_bi->bi_bdev = rdev->bdev;
2703 atomic_inc(&rdev->nr_pending); 2766 atomic_inc(&rdev->nr_pending);
2704 rcu_read_unlock(); 2767 rcu_read_unlock();
2768 raid_bio->bi_next = (void*)rdev;
2769 align_bi->bi_bdev = rdev->bdev;
2770 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
2771 align_bi->bi_sector += rdev->data_offset;
2772
2773 spin_lock_irq(&conf->device_lock);
2774 wait_event_lock_irq(conf->wait_for_stripe,
2775 conf->quiesce == 0,
2776 conf->device_lock, /* nothing */);
2777 atomic_inc(&conf->active_aligned_reads);
2778 spin_unlock_irq(&conf->device_lock);
2779
2705 generic_make_request(align_bi); 2780 generic_make_request(align_bi);
2706 return 1; 2781 return 1;
2707 } else { 2782 } else {
2708 rcu_read_unlock(); 2783 rcu_read_unlock();
2784 bio_put(align_bi);
2709 return 0; 2785 return 0;
2710 } 2786 }
2711} 2787}
@@ -3050,6 +3126,72 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
3050 return STRIPE_SECTORS; 3126 return STRIPE_SECTORS;
3051} 3127}
3052 3128
3129static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
3130{
3131 /* We may not be able to submit a whole bio at once as there
3132 * may not be enough stripe_heads available.
3133 * We cannot pre-allocate enough stripe_heads as we may need
3134 * more than exist in the cache (if we allow ever large chunks).
3135 * So we do one stripe head at a time and record in
3136 * ->bi_hw_segments how many have been done.
3137 *
3138 * We *know* that this entire raid_bio is in one chunk, so
3139 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
3140 */
3141 struct stripe_head *sh;
3142 int dd_idx, pd_idx;
3143 sector_t sector, logical_sector, last_sector;
3144 int scnt = 0;
3145 int remaining;
3146 int handled = 0;
3147
3148 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3149 sector = raid5_compute_sector( logical_sector,
3150 conf->raid_disks,
3151 conf->raid_disks - conf->max_degraded,
3152 &dd_idx,
3153 &pd_idx,
3154 conf);
3155 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
3156
3157 for (; logical_sector < last_sector;
3158 logical_sector += STRIPE_SECTORS, scnt++) {
3159
3160 if (scnt < raid_bio->bi_hw_segments)
3161 /* already done this stripe */
3162 continue;
3163
3164 sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1);
3165
3166 if (!sh) {
3167 /* failed to get a stripe - must wait */
3168 raid_bio->bi_hw_segments = scnt;
3169 conf->retry_read_aligned = raid_bio;
3170 return handled;
3171 }
3172
3173 set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
3174 add_stripe_bio(sh, raid_bio, dd_idx, 0);
3175 handle_stripe(sh, NULL);
3176 release_stripe(sh);
3177 handled++;
3178 }
3179 spin_lock_irq(&conf->device_lock);
3180 remaining = --raid_bio->bi_phys_segments;
3181 spin_unlock_irq(&conf->device_lock);
3182 if (remaining == 0) {
3183 int bytes = raid_bio->bi_size;
3184
3185 raid_bio->bi_size = 0;
3186 raid_bio->bi_end_io(raid_bio, bytes, 0);
3187 }
3188 if (atomic_dec_and_test(&conf->active_aligned_reads))
3189 wake_up(&conf->wait_for_stripe);
3190 return handled;
3191}
3192
3193
3194
3053/* 3195/*
3054 * This is our raid5 kernel thread. 3196 * This is our raid5 kernel thread.
3055 * 3197 *
@@ -3071,6 +3213,7 @@ static void raid5d (mddev_t *mddev)
3071 spin_lock_irq(&conf->device_lock); 3213 spin_lock_irq(&conf->device_lock);
3072 while (1) { 3214 while (1) {
3073 struct list_head *first; 3215 struct list_head *first;
3216 struct bio *bio;
3074 3217
3075 if (conf->seq_flush != conf->seq_write) { 3218 if (conf->seq_flush != conf->seq_write) {
3076 int seq = conf->seq_flush; 3219 int seq = conf->seq_flush;
@@ -3087,6 +3230,16 @@ static void raid5d (mddev_t *mddev)
3087 !list_empty(&conf->delayed_list)) 3230 !list_empty(&conf->delayed_list))
3088 raid5_activate_delayed(conf); 3231 raid5_activate_delayed(conf);
3089 3232
3233 while ((bio = remove_bio_from_retry(conf))) {
3234 int ok;
3235 spin_unlock_irq(&conf->device_lock);
3236 ok = retry_aligned_read(conf, bio);
3237 spin_lock_irq(&conf->device_lock);
3238 if (!ok)
3239 break;
3240 handled++;
3241 }
3242
3090 if (list_empty(&conf->handle_list)) 3243 if (list_empty(&conf->handle_list))
3091 break; 3244 break;
3092 3245
@@ -3274,6 +3427,7 @@ static int run(mddev_t *mddev)
3274 INIT_LIST_HEAD(&conf->inactive_list); 3427 INIT_LIST_HEAD(&conf->inactive_list);
3275 atomic_set(&conf->active_stripes, 0); 3428 atomic_set(&conf->active_stripes, 0);
3276 atomic_set(&conf->preread_active_stripes, 0); 3429 atomic_set(&conf->preread_active_stripes, 0);
3430 atomic_set(&conf->active_aligned_reads, 0);
3277 3431
3278 PRINTK("raid5: run(%s) called.\n", mdname(mddev)); 3432 PRINTK("raid5: run(%s) called.\n", mdname(mddev));
3279 3433
@@ -3796,7 +3950,8 @@ static void raid5_quiesce(mddev_t *mddev, int state)
3796 spin_lock_irq(&conf->device_lock); 3950 spin_lock_irq(&conf->device_lock);
3797 conf->quiesce = 1; 3951 conf->quiesce = 1;
3798 wait_event_lock_irq(conf->wait_for_stripe, 3952 wait_event_lock_irq(conf->wait_for_stripe,
3799 atomic_read(&conf->active_stripes) == 0, 3953 atomic_read(&conf->active_stripes) == 0 &&
3954 atomic_read(&conf->active_aligned_reads) == 0,
3800 conf->device_lock, /* nothing */); 3955 conf->device_lock, /* nothing */);
3801 spin_unlock_irq(&conf->device_lock); 3956 spin_unlock_irq(&conf->device_lock);
3802 break; 3957 break;