aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c94
1 files changed, 61 insertions, 33 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 087eee0cb809..3b27df52456b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -94,6 +94,8 @@
94#define __inline__ 94#define __inline__
95#endif 95#endif
96 96
97#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
98
97#if !RAID6_USE_EMPTY_ZERO_PAGE 99#if !RAID6_USE_EMPTY_ZERO_PAGE
98/* In .bss so it's zeroed */ 100/* In .bss so it's zeroed */
99const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); 101const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
@@ -1143,10 +1145,12 @@ static void raid5_end_read_request(struct bio * bi, int error)
1143 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1145 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1144 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1146 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1145 rdev = conf->disks[i].rdev; 1147 rdev = conf->disks[i].rdev;
1146 printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", 1148 printk_rl(KERN_INFO "raid5:%s: read error corrected"
1147 mdname(conf->mddev), STRIPE_SECTORS, 1149 " (%lu sectors at %llu on %s)\n",
1148 (unsigned long long)(sh->sector + rdev->data_offset), 1150 mdname(conf->mddev), STRIPE_SECTORS,
1149 bdevname(rdev->bdev, b)); 1151 (unsigned long long)(sh->sector
1152 + rdev->data_offset),
1153 bdevname(rdev->bdev, b));
1150 clear_bit(R5_ReadError, &sh->dev[i].flags); 1154 clear_bit(R5_ReadError, &sh->dev[i].flags);
1151 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1155 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1152 } 1156 }
@@ -1160,16 +1164,22 @@ static void raid5_end_read_request(struct bio * bi, int error)
1160 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1164 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1161 atomic_inc(&rdev->read_errors); 1165 atomic_inc(&rdev->read_errors);
1162 if (conf->mddev->degraded) 1166 if (conf->mddev->degraded)
1163 printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n", 1167 printk_rl(KERN_WARNING
1164 mdname(conf->mddev), 1168 "raid5:%s: read error not correctable "
1165 (unsigned long long)(sh->sector + rdev->data_offset), 1169 "(sector %llu on %s).\n",
1166 bdn); 1170 mdname(conf->mddev),
1171 (unsigned long long)(sh->sector
1172 + rdev->data_offset),
1173 bdn);
1167 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1174 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1168 /* Oh, no!!! */ 1175 /* Oh, no!!! */
1169 printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n", 1176 printk_rl(KERN_WARNING
1170 mdname(conf->mddev), 1177 "raid5:%s: read error NOT corrected!! "
1171 (unsigned long long)(sh->sector + rdev->data_offset), 1178 "(sector %llu on %s).\n",
1172 bdn); 1179 mdname(conf->mddev),
1180 (unsigned long long)(sh->sector
1181 + rdev->data_offset),
1182 bdn);
1173 else if (atomic_read(&rdev->read_errors) 1183 else if (atomic_read(&rdev->read_errors)
1174 > conf->max_nr_stripes) 1184 > conf->max_nr_stripes)
1175 printk(KERN_WARNING 1185 printk(KERN_WARNING
@@ -1258,7 +1268,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1258 /* 1268 /*
1259 * if recovery was running, make sure it aborts. 1269 * if recovery was running, make sure it aborts.
1260 */ 1270 */
1261 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 1271 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1262 } 1272 }
1263 set_bit(Faulty, &rdev->flags); 1273 set_bit(Faulty, &rdev->flags);
1264 printk (KERN_ALERT 1274 printk (KERN_ALERT
@@ -1992,6 +2002,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
1992 * have quiesced. 2002 * have quiesced.
1993 */ 2003 */
1994 if ((s->uptodate == disks - 1) && 2004 if ((s->uptodate == disks - 1) &&
2005 (s->failed && disk_idx == s->failed_num) &&
1995 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { 2006 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
1996 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); 2007 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
1997 set_bit(R5_Wantcompute, &dev->flags); 2008 set_bit(R5_Wantcompute, &dev->flags);
@@ -2006,12 +2017,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
2006 */ 2017 */
2007 s->uptodate++; 2018 s->uptodate++;
2008 return 0; /* uptodate + compute == disks */ 2019 return 0; /* uptodate + compute == disks */
2009 } else if ((s->uptodate < disks - 1) && 2020 } else if (test_bit(R5_Insync, &dev->flags)) {
2010 test_bit(R5_Insync, &dev->flags)) {
2011 /* Note: we hold off compute operations while checks are
2012 * in flight, but we still prefer 'compute' over 'read'
2013 * hence we only read if (uptodate < * disks-1)
2014 */
2015 set_bit(R5_LOCKED, &dev->flags); 2021 set_bit(R5_LOCKED, &dev->flags);
2016 set_bit(R5_Wantread, &dev->flags); 2022 set_bit(R5_Wantread, &dev->flags);
2017 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) 2023 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
@@ -2077,7 +2083,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
2077 /* we would like to get this block, possibly 2083 /* we would like to get this block, possibly
2078 * by computing it, but we might not be able to 2084 * by computing it, but we might not be able to
2079 */ 2085 */
2080 if (s->uptodate == disks-1) { 2086 if ((s->uptodate == disks - 1) &&
2087 (s->failed && (i == r6s->failed_num[0] ||
2088 i == r6s->failed_num[1]))) {
2081 pr_debug("Computing stripe %llu block %d\n", 2089 pr_debug("Computing stripe %llu block %d\n",
2082 (unsigned long long)sh->sector, i); 2090 (unsigned long long)sh->sector, i);
2083 compute_block_1(sh, i, 0); 2091 compute_block_1(sh, i, 0);
@@ -2369,8 +2377,8 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2369 2377
2370 /* complete a check operation */ 2378 /* complete a check operation */
2371 if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { 2379 if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
2372 clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); 2380 clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
2373 clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); 2381 clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
2374 if (s->failed == 0) { 2382 if (s->failed == 0) {
2375 if (sh->ops.zero_sum_result == 0) 2383 if (sh->ops.zero_sum_result == 0)
2376 /* parity is correct (on disc, 2384 /* parity is correct (on disc,
@@ -2400,16 +2408,6 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2400 canceled_check = 1; /* STRIPE_INSYNC is not set */ 2408 canceled_check = 1; /* STRIPE_INSYNC is not set */
2401 } 2409 }
2402 2410
2403 /* check if we can clear a parity disk reconstruct */
2404 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2405 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2406
2407 clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
2408 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2409 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2410 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2411 }
2412
2413 /* start a new check operation if there are no failures, the stripe is 2411 /* start a new check operation if there are no failures, the stripe is
2414 * not insync, and a repair is not in flight 2412 * not insync, and a repair is not in flight
2415 */ 2413 */
@@ -2424,6 +2422,17 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2424 } 2422 }
2425 } 2423 }
2426 2424
2425 /* check if we can clear a parity disk reconstruct */
2426 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2427 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2428
2429 clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
2430 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2431 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2432 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2433 }
2434
2435
2427 /* Wait for check parity and compute block operations to complete 2436 /* Wait for check parity and compute block operations to complete
2428 * before write-back. If a failure occurred while the check operation 2437 * before write-back. If a failure occurred while the check operation
2429 * was in flight we need to cycle this stripe through handle_stripe 2438 * was in flight we need to cycle this stripe through handle_stripe
@@ -2634,6 +2643,7 @@ static void handle_stripe5(struct stripe_head *sh)
2634 struct r5dev *dev; 2643 struct r5dev *dev;
2635 unsigned long pending = 0; 2644 unsigned long pending = 0;
2636 mdk_rdev_t *blocked_rdev = NULL; 2645 mdk_rdev_t *blocked_rdev = NULL;
2646 int prexor;
2637 2647
2638 memset(&s, 0, sizeof(s)); 2648 memset(&s, 0, sizeof(s));
2639 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " 2649 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
@@ -2763,9 +2773,11 @@ static void handle_stripe5(struct stripe_head *sh)
2763 /* leave prexor set until postxor is done, allows us to distinguish 2773 /* leave prexor set until postxor is done, allows us to distinguish
2764 * a rmw from a rcw during biodrain 2774 * a rmw from a rcw during biodrain
2765 */ 2775 */
2776 prexor = 0;
2766 if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && 2777 if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
2767 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { 2778 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2768 2779
2780 prexor = 1;
2769 clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); 2781 clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
2770 clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); 2782 clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
2771 clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); 2783 clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
@@ -2799,6 +2811,8 @@ static void handle_stripe5(struct stripe_head *sh)
2799 if (!test_and_set_bit( 2811 if (!test_and_set_bit(
2800 STRIPE_OP_IO, &sh->ops.pending)) 2812 STRIPE_OP_IO, &sh->ops.pending))
2801 sh->ops.count++; 2813 sh->ops.count++;
2814 if (prexor)
2815 continue;
2802 if (!test_bit(R5_Insync, &dev->flags) || 2816 if (!test_bit(R5_Insync, &dev->flags) ||
2803 (i == sh->pd_idx && s.failed == 0)) 2817 (i == sh->pd_idx && s.failed == 0))
2804 set_bit(STRIPE_INSYNC, &sh->state); 2818 set_bit(STRIPE_INSYNC, &sh->state);
@@ -2879,6 +2893,8 @@ static void handle_stripe5(struct stripe_head *sh)
2879 2893
2880 for (i = conf->raid_disks; i--; ) { 2894 for (i = conf->raid_disks; i--; ) {
2881 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2895 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2896 set_bit(R5_LOCKED, &dev->flags);
2897 s.locked++;
2882 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) 2898 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2883 sh->ops.count++; 2899 sh->ops.count++;
2884 } 2900 }
@@ -2892,6 +2908,7 @@ static void handle_stripe5(struct stripe_head *sh)
2892 conf->raid_disks); 2908 conf->raid_disks);
2893 s.locked += handle_write_operations5(sh, 1, 1); 2909 s.locked += handle_write_operations5(sh, 1, 1);
2894 } else if (s.expanded && 2910 } else if (s.expanded &&
2911 s.locked == 0 &&
2895 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { 2912 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2896 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2913 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2897 atomic_dec(&conf->reshape_stripes); 2914 atomic_dec(&conf->reshape_stripes);
@@ -4256,6 +4273,7 @@ static int run(mddev_t *mddev)
4256 goto abort; 4273 goto abort;
4257 } 4274 }
4258 spin_lock_init(&conf->device_lock); 4275 spin_lock_init(&conf->device_lock);
4276 mddev->queue->queue_lock = &conf->device_lock;
4259 init_waitqueue_head(&conf->wait_for_stripe); 4277 init_waitqueue_head(&conf->wait_for_stripe);
4260 init_waitqueue_head(&conf->wait_for_overlap); 4278 init_waitqueue_head(&conf->wait_for_overlap);
4261 INIT_LIST_HEAD(&conf->handle_list); 4279 INIT_LIST_HEAD(&conf->handle_list);
@@ -4285,7 +4303,9 @@ static int run(mddev_t *mddev)
4285 " disk %d\n", bdevname(rdev->bdev,b), 4303 " disk %d\n", bdevname(rdev->bdev,b),
4286 raid_disk); 4304 raid_disk);
4287 working_disks++; 4305 working_disks++;
4288 } 4306 } else
4307 /* Cannot rely on bitmap to complete recovery */
4308 conf->fullsync = 1;
4289 } 4309 }
4290 4310
4291 /* 4311 /*
@@ -4562,6 +4582,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
4562 err = -EBUSY; 4582 err = -EBUSY;
4563 goto abort; 4583 goto abort;
4564 } 4584 }
4585 /* Only remove non-faulty devices if recovery
4586 * isn't possible.
4587 */
4588 if (!test_bit(Faulty, &rdev->flags) &&
4589 mddev->degraded <= conf->max_degraded) {
4590 err = -EBUSY;
4591 goto abort;
4592 }
4565 p->rdev = NULL; 4593 p->rdev = NULL;
4566 synchronize_rcu(); 4594 synchronize_rcu();
4567 if (atomic_read(&rdev->nr_pending)) { 4595 if (atomic_read(&rdev->nr_pending)) {