aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.com>2015-08-13 22:47:33 -0400
committerNeilBrown <neilb@suse.com>2015-08-31 13:43:59 -0400
commitc3cce6cda162eb2b2960a85d9c8992f4f3be85d0 (patch)
tree076342691b99d0bb1fdca4c57df959665901cd9d /drivers/md/raid5.c
parent34a6f80e1639b124f24b5fadc1d45d69417cbace (diff)
md/raid5: ensure device failure recorded before write request returns.
When a write to one of the devices of a RAID5/6 fails, the failure is recorded in the metadata of the other devices so that after a restart the data on the failed drive wont be trusted even if that drive seems to be working again (maybe a cable was unplugged). Similarly when we record a bad-block in response to a write failure, we must not let the write complete until the bad-block update is safe. Currently there is no interlock between the write request completing and the metadata update. So it is possible that the write will complete, the app will confirm success in some way, and then the machine will crash before the metadata update completes. This is an extremely small hole for a racy to fit in, but it is theoretically possible and so should be closed. So: - set MD_CHANGE_PENDING when requesting a metadata update for a failed device, so we can know with certainty when it completes - queue requests that completed when MD_CHANGE_PENDING is set to only be processed after the metadata update completes - call raid_end_bio_io() on bios in that queue when the time comes. Signed-off-by: NeilBrown <neilb@suse.com>
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c24
1 files changed, 23 insertions, 1 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 214dcca0d7f8..4195064460d0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2513,6 +2513,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
2513 set_bit(Blocked, &rdev->flags); 2513 set_bit(Blocked, &rdev->flags);
2514 set_bit(Faulty, &rdev->flags); 2514 set_bit(Faulty, &rdev->flags);
2515 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2515 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2516 set_bit(MD_CHANGE_PENDING, &mddev->flags);
2516 printk(KERN_ALERT 2517 printk(KERN_ALERT
2517 "md/raid:%s: Disk failure on %s, disabling device.\n" 2518 "md/raid:%s: Disk failure on %s, disabling device.\n"
2518 "md/raid:%s: Operation continuing on %d devices.\n", 2519 "md/raid:%s: Operation continuing on %d devices.\n",
@@ -4601,7 +4602,15 @@ finish:
4601 md_wakeup_thread(conf->mddev->thread); 4602 md_wakeup_thread(conf->mddev->thread);
4602 } 4603 }
4603 4604
4604 return_io(&s.return_bi); 4605 if (!bio_list_empty(&s.return_bi)) {
4606 if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
4607 spin_lock_irq(&conf->device_lock);
4608 bio_list_merge(&conf->return_bi, &s.return_bi);
4609 spin_unlock_irq(&conf->device_lock);
4610 md_wakeup_thread(conf->mddev->thread);
4611 } else
4612 return_io(&s.return_bi);
4613 }
4605 4614
4606 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4615 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
4607} 4616}
@@ -5817,6 +5826,18 @@ static void raid5d(struct md_thread *thread)
5817 5826
5818 md_check_recovery(mddev); 5827 md_check_recovery(mddev);
5819 5828
5829 if (!bio_list_empty(&conf->return_bi) &&
5830 !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
5831 struct bio_list tmp = BIO_EMPTY_LIST;
5832 spin_lock_irq(&conf->device_lock);
5833 if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
5834 bio_list_merge(&tmp, &conf->return_bi);
5835 bio_list_init(&conf->return_bi);
5836 }
5837 spin_unlock_irq(&conf->device_lock);
5838 return_io(&tmp);
5839 }
5840
5820 blk_start_plug(&plug); 5841 blk_start_plug(&plug);
5821 handled = 0; 5842 handled = 0;
5822 spin_lock_irq(&conf->device_lock); 5843 spin_lock_irq(&conf->device_lock);
@@ -6476,6 +6497,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
6476 INIT_LIST_HEAD(&conf->hold_list); 6497 INIT_LIST_HEAD(&conf->hold_list);
6477 INIT_LIST_HEAD(&conf->delayed_list); 6498 INIT_LIST_HEAD(&conf->delayed_list);
6478 INIT_LIST_HEAD(&conf->bitmap_list); 6499 INIT_LIST_HEAD(&conf->bitmap_list);
6500 bio_list_init(&conf->return_bi);
6479 init_llist_head(&conf->released_stripes); 6501 init_llist_head(&conf->released_stripes);
6480 atomic_set(&conf->active_stripes, 0); 6502 atomic_set(&conf->active_stripes, 0);
6481 atomic_set(&conf->preread_active_stripes, 0); 6503 atomic_set(&conf->preread_active_stripes, 0);