summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorSong Liu <songliubraving@fb.com>2017-05-11 18:28:28 -0400
committerShaohua Li <shli@fb.com>2017-05-12 01:11:11 -0400
commit70d466f760b351fe30b5f8c956354ddf29aa676b (patch)
tree1e9cb8b69fb904173cb9fcf7d900c3be4bc91ab3 /drivers/md
parent23b245c04d0ef408087430dd4d1b214a5da1eb78 (diff)
md/r5cache: gracefully handle journal device errors for writeback mode
For the raid456 with writeback cache, when journal device failed during normal operation, it is still possible to persist all data, as all pending data is still in stripe cache. However, it is necessary to handle journal failure gracefully. During journal failures, the following logic handles the graceful shutdown of journal: 1. raid5_error() marks the device as Faulty and schedules async work log->disable_writeback_work; 2. In disable_writeback_work (r5c_disable_writeback_async), the mddev is suspended, set to write through, and then resumed. mddev_suspend() flushes all cached stripes; 3. All cached stripes need to be flushed carefully to the RAID array. This patch fixes issues within the process above: 1. In r5c_update_on_rdev_error() schedule disable_writeback_work for journal failures; 2. In r5c_disable_writeback_async(), wait for MD_SB_CHANGE_PENDING, since raid5_error() updates superblock. 3. In handle_stripe(), allow stripes with data in journal (s.injournal > 0) to make progress during log_failed; 4. In delay_towrite(), if log failed only process data in the cache (skip new writes in dev->towrite); 5. In __get_priority_stripe(), process loprio_list during journal device failures. 6. In raid5_remove_disk(), wait for all cached stripes are flushed before calling log_exit(). Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Shaohua Li <shli@fb.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/raid5-cache.c11
-rw-r--r--drivers/md/raid5-log.h3
-rw-r--r--drivers/md/raid5.c29
3 files changed, 34 insertions, 9 deletions
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index a6a62e212cd3..cc3f8442f11f 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -24,6 +24,7 @@
24#include "md.h" 24#include "md.h"
25#include "raid5.h" 25#include "raid5.h"
26#include "bitmap.h" 26#include "bitmap.h"
27#include "raid5-log.h"
27 28
28/* 29/*
29 * metadata/data stored in disk with 4k size unit (a block) regardless 30 * metadata/data stored in disk with 4k size unit (a block) regardless
@@ -680,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work)
680 return; 681 return;
681 pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", 682 pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
682 mdname(mddev)); 683 mdname(mddev));
684
685 /* wait superblock change before suspend */
686 wait_event(mddev->sb_wait,
687 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
688
683 mddev_suspend(mddev); 689 mddev_suspend(mddev);
684 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 690 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
685 mddev_resume(mddev); 691 mddev_resume(mddev);
@@ -2983,7 +2989,7 @@ ioerr:
2983 return ret; 2989 return ret;
2984} 2990}
2985 2991
2986void r5c_update_on_rdev_error(struct mddev *mddev) 2992void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
2987{ 2993{
2988 struct r5conf *conf = mddev->private; 2994 struct r5conf *conf = mddev->private;
2989 struct r5l_log *log = conf->log; 2995 struct r5l_log *log = conf->log;
@@ -2991,7 +2997,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev)
2991 if (!log) 2997 if (!log)
2992 return; 2998 return;
2993 2999
2994 if (raid5_calc_degraded(conf) > 0 && 3000 if ((raid5_calc_degraded(conf) > 0 ||
3001 test_bit(Journal, &rdev->flags)) &&
2995 conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) 3002 conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
2996 schedule_work(&log->disable_writeback_work); 3003 schedule_work(&log->disable_writeback_work);
2997} 3004}
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 27097101ccca..328d67aedda4 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
28extern void r5c_check_stripe_cache_usage(struct r5conf *conf); 28extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
29extern void r5c_check_cached_full_stripe(struct r5conf *conf); 29extern void r5c_check_cached_full_stripe(struct r5conf *conf);
30extern struct md_sysfs_entry r5c_journal_mode; 30extern struct md_sysfs_entry r5c_journal_mode;
31extern void r5c_update_on_rdev_error(struct mddev *mddev); 31extern void r5c_update_on_rdev_error(struct mddev *mddev,
32 struct md_rdev *rdev);
32extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); 33extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
33 34
34extern struct dma_async_tx_descriptor * 35extern struct dma_async_tx_descriptor *
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f8055a7abb4b..0ac57a925606 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2689,7 +2689,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2689 bdevname(rdev->bdev, b), 2689 bdevname(rdev->bdev, b),
2690 mdname(mddev), 2690 mdname(mddev),
2691 conf->raid_disks - mddev->degraded); 2691 conf->raid_disks - mddev->degraded);
2692 r5c_update_on_rdev_error(mddev); 2692 r5c_update_on_rdev_error(mddev, rdev);
2693} 2693}
2694 2694
2695/* 2695/*
@@ -3050,6 +3050,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
3050 * When LOG_CRITICAL, stripes with injournal == 0 will be sent to 3050 * When LOG_CRITICAL, stripes with injournal == 0 will be sent to
3051 * no_space_stripes list. 3051 * no_space_stripes list.
3052 * 3052 *
3053 * 3. during journal failure
3054 * In journal failure, we try to flush all cached data to raid disks
3055 * based on data in stripe cache. The array is read-only to upper
3056 * layers, so we would skip all pending writes.
3057 *
3053 */ 3058 */
3054static inline bool delay_towrite(struct r5conf *conf, 3059static inline bool delay_towrite(struct r5conf *conf,
3055 struct r5dev *dev, 3060 struct r5dev *dev,
@@ -3063,6 +3068,9 @@ static inline bool delay_towrite(struct r5conf *conf,
3063 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 3068 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3064 s->injournal > 0) 3069 s->injournal > 0)
3065 return true; 3070 return true;
3071 /* case 3 above */
3072 if (s->log_failed && s->injournal)
3073 return true;
3066 return false; 3074 return false;
3067} 3075}
3068 3076
@@ -4696,10 +4704,15 @@ static void handle_stripe(struct stripe_head *sh)
4696 " to_write=%d failed=%d failed_num=%d,%d\n", 4704 " to_write=%d failed=%d failed_num=%d,%d\n",
4697 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 4705 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4698 s.failed_num[0], s.failed_num[1]); 4706 s.failed_num[0], s.failed_num[1]);
4699 /* check if the array has lost more than max_degraded devices and, 4707 /*
4708 * check if the array has lost more than max_degraded devices and,
4700 * if so, some requests might need to be failed. 4709 * if so, some requests might need to be failed.
4710 *
4711 * When journal device failed (log_failed), we will only process
4712 * the stripe if there is data need write to raid disks
4701 */ 4713 */
4702 if (s.failed > conf->max_degraded || s.log_failed) { 4714 if (s.failed > conf->max_degraded ||
4715 (s.log_failed && s.injournal == 0)) {
4703 sh->check_state = 0; 4716 sh->check_state = 0;
4704 sh->reconstruct_state = 0; 4717 sh->reconstruct_state = 0;
4705 break_stripe_batch_list(sh, 0); 4718 break_stripe_batch_list(sh, 0);
@@ -5272,8 +5285,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5272 struct stripe_head *sh, *tmp; 5285 struct stripe_head *sh, *tmp;
5273 struct list_head *handle_list = NULL; 5286 struct list_head *handle_list = NULL;
5274 struct r5worker_group *wg; 5287 struct r5worker_group *wg;
5275 bool second_try = !r5c_is_writeback(conf->log); 5288 bool second_try = !r5c_is_writeback(conf->log) &&
5276 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state); 5289 !r5l_log_disk_error(conf);
5290 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5291 r5l_log_disk_error(conf);
5277 5292
5278again: 5293again:
5279 wg = NULL; 5294 wg = NULL;
@@ -7521,7 +7536,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7521 * neilb: there is no locking about new writes here, 7536 * neilb: there is no locking about new writes here,
7522 * so this cannot be safe. 7537 * so this cannot be safe.
7523 */ 7538 */
7524 if (atomic_read(&conf->active_stripes)) { 7539 if (atomic_read(&conf->active_stripes) ||
7540 atomic_read(&conf->r5c_cached_full_stripes) ||
7541 atomic_read(&conf->r5c_cached_partial_stripes)) {
7525 return -EBUSY; 7542 return -EBUSY;
7526 } 7543 }
7527 log_exit(conf); 7544 log_exit(conf);