aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-01-28 14:09:04 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-01-28 14:09:04 -0500
commitdd553962675ab5747e887f89aea1ece90e6a802e (patch)
treed999368d0921e139b7b67ea0fc6a4e6ac548d8dc
parent64a172d265643b345007ddaafcc523f6e5373b69 (diff)
parent2e38a37f23c98d7fad87ff022670060b8a0e2bf5 (diff)
Merge tag 'md/4.10-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD fixes from Shaohua Li: "This fixes several corner cases for raid5 cache, which is merged into this cycle" * tag 'md/4.10-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: md/r5cache: disable write back for degraded array md/r5cache: shift complex rmw from read path to write path md/r5cache: flush data only stripes in r5l_recovery_log() md/raid5: move comment of fetch_block to right location md/r5cache: read data into orig_page for prexor of cached data md/raid5-cache: delete meaningless code
-rw-r--r--drivers/md/md.c5
-rw-r--r--drivers/md/raid5-cache.c106
-rw-r--r--drivers/md/raid5.c121
-rw-r--r--drivers/md/raid5.h7
4 files changed, 194 insertions, 45 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 82821ee0d57f..01175dac0db6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5291,6 +5291,11 @@ int md_run(struct mddev *mddev)
5291 if (start_readonly && mddev->ro == 0) 5291 if (start_readonly && mddev->ro == 0)
5292 mddev->ro = 2; /* read-only, but switch on first write */ 5292 mddev->ro = 2; /* read-only, but switch on first write */
5293 5293
5294 /*
5295 * NOTE: some pers->run(), for example r5l_recovery_log(), wakes
5296 * up mddev->thread. It is important to initialize critical
5297 * resources for mddev->thread BEFORE calling pers->run().
5298 */
5294 err = pers->run(mddev); 5299 err = pers->run(mddev);
5295 if (err) 5300 if (err)
5296 pr_warn("md: pers->run() failed ...\n"); 5301 pr_warn("md: pers->run() failed ...\n");
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 0e8ed2c327b0..302dea3296ba 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -162,6 +162,8 @@ struct r5l_log {
162 162
163 /* to submit async io_units, to fulfill ordering of flush */ 163 /* to submit async io_units, to fulfill ordering of flush */
164 struct work_struct deferred_io_work; 164 struct work_struct deferred_io_work;
165 /* to disable write back during in degraded mode */
166 struct work_struct disable_writeback_work;
165}; 167};
166 168
167/* 169/*
@@ -611,6 +613,21 @@ static void r5l_submit_io_async(struct work_struct *work)
611 r5l_do_submit_io(log, io); 613 r5l_do_submit_io(log, io);
612} 614}
613 615
616static void r5c_disable_writeback_async(struct work_struct *work)
617{
618 struct r5l_log *log = container_of(work, struct r5l_log,
619 disable_writeback_work);
620 struct mddev *mddev = log->rdev->mddev;
621
622 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
623 return;
624 pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
625 mdname(mddev));
626 mddev_suspend(mddev);
627 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
628 mddev_resume(mddev);
629}
630
614static void r5l_submit_current_io(struct r5l_log *log) 631static void r5l_submit_current_io(struct r5l_log *log)
615{ 632{
616 struct r5l_io_unit *io = log->current_io; 633 struct r5l_io_unit *io = log->current_io;
@@ -1393,8 +1410,6 @@ static void r5l_do_reclaim(struct r5l_log *log)
1393 next_checkpoint = r5c_calculate_new_cp(conf); 1410 next_checkpoint = r5c_calculate_new_cp(conf);
1394 spin_unlock_irq(&log->io_list_lock); 1411 spin_unlock_irq(&log->io_list_lock);
1395 1412
1396 BUG_ON(reclaimable < 0);
1397
1398 if (reclaimable == 0 || !write_super) 1413 if (reclaimable == 0 || !write_super)
1399 return; 1414 return;
1400 1415
@@ -2062,7 +2077,7 @@ static int
2062r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 2077r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
2063 struct r5l_recovery_ctx *ctx) 2078 struct r5l_recovery_ctx *ctx)
2064{ 2079{
2065 struct stripe_head *sh, *next; 2080 struct stripe_head *sh;
2066 struct mddev *mddev = log->rdev->mddev; 2081 struct mddev *mddev = log->rdev->mddev;
2067 struct page *page; 2082 struct page *page;
2068 sector_t next_checkpoint = MaxSector; 2083 sector_t next_checkpoint = MaxSector;
@@ -2076,7 +2091,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
2076 2091
2077 WARN_ON(list_empty(&ctx->cached_list)); 2092 WARN_ON(list_empty(&ctx->cached_list));
2078 2093
2079 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 2094 list_for_each_entry(sh, &ctx->cached_list, lru) {
2080 struct r5l_meta_block *mb; 2095 struct r5l_meta_block *mb;
2081 int i; 2096 int i;
2082 int offset; 2097 int offset;
@@ -2126,14 +2141,39 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
2126 ctx->pos = write_pos; 2141 ctx->pos = write_pos;
2127 ctx->seq += 1; 2142 ctx->seq += 1;
2128 next_checkpoint = sh->log_start; 2143 next_checkpoint = sh->log_start;
2129 list_del_init(&sh->lru);
2130 raid5_release_stripe(sh);
2131 } 2144 }
2132 log->next_checkpoint = next_checkpoint; 2145 log->next_checkpoint = next_checkpoint;
2133 __free_page(page); 2146 __free_page(page);
2134 return 0; 2147 return 0;
2135} 2148}
2136 2149
2150static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
2151 struct r5l_recovery_ctx *ctx)
2152{
2153 struct mddev *mddev = log->rdev->mddev;
2154 struct r5conf *conf = mddev->private;
2155 struct stripe_head *sh, *next;
2156
2157 if (ctx->data_only_stripes == 0)
2158 return;
2159
2160 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
2161
2162 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
2163 r5c_make_stripe_write_out(sh);
2164 set_bit(STRIPE_HANDLE, &sh->state);
2165 list_del_init(&sh->lru);
2166 raid5_release_stripe(sh);
2167 }
2168
2169 md_wakeup_thread(conf->mddev->thread);
2170 /* reuse conf->wait_for_quiescent in recovery */
2171 wait_event(conf->wait_for_quiescent,
2172 atomic_read(&conf->active_stripes) == 0);
2173
2174 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2175}
2176
2137static int r5l_recovery_log(struct r5l_log *log) 2177static int r5l_recovery_log(struct r5l_log *log)
2138{ 2178{
2139 struct mddev *mddev = log->rdev->mddev; 2179 struct mddev *mddev = log->rdev->mddev;
@@ -2160,32 +2200,31 @@ static int r5l_recovery_log(struct r5l_log *log)
2160 pos = ctx.pos; 2200 pos = ctx.pos;
2161 ctx.seq += 10000; 2201 ctx.seq += 10000;
2162 2202
2163 if (ctx.data_only_stripes == 0) {
2164 log->next_checkpoint = ctx.pos;
2165 r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
2166 ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
2167 }
2168 2203
2169 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) 2204 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
2170 pr_debug("md/raid:%s: starting from clean shutdown\n", 2205 pr_debug("md/raid:%s: starting from clean shutdown\n",
2171 mdname(mddev)); 2206 mdname(mddev));
2172 else { 2207 else
2173 pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", 2208 pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
2174 mdname(mddev), ctx.data_only_stripes, 2209 mdname(mddev), ctx.data_only_stripes,
2175 ctx.data_parity_stripes); 2210 ctx.data_parity_stripes);
2176 2211
2177 if (ctx.data_only_stripes > 0) 2212 if (ctx.data_only_stripes == 0) {
2178 if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 2213 log->next_checkpoint = ctx.pos;
2179 pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 2214 r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
2180 mdname(mddev)); 2215 ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
2181 return -EIO; 2216 } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
2182 } 2217 pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
2218 mdname(mddev));
2219 return -EIO;
2183 } 2220 }
2184 2221
2185 log->log_start = ctx.pos; 2222 log->log_start = ctx.pos;
2186 log->seq = ctx.seq; 2223 log->seq = ctx.seq;
2187 log->last_checkpoint = pos; 2224 log->last_checkpoint = pos;
2188 r5l_write_super(log, pos); 2225 r5l_write_super(log, pos);
2226
2227 r5c_recovery_flush_data_only_stripes(log, &ctx);
2189 return 0; 2228 return 0;
2190} 2229}
2191 2230
@@ -2247,6 +2286,10 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
2247 val > R5C_JOURNAL_MODE_WRITE_BACK) 2286 val > R5C_JOURNAL_MODE_WRITE_BACK)
2248 return -EINVAL; 2287 return -EINVAL;
2249 2288
2289 if (raid5_calc_degraded(conf) > 0 &&
2290 val == R5C_JOURNAL_MODE_WRITE_BACK)
2291 return -EINVAL;
2292
2250 mddev_suspend(mddev); 2293 mddev_suspend(mddev);
2251 conf->log->r5c_journal_mode = val; 2294 conf->log->r5c_journal_mode = val;
2252 mddev_resume(mddev); 2295 mddev_resume(mddev);
@@ -2301,6 +2344,16 @@ int r5c_try_caching_write(struct r5conf *conf,
2301 set_bit(STRIPE_R5C_CACHING, &sh->state); 2344 set_bit(STRIPE_R5C_CACHING, &sh->state);
2302 } 2345 }
2303 2346
2347 /*
2348 * When run in degraded mode, array is set to write-through mode.
2349 * This check helps drain pending write safely in the transition to
2350 * write-through mode.
2351 */
2352 if (s->failed) {
2353 r5c_make_stripe_write_out(sh);
2354 return -EAGAIN;
2355 }
2356
2304 for (i = disks; i--; ) { 2357 for (i = disks; i--; ) {
2305 dev = &sh->dev[i]; 2358 dev = &sh->dev[i];
2306 /* if non-overwrite, use writing-out phase */ 2359 /* if non-overwrite, use writing-out phase */
@@ -2351,6 +2404,8 @@ void r5c_release_extra_page(struct stripe_head *sh)
2351 struct page *p = sh->dev[i].orig_page; 2404 struct page *p = sh->dev[i].orig_page;
2352 2405
2353 sh->dev[i].orig_page = sh->dev[i].page; 2406 sh->dev[i].orig_page = sh->dev[i].page;
2407 clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2408
2354 if (!using_disk_info_extra_page) 2409 if (!using_disk_info_extra_page)
2355 put_page(p); 2410 put_page(p);
2356 } 2411 }
@@ -2555,6 +2610,19 @@ ioerr:
2555 return ret; 2610 return ret;
2556} 2611}
2557 2612
2613void r5c_update_on_rdev_error(struct mddev *mddev)
2614{
2615 struct r5conf *conf = mddev->private;
2616 struct r5l_log *log = conf->log;
2617
2618 if (!log)
2619 return;
2620
2621 if (raid5_calc_degraded(conf) > 0 &&
2622 conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
2623 schedule_work(&log->disable_writeback_work);
2624}
2625
2558int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 2626int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
2559{ 2627{
2560 struct request_queue *q = bdev_get_queue(rdev->bdev); 2628 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -2627,6 +2695,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
2627 spin_lock_init(&log->no_space_stripes_lock); 2695 spin_lock_init(&log->no_space_stripes_lock);
2628 2696
2629 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 2697 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
2698 INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
2630 2699
2631 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2700 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2632 INIT_LIST_HEAD(&log->stripe_in_journal_list); 2701 INIT_LIST_HEAD(&log->stripe_in_journal_list);
@@ -2659,6 +2728,7 @@ io_kc:
2659 2728
2660void r5l_exit_log(struct r5l_log *log) 2729void r5l_exit_log(struct r5l_log *log)
2661{ 2730{
2731 flush_work(&log->disable_writeback_work);
2662 md_unregister_thread(&log->reclaim_thread); 2732 md_unregister_thread(&log->reclaim_thread);
2663 mempool_destroy(log->meta_pool); 2733 mempool_destroy(log->meta_pool);
2664 bioset_free(log->bs); 2734 bioset_free(log->bs);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 36c13e4be9c9..3c7e106c12a2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -556,7 +556,7 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
556 * of the two sections, and some non-in_sync devices may 556 * of the two sections, and some non-in_sync devices may
557 * be insync in the section most affected by failed devices. 557 * be insync in the section most affected by failed devices.
558 */ 558 */
559static int calc_degraded(struct r5conf *conf) 559int raid5_calc_degraded(struct r5conf *conf)
560{ 560{
561 int degraded, degraded2; 561 int degraded, degraded2;
562 int i; 562 int i;
@@ -619,7 +619,7 @@ static int has_failed(struct r5conf *conf)
619 if (conf->mddev->reshape_position == MaxSector) 619 if (conf->mddev->reshape_position == MaxSector)
620 return conf->mddev->degraded > conf->max_degraded; 620 return conf->mddev->degraded > conf->max_degraded;
621 621
622 degraded = calc_degraded(conf); 622 degraded = raid5_calc_degraded(conf);
623 if (degraded > conf->max_degraded) 623 if (degraded > conf->max_degraded)
624 return 1; 624 return 1;
625 return 0; 625 return 0;
@@ -1015,7 +1015,17 @@ again:
1015 1015
1016 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1016 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1017 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1017 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1018 sh->dev[i].vec.bv_page = sh->dev[i].page; 1018
1019 if (!op_is_write(op) &&
1020 test_bit(R5_InJournal, &sh->dev[i].flags))
1021 /*
1022 * issuing read for a page in journal, this
1023 * must be preparing for prexor in rmw; read
1024 * the data into orig_page
1025 */
1026 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1027 else
1028 sh->dev[i].vec.bv_page = sh->dev[i].page;
1019 bi->bi_vcnt = 1; 1029 bi->bi_vcnt = 1;
1020 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1030 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1021 bi->bi_io_vec[0].bv_offset = 0; 1031 bi->bi_io_vec[0].bv_offset = 0;
@@ -2380,6 +2390,13 @@ static void raid5_end_read_request(struct bio * bi)
2380 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2390 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2381 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2391 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2382 2392
2393 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2394 /*
2395 * end read for a page in journal, this
2396 * must be preparing for prexor in rmw
2397 */
2398 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2399
2383 if (atomic_read(&rdev->read_errors)) 2400 if (atomic_read(&rdev->read_errors))
2384 atomic_set(&rdev->read_errors, 0); 2401 atomic_set(&rdev->read_errors, 0);
2385 } else { 2402 } else {
@@ -2538,7 +2555,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2538 2555
2539 spin_lock_irqsave(&conf->device_lock, flags); 2556 spin_lock_irqsave(&conf->device_lock, flags);
2540 clear_bit(In_sync, &rdev->flags); 2557 clear_bit(In_sync, &rdev->flags);
2541 mddev->degraded = calc_degraded(conf); 2558 mddev->degraded = raid5_calc_degraded(conf);
2542 spin_unlock_irqrestore(&conf->device_lock, flags); 2559 spin_unlock_irqrestore(&conf->device_lock, flags);
2543 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2560 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2544 2561
@@ -2552,6 +2569,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2552 bdevname(rdev->bdev, b), 2569 bdevname(rdev->bdev, b),
2553 mdname(mddev), 2570 mdname(mddev),
2554 conf->raid_disks - mddev->degraded); 2571 conf->raid_disks - mddev->degraded);
2572 r5c_update_on_rdev_error(mddev);
2555} 2573}
2556 2574
2557/* 2575/*
@@ -2880,6 +2898,30 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
2880 return r_sector; 2898 return r_sector;
2881} 2899}
2882 2900
2901/*
2902 * There are cases where we want handle_stripe_dirtying() and
2903 * schedule_reconstruction() to delay towrite to some dev of a stripe.
2904 *
2905 * This function checks whether we want to delay the towrite. Specifically,
2906 * we delay the towrite when:
2907 *
2908 * 1. degraded stripe has a non-overwrite to the missing dev, AND this
2909 * stripe has data in journal (for other devices).
2910 *
2911 * In this case, when reading data for the non-overwrite dev, it is
2912 * necessary to handle complex rmw of write back cache (prexor with
2913 * orig_page, and xor with page). To keep read path simple, we would
2914 * like to flush data in journal to RAID disks first, so complex rmw
2915 * is handled in the write patch (handle_stripe_dirtying).
2916 *
2917 */
2918static inline bool delay_towrite(struct r5dev *dev,
2919 struct stripe_head_state *s)
2920{
2921 return !test_bit(R5_OVERWRITE, &dev->flags) &&
2922 !test_bit(R5_Insync, &dev->flags) && s->injournal;
2923}
2924
2883static void 2925static void
2884schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2926schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2885 int rcw, int expand) 2927 int rcw, int expand)
@@ -2900,7 +2942,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2900 for (i = disks; i--; ) { 2942 for (i = disks; i--; ) {
2901 struct r5dev *dev = &sh->dev[i]; 2943 struct r5dev *dev = &sh->dev[i];
2902 2944
2903 if (dev->towrite) { 2945 if (dev->towrite && !delay_towrite(dev, s)) {
2904 set_bit(R5_LOCKED, &dev->flags); 2946 set_bit(R5_LOCKED, &dev->flags);
2905 set_bit(R5_Wantdrain, &dev->flags); 2947 set_bit(R5_Wantdrain, &dev->flags);
2906 if (!expand) 2948 if (!expand)
@@ -3295,13 +3337,6 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
3295 return rv; 3337 return rv;
3296} 3338}
3297 3339
3298/* fetch_block - checks the given member device to see if its data needs
3299 * to be read or computed to satisfy a request.
3300 *
3301 * Returns 1 when no more member devices need to be checked, otherwise returns
3302 * 0 to tell the loop in handle_stripe_fill to continue
3303 */
3304
3305static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3340static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3306 int disk_idx, int disks) 3341 int disk_idx, int disks)
3307{ 3342{
@@ -3392,6 +3427,12 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3392 return 0; 3427 return 0;
3393} 3428}
3394 3429
3430/* fetch_block - checks the given member device to see if its data needs
3431 * to be read or computed to satisfy a request.
3432 *
3433 * Returns 1 when no more member devices need to be checked, otherwise returns
3434 * 0 to tell the loop in handle_stripe_fill to continue
3435 */
3395static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3436static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3396 int disk_idx, int disks) 3437 int disk_idx, int disks)
3397{ 3438{
@@ -3478,10 +3519,26 @@ static void handle_stripe_fill(struct stripe_head *sh,
3478 * midst of changing due to a write 3519 * midst of changing due to a write
3479 */ 3520 */
3480 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3521 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3481 !sh->reconstruct_state) 3522 !sh->reconstruct_state) {
3523
3524 /*
3525 * For degraded stripe with data in journal, do not handle
3526 * read requests yet, instead, flush the stripe to raid
3527 * disks first, this avoids handling complex rmw of write
3528 * back cache (prexor with orig_page, and then xor with
3529 * page) in the read path
3530 */
3531 if (s->injournal && s->failed) {
3532 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3533 r5c_make_stripe_write_out(sh);
3534 goto out;
3535 }
3536
3482 for (i = disks; i--; ) 3537 for (i = disks; i--; )
3483 if (fetch_block(sh, s, i, disks)) 3538 if (fetch_block(sh, s, i, disks))
3484 break; 3539 break;
3540 }
3541out:
3485 set_bit(STRIPE_HANDLE, &sh->state); 3542 set_bit(STRIPE_HANDLE, &sh->state);
3486} 3543}
3487 3544
@@ -3594,6 +3651,21 @@ unhash:
3594 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3651 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
3595} 3652}
3596 3653
3654/*
3655 * For RMW in write back cache, we need extra page in prexor to store the
3656 * old data. This page is stored in dev->orig_page.
3657 *
3658 * This function checks whether we have data for prexor. The exact logic
3659 * is:
3660 * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
3661 */
3662static inline bool uptodate_for_rmw(struct r5dev *dev)
3663{
3664 return (test_bit(R5_UPTODATE, &dev->flags)) &&
3665 (!test_bit(R5_InJournal, &dev->flags) ||
3666 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
3667}
3668
3597static int handle_stripe_dirtying(struct r5conf *conf, 3669static int handle_stripe_dirtying(struct r5conf *conf,
3598 struct stripe_head *sh, 3670 struct stripe_head *sh,
3599 struct stripe_head_state *s, 3671 struct stripe_head_state *s,
@@ -3622,12 +3694,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
3622 } else for (i = disks; i--; ) { 3694 } else for (i = disks; i--; ) {
3623 /* would I have to read this buffer for read_modify_write */ 3695 /* would I have to read this buffer for read_modify_write */
3624 struct r5dev *dev = &sh->dev[i]; 3696 struct r5dev *dev = &sh->dev[i];
3625 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx || 3697 if (((dev->towrite && !delay_towrite(dev, s)) ||
3698 i == sh->pd_idx || i == sh->qd_idx ||
3626 test_bit(R5_InJournal, &dev->flags)) && 3699 test_bit(R5_InJournal, &dev->flags)) &&
3627 !test_bit(R5_LOCKED, &dev->flags) && 3700 !test_bit(R5_LOCKED, &dev->flags) &&
3628 !((test_bit(R5_UPTODATE, &dev->flags) && 3701 !(uptodate_for_rmw(dev) ||
3629 (!test_bit(R5_InJournal, &dev->flags) ||
3630 dev->page != dev->orig_page)) ||
3631 test_bit(R5_Wantcompute, &dev->flags))) { 3702 test_bit(R5_Wantcompute, &dev->flags))) {
3632 if (test_bit(R5_Insync, &dev->flags)) 3703 if (test_bit(R5_Insync, &dev->flags))
3633 rmw++; 3704 rmw++;
@@ -3639,7 +3710,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
3639 i != sh->pd_idx && i != sh->qd_idx && 3710 i != sh->pd_idx && i != sh->qd_idx &&
3640 !test_bit(R5_LOCKED, &dev->flags) && 3711 !test_bit(R5_LOCKED, &dev->flags) &&
3641 !(test_bit(R5_UPTODATE, &dev->flags) || 3712 !(test_bit(R5_UPTODATE, &dev->flags) ||
3642 test_bit(R5_InJournal, &dev->flags) ||
3643 test_bit(R5_Wantcompute, &dev->flags))) { 3713 test_bit(R5_Wantcompute, &dev->flags))) {
3644 if (test_bit(R5_Insync, &dev->flags)) 3714 if (test_bit(R5_Insync, &dev->flags))
3645 rcw++; 3715 rcw++;
@@ -3689,13 +3759,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
3689 3759
3690 for (i = disks; i--; ) { 3760 for (i = disks; i--; ) {
3691 struct r5dev *dev = &sh->dev[i]; 3761 struct r5dev *dev = &sh->dev[i];
3692 if ((dev->towrite || 3762 if (((dev->towrite && !delay_towrite(dev, s)) ||
3693 i == sh->pd_idx || i == sh->qd_idx || 3763 i == sh->pd_idx || i == sh->qd_idx ||
3694 test_bit(R5_InJournal, &dev->flags)) && 3764 test_bit(R5_InJournal, &dev->flags)) &&
3695 !test_bit(R5_LOCKED, &dev->flags) && 3765 !test_bit(R5_LOCKED, &dev->flags) &&
3696 !((test_bit(R5_UPTODATE, &dev->flags) && 3766 !(uptodate_for_rmw(dev) ||
3697 (!test_bit(R5_InJournal, &dev->flags) ||
3698 dev->page != dev->orig_page)) ||
3699 test_bit(R5_Wantcompute, &dev->flags)) && 3767 test_bit(R5_Wantcompute, &dev->flags)) &&
3700 test_bit(R5_Insync, &dev->flags)) { 3768 test_bit(R5_Insync, &dev->flags)) {
3701 if (test_bit(STRIPE_PREREAD_ACTIVE, 3769 if (test_bit(STRIPE_PREREAD_ACTIVE,
@@ -3722,7 +3790,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
3722 i != sh->pd_idx && i != sh->qd_idx && 3790 i != sh->pd_idx && i != sh->qd_idx &&
3723 !test_bit(R5_LOCKED, &dev->flags) && 3791 !test_bit(R5_LOCKED, &dev->flags) &&
3724 !(test_bit(R5_UPTODATE, &dev->flags) || 3792 !(test_bit(R5_UPTODATE, &dev->flags) ||
3725 test_bit(R5_InJournal, &dev->flags) ||
3726 test_bit(R5_Wantcompute, &dev->flags))) { 3793 test_bit(R5_Wantcompute, &dev->flags))) {
3727 rcw++; 3794 rcw++;
3728 if (test_bit(R5_Insync, &dev->flags) && 3795 if (test_bit(R5_Insync, &dev->flags) &&
@@ -7025,7 +7092,7 @@ static int raid5_run(struct mddev *mddev)
7025 /* 7092 /*
7026 * 0 for a fully functional array, 1 or 2 for a degraded array. 7093 * 0 for a fully functional array, 1 or 2 for a degraded array.
7027 */ 7094 */
7028 mddev->degraded = calc_degraded(conf); 7095 mddev->degraded = raid5_calc_degraded(conf);
7029 7096
7030 if (has_failed(conf)) { 7097 if (has_failed(conf)) {
7031 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 7098 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
@@ -7272,7 +7339,7 @@ static int raid5_spare_active(struct mddev *mddev)
7272 } 7339 }
7273 } 7340 }
7274 spin_lock_irqsave(&conf->device_lock, flags); 7341 spin_lock_irqsave(&conf->device_lock, flags);
7275 mddev->degraded = calc_degraded(conf); 7342 mddev->degraded = raid5_calc_degraded(conf);
7276 spin_unlock_irqrestore(&conf->device_lock, flags); 7343 spin_unlock_irqrestore(&conf->device_lock, flags);
7277 print_raid5_conf(conf); 7344 print_raid5_conf(conf);
7278 return count; 7345 return count;
@@ -7632,7 +7699,7 @@ static int raid5_start_reshape(struct mddev *mddev)
7632 * pre and post number of devices. 7699 * pre and post number of devices.
7633 */ 7700 */
7634 spin_lock_irqsave(&conf->device_lock, flags); 7701 spin_lock_irqsave(&conf->device_lock, flags);
7635 mddev->degraded = calc_degraded(conf); 7702 mddev->degraded = raid5_calc_degraded(conf);
7636 spin_unlock_irqrestore(&conf->device_lock, flags); 7703 spin_unlock_irqrestore(&conf->device_lock, flags);
7637 } 7704 }
7638 mddev->raid_disks = conf->raid_disks; 7705 mddev->raid_disks = conf->raid_disks;
@@ -7720,7 +7787,7 @@ static void raid5_finish_reshape(struct mddev *mddev)
7720 } else { 7787 } else {
7721 int d; 7788 int d;
7722 spin_lock_irq(&conf->device_lock); 7789 spin_lock_irq(&conf->device_lock);
7723 mddev->degraded = calc_degraded(conf); 7790 mddev->degraded = raid5_calc_degraded(conf);
7724 spin_unlock_irq(&conf->device_lock); 7791 spin_unlock_irq(&conf->device_lock);
7725 for (d = conf->raid_disks ; 7792 for (d = conf->raid_disks ;
7726 d < conf->raid_disks - mddev->delta_disks; 7793 d < conf->raid_disks - mddev->delta_disks;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ed8e1362ab36..1440fa26e296 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -322,6 +322,11 @@ enum r5dev_flags {
322 * data and parity being written are in the journal 322 * data and parity being written are in the journal
323 * device 323 * device
324 */ 324 */
325 R5_OrigPageUPTDODATE, /* with write back cache, we read old data into
326 * dev->orig_page for prexor. When this flag is
327 * set, orig_page contains latest data in the
328 * raid disk.
329 */
325}; 330};
326 331
327/* 332/*
@@ -753,6 +758,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
753extern struct stripe_head * 758extern struct stripe_head *
754raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 759raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
755 int previous, int noblock, int noquiesce); 760 int previous, int noblock, int noquiesce);
761extern int raid5_calc_degraded(struct r5conf *conf);
756extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); 762extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
757extern void r5l_exit_log(struct r5l_log *log); 763extern void r5l_exit_log(struct r5l_log *log);
758extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); 764extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
@@ -781,4 +787,5 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
781extern void r5c_check_stripe_cache_usage(struct r5conf *conf); 787extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
782extern void r5c_check_cached_full_stripe(struct r5conf *conf); 788extern void r5c_check_cached_full_stripe(struct r5conf *conf);
783extern struct md_sysfs_entry r5c_journal_mode; 789extern struct md_sysfs_entry r5c_journal_mode;
790extern void r5c_update_on_rdev_error(struct mddev *mddev);
784#endif 791#endif