diff options
| -rw-r--r-- | drivers/md/md.c | 5 | ||||
| -rw-r--r-- | drivers/md/raid5-cache.c | 106 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 121 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 7 |
4 files changed, 194 insertions, 45 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 82821ee0d57f..01175dac0db6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -5291,6 +5291,11 @@ int md_run(struct mddev *mddev) | |||
| 5291 | if (start_readonly && mddev->ro == 0) | 5291 | if (start_readonly && mddev->ro == 0) |
| 5292 | mddev->ro = 2; /* read-only, but switch on first write */ | 5292 | mddev->ro = 2; /* read-only, but switch on first write */ |
| 5293 | 5293 | ||
| 5294 | /* | ||
| 5295 | * NOTE: some pers->run(), for example r5l_recovery_log(), wakes | ||
| 5296 | * up mddev->thread. It is important to initialize critical | ||
| 5297 | * resources for mddev->thread BEFORE calling pers->run(). | ||
| 5298 | */ | ||
| 5294 | err = pers->run(mddev); | 5299 | err = pers->run(mddev); |
| 5295 | if (err) | 5300 | if (err) |
| 5296 | pr_warn("md: pers->run() failed ...\n"); | 5301 | pr_warn("md: pers->run() failed ...\n"); |
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 0e8ed2c327b0..302dea3296ba 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c | |||
| @@ -162,6 +162,8 @@ struct r5l_log { | |||
| 162 | 162 | ||
| 163 | /* to submit async io_units, to fulfill ordering of flush */ | 163 | /* to submit async io_units, to fulfill ordering of flush */ |
| 164 | struct work_struct deferred_io_work; | 164 | struct work_struct deferred_io_work; |
| 165 | /* to disable write back during in degraded mode */ | ||
| 166 | struct work_struct disable_writeback_work; | ||
| 165 | }; | 167 | }; |
| 166 | 168 | ||
| 167 | /* | 169 | /* |
| @@ -611,6 +613,21 @@ static void r5l_submit_io_async(struct work_struct *work) | |||
| 611 | r5l_do_submit_io(log, io); | 613 | r5l_do_submit_io(log, io); |
| 612 | } | 614 | } |
| 613 | 615 | ||
| 616 | static void r5c_disable_writeback_async(struct work_struct *work) | ||
| 617 | { | ||
| 618 | struct r5l_log *log = container_of(work, struct r5l_log, | ||
| 619 | disable_writeback_work); | ||
| 620 | struct mddev *mddev = log->rdev->mddev; | ||
| 621 | |||
| 622 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) | ||
| 623 | return; | ||
| 624 | pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", | ||
| 625 | mdname(mddev)); | ||
| 626 | mddev_suspend(mddev); | ||
| 627 | log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; | ||
| 628 | mddev_resume(mddev); | ||
| 629 | } | ||
| 630 | |||
| 614 | static void r5l_submit_current_io(struct r5l_log *log) | 631 | static void r5l_submit_current_io(struct r5l_log *log) |
| 615 | { | 632 | { |
| 616 | struct r5l_io_unit *io = log->current_io; | 633 | struct r5l_io_unit *io = log->current_io; |
| @@ -1393,8 +1410,6 @@ static void r5l_do_reclaim(struct r5l_log *log) | |||
| 1393 | next_checkpoint = r5c_calculate_new_cp(conf); | 1410 | next_checkpoint = r5c_calculate_new_cp(conf); |
| 1394 | spin_unlock_irq(&log->io_list_lock); | 1411 | spin_unlock_irq(&log->io_list_lock); |
| 1395 | 1412 | ||
| 1396 | BUG_ON(reclaimable < 0); | ||
| 1397 | |||
| 1398 | if (reclaimable == 0 || !write_super) | 1413 | if (reclaimable == 0 || !write_super) |
| 1399 | return; | 1414 | return; |
| 1400 | 1415 | ||
| @@ -2062,7 +2077,7 @@ static int | |||
| 2062 | r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, | 2077 | r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, |
| 2063 | struct r5l_recovery_ctx *ctx) | 2078 | struct r5l_recovery_ctx *ctx) |
| 2064 | { | 2079 | { |
| 2065 | struct stripe_head *sh, *next; | 2080 | struct stripe_head *sh; |
| 2066 | struct mddev *mddev = log->rdev->mddev; | 2081 | struct mddev *mddev = log->rdev->mddev; |
| 2067 | struct page *page; | 2082 | struct page *page; |
| 2068 | sector_t next_checkpoint = MaxSector; | 2083 | sector_t next_checkpoint = MaxSector; |
| @@ -2076,7 +2091,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, | |||
| 2076 | 2091 | ||
| 2077 | WARN_ON(list_empty(&ctx->cached_list)); | 2092 | WARN_ON(list_empty(&ctx->cached_list)); |
| 2078 | 2093 | ||
| 2079 | list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { | 2094 | list_for_each_entry(sh, &ctx->cached_list, lru) { |
| 2080 | struct r5l_meta_block *mb; | 2095 | struct r5l_meta_block *mb; |
| 2081 | int i; | 2096 | int i; |
| 2082 | int offset; | 2097 | int offset; |
| @@ -2126,14 +2141,39 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, | |||
| 2126 | ctx->pos = write_pos; | 2141 | ctx->pos = write_pos; |
| 2127 | ctx->seq += 1; | 2142 | ctx->seq += 1; |
| 2128 | next_checkpoint = sh->log_start; | 2143 | next_checkpoint = sh->log_start; |
| 2129 | list_del_init(&sh->lru); | ||
| 2130 | raid5_release_stripe(sh); | ||
| 2131 | } | 2144 | } |
| 2132 | log->next_checkpoint = next_checkpoint; | 2145 | log->next_checkpoint = next_checkpoint; |
| 2133 | __free_page(page); | 2146 | __free_page(page); |
| 2134 | return 0; | 2147 | return 0; |
| 2135 | } | 2148 | } |
| 2136 | 2149 | ||
| 2150 | static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, | ||
| 2151 | struct r5l_recovery_ctx *ctx) | ||
| 2152 | { | ||
| 2153 | struct mddev *mddev = log->rdev->mddev; | ||
| 2154 | struct r5conf *conf = mddev->private; | ||
| 2155 | struct stripe_head *sh, *next; | ||
| 2156 | |||
| 2157 | if (ctx->data_only_stripes == 0) | ||
| 2158 | return; | ||
| 2159 | |||
| 2160 | log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; | ||
| 2161 | |||
| 2162 | list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { | ||
| 2163 | r5c_make_stripe_write_out(sh); | ||
| 2164 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 2165 | list_del_init(&sh->lru); | ||
| 2166 | raid5_release_stripe(sh); | ||
| 2167 | } | ||
| 2168 | |||
| 2169 | md_wakeup_thread(conf->mddev->thread); | ||
| 2170 | /* reuse conf->wait_for_quiescent in recovery */ | ||
| 2171 | wait_event(conf->wait_for_quiescent, | ||
| 2172 | atomic_read(&conf->active_stripes) == 0); | ||
| 2173 | |||
| 2174 | log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; | ||
| 2175 | } | ||
| 2176 | |||
| 2137 | static int r5l_recovery_log(struct r5l_log *log) | 2177 | static int r5l_recovery_log(struct r5l_log *log) |
| 2138 | { | 2178 | { |
| 2139 | struct mddev *mddev = log->rdev->mddev; | 2179 | struct mddev *mddev = log->rdev->mddev; |
| @@ -2160,32 +2200,31 @@ static int r5l_recovery_log(struct r5l_log *log) | |||
| 2160 | pos = ctx.pos; | 2200 | pos = ctx.pos; |
| 2161 | ctx.seq += 10000; | 2201 | ctx.seq += 10000; |
| 2162 | 2202 | ||
| 2163 | if (ctx.data_only_stripes == 0) { | ||
| 2164 | log->next_checkpoint = ctx.pos; | ||
| 2165 | r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); | ||
| 2166 | ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); | ||
| 2167 | } | ||
| 2168 | 2203 | ||
| 2169 | if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) | 2204 | if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) |
| 2170 | pr_debug("md/raid:%s: starting from clean shutdown\n", | 2205 | pr_debug("md/raid:%s: starting from clean shutdown\n", |
| 2171 | mdname(mddev)); | 2206 | mdname(mddev)); |
| 2172 | else { | 2207 | else |
| 2173 | pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", | 2208 | pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", |
| 2174 | mdname(mddev), ctx.data_only_stripes, | 2209 | mdname(mddev), ctx.data_only_stripes, |
| 2175 | ctx.data_parity_stripes); | 2210 | ctx.data_parity_stripes); |
| 2176 | 2211 | ||
| 2177 | if (ctx.data_only_stripes > 0) | 2212 | if (ctx.data_only_stripes == 0) { |
| 2178 | if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { | 2213 | log->next_checkpoint = ctx.pos; |
| 2179 | pr_err("md/raid:%s: failed to rewrite stripes to journal\n", | 2214 | r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); |
| 2180 | mdname(mddev)); | 2215 | ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); |
| 2181 | return -EIO; | 2216 | } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { |
| 2182 | } | 2217 | pr_err("md/raid:%s: failed to rewrite stripes to journal\n", |
| 2218 | mdname(mddev)); | ||
| 2219 | return -EIO; | ||
| 2183 | } | 2220 | } |
| 2184 | 2221 | ||
| 2185 | log->log_start = ctx.pos; | 2222 | log->log_start = ctx.pos; |
| 2186 | log->seq = ctx.seq; | 2223 | log->seq = ctx.seq; |
| 2187 | log->last_checkpoint = pos; | 2224 | log->last_checkpoint = pos; |
| 2188 | r5l_write_super(log, pos); | 2225 | r5l_write_super(log, pos); |
| 2226 | |||
| 2227 | r5c_recovery_flush_data_only_stripes(log, &ctx); | ||
| 2189 | return 0; | 2228 | return 0; |
| 2190 | } | 2229 | } |
| 2191 | 2230 | ||
| @@ -2247,6 +2286,10 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev, | |||
| 2247 | val > R5C_JOURNAL_MODE_WRITE_BACK) | 2286 | val > R5C_JOURNAL_MODE_WRITE_BACK) |
| 2248 | return -EINVAL; | 2287 | return -EINVAL; |
| 2249 | 2288 | ||
| 2289 | if (raid5_calc_degraded(conf) > 0 && | ||
| 2290 | val == R5C_JOURNAL_MODE_WRITE_BACK) | ||
| 2291 | return -EINVAL; | ||
| 2292 | |||
| 2250 | mddev_suspend(mddev); | 2293 | mddev_suspend(mddev); |
| 2251 | conf->log->r5c_journal_mode = val; | 2294 | conf->log->r5c_journal_mode = val; |
| 2252 | mddev_resume(mddev); | 2295 | mddev_resume(mddev); |
| @@ -2301,6 +2344,16 @@ int r5c_try_caching_write(struct r5conf *conf, | |||
| 2301 | set_bit(STRIPE_R5C_CACHING, &sh->state); | 2344 | set_bit(STRIPE_R5C_CACHING, &sh->state); |
| 2302 | } | 2345 | } |
| 2303 | 2346 | ||
| 2347 | /* | ||
| 2348 | * When run in degraded mode, array is set to write-through mode. | ||
| 2349 | * This check helps drain pending write safely in the transition to | ||
| 2350 | * write-through mode. | ||
| 2351 | */ | ||
| 2352 | if (s->failed) { | ||
| 2353 | r5c_make_stripe_write_out(sh); | ||
| 2354 | return -EAGAIN; | ||
| 2355 | } | ||
| 2356 | |||
| 2304 | for (i = disks; i--; ) { | 2357 | for (i = disks; i--; ) { |
| 2305 | dev = &sh->dev[i]; | 2358 | dev = &sh->dev[i]; |
| 2306 | /* if non-overwrite, use writing-out phase */ | 2359 | /* if non-overwrite, use writing-out phase */ |
| @@ -2351,6 +2404,8 @@ void r5c_release_extra_page(struct stripe_head *sh) | |||
| 2351 | struct page *p = sh->dev[i].orig_page; | 2404 | struct page *p = sh->dev[i].orig_page; |
| 2352 | 2405 | ||
| 2353 | sh->dev[i].orig_page = sh->dev[i].page; | 2406 | sh->dev[i].orig_page = sh->dev[i].page; |
| 2407 | clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); | ||
| 2408 | |||
| 2354 | if (!using_disk_info_extra_page) | 2409 | if (!using_disk_info_extra_page) |
| 2355 | put_page(p); | 2410 | put_page(p); |
| 2356 | } | 2411 | } |
| @@ -2555,6 +2610,19 @@ ioerr: | |||
| 2555 | return ret; | 2610 | return ret; |
| 2556 | } | 2611 | } |
| 2557 | 2612 | ||
| 2613 | void r5c_update_on_rdev_error(struct mddev *mddev) | ||
| 2614 | { | ||
| 2615 | struct r5conf *conf = mddev->private; | ||
| 2616 | struct r5l_log *log = conf->log; | ||
| 2617 | |||
| 2618 | if (!log) | ||
| 2619 | return; | ||
| 2620 | |||
| 2621 | if (raid5_calc_degraded(conf) > 0 && | ||
| 2622 | conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) | ||
| 2623 | schedule_work(&log->disable_writeback_work); | ||
| 2624 | } | ||
| 2625 | |||
| 2558 | int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) | 2626 | int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) |
| 2559 | { | 2627 | { |
| 2560 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 2628 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
| @@ -2627,6 +2695,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) | |||
| 2627 | spin_lock_init(&log->no_space_stripes_lock); | 2695 | spin_lock_init(&log->no_space_stripes_lock); |
| 2628 | 2696 | ||
| 2629 | INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); | 2697 | INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); |
| 2698 | INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async); | ||
| 2630 | 2699 | ||
| 2631 | log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; | 2700 | log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; |
| 2632 | INIT_LIST_HEAD(&log->stripe_in_journal_list); | 2701 | INIT_LIST_HEAD(&log->stripe_in_journal_list); |
| @@ -2659,6 +2728,7 @@ io_kc: | |||
| 2659 | 2728 | ||
| 2660 | void r5l_exit_log(struct r5l_log *log) | 2729 | void r5l_exit_log(struct r5l_log *log) |
| 2661 | { | 2730 | { |
| 2731 | flush_work(&log->disable_writeback_work); | ||
| 2662 | md_unregister_thread(&log->reclaim_thread); | 2732 | md_unregister_thread(&log->reclaim_thread); |
| 2663 | mempool_destroy(log->meta_pool); | 2733 | mempool_destroy(log->meta_pool); |
| 2664 | bioset_free(log->bs); | 2734 | bioset_free(log->bs); |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 36c13e4be9c9..3c7e106c12a2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -556,7 +556,7 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, | |||
| 556 | * of the two sections, and some non-in_sync devices may | 556 | * of the two sections, and some non-in_sync devices may |
| 557 | * be insync in the section most affected by failed devices. | 557 | * be insync in the section most affected by failed devices. |
| 558 | */ | 558 | */ |
| 559 | static int calc_degraded(struct r5conf *conf) | 559 | int raid5_calc_degraded(struct r5conf *conf) |
| 560 | { | 560 | { |
| 561 | int degraded, degraded2; | 561 | int degraded, degraded2; |
| 562 | int i; | 562 | int i; |
| @@ -619,7 +619,7 @@ static int has_failed(struct r5conf *conf) | |||
| 619 | if (conf->mddev->reshape_position == MaxSector) | 619 | if (conf->mddev->reshape_position == MaxSector) |
| 620 | return conf->mddev->degraded > conf->max_degraded; | 620 | return conf->mddev->degraded > conf->max_degraded; |
| 621 | 621 | ||
| 622 | degraded = calc_degraded(conf); | 622 | degraded = raid5_calc_degraded(conf); |
| 623 | if (degraded > conf->max_degraded) | 623 | if (degraded > conf->max_degraded) |
| 624 | return 1; | 624 | return 1; |
| 625 | return 0; | 625 | return 0; |
| @@ -1015,7 +1015,17 @@ again: | |||
| 1015 | 1015 | ||
| 1016 | if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) | 1016 | if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) |
| 1017 | WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); | 1017 | WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); |
| 1018 | sh->dev[i].vec.bv_page = sh->dev[i].page; | 1018 | |
| 1019 | if (!op_is_write(op) && | ||
| 1020 | test_bit(R5_InJournal, &sh->dev[i].flags)) | ||
| 1021 | /* | ||
| 1022 | * issuing read for a page in journal, this | ||
| 1023 | * must be preparing for prexor in rmw; read | ||
| 1024 | * the data into orig_page | ||
| 1025 | */ | ||
| 1026 | sh->dev[i].vec.bv_page = sh->dev[i].orig_page; | ||
| 1027 | else | ||
| 1028 | sh->dev[i].vec.bv_page = sh->dev[i].page; | ||
| 1019 | bi->bi_vcnt = 1; | 1029 | bi->bi_vcnt = 1; |
| 1020 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 1030 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
| 1021 | bi->bi_io_vec[0].bv_offset = 0; | 1031 | bi->bi_io_vec[0].bv_offset = 0; |
| @@ -2380,6 +2390,13 @@ static void raid5_end_read_request(struct bio * bi) | |||
| 2380 | } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | 2390 | } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) |
| 2381 | clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); | 2391 | clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); |
| 2382 | 2392 | ||
| 2393 | if (test_bit(R5_InJournal, &sh->dev[i].flags)) | ||
| 2394 | /* | ||
| 2395 | * end read for a page in journal, this | ||
| 2396 | * must be preparing for prexor in rmw | ||
| 2397 | */ | ||
| 2398 | set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); | ||
| 2399 | |||
| 2383 | if (atomic_read(&rdev->read_errors)) | 2400 | if (atomic_read(&rdev->read_errors)) |
| 2384 | atomic_set(&rdev->read_errors, 0); | 2401 | atomic_set(&rdev->read_errors, 0); |
| 2385 | } else { | 2402 | } else { |
| @@ -2538,7 +2555,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) | |||
| 2538 | 2555 | ||
| 2539 | spin_lock_irqsave(&conf->device_lock, flags); | 2556 | spin_lock_irqsave(&conf->device_lock, flags); |
| 2540 | clear_bit(In_sync, &rdev->flags); | 2557 | clear_bit(In_sync, &rdev->flags); |
| 2541 | mddev->degraded = calc_degraded(conf); | 2558 | mddev->degraded = raid5_calc_degraded(conf); |
| 2542 | spin_unlock_irqrestore(&conf->device_lock, flags); | 2559 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 2543 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 2560 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
| 2544 | 2561 | ||
| @@ -2552,6 +2569,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) | |||
| 2552 | bdevname(rdev->bdev, b), | 2569 | bdevname(rdev->bdev, b), |
| 2553 | mdname(mddev), | 2570 | mdname(mddev), |
| 2554 | conf->raid_disks - mddev->degraded); | 2571 | conf->raid_disks - mddev->degraded); |
| 2572 | r5c_update_on_rdev_error(mddev); | ||
| 2555 | } | 2573 | } |
| 2556 | 2574 | ||
| 2557 | /* | 2575 | /* |
| @@ -2880,6 +2898,30 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
| 2880 | return r_sector; | 2898 | return r_sector; |
| 2881 | } | 2899 | } |
| 2882 | 2900 | ||
| 2901 | /* | ||
| 2902 | * There are cases where we want handle_stripe_dirtying() and | ||
| 2903 | * schedule_reconstruction() to delay towrite to some dev of a stripe. | ||
| 2904 | * | ||
| 2905 | * This function checks whether we want to delay the towrite. Specifically, | ||
| 2906 | * we delay the towrite when: | ||
| 2907 | * | ||
| 2908 | * 1. degraded stripe has a non-overwrite to the missing dev, AND this | ||
| 2909 | * stripe has data in journal (for other devices). | ||
| 2910 | * | ||
| 2911 | * In this case, when reading data for the non-overwrite dev, it is | ||
| 2912 | * necessary to handle complex rmw of write back cache (prexor with | ||
| 2913 | * orig_page, and xor with page). To keep read path simple, we would | ||
| 2914 | * like to flush data in journal to RAID disks first, so complex rmw | ||
| 2915 | * is handled in the write patch (handle_stripe_dirtying). | ||
| 2916 | * | ||
| 2917 | */ | ||
| 2918 | static inline bool delay_towrite(struct r5dev *dev, | ||
| 2919 | struct stripe_head_state *s) | ||
| 2920 | { | ||
| 2921 | return !test_bit(R5_OVERWRITE, &dev->flags) && | ||
| 2922 | !test_bit(R5_Insync, &dev->flags) && s->injournal; | ||
| 2923 | } | ||
| 2924 | |||
| 2883 | static void | 2925 | static void |
| 2884 | schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | 2926 | schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, |
| 2885 | int rcw, int expand) | 2927 | int rcw, int expand) |
| @@ -2900,7 +2942,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
| 2900 | for (i = disks; i--; ) { | 2942 | for (i = disks; i--; ) { |
| 2901 | struct r5dev *dev = &sh->dev[i]; | 2943 | struct r5dev *dev = &sh->dev[i]; |
| 2902 | 2944 | ||
| 2903 | if (dev->towrite) { | 2945 | if (dev->towrite && !delay_towrite(dev, s)) { |
| 2904 | set_bit(R5_LOCKED, &dev->flags); | 2946 | set_bit(R5_LOCKED, &dev->flags); |
| 2905 | set_bit(R5_Wantdrain, &dev->flags); | 2947 | set_bit(R5_Wantdrain, &dev->flags); |
| 2906 | if (!expand) | 2948 | if (!expand) |
| @@ -3295,13 +3337,6 @@ static int want_replace(struct stripe_head *sh, int disk_idx) | |||
| 3295 | return rv; | 3337 | return rv; |
| 3296 | } | 3338 | } |
| 3297 | 3339 | ||
| 3298 | /* fetch_block - checks the given member device to see if its data needs | ||
| 3299 | * to be read or computed to satisfy a request. | ||
| 3300 | * | ||
| 3301 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
| 3302 | * 0 to tell the loop in handle_stripe_fill to continue | ||
| 3303 | */ | ||
| 3304 | |||
| 3305 | static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, | 3340 | static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, |
| 3306 | int disk_idx, int disks) | 3341 | int disk_idx, int disks) |
| 3307 | { | 3342 | { |
| @@ -3392,6 +3427,12 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, | |||
| 3392 | return 0; | 3427 | return 0; |
| 3393 | } | 3428 | } |
| 3394 | 3429 | ||
| 3430 | /* fetch_block - checks the given member device to see if its data needs | ||
| 3431 | * to be read or computed to satisfy a request. | ||
| 3432 | * | ||
| 3433 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
| 3434 | * 0 to tell the loop in handle_stripe_fill to continue | ||
| 3435 | */ | ||
| 3395 | static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, | 3436 | static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, |
| 3396 | int disk_idx, int disks) | 3437 | int disk_idx, int disks) |
| 3397 | { | 3438 | { |
| @@ -3478,10 +3519,26 @@ static void handle_stripe_fill(struct stripe_head *sh, | |||
| 3478 | * midst of changing due to a write | 3519 | * midst of changing due to a write |
| 3479 | */ | 3520 | */ |
| 3480 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | 3521 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && |
| 3481 | !sh->reconstruct_state) | 3522 | !sh->reconstruct_state) { |
| 3523 | |||
| 3524 | /* | ||
| 3525 | * For degraded stripe with data in journal, do not handle | ||
| 3526 | * read requests yet, instead, flush the stripe to raid | ||
| 3527 | * disks first, this avoids handling complex rmw of write | ||
| 3528 | * back cache (prexor with orig_page, and then xor with | ||
| 3529 | * page) in the read path | ||
| 3530 | */ | ||
| 3531 | if (s->injournal && s->failed) { | ||
| 3532 | if (test_bit(STRIPE_R5C_CACHING, &sh->state)) | ||
| 3533 | r5c_make_stripe_write_out(sh); | ||
| 3534 | goto out; | ||
| 3535 | } | ||
| 3536 | |||
| 3482 | for (i = disks; i--; ) | 3537 | for (i = disks; i--; ) |
| 3483 | if (fetch_block(sh, s, i, disks)) | 3538 | if (fetch_block(sh, s, i, disks)) |
| 3484 | break; | 3539 | break; |
| 3540 | } | ||
| 3541 | out: | ||
| 3485 | set_bit(STRIPE_HANDLE, &sh->state); | 3542 | set_bit(STRIPE_HANDLE, &sh->state); |
| 3486 | } | 3543 | } |
| 3487 | 3544 | ||
| @@ -3594,6 +3651,21 @@ unhash: | |||
| 3594 | break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); | 3651 | break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); |
| 3595 | } | 3652 | } |
| 3596 | 3653 | ||
| 3654 | /* | ||
| 3655 | * For RMW in write back cache, we need extra page in prexor to store the | ||
| 3656 | * old data. This page is stored in dev->orig_page. | ||
| 3657 | * | ||
| 3658 | * This function checks whether we have data for prexor. The exact logic | ||
| 3659 | * is: | ||
| 3660 | * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) | ||
| 3661 | */ | ||
| 3662 | static inline bool uptodate_for_rmw(struct r5dev *dev) | ||
| 3663 | { | ||
| 3664 | return (test_bit(R5_UPTODATE, &dev->flags)) && | ||
| 3665 | (!test_bit(R5_InJournal, &dev->flags) || | ||
| 3666 | test_bit(R5_OrigPageUPTDODATE, &dev->flags)); | ||
| 3667 | } | ||
| 3668 | |||
| 3597 | static int handle_stripe_dirtying(struct r5conf *conf, | 3669 | static int handle_stripe_dirtying(struct r5conf *conf, |
| 3598 | struct stripe_head *sh, | 3670 | struct stripe_head *sh, |
| 3599 | struct stripe_head_state *s, | 3671 | struct stripe_head_state *s, |
| @@ -3622,12 +3694,11 @@ static int handle_stripe_dirtying(struct r5conf *conf, | |||
| 3622 | } else for (i = disks; i--; ) { | 3694 | } else for (i = disks; i--; ) { |
| 3623 | /* would I have to read this buffer for read_modify_write */ | 3695 | /* would I have to read this buffer for read_modify_write */ |
| 3624 | struct r5dev *dev = &sh->dev[i]; | 3696 | struct r5dev *dev = &sh->dev[i]; |
| 3625 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx || | 3697 | if (((dev->towrite && !delay_towrite(dev, s)) || |
| 3698 | i == sh->pd_idx || i == sh->qd_idx || | ||
| 3626 | test_bit(R5_InJournal, &dev->flags)) && | 3699 | test_bit(R5_InJournal, &dev->flags)) && |
| 3627 | !test_bit(R5_LOCKED, &dev->flags) && | 3700 | !test_bit(R5_LOCKED, &dev->flags) && |
| 3628 | !((test_bit(R5_UPTODATE, &dev->flags) && | 3701 | !(uptodate_for_rmw(dev) || |
| 3629 | (!test_bit(R5_InJournal, &dev->flags) || | ||
| 3630 | dev->page != dev->orig_page)) || | ||
| 3631 | test_bit(R5_Wantcompute, &dev->flags))) { | 3702 | test_bit(R5_Wantcompute, &dev->flags))) { |
| 3632 | if (test_bit(R5_Insync, &dev->flags)) | 3703 | if (test_bit(R5_Insync, &dev->flags)) |
| 3633 | rmw++; | 3704 | rmw++; |
| @@ -3639,7 +3710,6 @@ static int handle_stripe_dirtying(struct r5conf *conf, | |||
| 3639 | i != sh->pd_idx && i != sh->qd_idx && | 3710 | i != sh->pd_idx && i != sh->qd_idx && |
| 3640 | !test_bit(R5_LOCKED, &dev->flags) && | 3711 | !test_bit(R5_LOCKED, &dev->flags) && |
| 3641 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3712 | !(test_bit(R5_UPTODATE, &dev->flags) || |
| 3642 | test_bit(R5_InJournal, &dev->flags) || | ||
| 3643 | test_bit(R5_Wantcompute, &dev->flags))) { | 3713 | test_bit(R5_Wantcompute, &dev->flags))) { |
| 3644 | if (test_bit(R5_Insync, &dev->flags)) | 3714 | if (test_bit(R5_Insync, &dev->flags)) |
| 3645 | rcw++; | 3715 | rcw++; |
| @@ -3689,13 +3759,11 @@ static int handle_stripe_dirtying(struct r5conf *conf, | |||
| 3689 | 3759 | ||
| 3690 | for (i = disks; i--; ) { | 3760 | for (i = disks; i--; ) { |
| 3691 | struct r5dev *dev = &sh->dev[i]; | 3761 | struct r5dev *dev = &sh->dev[i]; |
| 3692 | if ((dev->towrite || | 3762 | if (((dev->towrite && !delay_towrite(dev, s)) || |
| 3693 | i == sh->pd_idx || i == sh->qd_idx || | 3763 | i == sh->pd_idx || i == sh->qd_idx || |
| 3694 | test_bit(R5_InJournal, &dev->flags)) && | 3764 | test_bit(R5_InJournal, &dev->flags)) && |
| 3695 | !test_bit(R5_LOCKED, &dev->flags) && | 3765 | !test_bit(R5_LOCKED, &dev->flags) && |
| 3696 | !((test_bit(R5_UPTODATE, &dev->flags) && | 3766 | !(uptodate_for_rmw(dev) || |
| 3697 | (!test_bit(R5_InJournal, &dev->flags) || | ||
| 3698 | dev->page != dev->orig_page)) || | ||
| 3699 | test_bit(R5_Wantcompute, &dev->flags)) && | 3767 | test_bit(R5_Wantcompute, &dev->flags)) && |
| 3700 | test_bit(R5_Insync, &dev->flags)) { | 3768 | test_bit(R5_Insync, &dev->flags)) { |
| 3701 | if (test_bit(STRIPE_PREREAD_ACTIVE, | 3769 | if (test_bit(STRIPE_PREREAD_ACTIVE, |
| @@ -3722,7 +3790,6 @@ static int handle_stripe_dirtying(struct r5conf *conf, | |||
| 3722 | i != sh->pd_idx && i != sh->qd_idx && | 3790 | i != sh->pd_idx && i != sh->qd_idx && |
| 3723 | !test_bit(R5_LOCKED, &dev->flags) && | 3791 | !test_bit(R5_LOCKED, &dev->flags) && |
| 3724 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3792 | !(test_bit(R5_UPTODATE, &dev->flags) || |
| 3725 | test_bit(R5_InJournal, &dev->flags) || | ||
| 3726 | test_bit(R5_Wantcompute, &dev->flags))) { | 3793 | test_bit(R5_Wantcompute, &dev->flags))) { |
| 3727 | rcw++; | 3794 | rcw++; |
| 3728 | if (test_bit(R5_Insync, &dev->flags) && | 3795 | if (test_bit(R5_Insync, &dev->flags) && |
| @@ -7025,7 +7092,7 @@ static int raid5_run(struct mddev *mddev) | |||
| 7025 | /* | 7092 | /* |
| 7026 | * 0 for a fully functional array, 1 or 2 for a degraded array. | 7093 | * 0 for a fully functional array, 1 or 2 for a degraded array. |
| 7027 | */ | 7094 | */ |
| 7028 | mddev->degraded = calc_degraded(conf); | 7095 | mddev->degraded = raid5_calc_degraded(conf); |
| 7029 | 7096 | ||
| 7030 | if (has_failed(conf)) { | 7097 | if (has_failed(conf)) { |
| 7031 | pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", | 7098 | pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", |
| @@ -7272,7 +7339,7 @@ static int raid5_spare_active(struct mddev *mddev) | |||
| 7272 | } | 7339 | } |
| 7273 | } | 7340 | } |
| 7274 | spin_lock_irqsave(&conf->device_lock, flags); | 7341 | spin_lock_irqsave(&conf->device_lock, flags); |
| 7275 | mddev->degraded = calc_degraded(conf); | 7342 | mddev->degraded = raid5_calc_degraded(conf); |
| 7276 | spin_unlock_irqrestore(&conf->device_lock, flags); | 7343 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 7277 | print_raid5_conf(conf); | 7344 | print_raid5_conf(conf); |
| 7278 | return count; | 7345 | return count; |
| @@ -7632,7 +7699,7 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
| 7632 | * pre and post number of devices. | 7699 | * pre and post number of devices. |
| 7633 | */ | 7700 | */ |
| 7634 | spin_lock_irqsave(&conf->device_lock, flags); | 7701 | spin_lock_irqsave(&conf->device_lock, flags); |
| 7635 | mddev->degraded = calc_degraded(conf); | 7702 | mddev->degraded = raid5_calc_degraded(conf); |
| 7636 | spin_unlock_irqrestore(&conf->device_lock, flags); | 7703 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 7637 | } | 7704 | } |
| 7638 | mddev->raid_disks = conf->raid_disks; | 7705 | mddev->raid_disks = conf->raid_disks; |
| @@ -7720,7 +7787,7 @@ static void raid5_finish_reshape(struct mddev *mddev) | |||
| 7720 | } else { | 7787 | } else { |
| 7721 | int d; | 7788 | int d; |
| 7722 | spin_lock_irq(&conf->device_lock); | 7789 | spin_lock_irq(&conf->device_lock); |
| 7723 | mddev->degraded = calc_degraded(conf); | 7790 | mddev->degraded = raid5_calc_degraded(conf); |
| 7724 | spin_unlock_irq(&conf->device_lock); | 7791 | spin_unlock_irq(&conf->device_lock); |
| 7725 | for (d = conf->raid_disks ; | 7792 | for (d = conf->raid_disks ; |
| 7726 | d < conf->raid_disks - mddev->delta_disks; | 7793 | d < conf->raid_disks - mddev->delta_disks; |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index ed8e1362ab36..1440fa26e296 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
| @@ -322,6 +322,11 @@ enum r5dev_flags { | |||
| 322 | * data and parity being written are in the journal | 322 | * data and parity being written are in the journal |
| 323 | * device | 323 | * device |
| 324 | */ | 324 | */ |
| 325 | R5_OrigPageUPTDODATE, /* with write back cache, we read old data into | ||
| 326 | * dev->orig_page for prexor. When this flag is | ||
| 327 | * set, orig_page contains latest data in the | ||
| 328 | * raid disk. | ||
| 329 | */ | ||
| 325 | }; | 330 | }; |
| 326 | 331 | ||
| 327 | /* | 332 | /* |
| @@ -753,6 +758,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, | |||
| 753 | extern struct stripe_head * | 758 | extern struct stripe_head * |
| 754 | raid5_get_active_stripe(struct r5conf *conf, sector_t sector, | 759 | raid5_get_active_stripe(struct r5conf *conf, sector_t sector, |
| 755 | int previous, int noblock, int noquiesce); | 760 | int previous, int noblock, int noquiesce); |
| 761 | extern int raid5_calc_degraded(struct r5conf *conf); | ||
| 756 | extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); | 762 | extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); |
| 757 | extern void r5l_exit_log(struct r5l_log *log); | 763 | extern void r5l_exit_log(struct r5l_log *log); |
| 758 | extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); | 764 | extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); |
| @@ -781,4 +787,5 @@ extern void r5c_flush_cache(struct r5conf *conf, int num); | |||
| 781 | extern void r5c_check_stripe_cache_usage(struct r5conf *conf); | 787 | extern void r5c_check_stripe_cache_usage(struct r5conf *conf); |
| 782 | extern void r5c_check_cached_full_stripe(struct r5conf *conf); | 788 | extern void r5c_check_cached_full_stripe(struct r5conf *conf); |
| 783 | extern struct md_sysfs_entry r5c_journal_mode; | 789 | extern struct md_sysfs_entry r5c_journal_mode; |
| 790 | extern void r5c_update_on_rdev_error(struct mddev *mddev); | ||
| 784 | #endif | 791 | #endif |
