summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorSong Liu <songliubraving@fb.com>2016-11-17 18:24:39 -0500
committerShaohua Li <shli@fb.com>2016-11-18 16:26:30 -0500
commit1e6d690b9334b7e1b31d25fd8d93e980e449a5f9 (patch)
tree878a16fa392d23a942a1bc0efe3ef5e3ae2e3ab0 /drivers/md/raid5.c
parent2ded370373a400c20cf0c6e941e724e61582a867 (diff)
md/r5cache: caching phase of r5cache
As described in previous patch, write back cache operates in two phases: caching and writing-out. The caching phase works as: 1. write data to journal (r5c_handle_stripe_dirtying, r5c_cache_data) 2. call bio_endio (r5c_handle_data_cached, r5c_return_dev_pending_writes). Then the writing-out phase is as: 1. Mark the stripe as write-out (r5c_make_stripe_write_out) 2. Calcualte parity (reconstruct or RMW) 3. Write parity (and maybe some other data) to journal device 4. Write data and parity to RAID disks This patch implements caching phase. The cache is integrated with stripe cache of raid456. It leverages code of r5l_log to write data to journal device. Writing-out phase of the cache is implemented in the next patch. With r5cache, write operation does not wait for parity calculation and write out, so the write latency is lower (1 write to journal device vs. read and then write to raid disks). Also, r5cache will reduce RAID overhead (multipile IO due to read-modify-write of parity) and provide more opportunities of full stripe writes. This patch adds 2 flags to stripe_head.state: - STRIPE_R5C_PARTIAL_STRIPE, - STRIPE_R5C_FULL_STRIPE, Instead of inactive_list, stripes with cached data are tracked in r5conf->r5c_full_stripe_list and r5conf->r5c_partial_stripe_list. STRIPE_R5C_FULL_STRIPE and STRIPE_R5C_PARTIAL_STRIPE are flags for stripes in these lists. Note: stripes in r5c_full/partial_stripe_list are not considered as "active". For RMW, the code allocates an extra page for each data block being updated. This is stored in r5dev->orig_page and the old data is read into it. Then the prexor calculation subtracts ->orig_page from the parity block, and the reconstruct calculation adds the ->page data back into the parity block. r5cache naturally excludes SkipCopy. When the array has write back cache, async_copy_data() will not skip copy. There are some known limitations of the cache implementation: 1. Write cache only covers full page writes (R5_OVERWRITE). Writes of smaller granularity are write through. 2. Only one log io (sh->log_io) for each stripe at anytime. Later writes for the same stripe have to wait. This can be improved by moving log_io to r5dev. 3. With writeback cache, read path must enter state machine, which is a significant bottleneck for some workloads. 4. There is no per stripe checkpoint (with r5l_payload_flush) in the log, so recovery code has to replay more than necessary data (sometimes all the log from last_checkpoint). This reduces availability of the array. This patch includes a fix proposed by ZhengYuan Liu <liuzhengyuan@kylinos.cn> Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Shaohua Li <shli@fb.com>
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c152
1 files changed, 130 insertions, 22 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7c98eb06d1b2..f535ce2c267a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -218,8 +218,17 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
218static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 218static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
219 struct list_head *temp_inactive_list) 219 struct list_head *temp_inactive_list)
220{ 220{
221 int i;
222 int injournal = 0; /* number of date pages with R5_InJournal */
223
221 BUG_ON(!list_empty(&sh->lru)); 224 BUG_ON(!list_empty(&sh->lru));
222 BUG_ON(atomic_read(&conf->active_stripes)==0); 225 BUG_ON(atomic_read(&conf->active_stripes)==0);
226
227 if (r5c_is_writeback(conf->log))
228 for (i = sh->disks; i--; )
229 if (test_bit(R5_InJournal, &sh->dev[i].flags))
230 injournal++;
231
223 if (test_bit(STRIPE_HANDLE, &sh->state)) { 232 if (test_bit(STRIPE_HANDLE, &sh->state)) {
224 if (test_bit(STRIPE_DELAYED, &sh->state) && 233 if (test_bit(STRIPE_DELAYED, &sh->state) &&
225 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 234 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
@@ -245,8 +254,29 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
245 < IO_THRESHOLD) 254 < IO_THRESHOLD)
246 md_wakeup_thread(conf->mddev->thread); 255 md_wakeup_thread(conf->mddev->thread);
247 atomic_dec(&conf->active_stripes); 256 atomic_dec(&conf->active_stripes);
248 if (!test_bit(STRIPE_EXPANDING, &sh->state)) 257 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
249 list_add_tail(&sh->lru, temp_inactive_list); 258 if (!r5c_is_writeback(conf->log))
259 list_add_tail(&sh->lru, temp_inactive_list);
260 else {
261 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
262 if (injournal == 0)
263 list_add_tail(&sh->lru, temp_inactive_list);
264 else if (injournal == conf->raid_disks - conf->max_degraded) {
265 /* full stripe */
266 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
267 atomic_inc(&conf->r5c_cached_full_stripes);
268 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
269 atomic_dec(&conf->r5c_cached_partial_stripes);
270 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
271 } else {
272 /* partial stripe */
273 if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
274 &sh->state))
275 atomic_inc(&conf->r5c_cached_partial_stripes);
276 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
277 }
278 }
279 }
250 } 280 }
251} 281}
252 282
@@ -830,8 +860,17 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
830 860
831 might_sleep(); 861 might_sleep();
832 862
833 if (r5l_write_stripe(conf->log, sh) == 0) 863 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
834 return; 864 /* writing out phase */
865 if (r5l_write_stripe(conf->log, sh) == 0)
866 return;
867 } else { /* caching phase */
868 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
869 r5c_cache_data(conf->log, sh, s);
870 return;
871 }
872 }
873
835 for (i = disks; i--; ) { 874 for (i = disks; i--; ) {
836 int op, op_flags = 0; 875 int op, op_flags = 0;
837 int replace_only = 0; 876 int replace_only = 0;
@@ -1044,7 +1083,7 @@ again:
1044static struct dma_async_tx_descriptor * 1083static struct dma_async_tx_descriptor *
1045async_copy_data(int frombio, struct bio *bio, struct page **page, 1084async_copy_data(int frombio, struct bio *bio, struct page **page,
1046 sector_t sector, struct dma_async_tx_descriptor *tx, 1085 sector_t sector, struct dma_async_tx_descriptor *tx,
1047 struct stripe_head *sh) 1086 struct stripe_head *sh, int no_skipcopy)
1048{ 1087{
1049 struct bio_vec bvl; 1088 struct bio_vec bvl;
1050 struct bvec_iter iter; 1089 struct bvec_iter iter;
@@ -1084,7 +1123,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
1084 if (frombio) { 1123 if (frombio) {
1085 if (sh->raid_conf->skip_copy && 1124 if (sh->raid_conf->skip_copy &&
1086 b_offset == 0 && page_offset == 0 && 1125 b_offset == 0 && page_offset == 0 &&
1087 clen == STRIPE_SIZE) 1126 clen == STRIPE_SIZE &&
1127 !no_skipcopy)
1088 *page = bio_page; 1128 *page = bio_page;
1089 else 1129 else
1090 tx = async_memcpy(*page, bio_page, page_offset, 1130 tx = async_memcpy(*page, bio_page, page_offset,
@@ -1166,7 +1206,7 @@ static void ops_run_biofill(struct stripe_head *sh)
1166 while (rbi && rbi->bi_iter.bi_sector < 1206 while (rbi && rbi->bi_iter.bi_sector <
1167 dev->sector + STRIPE_SECTORS) { 1207 dev->sector + STRIPE_SECTORS) {
1168 tx = async_copy_data(0, rbi, &dev->page, 1208 tx = async_copy_data(0, rbi, &dev->page,
1169 dev->sector, tx, sh); 1209 dev->sector, tx, sh, 0);
1170 rbi = r5_next_bio(rbi, dev->sector); 1210 rbi = r5_next_bio(rbi, dev->sector);
1171 } 1211 }
1172 } 1212 }
@@ -1293,10 +1333,15 @@ static int set_syndrome_sources(struct page **srcs,
1293 if (i == sh->qd_idx || i == sh->pd_idx || 1333 if (i == sh->qd_idx || i == sh->pd_idx ||
1294 (srctype == SYNDROME_SRC_ALL) || 1334 (srctype == SYNDROME_SRC_ALL) ||
1295 (srctype == SYNDROME_SRC_WANT_DRAIN && 1335 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1296 test_bit(R5_Wantdrain, &dev->flags)) || 1336 (test_bit(R5_Wantdrain, &dev->flags) ||
1337 test_bit(R5_InJournal, &dev->flags))) ||
1297 (srctype == SYNDROME_SRC_WRITTEN && 1338 (srctype == SYNDROME_SRC_WRITTEN &&
1298 dev->written)) 1339 dev->written)) {
1299 srcs[slot] = sh->dev[i].page; 1340 if (test_bit(R5_InJournal, &dev->flags))
1341 srcs[slot] = sh->dev[i].orig_page;
1342 else
1343 srcs[slot] = sh->dev[i].page;
1344 }
1300 i = raid6_next_disk(i, disks); 1345 i = raid6_next_disk(i, disks);
1301 } while (i != d0_idx); 1346 } while (i != d0_idx);
1302 1347
@@ -1475,6 +1520,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
1475 1520
1476 pr_debug("%s: stripe %llu\n", __func__, 1521 pr_debug("%s: stripe %llu\n", __func__,
1477 (unsigned long long)sh->sector); 1522 (unsigned long long)sh->sector);
1523
1524 if (r5c_is_writeback(sh->raid_conf->log))
1525 /*
1526 * raid5-cache write back uses orig_page during prexor.
1527 * After prexor, it is time to free orig_page
1528 */
1529 r5c_release_extra_page(sh);
1478} 1530}
1479 1531
1480static struct dma_async_tx_descriptor * 1532static struct dma_async_tx_descriptor *
@@ -1496,7 +1548,9 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1496 for (i = disks; i--; ) { 1548 for (i = disks; i--; ) {
1497 struct r5dev *dev = &sh->dev[i]; 1549 struct r5dev *dev = &sh->dev[i];
1498 /* Only process blocks that are known to be uptodate */ 1550 /* Only process blocks that are known to be uptodate */
1499 if (test_bit(R5_Wantdrain, &dev->flags)) 1551 if (test_bit(R5_InJournal, &dev->flags))
1552 xor_srcs[count++] = dev->orig_page;
1553 else if (test_bit(R5_Wantdrain, &dev->flags))
1500 xor_srcs[count++] = dev->page; 1554 xor_srcs[count++] = dev->page;
1501 } 1555 }
1502 1556
@@ -1530,6 +1584,7 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1530static struct dma_async_tx_descriptor * 1584static struct dma_async_tx_descriptor *
1531ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1585ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1532{ 1586{
1587 struct r5conf *conf = sh->raid_conf;
1533 int disks = sh->disks; 1588 int disks = sh->disks;
1534 int i; 1589 int i;
1535 struct stripe_head *head_sh = sh; 1590 struct stripe_head *head_sh = sh;
@@ -1547,6 +1602,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1547 1602
1548again: 1603again:
1549 dev = &sh->dev[i]; 1604 dev = &sh->dev[i];
1605 /*
1606 * clear R5_InJournal, so when rewriting a page in
1607 * journal, it is not skipped by r5l_log_stripe()
1608 */
1609 clear_bit(R5_InJournal, &dev->flags);
1550 spin_lock_irq(&sh->stripe_lock); 1610 spin_lock_irq(&sh->stripe_lock);
1551 chosen = dev->towrite; 1611 chosen = dev->towrite;
1552 dev->towrite = NULL; 1612 dev->towrite = NULL;
@@ -1566,8 +1626,10 @@ again:
1566 set_bit(R5_Discard, &dev->flags); 1626 set_bit(R5_Discard, &dev->flags);
1567 else { 1627 else {
1568 tx = async_copy_data(1, wbi, &dev->page, 1628 tx = async_copy_data(1, wbi, &dev->page,
1569 dev->sector, tx, sh); 1629 dev->sector, tx, sh,
1570 if (dev->page != dev->orig_page) { 1630 r5c_is_writeback(conf->log));
1631 if (dev->page != dev->orig_page &&
1632 !r5c_is_writeback(conf->log)) {
1571 set_bit(R5_SkipCopy, &dev->flags); 1633 set_bit(R5_SkipCopy, &dev->flags);
1572 clear_bit(R5_UPTODATE, &dev->flags); 1634 clear_bit(R5_UPTODATE, &dev->flags);
1573 clear_bit(R5_OVERWRITE, &dev->flags); 1635 clear_bit(R5_OVERWRITE, &dev->flags);
@@ -1675,7 +1737,8 @@ again:
1675 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1737 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1676 for (i = disks; i--; ) { 1738 for (i = disks; i--; ) {
1677 struct r5dev *dev = &sh->dev[i]; 1739 struct r5dev *dev = &sh->dev[i];
1678 if (head_sh->dev[i].written) 1740 if (head_sh->dev[i].written ||
1741 test_bit(R5_InJournal, &head_sh->dev[i].flags))
1679 xor_srcs[count++] = dev->page; 1742 xor_srcs[count++] = dev->page;
1680 } 1743 }
1681 } else { 1744 } else {
@@ -2796,6 +2859,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2796 int level = conf->level; 2859 int level = conf->level;
2797 2860
2798 if (rcw) { 2861 if (rcw) {
2862 /*
2863 * In some cases, handle_stripe_dirtying initially decided to
2864 * run rmw and allocates extra page for prexor. However, rcw is
2865 * cheaper later on. We need to free the extra page now,
2866 * because we won't be able to do that in ops_complete_prexor().
2867 */
2868 r5c_release_extra_page(sh);
2799 2869
2800 for (i = disks; i--; ) { 2870 for (i = disks; i--; ) {
2801 struct r5dev *dev = &sh->dev[i]; 2871 struct r5dev *dev = &sh->dev[i];
@@ -2806,6 +2876,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2806 if (!expand) 2876 if (!expand)
2807 clear_bit(R5_UPTODATE, &dev->flags); 2877 clear_bit(R5_UPTODATE, &dev->flags);
2808 s->locked++; 2878 s->locked++;
2879 } else if (test_bit(R5_InJournal, &dev->flags)) {
2880 set_bit(R5_LOCKED, &dev->flags);
2881 s->locked++;
2809 } 2882 }
2810 } 2883 }
2811 /* if we are not expanding this is a proper write request, and 2884 /* if we are not expanding this is a proper write request, and
@@ -2845,6 +2918,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2845 set_bit(R5_LOCKED, &dev->flags); 2918 set_bit(R5_LOCKED, &dev->flags);
2846 clear_bit(R5_UPTODATE, &dev->flags); 2919 clear_bit(R5_UPTODATE, &dev->flags);
2847 s->locked++; 2920 s->locked++;
2921 } else if (test_bit(R5_InJournal, &dev->flags)) {
2922 set_bit(R5_LOCKED, &dev->flags);
2923 s->locked++;
2848 } 2924 }
2849 } 2925 }
2850 if (!s->locked) 2926 if (!s->locked)
@@ -3516,9 +3592,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3516 } else for (i = disks; i--; ) { 3592 } else for (i = disks; i--; ) {
3517 /* would I have to read this buffer for read_modify_write */ 3593 /* would I have to read this buffer for read_modify_write */
3518 struct r5dev *dev = &sh->dev[i]; 3594 struct r5dev *dev = &sh->dev[i];
3519 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && 3595 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
3596 test_bit(R5_InJournal, &dev->flags)) &&
3520 !test_bit(R5_LOCKED, &dev->flags) && 3597 !test_bit(R5_LOCKED, &dev->flags) &&
3521 !(test_bit(R5_UPTODATE, &dev->flags) || 3598 !((test_bit(R5_UPTODATE, &dev->flags) &&
3599 (!test_bit(R5_InJournal, &dev->flags) ||
3600 dev->page != dev->orig_page)) ||
3522 test_bit(R5_Wantcompute, &dev->flags))) { 3601 test_bit(R5_Wantcompute, &dev->flags))) {
3523 if (test_bit(R5_Insync, &dev->flags)) 3602 if (test_bit(R5_Insync, &dev->flags))
3524 rmw++; 3603 rmw++;
@@ -3530,13 +3609,15 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3530 i != sh->pd_idx && i != sh->qd_idx && 3609 i != sh->pd_idx && i != sh->qd_idx &&
3531 !test_bit(R5_LOCKED, &dev->flags) && 3610 !test_bit(R5_LOCKED, &dev->flags) &&
3532 !(test_bit(R5_UPTODATE, &dev->flags) || 3611 !(test_bit(R5_UPTODATE, &dev->flags) ||
3533 test_bit(R5_Wantcompute, &dev->flags))) { 3612 test_bit(R5_InJournal, &dev->flags) ||
3613 test_bit(R5_Wantcompute, &dev->flags))) {
3534 if (test_bit(R5_Insync, &dev->flags)) 3614 if (test_bit(R5_Insync, &dev->flags))
3535 rcw++; 3615 rcw++;
3536 else 3616 else
3537 rcw += 2*disks; 3617 rcw += 2*disks;
3538 } 3618 }
3539 } 3619 }
3620
3540 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 3621 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
3541 (unsigned long long)sh->sector, rmw, rcw); 3622 (unsigned long long)sh->sector, rmw, rcw);
3542 set_bit(STRIPE_HANDLE, &sh->state); 3623 set_bit(STRIPE_HANDLE, &sh->state);
@@ -3548,10 +3629,24 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3548 (unsigned long long)sh->sector, rmw); 3629 (unsigned long long)sh->sector, rmw);
3549 for (i = disks; i--; ) { 3630 for (i = disks; i--; ) {
3550 struct r5dev *dev = &sh->dev[i]; 3631 struct r5dev *dev = &sh->dev[i];
3551 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && 3632 if (test_bit(R5_InJournal, &dev->flags) &&
3633 dev->page == dev->orig_page &&
3634 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
3635 /* alloc page for prexor */
3636 dev->orig_page = alloc_page(GFP_NOIO);
3637
3638 /* will handle failure in a later patch*/
3639 BUG_ON(!dev->orig_page);
3640 }
3641
3642 if ((dev->towrite ||
3643 i == sh->pd_idx || i == sh->qd_idx ||
3644 test_bit(R5_InJournal, &dev->flags)) &&
3552 !test_bit(R5_LOCKED, &dev->flags) && 3645 !test_bit(R5_LOCKED, &dev->flags) &&
3553 !(test_bit(R5_UPTODATE, &dev->flags) || 3646 !((test_bit(R5_UPTODATE, &dev->flags) &&
3554 test_bit(R5_Wantcompute, &dev->flags)) && 3647 (!test_bit(R5_InJournal, &dev->flags) ||
3648 dev->page != dev->orig_page)) ||
3649 test_bit(R5_Wantcompute, &dev->flags)) &&
3555 test_bit(R5_Insync, &dev->flags)) { 3650 test_bit(R5_Insync, &dev->flags)) {
3556 if (test_bit(STRIPE_PREREAD_ACTIVE, 3651 if (test_bit(STRIPE_PREREAD_ACTIVE,
3557 &sh->state)) { 3652 &sh->state)) {
@@ -3577,6 +3672,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3577 i != sh->pd_idx && i != sh->qd_idx && 3672 i != sh->pd_idx && i != sh->qd_idx &&
3578 !test_bit(R5_LOCKED, &dev->flags) && 3673 !test_bit(R5_LOCKED, &dev->flags) &&
3579 !(test_bit(R5_UPTODATE, &dev->flags) || 3674 !(test_bit(R5_UPTODATE, &dev->flags) ||
3675 test_bit(R5_InJournal, &dev->flags) ||
3580 test_bit(R5_Wantcompute, &dev->flags))) { 3676 test_bit(R5_Wantcompute, &dev->flags))) {
3581 rcw++; 3677 rcw++;
3582 if (test_bit(R5_Insync, &dev->flags) && 3678 if (test_bit(R5_Insync, &dev->flags) &&
@@ -3616,7 +3712,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3616 */ 3712 */
3617 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 3713 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
3618 (s->locked == 0 && (rcw == 0 || rmw == 0) && 3714 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
3619 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 3715 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
3620 schedule_reconstruction(sh, s, rcw == 0, 0); 3716 schedule_reconstruction(sh, s, rcw == 0, 0);
3621} 3717}
3622 3718
@@ -4110,6 +4206,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4110 4206
4111 if (test_bit(R5_InJournal, &dev->flags)) 4207 if (test_bit(R5_InJournal, &dev->flags))
4112 s->injournal++; 4208 s->injournal++;
4209 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4210 s->just_cached++;
4113 } 4211 }
4114 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4212 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4115 /* If there is a failed device being replaced, 4213 /* If there is a failed device being replaced,
@@ -4338,7 +4436,8 @@ static void handle_stripe(struct stripe_head *sh)
4338 struct r5dev *dev = &sh->dev[i]; 4436 struct r5dev *dev = &sh->dev[i];
4339 if (test_bit(R5_LOCKED, &dev->flags) && 4437 if (test_bit(R5_LOCKED, &dev->flags) &&
4340 (i == sh->pd_idx || i == sh->qd_idx || 4438 (i == sh->pd_idx || i == sh->qd_idx ||
4341 dev->written)) { 4439 dev->written || test_bit(R5_InJournal,
4440 &dev->flags))) {
4342 pr_debug("Writing block %d\n", i); 4441 pr_debug("Writing block %d\n", i);
4343 set_bit(R5_Wantwrite, &dev->flags); 4442 set_bit(R5_Wantwrite, &dev->flags);
4344 if (prexor) 4443 if (prexor)
@@ -4378,6 +4477,10 @@ static void handle_stripe(struct stripe_head *sh)
4378 test_bit(R5_Discard, &qdev->flags)))))) 4477 test_bit(R5_Discard, &qdev->flags))))))
4379 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 4478 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
4380 4479
4480 if (s.just_cached)
4481 r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
4482 r5l_stripe_write_finished(sh);
4483
4381 /* Now we might consider reading some blocks, either to check/generate 4484 /* Now we might consider reading some blocks, either to check/generate
4382 * parity, or to satisfy requests 4485 * parity, or to satisfy requests
4383 * or to load a block that is being partially written. 4486 * or to load a block that is being partially written.
@@ -6499,6 +6602,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
6499 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6602 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6500 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6603 INIT_LIST_HEAD(conf->temp_inactive_list + i);
6501 6604
6605 atomic_set(&conf->r5c_cached_full_stripes, 0);
6606 INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
6607 atomic_set(&conf->r5c_cached_partial_stripes, 0);
6608 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
6609
6502 conf->level = mddev->new_level; 6610 conf->level = mddev->new_level;
6503 conf->chunk_sectors = mddev->new_chunk_sectors; 6611 conf->chunk_sectors = mddev->new_chunk_sectors;
6504 if (raid5_alloc_percpu(conf) != 0) 6612 if (raid5_alloc_percpu(conf) != 0)