summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5-ppl.c
diff options
context:
space:
mode:
authorTomasz Majchrzak <tomasz.majchrzak@intel.com>2017-12-27 04:31:40 -0500
committerShaohua Li <sh.li@alibaba-inc.com>2018-01-15 17:29:42 -0500
commit1532d9e87e8b2377f12929f9e40724d5fbe6ecc5 (patch)
treefa8ec94368dfff1ec93b0366833e6f5d3cbcc70c /drivers/md/raid5-ppl.c
parent92e6245deab80f0934a102ba969d8b891b8ba5bf (diff)
raid5-ppl: PPL support for disks with write-back cache enabled
In order to provide data consistency with PPL for disks with write-back cache enabled all data has to be flushed to disks before next PPL entry. The disks to be flushed are marked in the bitmap. It's modified under a mutex and it's only read after PPL io unit is submitted. A limitation of 64 disks in the array has been introduced to keep data structures and implementation simple. RAID5 arrays with so many disks are not likely due to high risk of multiple disks failure. Such restriction should not be a real life limitation. With write-back cache disabled next PPL entry is submitted when data write for current one completes. Data flush defers next log submission so trigger it when there are no stripes for handling found. As PPL assures all data is flushed to disk at request completion, just acknowledge flush request when PPL is enabled. Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com> Signed-off-by: Shaohua Li <sh.li@alibaba-inc.com>
Diffstat (limited to 'drivers/md/raid5-ppl.c')
-rw-r--r--drivers/md/raid5-ppl.c167
1 files changed, 155 insertions, 12 deletions
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 628c0bf7b9fd..2764c2290062 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -85,6 +85,9 @@
85 * (for a single member disk). New io_units are added to the end of the list 85 * (for a single member disk). New io_units are added to the end of the list
86 * and the first io_unit is submitted, if it is not submitted already. 86 * and the first io_unit is submitted, if it is not submitted already.
87 * The current io_unit accepting new stripes is always at the end of the list. 87 * The current io_unit accepting new stripes is always at the end of the list.
88 *
89 * If write-back cache is enabled for any of the disks in the array, its data
90 * must be flushed before next io_unit is submitted.
88 */ 91 */
89 92
90#define PPL_SPACE_SIZE (128 * 1024) 93#define PPL_SPACE_SIZE (128 * 1024)
@@ -104,6 +107,7 @@ struct ppl_conf {
104 struct kmem_cache *io_kc; 107 struct kmem_cache *io_kc;
105 mempool_t *io_pool; 108 mempool_t *io_pool;
106 struct bio_set *bs; 109 struct bio_set *bs;
110 struct bio_set *flush_bs;
107 111
108 /* used only for recovery */ 112 /* used only for recovery */
109 int recovered_entries; 113 int recovered_entries;
@@ -128,6 +132,8 @@ struct ppl_log {
128 sector_t next_io_sector; 132 sector_t next_io_sector;
129 unsigned int entry_space; 133 unsigned int entry_space;
130 bool use_multippl; 134 bool use_multippl;
135 bool wb_cache_on;
136 unsigned long disk_flush_bitmap;
131}; 137};
132 138
133#define PPL_IO_INLINE_BVECS 32 139#define PPL_IO_INLINE_BVECS 32
@@ -145,6 +151,7 @@ struct ppl_io_unit {
145 151
146 struct list_head stripe_list; /* stripes added to the io_unit */ 152 struct list_head stripe_list; /* stripes added to the io_unit */
147 atomic_t pending_stripes; /* how many stripes not written to raid */ 153 atomic_t pending_stripes; /* how many stripes not written to raid */
154 atomic_t pending_flushes; /* how many disk flushes are in progress */
148 155
149 bool submitted; /* true if write to log started */ 156 bool submitted; /* true if write to log started */
150 157
@@ -249,6 +256,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
249 INIT_LIST_HEAD(&io->log_sibling); 256 INIT_LIST_HEAD(&io->log_sibling);
250 INIT_LIST_HEAD(&io->stripe_list); 257 INIT_LIST_HEAD(&io->stripe_list);
251 atomic_set(&io->pending_stripes, 0); 258 atomic_set(&io->pending_stripes, 0);
259 atomic_set(&io->pending_flushes, 0);
252 bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS); 260 bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
253 261
254 pplhdr = page_address(io->header_page); 262 pplhdr = page_address(io->header_page);
@@ -475,7 +483,18 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
475 if (log->use_multippl) 483 if (log->use_multippl)
476 log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9; 484 log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9;
477 485
486 WARN_ON(log->disk_flush_bitmap != 0);
487
478 list_for_each_entry(sh, &io->stripe_list, log_list) { 488 list_for_each_entry(sh, &io->stripe_list, log_list) {
489 for (i = 0; i < sh->disks; i++) {
490 struct r5dev *dev = &sh->dev[i];
491
492 if ((ppl_conf->child_logs[i].wb_cache_on) &&
493 (test_bit(R5_Wantwrite, &dev->flags))) {
494 set_bit(i, &log->disk_flush_bitmap);
495 }
496 }
497
479 /* entries for full stripe writes have no partial parity */ 498 /* entries for full stripe writes have no partial parity */
480 if (test_bit(STRIPE_FULL_WRITE, &sh->state)) 499 if (test_bit(STRIPE_FULL_WRITE, &sh->state))
481 continue; 500 continue;
@@ -540,6 +559,7 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io)
540{ 559{
541 struct ppl_log *log = io->log; 560 struct ppl_log *log = io->log;
542 struct ppl_conf *ppl_conf = log->ppl_conf; 561 struct ppl_conf *ppl_conf = log->ppl_conf;
562 struct r5conf *conf = ppl_conf->mddev->private;
543 unsigned long flags; 563 unsigned long flags;
544 564
545 pr_debug("%s: seq: %llu\n", __func__, io->seq); 565 pr_debug("%s: seq: %llu\n", __func__, io->seq);
@@ -565,6 +585,112 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io)
565 spin_unlock(&ppl_conf->no_mem_stripes_lock); 585 spin_unlock(&ppl_conf->no_mem_stripes_lock);
566 586
567 local_irq_restore(flags); 587 local_irq_restore(flags);
588
589 wake_up(&conf->wait_for_quiescent);
590}
591
592static void ppl_flush_endio(struct bio *bio)
593{
594 struct ppl_io_unit *io = bio->bi_private;
595 struct ppl_log *log = io->log;
596 struct ppl_conf *ppl_conf = log->ppl_conf;
597 struct r5conf *conf = ppl_conf->mddev->private;
598 char b[BDEVNAME_SIZE];
599
600 pr_debug("%s: dev: %s\n", __func__, bio_devname(bio, b));
601
602 if (bio->bi_status) {
603 struct md_rdev *rdev;
604
605 rcu_read_lock();
606 rdev = md_find_rdev_rcu(conf->mddev, bio_dev(bio));
607 if (rdev)
608 md_error(rdev->mddev, rdev);
609 rcu_read_unlock();
610 }
611
612 bio_put(bio);
613
614 if (atomic_dec_and_test(&io->pending_flushes)) {
615 ppl_io_unit_finished(io);
616 md_wakeup_thread(conf->mddev->thread);
617 }
618}
619
620static void ppl_do_flush(struct ppl_io_unit *io)
621{
622 struct ppl_log *log = io->log;
623 struct ppl_conf *ppl_conf = log->ppl_conf;
624 struct r5conf *conf = ppl_conf->mddev->private;
625 int raid_disks = conf->raid_disks;
626 int flushed_disks = 0;
627 int i;
628
629 atomic_set(&io->pending_flushes, raid_disks);
630
631 for_each_set_bit(i, &log->disk_flush_bitmap, raid_disks) {
632 struct md_rdev *rdev;
633 struct block_device *bdev = NULL;
634
635 rcu_read_lock();
636 rdev = rcu_dereference(conf->disks[i].rdev);
637 if (rdev && !test_bit(Faulty, &rdev->flags))
638 bdev = rdev->bdev;
639 rcu_read_unlock();
640
641 if (bdev) {
642 struct bio *bio;
643 char b[BDEVNAME_SIZE];
644
645 bio = bio_alloc_bioset(GFP_NOIO, 0, ppl_conf->flush_bs);
646 bio_set_dev(bio, bdev);
647 bio->bi_private = io;
648 bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
649 bio->bi_end_io = ppl_flush_endio;
650
651 pr_debug("%s: dev: %s\n", __func__,
652 bio_devname(bio, b));
653
654 submit_bio(bio);
655 flushed_disks++;
656 }
657 }
658
659 log->disk_flush_bitmap = 0;
660
661 for (i = flushed_disks ; i < raid_disks; i++) {
662 if (atomic_dec_and_test(&io->pending_flushes))
663 ppl_io_unit_finished(io);
664 }
665}
666
667static inline bool ppl_no_io_unit_submitted(struct r5conf *conf,
668 struct ppl_log *log)
669{
670 struct ppl_io_unit *io;
671
672 io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
673 log_sibling);
674
675 return !io || !io->submitted;
676}
677
678void ppl_quiesce(struct r5conf *conf, int quiesce)
679{
680 struct ppl_conf *ppl_conf = conf->log_private;
681 int i;
682
683 if (quiesce) {
684 for (i = 0; i < ppl_conf->count; i++) {
685 struct ppl_log *log = &ppl_conf->child_logs[i];
686
687 spin_lock_irq(&log->io_list_lock);
688 wait_event_lock_irq(conf->wait_for_quiescent,
689 ppl_no_io_unit_submitted(conf, log),
690 log->io_list_lock);
691 spin_unlock_irq(&log->io_list_lock);
692 }
693 }
568} 694}
569 695
570void ppl_stripe_write_finished(struct stripe_head *sh) 696void ppl_stripe_write_finished(struct stripe_head *sh)
@@ -574,8 +700,12 @@ void ppl_stripe_write_finished(struct stripe_head *sh)
574 io = sh->ppl_io; 700 io = sh->ppl_io;
575 sh->ppl_io = NULL; 701 sh->ppl_io = NULL;
576 702
577 if (io && atomic_dec_and_test(&io->pending_stripes)) 703 if (io && atomic_dec_and_test(&io->pending_stripes)) {
578 ppl_io_unit_finished(io); 704 if (io->log->disk_flush_bitmap)
705 ppl_do_flush(io);
706 else
707 ppl_io_unit_finished(io);
708 }
579} 709}
580 710
581static void ppl_xor(int size, struct page *page1, struct page *page2) 711static void ppl_xor(int size, struct page *page1, struct page *page2)
@@ -1108,6 +1238,8 @@ static void __ppl_exit_log(struct ppl_conf *ppl_conf)
1108 1238
1109 if (ppl_conf->bs) 1239 if (ppl_conf->bs)
1110 bioset_free(ppl_conf->bs); 1240 bioset_free(ppl_conf->bs);
1241 if (ppl_conf->flush_bs)
1242 bioset_free(ppl_conf->flush_bs);
1111 mempool_destroy(ppl_conf->io_pool); 1243 mempool_destroy(ppl_conf->io_pool);
1112 kmem_cache_destroy(ppl_conf->io_kc); 1244 kmem_cache_destroy(ppl_conf->io_kc);
1113 1245
@@ -1173,6 +1305,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
1173 1305
1174static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) 1306static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
1175{ 1307{
1308 struct request_queue *q;
1309
1176 if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE + 1310 if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
1177 PPL_HEADER_SIZE) * 2) { 1311 PPL_HEADER_SIZE) * 2) {
1178 log->use_multippl = true; 1312 log->use_multippl = true;
@@ -1185,6 +1319,10 @@ static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
1185 PPL_HEADER_SIZE; 1319 PPL_HEADER_SIZE;
1186 } 1320 }
1187 log->next_io_sector = rdev->ppl.sector; 1321 log->next_io_sector = rdev->ppl.sector;
1322
1323 q = bdev_get_queue(rdev->bdev);
1324 if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
1325 log->wb_cache_on = true;
1188} 1326}
1189 1327
1190int ppl_init_log(struct r5conf *conf) 1328int ppl_init_log(struct r5conf *conf)
@@ -1192,8 +1330,8 @@ int ppl_init_log(struct r5conf *conf)
1192 struct ppl_conf *ppl_conf; 1330 struct ppl_conf *ppl_conf;
1193 struct mddev *mddev = conf->mddev; 1331 struct mddev *mddev = conf->mddev;
1194 int ret = 0; 1332 int ret = 0;
1333 int max_disks;
1195 int i; 1334 int i;
1196 bool need_cache_flush = false;
1197 1335
1198 pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n", 1336 pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
1199 mdname(conf->mddev)); 1337 mdname(conf->mddev));
@@ -1219,6 +1357,14 @@ int ppl_init_log(struct r5conf *conf)
1219 return -EINVAL; 1357 return -EINVAL;
1220 } 1358 }
1221 1359
1360 max_disks = FIELD_SIZEOF(struct ppl_log, disk_flush_bitmap) *
1361 BITS_PER_BYTE;
1362 if (conf->raid_disks > max_disks) {
1363 pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n",
1364 mdname(mddev), max_disks);
1365 return -EINVAL;
1366 }
1367
1222 ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL); 1368 ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
1223 if (!ppl_conf) 1369 if (!ppl_conf)
1224 return -ENOMEM; 1370 return -ENOMEM;
@@ -1244,6 +1390,12 @@ int ppl_init_log(struct r5conf *conf)
1244 goto err; 1390 goto err;
1245 } 1391 }
1246 1392
1393 ppl_conf->flush_bs = bioset_create(conf->raid_disks, 0, 0);
1394 if (!ppl_conf->flush_bs) {
1395 ret = -ENOMEM;
1396 goto err;
1397 }
1398
1247 ppl_conf->count = conf->raid_disks; 1399 ppl_conf->count = conf->raid_disks;
1248 ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log), 1400 ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
1249 GFP_KERNEL); 1401 GFP_KERNEL);
@@ -1275,23 +1427,14 @@ int ppl_init_log(struct r5conf *conf)
1275 log->rdev = rdev; 1427 log->rdev = rdev;
1276 1428
1277 if (rdev) { 1429 if (rdev) {
1278 struct request_queue *q;
1279
1280 ret = ppl_validate_rdev(rdev); 1430 ret = ppl_validate_rdev(rdev);
1281 if (ret) 1431 if (ret)
1282 goto err; 1432 goto err;
1283 1433
1284 q = bdev_get_queue(rdev->bdev);
1285 if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
1286 need_cache_flush = true;
1287 ppl_init_child_log(log, rdev); 1434 ppl_init_child_log(log, rdev);
1288 } 1435 }
1289 } 1436 }
1290 1437
1291 if (need_cache_flush)
1292 pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
1293 mdname(mddev));
1294
1295 /* load and possibly recover the logs from the member disks */ 1438 /* load and possibly recover the logs from the member disks */
1296 ret = ppl_load(ppl_conf); 1439 ret = ppl_load(ppl_conf);
1297 1440