aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-01-31 14:03:38 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-01-31 14:03:38 -0500
commit040639b7fcf73ee39c15d38257f652a2048e96f2 (patch)
tree7b2b5e7f6af28818e10cd5658d679b236a78d884
parent20c59c71ae711aff845eef640b25935bc9578c93 (diff)
parent1532d9e87e8b2377f12929f9e40724d5fbe6ecc5 (diff)
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD updates from Shaohua Li: "Some small fixes for MD: - fix raid5-cache potential problems if raid5 cache isn't fully recovered - fix a wait-within-wait warning in raid1/10 - make raid5-PPL support disks with writeback cache enabled" * 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: raid5-ppl: PPL support for disks with write-back cache enabled md/r5cache: print more info of log recovery md/raid1,raid10: silence warning about wait-within-wait md: introduce new personality funciton start()
-rw-r--r--Documentation/md/raid5-ppl.txt7
-rw-r--r--drivers/md/dm-raid.c9
-rw-r--r--drivers/md/md.c31
-rw-r--r--drivers/md/md.h9
-rw-r--r--drivers/md/raid1.c11
-rw-r--r--drivers/md/raid10.c12
-rw-r--r--drivers/md/raid5-cache.c31
-rw-r--r--drivers/md/raid5-log.h30
-rw-r--r--drivers/md/raid5-ppl.c167
-rw-r--r--drivers/md/raid5.c16
10 files changed, 285 insertions, 38 deletions
diff --git a/Documentation/md/raid5-ppl.txt b/Documentation/md/raid5-ppl.txt
index 127072b09363..bfa092589e00 100644
--- a/Documentation/md/raid5-ppl.txt
+++ b/Documentation/md/raid5-ppl.txt
@@ -39,6 +39,7 @@ case the behavior is the same as in plain raid5.
39PPL is available for md version-1 metadata and external (specifically IMSM) 39PPL is available for md version-1 metadata and external (specifically IMSM)
40metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl. 40metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
41 41
42Currently, volatile write-back cache should be disabled on all member drives 42There is a limitation of maximum 64 disks in the array for PPL. It allows to
43when using PPL. Otherwise it cannot guarantee consistency in case of power 43keep data structures and implementation simple. RAID5 arrays with so many disks
44failure. 44are not likely due to high risk of multiple disks failure. Such restriction
45should not be a real life limitation.
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6319d846e0ad..e5ef0757fe23 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3151,6 +3151,14 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
3151 goto bad; 3151 goto bad;
3152 } 3152 }
3153 3153
3154 r = md_start(&rs->md);
3155
3156 if (r) {
3157 ti->error = "Failed to start raid array";
3158 mddev_unlock(&rs->md);
3159 goto bad_md_start;
3160 }
3161
3154 rs->callbacks.congested_fn = raid_is_congested; 3162 rs->callbacks.congested_fn = raid_is_congested;
3155 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 3163 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
3156 3164
@@ -3198,6 +3206,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
3198 mddev_unlock(&rs->md); 3206 mddev_unlock(&rs->md);
3199 return 0; 3207 return 0;
3200 3208
3209bad_md_start:
3201bad_journal_mode_set: 3210bad_journal_mode_set:
3202bad_stripe_cache: 3211bad_stripe_cache:
3203bad_check_reshape: 3212bad_check_reshape:
diff --git a/drivers/md/md.c b/drivers/md/md.c
index cb1476214f3f..0081ace39a64 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -711,7 +711,7 @@ static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
711 return NULL; 711 return NULL;
712} 712}
713 713
714static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) 714struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
715{ 715{
716 struct md_rdev *rdev; 716 struct md_rdev *rdev;
717 717
@@ -721,6 +721,7 @@ static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
721 721
722 return NULL; 722 return NULL;
723} 723}
724EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
724 725
725static struct md_personality *find_pers(int level, char *clevel) 726static struct md_personality *find_pers(int level, char *clevel)
726{ 727{
@@ -5560,11 +5561,6 @@ int md_run(struct mddev *mddev)
5560 if (start_readonly && mddev->ro == 0) 5561 if (start_readonly && mddev->ro == 0)
5561 mddev->ro = 2; /* read-only, but switch on first write */ 5562 mddev->ro = 2; /* read-only, but switch on first write */
5562 5563
5563 /*
5564 * NOTE: some pers->run(), for example r5l_recovery_log(), wakes
5565 * up mddev->thread. It is important to initialize critical
5566 * resources for mddev->thread BEFORE calling pers->run().
5567 */
5568 err = pers->run(mddev); 5564 err = pers->run(mddev);
5569 if (err) 5565 if (err)
5570 pr_warn("md: pers->run() failed ...\n"); 5566 pr_warn("md: pers->run() failed ...\n");
@@ -5678,6 +5674,9 @@ static int do_md_run(struct mddev *mddev)
5678 if (mddev_is_clustered(mddev)) 5674 if (mddev_is_clustered(mddev))
5679 md_allow_write(mddev); 5675 md_allow_write(mddev);
5680 5676
5677 /* run start up tasks that require md_thread */
5678 md_start(mddev);
5679
5681 md_wakeup_thread(mddev->thread); 5680 md_wakeup_thread(mddev->thread);
5682 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 5681 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5683 5682
@@ -5689,6 +5688,21 @@ out:
5689 return err; 5688 return err;
5690} 5689}
5691 5690
5691int md_start(struct mddev *mddev)
5692{
5693 int ret = 0;
5694
5695 if (mddev->pers->start) {
5696 set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
5697 md_wakeup_thread(mddev->thread);
5698 ret = mddev->pers->start(mddev);
5699 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
5700 md_wakeup_thread(mddev->sync_thread);
5701 }
5702 return ret;
5703}
5704EXPORT_SYMBOL_GPL(md_start);
5705
5692static int restart_array(struct mddev *mddev) 5706static int restart_array(struct mddev *mddev)
5693{ 5707{
5694 struct gendisk *disk = mddev->gendisk; 5708 struct gendisk *disk = mddev->gendisk;
@@ -6997,7 +7011,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev)
6997 return -ENODEV; 7011 return -ENODEV;
6998 7012
6999 rcu_read_lock(); 7013 rcu_read_lock();
7000 rdev = find_rdev_rcu(mddev, dev); 7014 rdev = md_find_rdev_rcu(mddev, dev);
7001 if (!rdev) 7015 if (!rdev)
7002 err = -ENODEV; 7016 err = -ENODEV;
7003 else { 7017 else {
@@ -8169,7 +8183,8 @@ void md_do_sync(struct md_thread *thread)
8169 int ret; 8183 int ret;
8170 8184
8171 /* just incase thread restarts... */ 8185 /* just incase thread restarts... */
8172 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8186 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8187 test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
8173 return; 8188 return;
8174 if (mddev->ro) {/* never try to sync a read-only array */ 8189 if (mddev->ro) {/* never try to sync a read-only array */
8175 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8190 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7d6bcf0eba0c..58cd20a5e85e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -485,6 +485,7 @@ enum recovery_flags {
485 MD_RECOVERY_RESHAPE, /* A reshape is happening */ 485 MD_RECOVERY_RESHAPE, /* A reshape is happening */
486 MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */ 486 MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */
487 MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */ 487 MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */
488 MD_RECOVERY_WAIT, /* waiting for pers->start() to finish */
488}; 489};
489 490
490static inline int __must_check mddev_lock(struct mddev *mddev) 491static inline int __must_check mddev_lock(struct mddev *mddev)
@@ -523,7 +524,13 @@ struct md_personality
523 struct list_head list; 524 struct list_head list;
524 struct module *owner; 525 struct module *owner;
525 bool (*make_request)(struct mddev *mddev, struct bio *bio); 526 bool (*make_request)(struct mddev *mddev, struct bio *bio);
527 /*
528 * start up works that do NOT require md_thread. tasks that
529 * requires md_thread should go into start()
530 */
526 int (*run)(struct mddev *mddev); 531 int (*run)(struct mddev *mddev);
532 /* start up works that require md threads */
533 int (*start)(struct mddev *mddev);
527 void (*free)(struct mddev *mddev, void *priv); 534 void (*free)(struct mddev *mddev, void *priv);
528 void (*status)(struct seq_file *seq, struct mddev *mddev); 535 void (*status)(struct seq_file *seq, struct mddev *mddev);
529 /* error_handler must set ->faulty and clear ->in_sync 536 /* error_handler must set ->faulty and clear ->in_sync
@@ -687,6 +694,7 @@ extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
687 694
688extern void mddev_init(struct mddev *mddev); 695extern void mddev_init(struct mddev *mddev);
689extern int md_run(struct mddev *mddev); 696extern int md_run(struct mddev *mddev);
697extern int md_start(struct mddev *mddev);
690extern void md_stop(struct mddev *mddev); 698extern void md_stop(struct mddev *mddev);
691extern void md_stop_writes(struct mddev *mddev); 699extern void md_stop_writes(struct mddev *mddev);
692extern int md_rdev_init(struct md_rdev *rdev); 700extern int md_rdev_init(struct md_rdev *rdev);
@@ -702,6 +710,7 @@ extern void md_reload_sb(struct mddev *mddev, int raid_disk);
702extern void md_update_sb(struct mddev *mddev, int force); 710extern void md_update_sb(struct mddev *mddev, int force);
703extern void md_kick_rdev_from_array(struct md_rdev * rdev); 711extern void md_kick_rdev_from_array(struct md_rdev * rdev);
704struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); 712struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
713struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
705 714
706static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) 715static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
707{ 716{
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6df398e3a008..b2eae332e1a2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -815,6 +815,17 @@ static void flush_pending_writes(struct r1conf *conf)
815 bio = bio_list_get(&conf->pending_bio_list); 815 bio = bio_list_get(&conf->pending_bio_list);
816 conf->pending_count = 0; 816 conf->pending_count = 0;
817 spin_unlock_irq(&conf->device_lock); 817 spin_unlock_irq(&conf->device_lock);
818
819 /*
820 * As this is called in a wait_event() loop (see freeze_array),
821 * current->state might be TASK_UNINTERRUPTIBLE which will
822 * cause a warning when we prepare to wait again. As it is
823 * rare that this path is taken, it is perfectly safe to force
824 * us to go around the wait_event() loop again, so the warning
825 * is a false-positive. Silence the warning by resetting
826 * thread state
827 */
828 __set_current_state(TASK_RUNNING);
818 blk_start_plug(&plug); 829 blk_start_plug(&plug);
819 flush_bio_list(conf, bio); 830 flush_bio_list(conf, bio);
820 blk_finish_plug(&plug); 831 blk_finish_plug(&plug);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c131835cf008..99c9207899a7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -900,6 +900,18 @@ static void flush_pending_writes(struct r10conf *conf)
900 bio = bio_list_get(&conf->pending_bio_list); 900 bio = bio_list_get(&conf->pending_bio_list);
901 conf->pending_count = 0; 901 conf->pending_count = 0;
902 spin_unlock_irq(&conf->device_lock); 902 spin_unlock_irq(&conf->device_lock);
903
904 /*
905 * As this is called in a wait_event() loop (see freeze_array),
906 * current->state might be TASK_UNINTERRUPTIBLE which will
907 * cause a warning when we prepare to wait again. As it is
908 * rare that this path is taken, it is perfectly safe to force
909 * us to go around the wait_event() loop again, so the warning
910 * is a false-positive. Silence the warning by resetting
911 * thread state
912 */
913 __set_current_state(TASK_RUNNING);
914
903 blk_start_plug(&plug); 915 blk_start_plug(&plug);
904 /* flush any pending bitmap writes to disk 916 /* flush any pending bitmap writes to disk
905 * before proceeding w/ I/O */ 917 * before proceeding w/ I/O */
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 39f31f07ffe9..3c65f52b68f5 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1111,9 +1111,6 @@ void r5l_write_stripe_run(struct r5l_log *log)
1111 1111
1112int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 1112int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
1113{ 1113{
1114 if (!log)
1115 return -ENODEV;
1116
1117 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 1114 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
1118 /* 1115 /*
1119 * in write through (journal only) 1116 * in write through (journal only)
@@ -1592,8 +1589,6 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1592void r5l_quiesce(struct r5l_log *log, int quiesce) 1589void r5l_quiesce(struct r5l_log *log, int quiesce)
1593{ 1590{
1594 struct mddev *mddev; 1591 struct mddev *mddev;
1595 if (!log)
1596 return;
1597 1592
1598 if (quiesce) { 1593 if (quiesce) {
1599 /* make sure r5l_write_super_and_discard_space exits */ 1594 /* make sure r5l_write_super_and_discard_space exits */
@@ -2448,7 +2443,6 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
2448 raid5_release_stripe(sh); 2443 raid5_release_stripe(sh);
2449 } 2444 }
2450 2445
2451 md_wakeup_thread(conf->mddev->thread);
2452 /* reuse conf->wait_for_quiescent in recovery */ 2446 /* reuse conf->wait_for_quiescent in recovery */
2453 wait_event(conf->wait_for_quiescent, 2447 wait_event(conf->wait_for_quiescent,
2454 atomic_read(&conf->active_stripes) == 0); 2448 atomic_read(&conf->active_stripes) == 0);
@@ -2491,10 +2485,10 @@ static int r5l_recovery_log(struct r5l_log *log)
2491 ctx->seq += 10000; 2485 ctx->seq += 10000;
2492 2486
2493 if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0)) 2487 if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
2494 pr_debug("md/raid:%s: starting from clean shutdown\n", 2488 pr_info("md/raid:%s: starting from clean shutdown\n",
2495 mdname(mddev)); 2489 mdname(mddev));
2496 else 2490 else
2497 pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", 2491 pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
2498 mdname(mddev), ctx->data_only_stripes, 2492 mdname(mddev), ctx->data_only_stripes,
2499 ctx->data_parity_stripes); 2493 ctx->data_parity_stripes);
2500 2494
@@ -3036,6 +3030,23 @@ ioerr:
3036 return ret; 3030 return ret;
3037} 3031}
3038 3032
3033int r5l_start(struct r5l_log *log)
3034{
3035 int ret;
3036
3037 if (!log)
3038 return 0;
3039
3040 ret = r5l_load_log(log);
3041 if (ret) {
3042 struct mddev *mddev = log->rdev->mddev;
3043 struct r5conf *conf = mddev->private;
3044
3045 r5l_exit_log(conf);
3046 }
3047 return ret;
3048}
3049
3039void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) 3050void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
3040{ 3051{
3041 struct r5conf *conf = mddev->private; 3052 struct r5conf *conf = mddev->private;
@@ -3138,13 +3149,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
3138 3149
3139 rcu_assign_pointer(conf->log, log); 3150 rcu_assign_pointer(conf->log, log);
3140 3151
3141 if (r5l_load_log(log))
3142 goto error;
3143
3144 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 3152 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
3145 return 0; 3153 return 0;
3146 3154
3147error:
3148 rcu_assign_pointer(conf->log, NULL); 3155 rcu_assign_pointer(conf->log, NULL);
3149 md_unregister_thread(&log->reclaim_thread); 3156 md_unregister_thread(&log->reclaim_thread);
3150reclaim_thread: 3157reclaim_thread:
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 284578b0a349..0c76bcedfc1c 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -32,6 +32,7 @@ extern struct md_sysfs_entry r5c_journal_mode;
32extern void r5c_update_on_rdev_error(struct mddev *mddev, 32extern void r5c_update_on_rdev_error(struct mddev *mddev,
33 struct md_rdev *rdev); 33 struct md_rdev *rdev);
34extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); 34extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
35extern int r5l_start(struct r5l_log *log);
35 36
36extern struct dma_async_tx_descriptor * 37extern struct dma_async_tx_descriptor *
37ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, 38ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
@@ -42,6 +43,7 @@ extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
42extern void ppl_write_stripe_run(struct r5conf *conf); 43extern void ppl_write_stripe_run(struct r5conf *conf);
43extern void ppl_stripe_write_finished(struct stripe_head *sh); 44extern void ppl_stripe_write_finished(struct stripe_head *sh);
44extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); 45extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
46extern void ppl_quiesce(struct r5conf *conf, int quiesce);
45 47
46static inline bool raid5_has_ppl(struct r5conf *conf) 48static inline bool raid5_has_ppl(struct r5conf *conf)
47{ 49{
@@ -87,6 +89,34 @@ static inline void log_write_stripe_run(struct r5conf *conf)
87 ppl_write_stripe_run(conf); 89 ppl_write_stripe_run(conf);
88} 90}
89 91
92static inline void log_flush_stripe_to_raid(struct r5conf *conf)
93{
94 if (conf->log)
95 r5l_flush_stripe_to_raid(conf->log);
96 else if (raid5_has_ppl(conf))
97 ppl_write_stripe_run(conf);
98}
99
100static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio)
101{
102 int ret = -ENODEV;
103
104 if (conf->log)
105 ret = r5l_handle_flush_request(conf->log, bio);
106 else if (raid5_has_ppl(conf))
107 ret = 0;
108
109 return ret;
110}
111
112static inline void log_quiesce(struct r5conf *conf, int quiesce)
113{
114 if (conf->log)
115 r5l_quiesce(conf->log, quiesce);
116 else if (raid5_has_ppl(conf))
117 ppl_quiesce(conf, quiesce);
118}
119
90static inline void log_exit(struct r5conf *conf) 120static inline void log_exit(struct r5conf *conf)
91{ 121{
92 if (conf->log) 122 if (conf->log)
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 628c0bf7b9fd..2764c2290062 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -85,6 +85,9 @@
85 * (for a single member disk). New io_units are added to the end of the list 85 * (for a single member disk). New io_units are added to the end of the list
86 * and the first io_unit is submitted, if it is not submitted already. 86 * and the first io_unit is submitted, if it is not submitted already.
87 * The current io_unit accepting new stripes is always at the end of the list. 87 * The current io_unit accepting new stripes is always at the end of the list.
88 *
89 * If write-back cache is enabled for any of the disks in the array, its data
90 * must be flushed before next io_unit is submitted.
88 */ 91 */
89 92
90#define PPL_SPACE_SIZE (128 * 1024) 93#define PPL_SPACE_SIZE (128 * 1024)
@@ -104,6 +107,7 @@ struct ppl_conf {
104 struct kmem_cache *io_kc; 107 struct kmem_cache *io_kc;
105 mempool_t *io_pool; 108 mempool_t *io_pool;
106 struct bio_set *bs; 109 struct bio_set *bs;
110 struct bio_set *flush_bs;
107 111
108 /* used only for recovery */ 112 /* used only for recovery */
109 int recovered_entries; 113 int recovered_entries;
@@ -128,6 +132,8 @@ struct ppl_log {
128 sector_t next_io_sector; 132 sector_t next_io_sector;
129 unsigned int entry_space; 133 unsigned int entry_space;
130 bool use_multippl; 134 bool use_multippl;
135 bool wb_cache_on;
136 unsigned long disk_flush_bitmap;
131}; 137};
132 138
133#define PPL_IO_INLINE_BVECS 32 139#define PPL_IO_INLINE_BVECS 32
@@ -145,6 +151,7 @@ struct ppl_io_unit {
145 151
146 struct list_head stripe_list; /* stripes added to the io_unit */ 152 struct list_head stripe_list; /* stripes added to the io_unit */
147 atomic_t pending_stripes; /* how many stripes not written to raid */ 153 atomic_t pending_stripes; /* how many stripes not written to raid */
154 atomic_t pending_flushes; /* how many disk flushes are in progress */
148 155
149 bool submitted; /* true if write to log started */ 156 bool submitted; /* true if write to log started */
150 157
@@ -249,6 +256,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
249 INIT_LIST_HEAD(&io->log_sibling); 256 INIT_LIST_HEAD(&io->log_sibling);
250 INIT_LIST_HEAD(&io->stripe_list); 257 INIT_LIST_HEAD(&io->stripe_list);
251 atomic_set(&io->pending_stripes, 0); 258 atomic_set(&io->pending_stripes, 0);
259 atomic_set(&io->pending_flushes, 0);
252 bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS); 260 bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
253 261
254 pplhdr = page_address(io->header_page); 262 pplhdr = page_address(io->header_page);
@@ -475,7 +483,18 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
475 if (log->use_multippl) 483 if (log->use_multippl)
476 log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9; 484 log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9;
477 485
486 WARN_ON(log->disk_flush_bitmap != 0);
487
478 list_for_each_entry(sh, &io->stripe_list, log_list) { 488 list_for_each_entry(sh, &io->stripe_list, log_list) {
489 for (i = 0; i < sh->disks; i++) {
490 struct r5dev *dev = &sh->dev[i];
491
492 if ((ppl_conf->child_logs[i].wb_cache_on) &&
493 (test_bit(R5_Wantwrite, &dev->flags))) {
494 set_bit(i, &log->disk_flush_bitmap);
495 }
496 }
497
479 /* entries for full stripe writes have no partial parity */ 498 /* entries for full stripe writes have no partial parity */
480 if (test_bit(STRIPE_FULL_WRITE, &sh->state)) 499 if (test_bit(STRIPE_FULL_WRITE, &sh->state))
481 continue; 500 continue;
@@ -540,6 +559,7 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io)
540{ 559{
541 struct ppl_log *log = io->log; 560 struct ppl_log *log = io->log;
542 struct ppl_conf *ppl_conf = log->ppl_conf; 561 struct ppl_conf *ppl_conf = log->ppl_conf;
562 struct r5conf *conf = ppl_conf->mddev->private;
543 unsigned long flags; 563 unsigned long flags;
544 564
545 pr_debug("%s: seq: %llu\n", __func__, io->seq); 565 pr_debug("%s: seq: %llu\n", __func__, io->seq);
@@ -565,6 +585,112 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io)
565 spin_unlock(&ppl_conf->no_mem_stripes_lock); 585 spin_unlock(&ppl_conf->no_mem_stripes_lock);
566 586
567 local_irq_restore(flags); 587 local_irq_restore(flags);
588
589 wake_up(&conf->wait_for_quiescent);
590}
591
592static void ppl_flush_endio(struct bio *bio)
593{
594 struct ppl_io_unit *io = bio->bi_private;
595 struct ppl_log *log = io->log;
596 struct ppl_conf *ppl_conf = log->ppl_conf;
597 struct r5conf *conf = ppl_conf->mddev->private;
598 char b[BDEVNAME_SIZE];
599
600 pr_debug("%s: dev: %s\n", __func__, bio_devname(bio, b));
601
602 if (bio->bi_status) {
603 struct md_rdev *rdev;
604
605 rcu_read_lock();
606 rdev = md_find_rdev_rcu(conf->mddev, bio_dev(bio));
607 if (rdev)
608 md_error(rdev->mddev, rdev);
609 rcu_read_unlock();
610 }
611
612 bio_put(bio);
613
614 if (atomic_dec_and_test(&io->pending_flushes)) {
615 ppl_io_unit_finished(io);
616 md_wakeup_thread(conf->mddev->thread);
617 }
618}
619
620static void ppl_do_flush(struct ppl_io_unit *io)
621{
622 struct ppl_log *log = io->log;
623 struct ppl_conf *ppl_conf = log->ppl_conf;
624 struct r5conf *conf = ppl_conf->mddev->private;
625 int raid_disks = conf->raid_disks;
626 int flushed_disks = 0;
627 int i;
628
629 atomic_set(&io->pending_flushes, raid_disks);
630
631 for_each_set_bit(i, &log->disk_flush_bitmap, raid_disks) {
632 struct md_rdev *rdev;
633 struct block_device *bdev = NULL;
634
635 rcu_read_lock();
636 rdev = rcu_dereference(conf->disks[i].rdev);
637 if (rdev && !test_bit(Faulty, &rdev->flags))
638 bdev = rdev->bdev;
639 rcu_read_unlock();
640
641 if (bdev) {
642 struct bio *bio;
643 char b[BDEVNAME_SIZE];
644
645 bio = bio_alloc_bioset(GFP_NOIO, 0, ppl_conf->flush_bs);
646 bio_set_dev(bio, bdev);
647 bio->bi_private = io;
648 bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
649 bio->bi_end_io = ppl_flush_endio;
650
651 pr_debug("%s: dev: %s\n", __func__,
652 bio_devname(bio, b));
653
654 submit_bio(bio);
655 flushed_disks++;
656 }
657 }
658
659 log->disk_flush_bitmap = 0;
660
661 for (i = flushed_disks ; i < raid_disks; i++) {
662 if (atomic_dec_and_test(&io->pending_flushes))
663 ppl_io_unit_finished(io);
664 }
665}
666
667static inline bool ppl_no_io_unit_submitted(struct r5conf *conf,
668 struct ppl_log *log)
669{
670 struct ppl_io_unit *io;
671
672 io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
673 log_sibling);
674
675 return !io || !io->submitted;
676}
677
678void ppl_quiesce(struct r5conf *conf, int quiesce)
679{
680 struct ppl_conf *ppl_conf = conf->log_private;
681 int i;
682
683 if (quiesce) {
684 for (i = 0; i < ppl_conf->count; i++) {
685 struct ppl_log *log = &ppl_conf->child_logs[i];
686
687 spin_lock_irq(&log->io_list_lock);
688 wait_event_lock_irq(conf->wait_for_quiescent,
689 ppl_no_io_unit_submitted(conf, log),
690 log->io_list_lock);
691 spin_unlock_irq(&log->io_list_lock);
692 }
693 }
568} 694}
569 695
570void ppl_stripe_write_finished(struct stripe_head *sh) 696void ppl_stripe_write_finished(struct stripe_head *sh)
@@ -574,8 +700,12 @@ void ppl_stripe_write_finished(struct stripe_head *sh)
574 io = sh->ppl_io; 700 io = sh->ppl_io;
575 sh->ppl_io = NULL; 701 sh->ppl_io = NULL;
576 702
577 if (io && atomic_dec_and_test(&io->pending_stripes)) 703 if (io && atomic_dec_and_test(&io->pending_stripes)) {
578 ppl_io_unit_finished(io); 704 if (io->log->disk_flush_bitmap)
705 ppl_do_flush(io);
706 else
707 ppl_io_unit_finished(io);
708 }
579} 709}
580 710
581static void ppl_xor(int size, struct page *page1, struct page *page2) 711static void ppl_xor(int size, struct page *page1, struct page *page2)
@@ -1108,6 +1238,8 @@ static void __ppl_exit_log(struct ppl_conf *ppl_conf)
1108 1238
1109 if (ppl_conf->bs) 1239 if (ppl_conf->bs)
1110 bioset_free(ppl_conf->bs); 1240 bioset_free(ppl_conf->bs);
1241 if (ppl_conf->flush_bs)
1242 bioset_free(ppl_conf->flush_bs);
1111 mempool_destroy(ppl_conf->io_pool); 1243 mempool_destroy(ppl_conf->io_pool);
1112 kmem_cache_destroy(ppl_conf->io_kc); 1244 kmem_cache_destroy(ppl_conf->io_kc);
1113 1245
@@ -1173,6 +1305,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
1173 1305
1174static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) 1306static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
1175{ 1307{
1308 struct request_queue *q;
1309
1176 if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE + 1310 if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
1177 PPL_HEADER_SIZE) * 2) { 1311 PPL_HEADER_SIZE) * 2) {
1178 log->use_multippl = true; 1312 log->use_multippl = true;
@@ -1185,6 +1319,10 @@ static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
1185 PPL_HEADER_SIZE; 1319 PPL_HEADER_SIZE;
1186 } 1320 }
1187 log->next_io_sector = rdev->ppl.sector; 1321 log->next_io_sector = rdev->ppl.sector;
1322
1323 q = bdev_get_queue(rdev->bdev);
1324 if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
1325 log->wb_cache_on = true;
1188} 1326}
1189 1327
1190int ppl_init_log(struct r5conf *conf) 1328int ppl_init_log(struct r5conf *conf)
@@ -1192,8 +1330,8 @@ int ppl_init_log(struct r5conf *conf)
1192 struct ppl_conf *ppl_conf; 1330 struct ppl_conf *ppl_conf;
1193 struct mddev *mddev = conf->mddev; 1331 struct mddev *mddev = conf->mddev;
1194 int ret = 0; 1332 int ret = 0;
1333 int max_disks;
1195 int i; 1334 int i;
1196 bool need_cache_flush = false;
1197 1335
1198 pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n", 1336 pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
1199 mdname(conf->mddev)); 1337 mdname(conf->mddev));
@@ -1219,6 +1357,14 @@ int ppl_init_log(struct r5conf *conf)
1219 return -EINVAL; 1357 return -EINVAL;
1220 } 1358 }
1221 1359
1360 max_disks = FIELD_SIZEOF(struct ppl_log, disk_flush_bitmap) *
1361 BITS_PER_BYTE;
1362 if (conf->raid_disks > max_disks) {
1363 pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n",
1364 mdname(mddev), max_disks);
1365 return -EINVAL;
1366 }
1367
1222 ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL); 1368 ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
1223 if (!ppl_conf) 1369 if (!ppl_conf)
1224 return -ENOMEM; 1370 return -ENOMEM;
@@ -1244,6 +1390,12 @@ int ppl_init_log(struct r5conf *conf)
1244 goto err; 1390 goto err;
1245 } 1391 }
1246 1392
1393 ppl_conf->flush_bs = bioset_create(conf->raid_disks, 0, 0);
1394 if (!ppl_conf->flush_bs) {
1395 ret = -ENOMEM;
1396 goto err;
1397 }
1398
1247 ppl_conf->count = conf->raid_disks; 1399 ppl_conf->count = conf->raid_disks;
1248 ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log), 1400 ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
1249 GFP_KERNEL); 1401 GFP_KERNEL);
@@ -1275,23 +1427,14 @@ int ppl_init_log(struct r5conf *conf)
1275 log->rdev = rdev; 1427 log->rdev = rdev;
1276 1428
1277 if (rdev) { 1429 if (rdev) {
1278 struct request_queue *q;
1279
1280 ret = ppl_validate_rdev(rdev); 1430 ret = ppl_validate_rdev(rdev);
1281 if (ret) 1431 if (ret)
1282 goto err; 1432 goto err;
1283 1433
1284 q = bdev_get_queue(rdev->bdev);
1285 if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
1286 need_cache_flush = true;
1287 ppl_init_child_log(log, rdev); 1434 ppl_init_child_log(log, rdev);
1288 } 1435 }
1289 } 1436 }
1290 1437
1291 if (need_cache_flush)
1292 pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
1293 mdname(mddev));
1294
1295 /* load and possibly recover the logs from the member disks */ 1438 /* load and possibly recover the logs from the member disks */
1296 ret = ppl_load(ppl_conf); 1439 ret = ppl_load(ppl_conf);
1297 1440
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 98ce4272ace9..50d01144b805 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5563,7 +5563,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
5563 bool do_flush = false; 5563 bool do_flush = false;
5564 5564
5565 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5565 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5566 int ret = r5l_handle_flush_request(conf->log, bi); 5566 int ret = log_handle_flush_request(conf, bi);
5567 5567
5568 if (ret == 0) 5568 if (ret == 0)
5569 return true; 5569 return true;
@@ -6168,7 +6168,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
6168 break; 6168 break;
6169 if (i == NR_STRIPE_HASH_LOCKS) { 6169 if (i == NR_STRIPE_HASH_LOCKS) {
6170 spin_unlock_irq(&conf->device_lock); 6170 spin_unlock_irq(&conf->device_lock);
6171 r5l_flush_stripe_to_raid(conf->log); 6171 log_flush_stripe_to_raid(conf);
6172 spin_lock_irq(&conf->device_lock); 6172 spin_lock_irq(&conf->device_lock);
6173 return batch_size; 6173 return batch_size;
6174 } 6174 }
@@ -8060,7 +8060,7 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
8060 wake_up(&conf->wait_for_overlap); 8060 wake_up(&conf->wait_for_overlap);
8061 unlock_all_device_hash_locks_irq(conf); 8061 unlock_all_device_hash_locks_irq(conf);
8062 } 8062 }
8063 r5l_quiesce(conf->log, quiesce); 8063 log_quiesce(conf, quiesce);
8064} 8064}
8065 8065
8066static void *raid45_takeover_raid0(struct mddev *mddev, int level) 8066static void *raid45_takeover_raid0(struct mddev *mddev, int level)
@@ -8364,6 +8364,13 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8364 return err; 8364 return err;
8365} 8365}
8366 8366
8367static int raid5_start(struct mddev *mddev)
8368{
8369 struct r5conf *conf = mddev->private;
8370
8371 return r5l_start(conf->log);
8372}
8373
8367static struct md_personality raid6_personality = 8374static struct md_personality raid6_personality =
8368{ 8375{
8369 .name = "raid6", 8376 .name = "raid6",
@@ -8371,6 +8378,7 @@ static struct md_personality raid6_personality =
8371 .owner = THIS_MODULE, 8378 .owner = THIS_MODULE,
8372 .make_request = raid5_make_request, 8379 .make_request = raid5_make_request,
8373 .run = raid5_run, 8380 .run = raid5_run,
8381 .start = raid5_start,
8374 .free = raid5_free, 8382 .free = raid5_free,
8375 .status = raid5_status, 8383 .status = raid5_status,
8376 .error_handler = raid5_error, 8384 .error_handler = raid5_error,
@@ -8395,6 +8403,7 @@ static struct md_personality raid5_personality =
8395 .owner = THIS_MODULE, 8403 .owner = THIS_MODULE,
8396 .make_request = raid5_make_request, 8404 .make_request = raid5_make_request,
8397 .run = raid5_run, 8405 .run = raid5_run,
8406 .start = raid5_start,
8398 .free = raid5_free, 8407 .free = raid5_free,
8399 .status = raid5_status, 8408 .status = raid5_status,
8400 .error_handler = raid5_error, 8409 .error_handler = raid5_error,
@@ -8420,6 +8429,7 @@ static struct md_personality raid4_personality =
8420 .owner = THIS_MODULE, 8429 .owner = THIS_MODULE,
8421 .make_request = raid5_make_request, 8430 .make_request = raid5_make_request,
8422 .run = raid5_run, 8431 .run = raid5_run,
8432 .start = raid5_start,
8423 .free = raid5_free, 8433 .free = raid5_free,
8424 .status = raid5_status, 8434 .status = raid5_status,
8425 .error_handler = raid5_error, 8435 .error_handler = raid5_error,