aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-04-18 08:14:18 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-18 08:14:18 -0400
commitafad97eee47c1f1f242202e2473929b4ef5d9f43 (patch)
tree31f68d70760234b582a28bd3f64311ff5307b7b1 /drivers/md
parent04b7fe6a4a231871ef681bc95e08fe66992f7b1f (diff)
parent44c144f9c8e8fbd73ede2848da8253b3aae42ec2 (diff)
Merge tag 'dm-4.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - the most extensive changes this cycle are the DM core improvements to add full blk-mq support to request-based DM. - disabled by default but user can opt-in with CONFIG_DM_MQ_DEFAULT - depends on some blk-mq changes from Jens' for-4.1/core branch so that explains why this pull is built on linux-block.git - update DM to use name_to_dev_t() rather than open-coding a less capable device parser. - includes a couple small improvements to name_to_dev_t() that offer stricter constraints that DM's code provided. - improvements to the dm-cache "mq" cache replacement policy. - a DM crypt crypt_ctr() error path fix and an async crypto deadlock fix - a small efficiency improvement for DM crypt decryption by leveraging immutable biovecs - add error handling modes for corrupted blocks to DM verity - a new "log-writes" DM target from Josef Bacik that is meant for file system developers to test file system integrity at particular points in the life of a file system - a few DM log userspace cleanups and fixes - a few Documentation fixes (for thin, cache, crypt and switch) * tag 'dm-4.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (34 commits) dm crypt: fix missing error code return from crypt_ctr error path dm crypt: fix deadlock when async crypto algorithm returns -EBUSY dm crypt: leverage immutable biovecs when decrypting on read dm crypt: update URLs to new cryptsetup project page dm: add log writes target dm table: use bool function return values of true/false not 1/0 dm verity: add error handling modes for corrupted blocks dm thin: remove stale 'trim' message documentation dm delay: use msecs_to_jiffies for time conversion dm log userspace base: fix compile warning dm log userspace transfer: match wait_for_completion_timeout return type dm table: fall back to getting device using name_to_dev_t() init: stricter checking of major:minor root= values init: export name_to_dev_t and mark name argument as const dm: add 'use_blk_mq' module param and expose in per-device ro sysfs attr dm: optimize dm_mq_queue_rq to _not_ use kthread if using pure blk-mq dm: add full blk-mq support to request-based DM dm: impose configurable deadline for dm_request_fn's merge heuristic dm sysfs: introduce ability to add writable attributes dm: don't start current request if it would've merged with the previous ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig27
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-cache-policy-mq.c251
-rw-r--r--drivers/md/dm-crypt.c25
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-log-userspace-base.c91
-rw-r--r--drivers/md/dm-log-userspace-transfer.c5
-rw-r--r--drivers/md/dm-log-writes.c825
-rw-r--r--drivers/md/dm-mpath.c6
-rw-r--r--drivers/md/dm-sysfs.c43
-rw-r--r--drivers/md/dm-table.c71
-rw-r--r--drivers/md/dm-verity.c147
-rw-r--r--drivers/md/dm.c556
-rw-r--r--drivers/md/dm.h10
14 files changed, 1736 insertions, 324 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 63e05e32b462..6ddc983417d5 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -196,6 +196,17 @@ config BLK_DEV_DM
196 196
197 If unsure, say N. 197 If unsure, say N.
198 198
199config DM_MQ_DEFAULT
200 bool "request-based DM: use blk-mq I/O path by default"
201 depends on BLK_DEV_DM
202 ---help---
203 This option enables the blk-mq based I/O path for request-based
204 DM devices by default. With the option the dm_mod.use_blk_mq
205 module/boot option defaults to Y, without it to N, but it can
206 still be overriden either way.
207
208 If unsure say N.
209
199config DM_DEBUG 210config DM_DEBUG
200 bool "Device mapper debugging support" 211 bool "Device mapper debugging support"
201 depends on BLK_DEV_DM 212 depends on BLK_DEV_DM
@@ -432,4 +443,20 @@ config DM_SWITCH
432 443
433 If unsure, say N. 444 If unsure, say N.
434 445
446config DM_LOG_WRITES
447 tristate "Log writes target support"
448 depends on BLK_DEV_DM
449 ---help---
450 This device-mapper target takes two devices, one device to use
451 normally, one to log all write operations done to the first device.
452 This is for use by file system developers wishing to verify that
453 their fs is writing a consitent file system at all times by allowing
454 them to replay the log in a variety of ways and to check the
455 contents.
456
457 To compile this code as a module, choose M here: the module will
458 be called dm-log-writes.
459
460 If unsure, say N.
461
435endif # MD 462endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index a2da532b1c2b..1863feaa5846 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_DM_CACHE) += dm-cache.o
55obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o 55obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o
56obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o 56obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
57obj-$(CONFIG_DM_ERA) += dm-era.o 57obj-$(CONFIG_DM_ERA) += dm-era.o
58obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
58 59
59ifeq ($(CONFIG_DM_UEVENT),y) 60ifeq ($(CONFIG_DM_UEVENT),y)
60dm-mod-objs += dm-uevent.o 61dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 13f547a4eeb6..3ddd1162334d 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -8,6 +8,7 @@
8#include "dm.h" 8#include "dm.h"
9 9
10#include <linux/hash.h> 10#include <linux/hash.h>
11#include <linux/jiffies.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/mutex.h> 13#include <linux/mutex.h>
13#include <linux/slab.h> 14#include <linux/slab.h>
@@ -124,32 +125,41 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
124 * sorted queue. 125 * sorted queue.
125 */ 126 */
126#define NR_QUEUE_LEVELS 16u 127#define NR_QUEUE_LEVELS 16u
128#define NR_SENTINELS NR_QUEUE_LEVELS * 3
129
130#define WRITEBACK_PERIOD HZ
127 131
128struct queue { 132struct queue {
133 unsigned nr_elts;
134 bool current_writeback_sentinels;
135 unsigned long next_writeback;
129 struct list_head qs[NR_QUEUE_LEVELS]; 136 struct list_head qs[NR_QUEUE_LEVELS];
137 struct list_head sentinels[NR_SENTINELS];
130}; 138};
131 139
132static void queue_init(struct queue *q) 140static void queue_init(struct queue *q)
133{ 141{
134 unsigned i; 142 unsigned i;
135 143
136 for (i = 0; i < NR_QUEUE_LEVELS; i++) 144 q->nr_elts = 0;
145 q->current_writeback_sentinels = false;
146 q->next_writeback = 0;
147 for (i = 0; i < NR_QUEUE_LEVELS; i++) {
137 INIT_LIST_HEAD(q->qs + i); 148 INIT_LIST_HEAD(q->qs + i);
149 INIT_LIST_HEAD(q->sentinels + i);
150 INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i);
151 INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i);
152 }
138} 153}
139 154
140/* 155static unsigned queue_size(struct queue *q)
141 * Checks to see if the queue is empty.
142 * FIXME: reduce cpu usage.
143 */
144static bool queue_empty(struct queue *q)
145{ 156{
146 unsigned i; 157 return q->nr_elts;
147 158}
148 for (i = 0; i < NR_QUEUE_LEVELS; i++)
149 if (!list_empty(q->qs + i))
150 return false;
151 159
152 return true; 160static bool queue_empty(struct queue *q)
161{
162 return q->nr_elts == 0;
153} 163}
154 164
155/* 165/*
@@ -157,24 +167,19 @@ static bool queue_empty(struct queue *q)
157 */ 167 */
158static void queue_push(struct queue *q, unsigned level, struct list_head *elt) 168static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
159{ 169{
170 q->nr_elts++;
160 list_add_tail(elt, q->qs + level); 171 list_add_tail(elt, q->qs + level);
161} 172}
162 173
163static void queue_remove(struct list_head *elt) 174static void queue_remove(struct queue *q, struct list_head *elt)
164{ 175{
176 q->nr_elts--;
165 list_del(elt); 177 list_del(elt);
166} 178}
167 179
168/* 180static bool is_sentinel(struct queue *q, struct list_head *h)
169 * Shifts all regions down one level. This has no effect on the order of
170 * the queue.
171 */
172static void queue_shift_down(struct queue *q)
173{ 181{
174 unsigned level; 182 return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS));
175
176 for (level = 1; level < NR_QUEUE_LEVELS; level++)
177 list_splice_init(q->qs + level, q->qs + level - 1);
178} 183}
179 184
180/* 185/*
@@ -184,10 +189,12 @@ static void queue_shift_down(struct queue *q)
184static struct list_head *queue_peek(struct queue *q) 189static struct list_head *queue_peek(struct queue *q)
185{ 190{
186 unsigned level; 191 unsigned level;
192 struct list_head *h;
187 193
188 for (level = 0; level < NR_QUEUE_LEVELS; level++) 194 for (level = 0; level < NR_QUEUE_LEVELS; level++)
189 if (!list_empty(q->qs + level)) 195 list_for_each(h, q->qs + level)
190 return q->qs[level].next; 196 if (!is_sentinel(q, h))
197 return h;
191 198
192 return NULL; 199 return NULL;
193} 200}
@@ -197,16 +204,34 @@ static struct list_head *queue_pop(struct queue *q)
197 struct list_head *r = queue_peek(q); 204 struct list_head *r = queue_peek(q);
198 205
199 if (r) { 206 if (r) {
207 q->nr_elts--;
200 list_del(r); 208 list_del(r);
201
202 /* have we just emptied the bottom level? */
203 if (list_empty(q->qs))
204 queue_shift_down(q);
205 } 209 }
206 210
207 return r; 211 return r;
208} 212}
209 213
214/*
215 * Pops an entry from a level that is not past a sentinel.
216 */
217static struct list_head *queue_pop_old(struct queue *q)
218{
219 unsigned level;
220 struct list_head *h;
221
222 for (level = 0; level < NR_QUEUE_LEVELS; level++)
223 list_for_each(h, q->qs + level) {
224 if (is_sentinel(q, h))
225 break;
226
227 q->nr_elts--;
228 list_del(h);
229 return h;
230 }
231
232 return NULL;
233}
234
210static struct list_head *list_pop(struct list_head *lh) 235static struct list_head *list_pop(struct list_head *lh)
211{ 236{
212 struct list_head *r = lh->next; 237 struct list_head *r = lh->next;
@@ -217,6 +242,62 @@ static struct list_head *list_pop(struct list_head *lh)
217 return r; 242 return r;
218} 243}
219 244
245static struct list_head *writeback_sentinel(struct queue *q, unsigned level)
246{
247 if (q->current_writeback_sentinels)
248 return q->sentinels + NR_QUEUE_LEVELS + level;
249 else
250 return q->sentinels + 2 * NR_QUEUE_LEVELS + level;
251}
252
253static void queue_update_writeback_sentinels(struct queue *q)
254{
255 unsigned i;
256 struct list_head *h;
257
258 if (time_after(jiffies, q->next_writeback)) {
259 for (i = 0; i < NR_QUEUE_LEVELS; i++) {
260 h = writeback_sentinel(q, i);
261 list_del(h);
262 list_add_tail(h, q->qs + i);
263 }
264
265 q->next_writeback = jiffies + WRITEBACK_PERIOD;
266 q->current_writeback_sentinels = !q->current_writeback_sentinels;
267 }
268}
269
270/*
271 * Sometimes we want to iterate through entries that have been pushed since
272 * a certain event. We use sentinel entries on the queues to delimit these
273 * 'tick' events.
274 */
275static void queue_tick(struct queue *q)
276{
277 unsigned i;
278
279 for (i = 0; i < NR_QUEUE_LEVELS; i++) {
280 list_del(q->sentinels + i);
281 list_add_tail(q->sentinels + i, q->qs + i);
282 }
283}
284
285typedef void (*iter_fn)(struct list_head *, void *);
286static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context)
287{
288 unsigned i;
289 struct list_head *h;
290
291 for (i = 0; i < NR_QUEUE_LEVELS; i++) {
292 list_for_each_prev(h, q->qs + i) {
293 if (is_sentinel(q, h))
294 break;
295
296 fn(h, context);
297 }
298 }
299}
300
220/*----------------------------------------------------------------*/ 301/*----------------------------------------------------------------*/
221 302
222/* 303/*
@@ -232,8 +313,6 @@ struct entry {
232 */ 313 */
233 bool dirty:1; 314 bool dirty:1;
234 unsigned hit_count; 315 unsigned hit_count;
235 unsigned generation;
236 unsigned tick;
237}; 316};
238 317
239/* 318/*
@@ -481,7 +560,6 @@ static bool in_cache(struct mq_policy *mq, struct entry *e)
481 */ 560 */
482static void push(struct mq_policy *mq, struct entry *e) 561static void push(struct mq_policy *mq, struct entry *e)
483{ 562{
484 e->tick = mq->tick;
485 hash_insert(mq, e); 563 hash_insert(mq, e);
486 564
487 if (in_cache(mq, e)) 565 if (in_cache(mq, e))
@@ -496,7 +574,11 @@ static void push(struct mq_policy *mq, struct entry *e)
496 */ 574 */
497static void del(struct mq_policy *mq, struct entry *e) 575static void del(struct mq_policy *mq, struct entry *e)
498{ 576{
499 queue_remove(&e->list); 577 if (in_cache(mq, e))
578 queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list);
579 else
580 queue_remove(&mq->pre_cache, &e->list);
581
500 hash_remove(e); 582 hash_remove(e);
501} 583}
502 584
@@ -518,18 +600,24 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q)
518 return e; 600 return e;
519} 601}
520 602
521static struct entry *peek(struct queue *q) 603static struct entry *pop_old(struct mq_policy *mq, struct queue *q)
522{ 604{
523 struct list_head *h = queue_peek(q); 605 struct entry *e;
524 return h ? container_of(h, struct entry, list) : NULL; 606 struct list_head *h = queue_pop_old(q);
607
608 if (!h)
609 return NULL;
610
611 e = container_of(h, struct entry, list);
612 hash_remove(e);
613
614 return e;
525} 615}
526 616
527/* 617static struct entry *peek(struct queue *q)
528 * Has this entry already been updated?
529 */
530static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
531{ 618{
532 return mq->tick == e->tick; 619 struct list_head *h = queue_peek(q);
620 return h ? container_of(h, struct entry, list) : NULL;
533} 621}
534 622
535/* 623/*
@@ -583,20 +671,9 @@ static void check_generation(struct mq_policy *mq)
583 * Whenever we use an entry we bump up it's hit counter, and push it to the 671 * Whenever we use an entry we bump up it's hit counter, and push it to the
584 * back to it's current level. 672 * back to it's current level.
585 */ 673 */
586static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) 674static void requeue(struct mq_policy *mq, struct entry *e)
587{ 675{
588 if (updated_this_tick(mq, e))
589 return;
590
591 e->hit_count++;
592 mq->hit_count++;
593 check_generation(mq); 676 check_generation(mq);
594
595 /* generation adjustment, to stop the counts increasing forever. */
596 /* FIXME: divide? */
597 /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
598 e->generation = mq->generation;
599
600 del(mq, e); 677 del(mq, e);
601 push(mq, e); 678 push(mq, e);
602} 679}
@@ -703,7 +780,7 @@ static int cache_entry_found(struct mq_policy *mq,
703 struct entry *e, 780 struct entry *e,
704 struct policy_result *result) 781 struct policy_result *result)
705{ 782{
706 requeue_and_update_tick(mq, e); 783 requeue(mq, e);
707 784
708 if (in_cache(mq, e)) { 785 if (in_cache(mq, e)) {
709 result->op = POLICY_HIT; 786 result->op = POLICY_HIT;
@@ -740,8 +817,6 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
740 new_e->oblock = e->oblock; 817 new_e->oblock = e->oblock;
741 new_e->dirty = false; 818 new_e->dirty = false;
742 new_e->hit_count = e->hit_count; 819 new_e->hit_count = e->hit_count;
743 new_e->generation = e->generation;
744 new_e->tick = e->tick;
745 820
746 del(mq, e); 821 del(mq, e);
747 free_entry(&mq->pre_cache_pool, e); 822 free_entry(&mq->pre_cache_pool, e);
@@ -757,18 +832,16 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
757 int data_dir, struct policy_result *result) 832 int data_dir, struct policy_result *result)
758{ 833{
759 int r = 0; 834 int r = 0;
760 bool updated = updated_this_tick(mq, e);
761 835
762 if ((!discarded_oblock && updated) || 836 if (!should_promote(mq, e, discarded_oblock, data_dir)) {
763 !should_promote(mq, e, discarded_oblock, data_dir)) { 837 requeue(mq, e);
764 requeue_and_update_tick(mq, e);
765 result->op = POLICY_MISS; 838 result->op = POLICY_MISS;
766 839
767 } else if (!can_migrate) 840 } else if (!can_migrate)
768 r = -EWOULDBLOCK; 841 r = -EWOULDBLOCK;
769 842
770 else { 843 else {
771 requeue_and_update_tick(mq, e); 844 requeue(mq, e);
772 r = pre_cache_to_cache(mq, e, result); 845 r = pre_cache_to_cache(mq, e, result);
773 } 846 }
774 847
@@ -795,7 +868,6 @@ static void insert_in_pre_cache(struct mq_policy *mq,
795 e->dirty = false; 868 e->dirty = false;
796 e->oblock = oblock; 869 e->oblock = oblock;
797 e->hit_count = 1; 870 e->hit_count = 1;
798 e->generation = mq->generation;
799 push(mq, e); 871 push(mq, e);
800} 872}
801 873
@@ -828,7 +900,6 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
828 e->oblock = oblock; 900 e->oblock = oblock;
829 e->dirty = false; 901 e->dirty = false;
830 e->hit_count = 1; 902 e->hit_count = 1;
831 e->generation = mq->generation;
832 push(mq, e); 903 push(mq, e);
833 904
834 result->cblock = infer_cblock(&mq->cache_pool, e); 905 result->cblock = infer_cblock(&mq->cache_pool, e);
@@ -905,12 +976,37 @@ static void mq_destroy(struct dm_cache_policy *p)
905 kfree(mq); 976 kfree(mq);
906} 977}
907 978
979static void update_pre_cache_hits(struct list_head *h, void *context)
980{
981 struct entry *e = container_of(h, struct entry, list);
982 e->hit_count++;
983}
984
985static void update_cache_hits(struct list_head *h, void *context)
986{
987 struct mq_policy *mq = context;
988 struct entry *e = container_of(h, struct entry, list);
989 e->hit_count++;
990 mq->hit_count++;
991}
992
908static void copy_tick(struct mq_policy *mq) 993static void copy_tick(struct mq_policy *mq)
909{ 994{
910 unsigned long flags; 995 unsigned long flags, tick;
911 996
912 spin_lock_irqsave(&mq->tick_lock, flags); 997 spin_lock_irqsave(&mq->tick_lock, flags);
913 mq->tick = mq->tick_protected; 998 tick = mq->tick_protected;
999 if (tick != mq->tick) {
1000 queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq);
1001 queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq);
1002 queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq);
1003 mq->tick = tick;
1004 }
1005
1006 queue_tick(&mq->pre_cache);
1007 queue_tick(&mq->cache_dirty);
1008 queue_tick(&mq->cache_clean);
1009 queue_update_writeback_sentinels(&mq->cache_dirty);
914 spin_unlock_irqrestore(&mq->tick_lock, flags); 1010 spin_unlock_irqrestore(&mq->tick_lock, flags);
915} 1011}
916 1012
@@ -1001,7 +1097,6 @@ static int mq_load_mapping(struct dm_cache_policy *p,
1001 e->oblock = oblock; 1097 e->oblock = oblock;
1002 e->dirty = false; /* this gets corrected in a minute */ 1098 e->dirty = false; /* this gets corrected in a minute */
1003 e->hit_count = hint_valid ? hint : 1; 1099 e->hit_count = hint_valid ? hint : 1;
1004 e->generation = mq->generation;
1005 push(mq, e); 1100 push(mq, e);
1006 1101
1007 return 0; 1102 return 0;
@@ -1012,10 +1107,15 @@ static int mq_save_hints(struct mq_policy *mq, struct queue *q,
1012{ 1107{
1013 int r; 1108 int r;
1014 unsigned level; 1109 unsigned level;
1110 struct list_head *h;
1015 struct entry *e; 1111 struct entry *e;
1016 1112
1017 for (level = 0; level < NR_QUEUE_LEVELS; level++) 1113 for (level = 0; level < NR_QUEUE_LEVELS; level++)
1018 list_for_each_entry(e, q->qs + level, list) { 1114 list_for_each(h, q->qs + level) {
1115 if (is_sentinel(q, h))
1116 continue;
1117
1118 e = container_of(h, struct entry, list);
1019 r = fn(context, infer_cblock(&mq->cache_pool, e), 1119 r = fn(context, infer_cblock(&mq->cache_pool, e),
1020 e->oblock, e->hit_count); 1120 e->oblock, e->hit_count);
1021 if (r) 1121 if (r)
@@ -1087,10 +1187,27 @@ static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
1087 return r; 1187 return r;
1088} 1188}
1089 1189
1190#define CLEAN_TARGET_PERCENTAGE 25
1191
1192static bool clean_target_met(struct mq_policy *mq)
1193{
1194 /*
1195 * Cache entries may not be populated. So we're cannot rely on the
1196 * size of the clean queue.
1197 */
1198 unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty);
1199 unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100;
1200
1201 return nr_clean >= target;
1202}
1203
1090static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, 1204static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
1091 dm_cblock_t *cblock) 1205 dm_cblock_t *cblock)
1092{ 1206{
1093 struct entry *e = pop(mq, &mq->cache_dirty); 1207 struct entry *e = pop_old(mq, &mq->cache_dirty);
1208
1209 if (!e && !clean_target_met(mq))
1210 e = pop(mq, &mq->cache_dirty);
1094 1211
1095 if (!e) 1212 if (!e)
1096 return -ENODATA; 1213 return -ENODATA;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 713a96237a80..9eeea196328a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -228,7 +228,7 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
228 * 228 *
229 * tcw: Compatible implementation of the block chaining mode used 229 * tcw: Compatible implementation of the block chaining mode used
230 * by the TrueCrypt device encryption system (prior to version 4.1). 230 * by the TrueCrypt device encryption system (prior to version 4.1).
231 * For more info see: http://www.truecrypt.org 231 * For more info see: https://gitlab.com/cryptsetup/cryptsetup/wikis/TrueCryptOnDiskFormat
232 * It operates on full 512 byte sectors and uses CBC 232 * It operates on full 512 byte sectors and uses CBC
233 * with an IV derived from initial key and the sector number. 233 * with an IV derived from initial key and the sector number.
234 * In addition, whitening value is applied on every sector, whitening 234 * In addition, whitening value is applied on every sector, whitening
@@ -925,11 +925,10 @@ static int crypt_convert(struct crypt_config *cc,
925 925
926 switch (r) { 926 switch (r) {
927 /* async */ 927 /* async */
928 case -EINPROGRESS:
928 case -EBUSY: 929 case -EBUSY:
929 wait_for_completion(&ctx->restart); 930 wait_for_completion(&ctx->restart);
930 reinit_completion(&ctx->restart); 931 reinit_completion(&ctx->restart);
931 /* fall through*/
932 case -EINPROGRESS:
933 ctx->req = NULL; 932 ctx->req = NULL;
934 ctx->cc_sector++; 933 ctx->cc_sector++;
935 continue; 934 continue;
@@ -1124,15 +1123,15 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
1124static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) 1123static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
1125{ 1124{
1126 struct crypt_config *cc = io->cc; 1125 struct crypt_config *cc = io->cc;
1127 struct bio *base_bio = io->base_bio;
1128 struct bio *clone; 1126 struct bio *clone;
1129 1127
1130 /* 1128 /*
1131 * The block layer might modify the bvec array, so always 1129 * We need the original biovec array in order to decrypt
1132 * copy the required bvecs because we need the original 1130 * the whole bio data *afterwards* -- thanks to immutable
1133 * one in order to decrypt the whole bio data *afterwards*. 1131 * biovecs we don't need to worry about the block layer
1132 * modifying the biovec array; so leverage bio_clone_fast().
1134 */ 1133 */
1135 clone = bio_clone_bioset(base_bio, gfp, cc->bs); 1134 clone = bio_clone_fast(io->base_bio, gfp, cc->bs);
1136 if (!clone) 1135 if (!clone)
1137 return 1; 1136 return 1;
1138 1137
@@ -1346,10 +1345,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1346 struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); 1345 struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
1347 struct crypt_config *cc = io->cc; 1346 struct crypt_config *cc = io->cc;
1348 1347
1349 if (error == -EINPROGRESS) { 1348 if (error == -EINPROGRESS)
1350 complete(&ctx->restart);
1351 return; 1349 return;
1352 }
1353 1350
1354 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) 1351 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
1355 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); 1352 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
@@ -1360,12 +1357,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1360 crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); 1357 crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
1361 1358
1362 if (!atomic_dec_and_test(&ctx->cc_pending)) 1359 if (!atomic_dec_and_test(&ctx->cc_pending))
1363 return; 1360 goto done;
1364 1361
1365 if (bio_data_dir(io->base_bio) == READ) 1362 if (bio_data_dir(io->base_bio) == READ)
1366 kcryptd_crypt_read_done(io); 1363 kcryptd_crypt_read_done(io);
1367 else 1364 else
1368 kcryptd_crypt_write_io_submit(io, 1); 1365 kcryptd_crypt_write_io_submit(io, 1);
1366done:
1367 if (!completion_done(&ctx->restart))
1368 complete(&ctx->restart);
1369} 1369}
1370 1370
1371static void kcryptd_crypt(struct work_struct *work) 1371static void kcryptd_crypt(struct work_struct *work)
@@ -1816,6 +1816,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1816 if (ret) 1816 if (ret)
1817 goto bad; 1817 goto bad;
1818 1818
1819 ret = -EINVAL;
1819 while (opt_params--) { 1820 while (opt_params--) {
1820 opt_string = dm_shift_arg(&as); 1821 opt_string = dm_shift_arg(&as);
1821 if (!opt_string) { 1822 if (!opt_string) {
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 42c3a27a14cc..57b6a1901c91 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -236,7 +236,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
236 delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); 236 delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
237 237
238 delayed->context = dc; 238 delayed->context = dc;
239 delayed->expires = expires = jiffies + (delay * HZ / 1000); 239 delayed->expires = expires = jiffies + msecs_to_jiffies(delay);
240 240
241 mutex_lock(&delayed_bios_lock); 241 mutex_lock(&delayed_bios_lock);
242 242
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 03177ca0b009..058256d2eeea 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -17,7 +17,9 @@
17 17
18#define DM_LOG_USERSPACE_VSN "1.3.0" 18#define DM_LOG_USERSPACE_VSN "1.3.0"
19 19
20struct flush_entry { 20#define FLUSH_ENTRY_POOL_SIZE 16
21
22struct dm_dirty_log_flush_entry {
21 int type; 23 int type;
22 region_t region; 24 region_t region;
23 struct list_head list; 25 struct list_head list;
@@ -34,22 +36,14 @@ struct flush_entry {
34struct log_c { 36struct log_c {
35 struct dm_target *ti; 37 struct dm_target *ti;
36 struct dm_dev *log_dev; 38 struct dm_dev *log_dev;
37 uint32_t region_size;
38 region_t region_count;
39 uint64_t luid;
40 char uuid[DM_UUID_LEN];
41 39
42 char *usr_argv_str; 40 char *usr_argv_str;
43 uint32_t usr_argc; 41 uint32_t usr_argc;
44 42
45 /* 43 uint32_t region_size;
46 * in_sync_hint gets set when doing is_remote_recovering. It 44 region_t region_count;
47 * represents the first region that needs recovery. IOW, the 45 uint64_t luid;
48 * first zero bit of sync_bits. This can be useful for to limit 46 char uuid[DM_UUID_LEN];
49 * traffic for calls like is_remote_recovering and get_resync_work,
50 * but be take care in its use for anything else.
51 */
52 uint64_t in_sync_hint;
53 47
54 /* 48 /*
55 * Mark and clear requests are held until a flush is issued 49 * Mark and clear requests are held until a flush is issued
@@ -62,6 +56,15 @@ struct log_c {
62 struct list_head clear_list; 56 struct list_head clear_list;
63 57
64 /* 58 /*
59 * in_sync_hint gets set when doing is_remote_recovering. It
60 * represents the first region that needs recovery. IOW, the
61 * first zero bit of sync_bits. This can be useful for to limit
62 * traffic for calls like is_remote_recovering and get_resync_work,
63 * but be take care in its use for anything else.
64 */
65 uint64_t in_sync_hint;
66
67 /*
65 * Workqueue for flush of clear region requests. 68 * Workqueue for flush of clear region requests.
66 */ 69 */
67 struct workqueue_struct *dmlog_wq; 70 struct workqueue_struct *dmlog_wq;
@@ -72,19 +75,11 @@ struct log_c {
72 * Combine userspace flush and mark requests for efficiency. 75 * Combine userspace flush and mark requests for efficiency.
73 */ 76 */
74 uint32_t integrated_flush; 77 uint32_t integrated_flush;
75};
76
77static mempool_t *flush_entry_pool;
78 78
79static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) 79 mempool_t *flush_entry_pool;
80{ 80};
81 return kmalloc(sizeof(struct flush_entry), gfp_mask);
82}
83 81
84static void flush_entry_free(void *element, void *pool_data) 82static struct kmem_cache *_flush_entry_cache;
85{
86 kfree(element);
87}
88 83
89static int userspace_do_request(struct log_c *lc, const char *uuid, 84static int userspace_do_request(struct log_c *lc, const char *uuid,
90 int request_type, char *data, size_t data_size, 85 int request_type, char *data, size_t data_size,
@@ -254,6 +249,14 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
254 goto out; 249 goto out;
255 } 250 }
256 251
252 lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE,
253 _flush_entry_cache);
254 if (!lc->flush_entry_pool) {
255 DMERR("Failed to create flush_entry_pool");
256 r = -ENOMEM;
257 goto out;
258 }
259
257 /* 260 /*
258 * Send table string and get back any opened device. 261 * Send table string and get back any opened device.
259 */ 262 */
@@ -310,6 +313,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
310out: 313out:
311 kfree(devices_rdata); 314 kfree(devices_rdata);
312 if (r) { 315 if (r) {
316 if (lc->flush_entry_pool)
317 mempool_destroy(lc->flush_entry_pool);
313 kfree(lc); 318 kfree(lc);
314 kfree(ctr_str); 319 kfree(ctr_str);
315 } else { 320 } else {
@@ -338,6 +343,8 @@ static void userspace_dtr(struct dm_dirty_log *log)
338 if (lc->log_dev) 343 if (lc->log_dev)
339 dm_put_device(lc->ti, lc->log_dev); 344 dm_put_device(lc->ti, lc->log_dev);
340 345
346 mempool_destroy(lc->flush_entry_pool);
347
341 kfree(lc->usr_argv_str); 348 kfree(lc->usr_argv_str);
342 kfree(lc); 349 kfree(lc);
343 350
@@ -461,7 +468,7 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
461static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) 468static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
462{ 469{
463 int r = 0; 470 int r = 0;
464 struct flush_entry *fe; 471 struct dm_dirty_log_flush_entry *fe;
465 472
466 list_for_each_entry(fe, flush_list, list) { 473 list_for_each_entry(fe, flush_list, list) {
467 r = userspace_do_request(lc, lc->uuid, fe->type, 474 r = userspace_do_request(lc, lc->uuid, fe->type,
@@ -481,7 +488,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
481 int r = 0; 488 int r = 0;
482 int count; 489 int count;
483 uint32_t type = 0; 490 uint32_t type = 0;
484 struct flush_entry *fe, *tmp_fe; 491 struct dm_dirty_log_flush_entry *fe, *tmp_fe;
485 LIST_HEAD(tmp_list); 492 LIST_HEAD(tmp_list);
486 uint64_t group[MAX_FLUSH_GROUP_COUNT]; 493 uint64_t group[MAX_FLUSH_GROUP_COUNT];
487 494
@@ -563,7 +570,8 @@ static int userspace_flush(struct dm_dirty_log *log)
563 LIST_HEAD(clear_list); 570 LIST_HEAD(clear_list);
564 int mark_list_is_empty; 571 int mark_list_is_empty;
565 int clear_list_is_empty; 572 int clear_list_is_empty;
566 struct flush_entry *fe, *tmp_fe; 573 struct dm_dirty_log_flush_entry *fe, *tmp_fe;
574 mempool_t *flush_entry_pool = lc->flush_entry_pool;
567 575
568 spin_lock_irqsave(&lc->flush_lock, flags); 576 spin_lock_irqsave(&lc->flush_lock, flags);
569 list_splice_init(&lc->mark_list, &mark_list); 577 list_splice_init(&lc->mark_list, &mark_list);
@@ -643,10 +651,10 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
643{ 651{
644 unsigned long flags; 652 unsigned long flags;
645 struct log_c *lc = log->context; 653 struct log_c *lc = log->context;
646 struct flush_entry *fe; 654 struct dm_dirty_log_flush_entry *fe;
647 655
648 /* Wait for an allocation, but _never_ fail */ 656 /* Wait for an allocation, but _never_ fail */
649 fe = mempool_alloc(flush_entry_pool, GFP_NOIO); 657 fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO);
650 BUG_ON(!fe); 658 BUG_ON(!fe);
651 659
652 spin_lock_irqsave(&lc->flush_lock, flags); 660 spin_lock_irqsave(&lc->flush_lock, flags);
@@ -672,7 +680,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
672{ 680{
673 unsigned long flags; 681 unsigned long flags;
674 struct log_c *lc = log->context; 682 struct log_c *lc = log->context;
675 struct flush_entry *fe; 683 struct dm_dirty_log_flush_entry *fe;
676 684
677 /* 685 /*
678 * If we fail to allocate, we skip the clearing of 686 * If we fail to allocate, we skip the clearing of
@@ -680,7 +688,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
680 * to cause the region to be resync'ed when the 688 * to cause the region to be resync'ed when the
681 * device is activated next time. 689 * device is activated next time.
682 */ 690 */
683 fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); 691 fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC);
684 if (!fe) { 692 if (!fe) {
685 DMERR("Failed to allocate memory to clear region."); 693 DMERR("Failed to allocate memory to clear region.");
686 return; 694 return;
@@ -733,7 +741,6 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
733static void userspace_set_region_sync(struct dm_dirty_log *log, 741static void userspace_set_region_sync(struct dm_dirty_log *log,
734 region_t region, int in_sync) 742 region_t region, int in_sync)
735{ 743{
736 int r;
737 struct log_c *lc = log->context; 744 struct log_c *lc = log->context;
738 struct { 745 struct {
739 region_t r; 746 region_t r;
@@ -743,12 +750,12 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
743 pkg.r = region; 750 pkg.r = region;
744 pkg.i = (int64_t)in_sync; 751 pkg.i = (int64_t)in_sync;
745 752
746 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, 753 (void) userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
747 (char *)&pkg, sizeof(pkg), NULL, NULL); 754 (char *)&pkg, sizeof(pkg), NULL, NULL);
748 755
749 /* 756 /*
750 * It would be nice to be able to report failures. 757 * It would be nice to be able to report failures.
751 * However, it is easy emough to detect and resolve. 758 * However, it is easy enough to detect and resolve.
752 */ 759 */
753 return; 760 return;
754} 761}
@@ -886,18 +893,16 @@ static int __init userspace_dirty_log_init(void)
886{ 893{
887 int r = 0; 894 int r = 0;
888 895
889 flush_entry_pool = mempool_create(100, flush_entry_alloc, 896 _flush_entry_cache = KMEM_CACHE(dm_dirty_log_flush_entry, 0);
890 flush_entry_free, NULL); 897 if (!_flush_entry_cache) {
891 898 DMWARN("Unable to create flush_entry_cache: No memory.");
892 if (!flush_entry_pool) {
893 DMWARN("Unable to create flush_entry_pool: No memory.");
894 return -ENOMEM; 899 return -ENOMEM;
895 } 900 }
896 901
897 r = dm_ulog_tfr_init(); 902 r = dm_ulog_tfr_init();
898 if (r) { 903 if (r) {
899 DMWARN("Unable to initialize userspace log communications"); 904 DMWARN("Unable to initialize userspace log communications");
900 mempool_destroy(flush_entry_pool); 905 kmem_cache_destroy(_flush_entry_cache);
901 return r; 906 return r;
902 } 907 }
903 908
@@ -905,7 +910,7 @@ static int __init userspace_dirty_log_init(void)
905 if (r) { 910 if (r) {
906 DMWARN("Couldn't register userspace dirty log type"); 911 DMWARN("Couldn't register userspace dirty log type");
907 dm_ulog_tfr_exit(); 912 dm_ulog_tfr_exit();
908 mempool_destroy(flush_entry_pool); 913 kmem_cache_destroy(_flush_entry_cache);
909 return r; 914 return r;
910 } 915 }
911 916
@@ -917,7 +922,7 @@ static void __exit userspace_dirty_log_exit(void)
917{ 922{
918 dm_dirty_log_type_unregister(&_userspace_type); 923 dm_dirty_log_type_unregister(&_userspace_type);
919 dm_ulog_tfr_exit(); 924 dm_ulog_tfr_exit();
920 mempool_destroy(flush_entry_pool); 925 kmem_cache_destroy(_flush_entry_cache);
921 926
922 DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); 927 DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
923 return; 928 return;
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 39ad9664d397..fdf8ec304f8d 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -172,6 +172,7 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
172 char *rdata, size_t *rdata_size) 172 char *rdata, size_t *rdata_size)
173{ 173{
174 int r = 0; 174 int r = 0;
175 unsigned long tmo;
175 size_t dummy = 0; 176 size_t dummy = 0;
176 int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg); 177 int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg);
177 struct dm_ulog_request *tfr = prealloced_ulog_tfr; 178 struct dm_ulog_request *tfr = prealloced_ulog_tfr;
@@ -236,11 +237,11 @@ resend:
236 goto out; 237 goto out;
237 } 238 }
238 239
239 r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT); 240 tmo = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
240 spin_lock(&receiving_list_lock); 241 spin_lock(&receiving_list_lock);
241 list_del_init(&(pkg.list)); 242 list_del_init(&(pkg.list));
242 spin_unlock(&receiving_list_lock); 243 spin_unlock(&receiving_list_lock);
243 if (!r) { 244 if (!tmo) {
244 DMWARN("[%s] Request timed out: [%u/%u] - retrying", 245 DMWARN("[%s] Request timed out: [%u/%u] - retrying",
245 (strlen(uuid) > 8) ? 246 (strlen(uuid) > 8) ?
246 (uuid + (strlen(uuid) - 8)) : (uuid), 247 (uuid + (strlen(uuid) - 8)) : (uuid),
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
new file mode 100644
index 000000000000..93e08446a87d
--- /dev/null
+++ b/drivers/md/dm-log-writes.c
@@ -0,0 +1,825 @@
1/*
2 * Copyright (C) 2014 Facebook. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include <linux/device-mapper.h>
8
9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/blkdev.h>
12#include <linux/bio.h>
13#include <linux/slab.h>
14#include <linux/kthread.h>
15#include <linux/freezer.h>
16
17#define DM_MSG_PREFIX "log-writes"
18
19/*
20 * This target will sequentially log all writes to the target device onto the
21 * log device. This is helpful for replaying writes to check for fs consistency
22 * at all times. This target provides a mechanism to mark specific events to
23 * check data at a later time. So for example you would:
24 *
25 * write data
26 * fsync
27 * dmsetup message /dev/whatever mark mymark
28 * unmount /mnt/test
29 *
30 * Then replay the log up to mymark and check the contents of the replay to
31 * verify it matches what was written.
32 *
33 * We log writes only after they have been flushed, this makes the log describe
34 * close to the order in which the data hits the actual disk, not its cache. So
35 * for example the following sequence (W means write, C means complete)
36 *
37 * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
38 *
39 * Would result in the log looking like this:
40 *
41 * c,a,flush,fuad,b,<other writes>,<next flush>
42 *
43 * This is meant to help expose problems where file systems do not properly wait
44 * on data being written before invoking a FLUSH. FUA bypasses cache so once it
45 * completes it is added to the log as it should be on disk.
46 *
47 * We treat DISCARDs as if they don't bypass cache so that they are logged in
48 * order of completion along with the normal writes. If we didn't do it this
49 * way we would process all the discards first and then write all the data, when
50 * in fact we want to do the data and the discard in the order that they
51 * completed.
52 */
53#define LOG_FLUSH_FLAG (1 << 0)
54#define LOG_FUA_FLAG (1 << 1)
55#define LOG_DISCARD_FLAG (1 << 2)
56#define LOG_MARK_FLAG (1 << 3)
57
58#define WRITE_LOG_VERSION 1
59#define WRITE_LOG_MAGIC 0x6a736677736872
60
61/*
62 * The disk format for this is braindead simple.
63 *
64 * At byte 0 we have our super, followed by the following sequence for
65 * nr_entries:
66 *
67 * [ 1 sector ][ entry->nr_sectors ]
68 * [log_write_entry][ data written ]
69 *
70 * The log_write_entry takes up a full sector so we can have arbitrary length
71 * marks and it leaves us room for extra content in the future.
72 */
73
74/*
75 * Basic info about the log for userspace.
76 */
77struct log_write_super {
78 __le64 magic;
79 __le64 version;
80 __le64 nr_entries;
81 __le32 sectorsize;
82};
83
84/*
85 * sector - the sector we wrote.
86 * nr_sectors - the number of sectors we wrote.
87 * flags - flags for this log entry.
88 * data_len - the size of the data in this log entry, this is for private log
89 * entry stuff, the MARK data provided by userspace for example.
90 */
91struct log_write_entry {
92 __le64 sector;
93 __le64 nr_sectors;
94 __le64 flags;
95 __le64 data_len;
96};
97
98struct log_writes_c {
99 struct dm_dev *dev;
100 struct dm_dev *logdev;
101 u64 logged_entries;
102 u32 sectorsize;
103 atomic_t io_blocks;
104 atomic_t pending_blocks;
105 sector_t next_sector;
106 sector_t end_sector;
107 bool logging_enabled;
108 bool device_supports_discard;
109 spinlock_t blocks_lock;
110 struct list_head unflushed_blocks;
111 struct list_head logging_blocks;
112 wait_queue_head_t wait;
113 struct task_struct *log_kthread;
114};
115
116struct pending_block {
117 int vec_cnt;
118 u64 flags;
119 sector_t sector;
120 sector_t nr_sectors;
121 char *data;
122 u32 datalen;
123 struct list_head list;
124 struct bio_vec vecs[0];
125};
126
127struct per_bio_data {
128 struct pending_block *block;
129};
130
131static void put_pending_block(struct log_writes_c *lc)
132{
133 if (atomic_dec_and_test(&lc->pending_blocks)) {
134 smp_mb__after_atomic();
135 if (waitqueue_active(&lc->wait))
136 wake_up(&lc->wait);
137 }
138}
139
140static void put_io_block(struct log_writes_c *lc)
141{
142 if (atomic_dec_and_test(&lc->io_blocks)) {
143 smp_mb__after_atomic();
144 if (waitqueue_active(&lc->wait))
145 wake_up(&lc->wait);
146 }
147}
148
149static void log_end_io(struct bio *bio, int err)
150{
151 struct log_writes_c *lc = bio->bi_private;
152 struct bio_vec *bvec;
153 int i;
154
155 if (err) {
156 unsigned long flags;
157
158 DMERR("Error writing log block, error=%d", err);
159 spin_lock_irqsave(&lc->blocks_lock, flags);
160 lc->logging_enabled = false;
161 spin_unlock_irqrestore(&lc->blocks_lock, flags);
162 }
163
164 bio_for_each_segment_all(bvec, bio, i)
165 __free_page(bvec->bv_page);
166
167 put_io_block(lc);
168 bio_put(bio);
169}
170
171/*
172 * Meant to be called if there is an error, it will free all the pages
173 * associated with the block.
174 */
175static void free_pending_block(struct log_writes_c *lc,
176 struct pending_block *block)
177{
178 int i;
179
180 for (i = 0; i < block->vec_cnt; i++) {
181 if (block->vecs[i].bv_page)
182 __free_page(block->vecs[i].bv_page);
183 }
184 kfree(block->data);
185 kfree(block);
186 put_pending_block(lc);
187}
188
189static int write_metadata(struct log_writes_c *lc, void *entry,
190 size_t entrylen, void *data, size_t datalen,
191 sector_t sector)
192{
193 struct bio *bio;
194 struct page *page;
195 void *ptr;
196 size_t ret;
197
198 bio = bio_alloc(GFP_KERNEL, 1);
199 if (!bio) {
200 DMERR("Couldn't alloc log bio");
201 goto error;
202 }
203 bio->bi_iter.bi_size = 0;
204 bio->bi_iter.bi_sector = sector;
205 bio->bi_bdev = lc->logdev->bdev;
206 bio->bi_end_io = log_end_io;
207 bio->bi_private = lc;
208 set_bit(BIO_UPTODATE, &bio->bi_flags);
209
210 page = alloc_page(GFP_KERNEL);
211 if (!page) {
212 DMERR("Couldn't alloc log page");
213 bio_put(bio);
214 goto error;
215 }
216
217 ptr = kmap_atomic(page);
218 memcpy(ptr, entry, entrylen);
219 if (datalen)
220 memcpy(ptr + entrylen, data, datalen);
221 memset(ptr + entrylen + datalen, 0,
222 lc->sectorsize - entrylen - datalen);
223 kunmap_atomic(ptr);
224
225 ret = bio_add_page(bio, page, lc->sectorsize, 0);
226 if (ret != lc->sectorsize) {
227 DMERR("Couldn't add page to the log block");
228 goto error_bio;
229 }
230 submit_bio(WRITE, bio);
231 return 0;
232error_bio:
233 bio_put(bio);
234 __free_page(page);
235error:
236 put_io_block(lc);
237 return -1;
238}
239
240static int log_one_block(struct log_writes_c *lc,
241 struct pending_block *block, sector_t sector)
242{
243 struct bio *bio;
244 struct log_write_entry entry;
245 size_t ret;
246 int i;
247
248 entry.sector = cpu_to_le64(block->sector);
249 entry.nr_sectors = cpu_to_le64(block->nr_sectors);
250 entry.flags = cpu_to_le64(block->flags);
251 entry.data_len = cpu_to_le64(block->datalen);
252 if (write_metadata(lc, &entry, sizeof(entry), block->data,
253 block->datalen, sector)) {
254 free_pending_block(lc, block);
255 return -1;
256 }
257
258 if (!block->vec_cnt)
259 goto out;
260 sector++;
261
262 bio = bio_alloc(GFP_KERNEL, block->vec_cnt);
263 if (!bio) {
264 DMERR("Couldn't alloc log bio");
265 goto error;
266 }
267 atomic_inc(&lc->io_blocks);
268 bio->bi_iter.bi_size = 0;
269 bio->bi_iter.bi_sector = sector;
270 bio->bi_bdev = lc->logdev->bdev;
271 bio->bi_end_io = log_end_io;
272 bio->bi_private = lc;
273 set_bit(BIO_UPTODATE, &bio->bi_flags);
274
275 for (i = 0; i < block->vec_cnt; i++) {
276 /*
277 * The page offset is always 0 because we allocate a new page
278 * for every bvec in the original bio for simplicity sake.
279 */
280 ret = bio_add_page(bio, block->vecs[i].bv_page,
281 block->vecs[i].bv_len, 0);
282 if (ret != block->vecs[i].bv_len) {
283 atomic_inc(&lc->io_blocks);
284 submit_bio(WRITE, bio);
285 bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i);
286 if (!bio) {
287 DMERR("Couldn't alloc log bio");
288 goto error;
289 }
290 bio->bi_iter.bi_size = 0;
291 bio->bi_iter.bi_sector = sector;
292 bio->bi_bdev = lc->logdev->bdev;
293 bio->bi_end_io = log_end_io;
294 bio->bi_private = lc;
295 set_bit(BIO_UPTODATE, &bio->bi_flags);
296
297 ret = bio_add_page(bio, block->vecs[i].bv_page,
298 block->vecs[i].bv_len, 0);
299 if (ret != block->vecs[i].bv_len) {
300 DMERR("Couldn't add page on new bio?");
301 bio_put(bio);
302 goto error;
303 }
304 }
305 sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
306 }
307 submit_bio(WRITE, bio);
308out:
309 kfree(block->data);
310 kfree(block);
311 put_pending_block(lc);
312 return 0;
313error:
314 free_pending_block(lc, block);
315 put_io_block(lc);
316 return -1;
317}
318
319static int log_super(struct log_writes_c *lc)
320{
321 struct log_write_super super;
322
323 super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
324 super.version = cpu_to_le64(WRITE_LOG_VERSION);
325 super.nr_entries = cpu_to_le64(lc->logged_entries);
326 super.sectorsize = cpu_to_le32(lc->sectorsize);
327
328 if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) {
329 DMERR("Couldn't write super");
330 return -1;
331 }
332
333 return 0;
334}
335
336static inline sector_t logdev_last_sector(struct log_writes_c *lc)
337{
338 return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
339}
340
341static int log_writes_kthread(void *arg)
342{
343 struct log_writes_c *lc = (struct log_writes_c *)arg;
344 sector_t sector = 0;
345
346 while (!kthread_should_stop()) {
347 bool super = false;
348 bool logging_enabled;
349 struct pending_block *block = NULL;
350 int ret;
351
352 spin_lock_irq(&lc->blocks_lock);
353 if (!list_empty(&lc->logging_blocks)) {
354 block = list_first_entry(&lc->logging_blocks,
355 struct pending_block, list);
356 list_del_init(&block->list);
357 if (!lc->logging_enabled)
358 goto next;
359
360 sector = lc->next_sector;
361 if (block->flags & LOG_DISCARD_FLAG)
362 lc->next_sector++;
363 else
364 lc->next_sector += block->nr_sectors + 1;
365
366 /*
367 * Apparently the size of the device may not be known
368 * right away, so handle this properly.
369 */
370 if (!lc->end_sector)
371 lc->end_sector = logdev_last_sector(lc);
372 if (lc->end_sector &&
373 lc->next_sector >= lc->end_sector) {
374 DMERR("Ran out of space on the logdev");
375 lc->logging_enabled = false;
376 goto next;
377 }
378 lc->logged_entries++;
379 atomic_inc(&lc->io_blocks);
380
381 super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
382 if (super)
383 atomic_inc(&lc->io_blocks);
384 }
385next:
386 logging_enabled = lc->logging_enabled;
387 spin_unlock_irq(&lc->blocks_lock);
388 if (block) {
389 if (logging_enabled) {
390 ret = log_one_block(lc, block, sector);
391 if (!ret && super)
392 ret = log_super(lc);
393 if (ret) {
394 spin_lock_irq(&lc->blocks_lock);
395 lc->logging_enabled = false;
396 spin_unlock_irq(&lc->blocks_lock);
397 }
398 } else
399 free_pending_block(lc, block);
400 continue;
401 }
402
403 if (!try_to_freeze()) {
404 set_current_state(TASK_INTERRUPTIBLE);
405 if (!kthread_should_stop() &&
406 !atomic_read(&lc->pending_blocks))
407 schedule();
408 __set_current_state(TASK_RUNNING);
409 }
410 }
411 return 0;
412}
413
414/*
415 * Construct a log-writes mapping:
416 * log-writes <dev_path> <log_dev_path>
417 */
418static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
419{
420 struct log_writes_c *lc;
421 struct dm_arg_set as;
422 const char *devname, *logdevname;
423
424 as.argc = argc;
425 as.argv = argv;
426
427 if (argc < 2) {
428 ti->error = "Invalid argument count";
429 return -EINVAL;
430 }
431
432 lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
433 if (!lc) {
434 ti->error = "Cannot allocate context";
435 return -ENOMEM;
436 }
437 spin_lock_init(&lc->blocks_lock);
438 INIT_LIST_HEAD(&lc->unflushed_blocks);
439 INIT_LIST_HEAD(&lc->logging_blocks);
440 init_waitqueue_head(&lc->wait);
441 lc->sectorsize = 1 << SECTOR_SHIFT;
442 atomic_set(&lc->io_blocks, 0);
443 atomic_set(&lc->pending_blocks, 0);
444
445 devname = dm_shift_arg(&as);
446 if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev)) {
447 ti->error = "Device lookup failed";
448 goto bad;
449 }
450
451 logdevname = dm_shift_arg(&as);
452 if (dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), &lc->logdev)) {
453 ti->error = "Log device lookup failed";
454 dm_put_device(ti, lc->dev);
455 goto bad;
456 }
457
458 lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
459 if (!lc->log_kthread) {
460 ti->error = "Couldn't alloc kthread";
461 dm_put_device(ti, lc->dev);
462 dm_put_device(ti, lc->logdev);
463 goto bad;
464 }
465
466 /* We put the super at sector 0, start logging at sector 1 */
467 lc->next_sector = 1;
468 lc->logging_enabled = true;
469 lc->end_sector = logdev_last_sector(lc);
470 lc->device_supports_discard = true;
471
472 ti->num_flush_bios = 1;
473 ti->flush_supported = true;
474 ti->num_discard_bios = 1;
475 ti->discards_supported = true;
476 ti->per_bio_data_size = sizeof(struct per_bio_data);
477 ti->private = lc;
478 return 0;
479
480bad:
481 kfree(lc);
482 return -EINVAL;
483}
484
485static int log_mark(struct log_writes_c *lc, char *data)
486{
487 struct pending_block *block;
488 size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
489
490 block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
491 if (!block) {
492 DMERR("Error allocating pending block");
493 return -ENOMEM;
494 }
495
496 block->data = kstrndup(data, maxsize, GFP_KERNEL);
497 if (!block->data) {
498 DMERR("Error copying mark data");
499 kfree(block);
500 return -ENOMEM;
501 }
502 atomic_inc(&lc->pending_blocks);
503 block->datalen = strlen(block->data);
504 block->flags |= LOG_MARK_FLAG;
505 spin_lock_irq(&lc->blocks_lock);
506 list_add_tail(&block->list, &lc->logging_blocks);
507 spin_unlock_irq(&lc->blocks_lock);
508 wake_up_process(lc->log_kthread);
509 return 0;
510}
511
512static void log_writes_dtr(struct dm_target *ti)
513{
514 struct log_writes_c *lc = ti->private;
515
516 spin_lock_irq(&lc->blocks_lock);
517 list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
518 spin_unlock_irq(&lc->blocks_lock);
519
520 /*
521 * This is just nice to have since it'll update the super to include the
522 * unflushed blocks, if it fails we don't really care.
523 */
524 log_mark(lc, "dm-log-writes-end");
525 wake_up_process(lc->log_kthread);
526 wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
527 !atomic_read(&lc->pending_blocks));
528 kthread_stop(lc->log_kthread);
529
530 WARN_ON(!list_empty(&lc->logging_blocks));
531 WARN_ON(!list_empty(&lc->unflushed_blocks));
532 dm_put_device(ti, lc->dev);
533 dm_put_device(ti, lc->logdev);
534 kfree(lc);
535}
536
537static void normal_map_bio(struct dm_target *ti, struct bio *bio)
538{
539 struct log_writes_c *lc = ti->private;
540
541 bio->bi_bdev = lc->dev->bdev;
542}
543
544static int log_writes_map(struct dm_target *ti, struct bio *bio)
545{
546 struct log_writes_c *lc = ti->private;
547 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
548 struct pending_block *block;
549 struct bvec_iter iter;
550 struct bio_vec bv;
551 size_t alloc_size;
552 int i = 0;
553 bool flush_bio = (bio->bi_rw & REQ_FLUSH);
554 bool fua_bio = (bio->bi_rw & REQ_FUA);
555 bool discard_bio = (bio->bi_rw & REQ_DISCARD);
556
557 pb->block = NULL;
558
559 /* Don't bother doing anything if logging has been disabled */
560 if (!lc->logging_enabled)
561 goto map_bio;
562
563 /*
564 * Map reads as normal.
565 */
566 if (bio_data_dir(bio) == READ)
567 goto map_bio;
568
569 /* No sectors and not a flush? Don't care */
570 if (!bio_sectors(bio) && !flush_bio)
571 goto map_bio;
572
573 /*
574 * Discards will have bi_size set but there's no actual data, so just
575 * allocate the size of the pending block.
576 */
577 if (discard_bio)
578 alloc_size = sizeof(struct pending_block);
579 else
580 alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio);
581
582 block = kzalloc(alloc_size, GFP_NOIO);
583 if (!block) {
584 DMERR("Error allocating pending block");
585 spin_lock_irq(&lc->blocks_lock);
586 lc->logging_enabled = false;
587 spin_unlock_irq(&lc->blocks_lock);
588 return -ENOMEM;
589 }
590 INIT_LIST_HEAD(&block->list);
591 pb->block = block;
592 atomic_inc(&lc->pending_blocks);
593
594 if (flush_bio)
595 block->flags |= LOG_FLUSH_FLAG;
596 if (fua_bio)
597 block->flags |= LOG_FUA_FLAG;
598 if (discard_bio)
599 block->flags |= LOG_DISCARD_FLAG;
600
601 block->sector = bio->bi_iter.bi_sector;
602 block->nr_sectors = bio_sectors(bio);
603
604 /* We don't need the data, just submit */
605 if (discard_bio) {
606 WARN_ON(flush_bio || fua_bio);
607 if (lc->device_supports_discard)
608 goto map_bio;
609 bio_endio(bio, 0);
610 return DM_MAPIO_SUBMITTED;
611 }
612
613 /* Flush bio, splice the unflushed blocks onto this list and submit */
614 if (flush_bio && !bio_sectors(bio)) {
615 spin_lock_irq(&lc->blocks_lock);
616 list_splice_init(&lc->unflushed_blocks, &block->list);
617 spin_unlock_irq(&lc->blocks_lock);
618 goto map_bio;
619 }
620
621 /*
622 * We will write this bio somewhere else way later so we need to copy
623 * the actual contents into new pages so we know the data will always be
624 * there.
625 *
626 * We do this because this could be a bio from O_DIRECT in which case we
627 * can't just hold onto the page until some later point, we have to
628 * manually copy the contents.
629 */
630 bio_for_each_segment(bv, bio, iter) {
631 struct page *page;
632 void *src, *dst;
633
634 page = alloc_page(GFP_NOIO);
635 if (!page) {
636 DMERR("Error allocing page");
637 free_pending_block(lc, block);
638 spin_lock_irq(&lc->blocks_lock);
639 lc->logging_enabled = false;
640 spin_unlock_irq(&lc->blocks_lock);
641 return -ENOMEM;
642 }
643
644 src = kmap_atomic(bv.bv_page);
645 dst = kmap_atomic(page);
646 memcpy(dst, src + bv.bv_offset, bv.bv_len);
647 kunmap_atomic(dst);
648 kunmap_atomic(src);
649 block->vecs[i].bv_page = page;
650 block->vecs[i].bv_len = bv.bv_len;
651 block->vec_cnt++;
652 i++;
653 }
654
655 /* Had a flush with data in it, weird */
656 if (flush_bio) {
657 spin_lock_irq(&lc->blocks_lock);
658 list_splice_init(&lc->unflushed_blocks, &block->list);
659 spin_unlock_irq(&lc->blocks_lock);
660 }
661map_bio:
662 normal_map_bio(ti, bio);
663 return DM_MAPIO_REMAPPED;
664}
665
666static int normal_end_io(struct dm_target *ti, struct bio *bio, int error)
667{
668 struct log_writes_c *lc = ti->private;
669 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
670
671 if (bio_data_dir(bio) == WRITE && pb->block) {
672 struct pending_block *block = pb->block;
673 unsigned long flags;
674
675 spin_lock_irqsave(&lc->blocks_lock, flags);
676 if (block->flags & LOG_FLUSH_FLAG) {
677 list_splice_tail_init(&block->list, &lc->logging_blocks);
678 list_add_tail(&block->list, &lc->logging_blocks);
679 wake_up_process(lc->log_kthread);
680 } else if (block->flags & LOG_FUA_FLAG) {
681 list_add_tail(&block->list, &lc->logging_blocks);
682 wake_up_process(lc->log_kthread);
683 } else
684 list_add_tail(&block->list, &lc->unflushed_blocks);
685 spin_unlock_irqrestore(&lc->blocks_lock, flags);
686 }
687
688 return error;
689}
690
691/*
692 * INFO format: <logged entries> <highest allocated sector>
693 */
694static void log_writes_status(struct dm_target *ti, status_type_t type,
695 unsigned status_flags, char *result,
696 unsigned maxlen)
697{
698 unsigned sz = 0;
699 struct log_writes_c *lc = ti->private;
700
701 switch (type) {
702 case STATUSTYPE_INFO:
703 DMEMIT("%llu %llu", lc->logged_entries,
704 (unsigned long long)lc->next_sector - 1);
705 if (!lc->logging_enabled)
706 DMEMIT(" logging_disabled");
707 break;
708
709 case STATUSTYPE_TABLE:
710 DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
711 break;
712 }
713}
714
715static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd,
716 unsigned long arg)
717{
718 struct log_writes_c *lc = ti->private;
719 struct dm_dev *dev = lc->dev;
720 int r = 0;
721
722 /*
723 * Only pass ioctls through if the device sizes match exactly.
724 */
725 if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
726 r = scsi_verify_blk_ioctl(NULL, cmd);
727
728 return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
729}
730
731static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
732 struct bio_vec *biovec, int max_size)
733{
734 struct log_writes_c *lc = ti->private;
735 struct request_queue *q = bdev_get_queue(lc->dev->bdev);
736
737 if (!q->merge_bvec_fn)
738 return max_size;
739
740 bvm->bi_bdev = lc->dev->bdev;
741 bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
742
743 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
744}
745
746static int log_writes_iterate_devices(struct dm_target *ti,
747 iterate_devices_callout_fn fn,
748 void *data)
749{
750 struct log_writes_c *lc = ti->private;
751
752 return fn(ti, lc->dev, 0, ti->len, data);
753}
754
755/*
756 * Messages supported:
757 * mark <mark data> - specify the marked data.
758 */
759static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv)
760{
761 int r = -EINVAL;
762 struct log_writes_c *lc = ti->private;
763
764 if (argc != 2) {
765 DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
766 return r;
767 }
768
769 if (!strcasecmp(argv[0], "mark"))
770 r = log_mark(lc, argv[1]);
771 else
772 DMWARN("Unrecognised log writes target message received: %s", argv[0]);
773
774 return r;
775}
776
777static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
778{
779 struct log_writes_c *lc = ti->private;
780 struct request_queue *q = bdev_get_queue(lc->dev->bdev);
781
782 if (!q || !blk_queue_discard(q)) {
783 lc->device_supports_discard = false;
784 limits->discard_granularity = 1 << SECTOR_SHIFT;
785 limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
786 }
787}
788
789static struct target_type log_writes_target = {
790 .name = "log-writes",
791 .version = {1, 0, 0},
792 .module = THIS_MODULE,
793 .ctr = log_writes_ctr,
794 .dtr = log_writes_dtr,
795 .map = log_writes_map,
796 .end_io = normal_end_io,
797 .status = log_writes_status,
798 .ioctl = log_writes_ioctl,
799 .merge = log_writes_merge,
800 .message = log_writes_message,
801 .iterate_devices = log_writes_iterate_devices,
802 .io_hints = log_writes_io_hints,
803};
804
805static int __init dm_log_writes_init(void)
806{
807 int r = dm_register_target(&log_writes_target);
808
809 if (r < 0)
810 DMERR("register failed %d", r);
811
812 return r;
813}
814
815static void __exit dm_log_writes_exit(void)
816{
817 dm_unregister_target(&log_writes_target);
818}
819
820module_init(dm_log_writes_init);
821module_exit(dm_log_writes_exit);
822
823MODULE_DESCRIPTION(DM_NAME " log writes target");
824MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
825MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d376dc87716e..63953477a07c 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -428,7 +428,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
428 } else { 428 } else {
429 /* blk-mq request-based interface */ 429 /* blk-mq request-based interface */
430 *__clone = blk_get_request(bdev_get_queue(bdev), 430 *__clone = blk_get_request(bdev_get_queue(bdev),
431 rq_data_dir(rq), GFP_KERNEL); 431 rq_data_dir(rq), GFP_ATOMIC);
432 if (IS_ERR(*__clone)) 432 if (IS_ERR(*__clone))
433 /* ENOMEM, requeue */ 433 /* ENOMEM, requeue */
434 return r; 434 return r;
@@ -1627,7 +1627,7 @@ static int __pgpath_busy(struct pgpath *pgpath)
1627{ 1627{
1628 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); 1628 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1629 1629
1630 return dm_underlying_device_busy(q); 1630 return blk_lld_busy(q);
1631} 1631}
1632 1632
1633/* 1633/*
@@ -1703,7 +1703,7 @@ out:
1703 *---------------------------------------------------------------*/ 1703 *---------------------------------------------------------------*/
1704static struct target_type multipath_target = { 1704static struct target_type multipath_target = {
1705 .name = "multipath", 1705 .name = "multipath",
1706 .version = {1, 8, 0}, 1706 .version = {1, 9, 0},
1707 .module = THIS_MODULE, 1707 .module = THIS_MODULE,
1708 .ctr = multipath_ctr, 1708 .ctr = multipath_ctr,
1709 .dtr = multipath_dtr, 1709 .dtr = multipath_dtr,
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index c62c5ab6aed5..7e818f5f1dc4 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -11,7 +11,7 @@
11struct dm_sysfs_attr { 11struct dm_sysfs_attr {
12 struct attribute attr; 12 struct attribute attr;
13 ssize_t (*show)(struct mapped_device *, char *); 13 ssize_t (*show)(struct mapped_device *, char *);
14 ssize_t (*store)(struct mapped_device *, char *); 14 ssize_t (*store)(struct mapped_device *, const char *, size_t count);
15}; 15};
16 16
17#define DM_ATTR_RO(_name) \ 17#define DM_ATTR_RO(_name) \
@@ -39,6 +39,31 @@ static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
39 return ret; 39 return ret;
40} 40}
41 41
42#define DM_ATTR_RW(_name) \
43struct dm_sysfs_attr dm_attr_##_name = \
44 __ATTR(_name, S_IRUGO | S_IWUSR, dm_attr_##_name##_show, dm_attr_##_name##_store)
45
46static ssize_t dm_attr_store(struct kobject *kobj, struct attribute *attr,
47 const char *page, size_t count)
48{
49 struct dm_sysfs_attr *dm_attr;
50 struct mapped_device *md;
51 ssize_t ret;
52
53 dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
54 if (!dm_attr->store)
55 return -EIO;
56
57 md = dm_get_from_kobject(kobj);
58 if (!md)
59 return -EINVAL;
60
61 ret = dm_attr->store(md, page, count);
62 dm_put(md);
63
64 return ret;
65}
66
42static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf) 67static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
43{ 68{
44 if (dm_copy_name_and_uuid(md, buf, NULL)) 69 if (dm_copy_name_and_uuid(md, buf, NULL))
@@ -64,25 +89,33 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
64 return strlen(buf); 89 return strlen(buf);
65} 90}
66 91
92static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf)
93{
94 sprintf(buf, "%d\n", dm_use_blk_mq(md));
95
96 return strlen(buf);
97}
98
67static DM_ATTR_RO(name); 99static DM_ATTR_RO(name);
68static DM_ATTR_RO(uuid); 100static DM_ATTR_RO(uuid);
69static DM_ATTR_RO(suspended); 101static DM_ATTR_RO(suspended);
102static DM_ATTR_RO(use_blk_mq);
103static DM_ATTR_RW(rq_based_seq_io_merge_deadline);
70 104
71static struct attribute *dm_attrs[] = { 105static struct attribute *dm_attrs[] = {
72 &dm_attr_name.attr, 106 &dm_attr_name.attr,
73 &dm_attr_uuid.attr, 107 &dm_attr_uuid.attr,
74 &dm_attr_suspended.attr, 108 &dm_attr_suspended.attr,
109 &dm_attr_use_blk_mq.attr,
110 &dm_attr_rq_based_seq_io_merge_deadline.attr,
75 NULL, 111 NULL,
76}; 112};
77 113
78static const struct sysfs_ops dm_sysfs_ops = { 114static const struct sysfs_ops dm_sysfs_ops = {
79 .show = dm_attr_show, 115 .show = dm_attr_show,
116 .store = dm_attr_store,
80}; 117};
81 118
82/*
83 * dm kobject is embedded in mapped_device structure
84 * no need to define release function here
85 */
86static struct kobj_type dm_ktype = { 119static struct kobj_type dm_ktype = {
87 .sysfs_ops = &dm_sysfs_ops, 120 .sysfs_ops = &dm_sysfs_ops,
88 .default_attrs = dm_attrs, 121 .default_attrs = dm_attrs,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 6554d9148927..d9b00b8565c6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -18,6 +18,8 @@
18#include <linux/mutex.h> 18#include <linux/mutex.h>
19#include <linux/delay.h> 19#include <linux/delay.h>
20#include <linux/atomic.h> 20#include <linux/atomic.h>
21#include <linux/blk-mq.h>
22#include <linux/mount.h>
21 23
22#define DM_MSG_PREFIX "table" 24#define DM_MSG_PREFIX "table"
23 25
@@ -372,23 +374,18 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
372 int r; 374 int r;
373 dev_t uninitialized_var(dev); 375 dev_t uninitialized_var(dev);
374 struct dm_dev_internal *dd; 376 struct dm_dev_internal *dd;
375 unsigned int major, minor;
376 struct dm_table *t = ti->table; 377 struct dm_table *t = ti->table;
377 char dummy; 378 struct block_device *bdev;
378 379
379 BUG_ON(!t); 380 BUG_ON(!t);
380 381
381 if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { 382 /* convert the path to a device */
382 /* Extract the major/minor numbers */ 383 bdev = lookup_bdev(path);
383 dev = MKDEV(major, minor); 384 if (IS_ERR(bdev)) {
384 if (MAJOR(dev) != major || MINOR(dev) != minor) 385 dev = name_to_dev_t(path);
385 return -EOVERFLOW; 386 if (!dev)
387 return -ENODEV;
386 } else { 388 } else {
387 /* convert the path to a device */
388 struct block_device *bdev = lookup_bdev(path);
389
390 if (IS_ERR(bdev))
391 return PTR_ERR(bdev);
392 dev = bdev->bd_dev; 389 dev = bdev->bd_dev;
393 bdput(bdev); 390 bdput(bdev);
394 } 391 }
@@ -939,7 +936,7 @@ bool dm_table_mq_request_based(struct dm_table *t)
939 return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED; 936 return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
940} 937}
941 938
942static int dm_table_alloc_md_mempools(struct dm_table *t) 939static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
943{ 940{
944 unsigned type = dm_table_get_type(t); 941 unsigned type = dm_table_get_type(t);
945 unsigned per_bio_data_size = 0; 942 unsigned per_bio_data_size = 0;
@@ -957,7 +954,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t)
957 per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size); 954 per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
958 } 955 }
959 956
960 t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size); 957 t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
961 if (!t->mempools) 958 if (!t->mempools)
962 return -ENOMEM; 959 return -ENOMEM;
963 960
@@ -1127,7 +1124,7 @@ int dm_table_complete(struct dm_table *t)
1127 return r; 1124 return r;
1128 } 1125 }
1129 1126
1130 r = dm_table_alloc_md_mempools(t); 1127 r = dm_table_alloc_md_mempools(t, t->md);
1131 if (r) 1128 if (r)
1132 DMERR("unable to allocate mempools"); 1129 DMERR("unable to allocate mempools");
1133 1130
@@ -1339,14 +1336,14 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
1339 continue; 1336 continue;
1340 1337
1341 if (ti->flush_supported) 1338 if (ti->flush_supported)
1342 return 1; 1339 return true;
1343 1340
1344 if (ti->type->iterate_devices && 1341 if (ti->type->iterate_devices &&
1345 ti->type->iterate_devices(ti, device_flush_capable, &flush)) 1342 ti->type->iterate_devices(ti, device_flush_capable, &flush))
1346 return 1; 1343 return true;
1347 } 1344 }
1348 1345
1349 return 0; 1346 return false;
1350} 1347}
1351 1348
1352static bool dm_table_discard_zeroes_data(struct dm_table *t) 1349static bool dm_table_discard_zeroes_data(struct dm_table *t)
@@ -1359,10 +1356,10 @@ static bool dm_table_discard_zeroes_data(struct dm_table *t)
1359 ti = dm_table_get_target(t, i++); 1356 ti = dm_table_get_target(t, i++);
1360 1357
1361 if (ti->discard_zeroes_data_unsupported) 1358 if (ti->discard_zeroes_data_unsupported)
1362 return 0; 1359 return false;
1363 } 1360 }
1364 1361
1365 return 1; 1362 return true;
1366} 1363}
1367 1364
1368static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, 1365static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
@@ -1408,10 +1405,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
1408 1405
1409 if (!ti->type->iterate_devices || 1406 if (!ti->type->iterate_devices ||
1410 !ti->type->iterate_devices(ti, func, NULL)) 1407 !ti->type->iterate_devices(ti, func, NULL))
1411 return 0; 1408 return false;
1412 } 1409 }
1413 1410
1414 return 1; 1411 return true;
1415} 1412}
1416 1413
1417static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev, 1414static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
@@ -1468,14 +1465,14 @@ static bool dm_table_supports_discards(struct dm_table *t)
1468 continue; 1465 continue;
1469 1466
1470 if (ti->discards_supported) 1467 if (ti->discards_supported)
1471 return 1; 1468 return true;
1472 1469
1473 if (ti->type->iterate_devices && 1470 if (ti->type->iterate_devices &&
1474 ti->type->iterate_devices(ti, device_discard_capable, NULL)) 1471 ti->type->iterate_devices(ti, device_discard_capable, NULL))
1475 return 1; 1472 return true;
1476 } 1473 }
1477 1474
1478 return 0; 1475 return false;
1479} 1476}
1480 1477
1481void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 1478void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -1677,20 +1674,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
1677 return r; 1674 return r;
1678} 1675}
1679 1676
1680int dm_table_any_busy_target(struct dm_table *t)
1681{
1682 unsigned i;
1683 struct dm_target *ti;
1684
1685 for (i = 0; i < t->num_targets; i++) {
1686 ti = t->targets + i;
1687 if (ti->type->busy && ti->type->busy(ti))
1688 return 1;
1689 }
1690
1691 return 0;
1692}
1693
1694struct mapped_device *dm_table_get_md(struct dm_table *t) 1677struct mapped_device *dm_table_get_md(struct dm_table *t)
1695{ 1678{
1696 return t->md; 1679 return t->md;
@@ -1709,9 +1692,13 @@ void dm_table_run_md_queue_async(struct dm_table *t)
1709 md = dm_table_get_md(t); 1692 md = dm_table_get_md(t);
1710 queue = dm_get_md_queue(md); 1693 queue = dm_get_md_queue(md);
1711 if (queue) { 1694 if (queue) {
1712 spin_lock_irqsave(queue->queue_lock, flags); 1695 if (queue->mq_ops)
1713 blk_run_queue_async(queue); 1696 blk_mq_run_hw_queues(queue, true);
1714 spin_unlock_irqrestore(queue->queue_lock, flags); 1697 else {
1698 spin_lock_irqsave(queue->queue_lock, flags);
1699 blk_run_queue_async(queue);
1700 spin_unlock_irqrestore(queue->queue_lock, flags);
1701 }
1715 } 1702 }
1716} 1703}
1717EXPORT_SYMBOL(dm_table_run_md_queue_async); 1704EXPORT_SYMBOL(dm_table_run_md_queue_async);
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 7a7bab8947ae..66616db33e6f 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -18,20 +18,39 @@
18 18
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/device-mapper.h> 20#include <linux/device-mapper.h>
21#include <linux/reboot.h>
21#include <crypto/hash.h> 22#include <crypto/hash.h>
22 23
23#define DM_MSG_PREFIX "verity" 24#define DM_MSG_PREFIX "verity"
24 25
26#define DM_VERITY_ENV_LENGTH 42
27#define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR"
28
25#define DM_VERITY_IO_VEC_INLINE 16 29#define DM_VERITY_IO_VEC_INLINE 16
26#define DM_VERITY_MEMPOOL_SIZE 4 30#define DM_VERITY_MEMPOOL_SIZE 4
27#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 31#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
28 32
29#define DM_VERITY_MAX_LEVELS 63 33#define DM_VERITY_MAX_LEVELS 63
34#define DM_VERITY_MAX_CORRUPTED_ERRS 100
35
36#define DM_VERITY_OPT_LOGGING "ignore_corruption"
37#define DM_VERITY_OPT_RESTART "restart_on_corruption"
30 38
31static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; 39static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
32 40
33module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); 41module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
34 42
43enum verity_mode {
44 DM_VERITY_MODE_EIO,
45 DM_VERITY_MODE_LOGGING,
46 DM_VERITY_MODE_RESTART
47};
48
49enum verity_block_type {
50 DM_VERITY_BLOCK_TYPE_DATA,
51 DM_VERITY_BLOCK_TYPE_METADATA
52};
53
35struct dm_verity { 54struct dm_verity {
36 struct dm_dev *data_dev; 55 struct dm_dev *data_dev;
37 struct dm_dev *hash_dev; 56 struct dm_dev *hash_dev;
@@ -54,6 +73,8 @@ struct dm_verity {
54 unsigned digest_size; /* digest size for the current hash algorithm */ 73 unsigned digest_size; /* digest size for the current hash algorithm */
55 unsigned shash_descsize;/* the size of temporary space for crypto */ 74 unsigned shash_descsize;/* the size of temporary space for crypto */
56 int hash_failed; /* set to 1 if hash of any block failed */ 75 int hash_failed; /* set to 1 if hash of any block failed */
76 enum verity_mode mode; /* mode for handling verification errors */
77 unsigned corrupted_errs;/* Number of errors for corrupted blocks */
57 78
58 mempool_t *vec_mempool; /* mempool of bio vector */ 79 mempool_t *vec_mempool; /* mempool of bio vector */
59 80
@@ -175,6 +196,57 @@ static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
175} 196}
176 197
177/* 198/*
199 * Handle verification errors.
200 */
201static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
202 unsigned long long block)
203{
204 char verity_env[DM_VERITY_ENV_LENGTH];
205 char *envp[] = { verity_env, NULL };
206 const char *type_str = "";
207 struct mapped_device *md = dm_table_get_md(v->ti->table);
208
209 /* Corruption should be visible in device status in all modes */
210 v->hash_failed = 1;
211
212 if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS)
213 goto out;
214
215 v->corrupted_errs++;
216
217 switch (type) {
218 case DM_VERITY_BLOCK_TYPE_DATA:
219 type_str = "data";
220 break;
221 case DM_VERITY_BLOCK_TYPE_METADATA:
222 type_str = "metadata";
223 break;
224 default:
225 BUG();
226 }
227
228 DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
229 block);
230
231 if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
232 DMERR("%s: reached maximum errors", v->data_dev->name);
233
234 snprintf(verity_env, DM_VERITY_ENV_LENGTH, "%s=%d,%llu",
235 DM_VERITY_ENV_VAR_NAME, type, block);
236
237 kobject_uevent_env(&disk_to_dev(dm_disk(md))->kobj, KOBJ_CHANGE, envp);
238
239out:
240 if (v->mode == DM_VERITY_MODE_LOGGING)
241 return 0;
242
243 if (v->mode == DM_VERITY_MODE_RESTART)
244 kernel_restart("dm-verity device corrupted");
245
246 return 1;
247}
248
249/*
178 * Verify hash of a metadata block pertaining to the specified data block 250 * Verify hash of a metadata block pertaining to the specified data block
179 * ("block" argument) at a specified level ("level" argument). 251 * ("block" argument) at a specified level ("level" argument).
180 * 252 *
@@ -251,11 +323,11 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block,
251 goto release_ret_r; 323 goto release_ret_r;
252 } 324 }
253 if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { 325 if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
254 DMERR_LIMIT("metadata block %llu is corrupted", 326 if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA,
255 (unsigned long long)hash_block); 327 hash_block)) {
256 v->hash_failed = 1; 328 r = -EIO;
257 r = -EIO; 329 goto release_ret_r;
258 goto release_ret_r; 330 }
259 } else 331 } else
260 aux->hash_verified = 1; 332 aux->hash_verified = 1;
261 } 333 }
@@ -367,10 +439,9 @@ test_block_hash:
367 return r; 439 return r;
368 } 440 }
369 if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { 441 if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
370 DMERR_LIMIT("data block %llu is corrupted", 442 if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
371 (unsigned long long)(io->block + b)); 443 io->block + b))
372 v->hash_failed = 1; 444 return -EIO;
373 return -EIO;
374 } 445 }
375 } 446 }
376 447
@@ -546,6 +617,19 @@ static void verity_status(struct dm_target *ti, status_type_t type,
546 else 617 else
547 for (x = 0; x < v->salt_size; x++) 618 for (x = 0; x < v->salt_size; x++)
548 DMEMIT("%02x", v->salt[x]); 619 DMEMIT("%02x", v->salt[x]);
620 if (v->mode != DM_VERITY_MODE_EIO) {
621 DMEMIT(" 1 ");
622 switch (v->mode) {
623 case DM_VERITY_MODE_LOGGING:
624 DMEMIT(DM_VERITY_OPT_LOGGING);
625 break;
626 case DM_VERITY_MODE_RESTART:
627 DMEMIT(DM_VERITY_OPT_RESTART);
628 break;
629 default:
630 BUG();
631 }
632 }
549 break; 633 break;
550 } 634 }
551} 635}
@@ -647,13 +731,19 @@ static void verity_dtr(struct dm_target *ti)
647static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) 731static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
648{ 732{
649 struct dm_verity *v; 733 struct dm_verity *v;
650 unsigned num; 734 struct dm_arg_set as;
735 const char *opt_string;
736 unsigned int num, opt_params;
651 unsigned long long num_ll; 737 unsigned long long num_ll;
652 int r; 738 int r;
653 int i; 739 int i;
654 sector_t hash_position; 740 sector_t hash_position;
655 char dummy; 741 char dummy;
656 742
743 static struct dm_arg _args[] = {
744 {0, 1, "Invalid number of feature args"},
745 };
746
657 v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); 747 v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
658 if (!v) { 748 if (!v) {
659 ti->error = "Cannot allocate verity structure"; 749 ti->error = "Cannot allocate verity structure";
@@ -668,8 +758,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
668 goto bad; 758 goto bad;
669 } 759 }
670 760
671 if (argc != 10) { 761 if (argc < 10) {
672 ti->error = "Invalid argument count: exactly 10 arguments required"; 762 ti->error = "Not enough arguments";
673 r = -EINVAL; 763 r = -EINVAL;
674 goto bad; 764 goto bad;
675 } 765 }
@@ -790,6 +880,39 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
790 } 880 }
791 } 881 }
792 882
883 argv += 10;
884 argc -= 10;
885
886 /* Optional parameters */
887 if (argc) {
888 as.argc = argc;
889 as.argv = argv;
890
891 r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
892 if (r)
893 goto bad;
894
895 while (opt_params) {
896 opt_params--;
897 opt_string = dm_shift_arg(&as);
898 if (!opt_string) {
899 ti->error = "Not enough feature arguments";
900 r = -EINVAL;
901 goto bad;
902 }
903
904 if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING))
905 v->mode = DM_VERITY_MODE_LOGGING;
906 else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART))
907 v->mode = DM_VERITY_MODE_RESTART;
908 else {
909 ti->error = "Invalid feature arguments";
910 r = -EINVAL;
911 goto bad;
912 }
913 }
914 }
915
793 v->hash_per_block_bits = 916 v->hash_per_block_bits =
794 __fls((1 << v->hash_dev_block_bits) / v->digest_size); 917 __fls((1 << v->hash_dev_block_bits) / v->digest_size);
795 918
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8001fe9e3434..f8c7ca3e8947 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -21,6 +21,9 @@
21#include <linux/delay.h> 21#include <linux/delay.h>
22#include <linux/wait.h> 22#include <linux/wait.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/ktime.h>
25#include <linux/elevator.h> /* for rq_end_sector() */
26#include <linux/blk-mq.h>
24 27
25#include <trace/events/block.h> 28#include <trace/events/block.h>
26 29
@@ -216,8 +219,29 @@ struct mapped_device {
216 219
217 struct kthread_worker kworker; 220 struct kthread_worker kworker;
218 struct task_struct *kworker_task; 221 struct task_struct *kworker_task;
222
223 /* for request-based merge heuristic in dm_request_fn() */
224 unsigned seq_rq_merge_deadline_usecs;
225 int last_rq_rw;
226 sector_t last_rq_pos;
227 ktime_t last_rq_start_time;
228
229 /* for blk-mq request-based DM support */
230 struct blk_mq_tag_set tag_set;
231 bool use_blk_mq;
219}; 232};
220 233
234#ifdef CONFIG_DM_MQ_DEFAULT
235static bool use_blk_mq = true;
236#else
237static bool use_blk_mq = false;
238#endif
239
240bool dm_use_blk_mq(struct mapped_device *md)
241{
242 return md->use_blk_mq;
243}
244
221/* 245/*
222 * For mempools pre-allocation at the table loading time. 246 * For mempools pre-allocation at the table loading time.
223 */ 247 */
@@ -250,35 +274,35 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
250 */ 274 */
251static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; 275static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
252 276
253static unsigned __dm_get_reserved_ios(unsigned *reserved_ios, 277static unsigned __dm_get_module_param(unsigned *module_param,
254 unsigned def, unsigned max) 278 unsigned def, unsigned max)
255{ 279{
256 unsigned ios = ACCESS_ONCE(*reserved_ios); 280 unsigned param = ACCESS_ONCE(*module_param);
257 unsigned modified_ios = 0; 281 unsigned modified_param = 0;
258 282
259 if (!ios) 283 if (!param)
260 modified_ios = def; 284 modified_param = def;
261 else if (ios > max) 285 else if (param > max)
262 modified_ios = max; 286 modified_param = max;
263 287
264 if (modified_ios) { 288 if (modified_param) {
265 (void)cmpxchg(reserved_ios, ios, modified_ios); 289 (void)cmpxchg(module_param, param, modified_param);
266 ios = modified_ios; 290 param = modified_param;
267 } 291 }
268 292
269 return ios; 293 return param;
270} 294}
271 295
272unsigned dm_get_reserved_bio_based_ios(void) 296unsigned dm_get_reserved_bio_based_ios(void)
273{ 297{
274 return __dm_get_reserved_ios(&reserved_bio_based_ios, 298 return __dm_get_module_param(&reserved_bio_based_ios,
275 RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); 299 RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
276} 300}
277EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 301EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
278 302
279unsigned dm_get_reserved_rq_based_ios(void) 303unsigned dm_get_reserved_rq_based_ios(void)
280{ 304{
281 return __dm_get_reserved_ios(&reserved_rq_based_ios, 305 return __dm_get_module_param(&reserved_rq_based_ios,
282 RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); 306 RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
283} 307}
284EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); 308EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
@@ -1017,6 +1041,11 @@ static void end_clone_bio(struct bio *clone, int error)
1017 blk_update_request(tio->orig, 0, nr_bytes); 1041 blk_update_request(tio->orig, 0, nr_bytes);
1018} 1042}
1019 1043
1044static struct dm_rq_target_io *tio_from_request(struct request *rq)
1045{
1046 return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
1047}
1048
1020/* 1049/*
1021 * Don't touch any member of the md after calling this function because 1050 * Don't touch any member of the md after calling this function because
1022 * the md may be freed in dm_put() at the end of this function. 1051 * the md may be freed in dm_put() at the end of this function.
@@ -1024,10 +1053,13 @@ static void end_clone_bio(struct bio *clone, int error)
1024 */ 1053 */
1025static void rq_completed(struct mapped_device *md, int rw, bool run_queue) 1054static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
1026{ 1055{
1056 int nr_requests_pending;
1057
1027 atomic_dec(&md->pending[rw]); 1058 atomic_dec(&md->pending[rw]);
1028 1059
1029 /* nudge anyone waiting on suspend queue */ 1060 /* nudge anyone waiting on suspend queue */
1030 if (!md_in_flight(md)) 1061 nr_requests_pending = md_in_flight(md);
1062 if (!nr_requests_pending)
1031 wake_up(&md->wait); 1063 wake_up(&md->wait);
1032 1064
1033 /* 1065 /*
@@ -1036,8 +1068,13 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
1036 * back into ->request_fn() could deadlock attempting to grab the 1068 * back into ->request_fn() could deadlock attempting to grab the
1037 * queue lock again. 1069 * queue lock again.
1038 */ 1070 */
1039 if (run_queue) 1071 if (run_queue) {
1040 blk_run_queue_async(md->queue); 1072 if (md->queue->mq_ops)
1073 blk_mq_run_hw_queues(md->queue, true);
1074 else if (!nr_requests_pending ||
1075 (nr_requests_pending >= md->queue->nr_congestion_on))
1076 blk_run_queue_async(md->queue);
1077 }
1041 1078
1042 /* 1079 /*
1043 * dm_put() must be at the end of this function. See the comment above 1080 * dm_put() must be at the end of this function. See the comment above
@@ -1048,13 +1085,18 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
1048static void free_rq_clone(struct request *clone) 1085static void free_rq_clone(struct request *clone)
1049{ 1086{
1050 struct dm_rq_target_io *tio = clone->end_io_data; 1087 struct dm_rq_target_io *tio = clone->end_io_data;
1088 struct mapped_device *md = tio->md;
1051 1089
1052 blk_rq_unprep_clone(clone); 1090 blk_rq_unprep_clone(clone);
1053 if (clone->q && clone->q->mq_ops) 1091
1092 if (clone->q->mq_ops)
1054 tio->ti->type->release_clone_rq(clone); 1093 tio->ti->type->release_clone_rq(clone);
1055 else 1094 else if (!md->queue->mq_ops)
1056 free_clone_request(tio->md, clone); 1095 /* request_fn queue stacked on request_fn queue(s) */
1057 free_rq_tio(tio); 1096 free_clone_request(md, clone);
1097
1098 if (!md->queue->mq_ops)
1099 free_rq_tio(tio);
1058} 1100}
1059 1101
1060/* 1102/*
@@ -1083,17 +1125,22 @@ static void dm_end_request(struct request *clone, int error)
1083 } 1125 }
1084 1126
1085 free_rq_clone(clone); 1127 free_rq_clone(clone);
1086 blk_end_request_all(rq, error); 1128 if (!rq->q->mq_ops)
1129 blk_end_request_all(rq, error);
1130 else
1131 blk_mq_end_request(rq, error);
1087 rq_completed(md, rw, true); 1132 rq_completed(md, rw, true);
1088} 1133}
1089 1134
1090static void dm_unprep_request(struct request *rq) 1135static void dm_unprep_request(struct request *rq)
1091{ 1136{
1092 struct dm_rq_target_io *tio = rq->special; 1137 struct dm_rq_target_io *tio = tio_from_request(rq);
1093 struct request *clone = tio->clone; 1138 struct request *clone = tio->clone;
1094 1139
1095 rq->special = NULL; 1140 if (!rq->q->mq_ops) {
1096 rq->cmd_flags &= ~REQ_DONTPREP; 1141 rq->special = NULL;
1142 rq->cmd_flags &= ~REQ_DONTPREP;
1143 }
1097 1144
1098 if (clone) 1145 if (clone)
1099 free_rq_clone(clone); 1146 free_rq_clone(clone);
@@ -1102,18 +1149,29 @@ static void dm_unprep_request(struct request *rq)
1102/* 1149/*
1103 * Requeue the original request of a clone. 1150 * Requeue the original request of a clone.
1104 */ 1151 */
1105static void dm_requeue_unmapped_original_request(struct mapped_device *md, 1152static void old_requeue_request(struct request *rq)
1106 struct request *rq)
1107{ 1153{
1108 int rw = rq_data_dir(rq);
1109 struct request_queue *q = rq->q; 1154 struct request_queue *q = rq->q;
1110 unsigned long flags; 1155 unsigned long flags;
1111 1156
1112 dm_unprep_request(rq);
1113
1114 spin_lock_irqsave(q->queue_lock, flags); 1157 spin_lock_irqsave(q->queue_lock, flags);
1115 blk_requeue_request(q, rq); 1158 blk_requeue_request(q, rq);
1116 spin_unlock_irqrestore(q->queue_lock, flags); 1159 spin_unlock_irqrestore(q->queue_lock, flags);
1160}
1161
1162static void dm_requeue_unmapped_original_request(struct mapped_device *md,
1163 struct request *rq)
1164{
1165 int rw = rq_data_dir(rq);
1166
1167 dm_unprep_request(rq);
1168
1169 if (!rq->q->mq_ops)
1170 old_requeue_request(rq);
1171 else {
1172 blk_mq_requeue_request(rq);
1173 blk_mq_kick_requeue_list(rq->q);
1174 }
1117 1175
1118 rq_completed(md, rw, false); 1176 rq_completed(md, rw, false);
1119} 1177}
@@ -1125,35 +1183,44 @@ static void dm_requeue_unmapped_request(struct request *clone)
1125 dm_requeue_unmapped_original_request(tio->md, tio->orig); 1183 dm_requeue_unmapped_original_request(tio->md, tio->orig);
1126} 1184}
1127 1185
1128static void __stop_queue(struct request_queue *q) 1186static void old_stop_queue(struct request_queue *q)
1129{
1130 blk_stop_queue(q);
1131}
1132
1133static void stop_queue(struct request_queue *q)
1134{ 1187{
1135 unsigned long flags; 1188 unsigned long flags;
1136 1189
1190 if (blk_queue_stopped(q))
1191 return;
1192
1137 spin_lock_irqsave(q->queue_lock, flags); 1193 spin_lock_irqsave(q->queue_lock, flags);
1138 __stop_queue(q); 1194 blk_stop_queue(q);
1139 spin_unlock_irqrestore(q->queue_lock, flags); 1195 spin_unlock_irqrestore(q->queue_lock, flags);
1140} 1196}
1141 1197
1142static void __start_queue(struct request_queue *q) 1198static void stop_queue(struct request_queue *q)
1143{ 1199{
1144 if (blk_queue_stopped(q)) 1200 if (!q->mq_ops)
1145 blk_start_queue(q); 1201 old_stop_queue(q);
1202 else
1203 blk_mq_stop_hw_queues(q);
1146} 1204}
1147 1205
1148static void start_queue(struct request_queue *q) 1206static void old_start_queue(struct request_queue *q)
1149{ 1207{
1150 unsigned long flags; 1208 unsigned long flags;
1151 1209
1152 spin_lock_irqsave(q->queue_lock, flags); 1210 spin_lock_irqsave(q->queue_lock, flags);
1153 __start_queue(q); 1211 if (blk_queue_stopped(q))
1212 blk_start_queue(q);
1154 spin_unlock_irqrestore(q->queue_lock, flags); 1213 spin_unlock_irqrestore(q->queue_lock, flags);
1155} 1214}
1156 1215
1216static void start_queue(struct request_queue *q)
1217{
1218 if (!q->mq_ops)
1219 old_start_queue(q);
1220 else
1221 blk_mq_start_stopped_hw_queues(q, true);
1222}
1223
1157static void dm_done(struct request *clone, int error, bool mapped) 1224static void dm_done(struct request *clone, int error, bool mapped)
1158{ 1225{
1159 int r = error; 1226 int r = error;
@@ -1192,13 +1259,20 @@ static void dm_done(struct request *clone, int error, bool mapped)
1192static void dm_softirq_done(struct request *rq) 1259static void dm_softirq_done(struct request *rq)
1193{ 1260{
1194 bool mapped = true; 1261 bool mapped = true;
1195 struct dm_rq_target_io *tio = rq->special; 1262 struct dm_rq_target_io *tio = tio_from_request(rq);
1196 struct request *clone = tio->clone; 1263 struct request *clone = tio->clone;
1264 int rw;
1197 1265
1198 if (!clone) { 1266 if (!clone) {
1199 blk_end_request_all(rq, tio->error); 1267 rw = rq_data_dir(rq);
1200 rq_completed(tio->md, rq_data_dir(rq), false); 1268 if (!rq->q->mq_ops) {
1201 free_rq_tio(tio); 1269 blk_end_request_all(rq, tio->error);
1270 rq_completed(tio->md, rw, false);
1271 free_rq_tio(tio);
1272 } else {
1273 blk_mq_end_request(rq, tio->error);
1274 rq_completed(tio->md, rw, false);
1275 }
1202 return; 1276 return;
1203 } 1277 }
1204 1278
@@ -1214,7 +1288,7 @@ static void dm_softirq_done(struct request *rq)
1214 */ 1288 */
1215static void dm_complete_request(struct request *rq, int error) 1289static void dm_complete_request(struct request *rq, int error)
1216{ 1290{
1217 struct dm_rq_target_io *tio = rq->special; 1291 struct dm_rq_target_io *tio = tio_from_request(rq);
1218 1292
1219 tio->error = error; 1293 tio->error = error;
1220 blk_complete_request(rq); 1294 blk_complete_request(rq);
@@ -1233,7 +1307,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error)
1233} 1307}
1234 1308
1235/* 1309/*
1236 * Called with the clone's queue lock held 1310 * Called with the clone's queue lock held (for non-blk-mq)
1237 */ 1311 */
1238static void end_clone_request(struct request *clone, int error) 1312static void end_clone_request(struct request *clone, int error)
1239{ 1313{
@@ -1693,7 +1767,7 @@ out:
1693 * The request function that just remaps the bio built up by 1767 * The request function that just remaps the bio built up by
1694 * dm_merge_bvec. 1768 * dm_merge_bvec.
1695 */ 1769 */
1696static void _dm_request(struct request_queue *q, struct bio *bio) 1770static void dm_make_request(struct request_queue *q, struct bio *bio)
1697{ 1771{
1698 int rw = bio_data_dir(bio); 1772 int rw = bio_data_dir(bio);
1699 struct mapped_device *md = q->queuedata; 1773 struct mapped_device *md = q->queuedata;
@@ -1725,16 +1799,6 @@ int dm_request_based(struct mapped_device *md)
1725 return blk_queue_stackable(md->queue); 1799 return blk_queue_stackable(md->queue);
1726} 1800}
1727 1801
1728static void dm_request(struct request_queue *q, struct bio *bio)
1729{
1730 struct mapped_device *md = q->queuedata;
1731
1732 if (dm_request_based(md))
1733 blk_queue_bio(q, bio);
1734 else
1735 _dm_request(q, bio);
1736}
1737
1738static void dm_dispatch_clone_request(struct request *clone, struct request *rq) 1802static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
1739{ 1803{
1740 int r; 1804 int r;
@@ -1787,15 +1851,25 @@ static int setup_clone(struct request *clone, struct request *rq,
1787static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1851static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1788 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1852 struct dm_rq_target_io *tio, gfp_t gfp_mask)
1789{ 1853{
1790 struct request *clone = alloc_clone_request(md, gfp_mask); 1854 /*
1855 * Do not allocate a clone if tio->clone was already set
1856 * (see: dm_mq_queue_rq).
1857 */
1858 bool alloc_clone = !tio->clone;
1859 struct request *clone;
1791 1860
1792 if (!clone) 1861 if (alloc_clone) {
1793 return NULL; 1862 clone = alloc_clone_request(md, gfp_mask);
1863 if (!clone)
1864 return NULL;
1865 } else
1866 clone = tio->clone;
1794 1867
1795 blk_rq_init(NULL, clone); 1868 blk_rq_init(NULL, clone);
1796 if (setup_clone(clone, rq, tio, gfp_mask)) { 1869 if (setup_clone(clone, rq, tio, gfp_mask)) {
1797 /* -ENOMEM */ 1870 /* -ENOMEM */
1798 free_clone_request(md, clone); 1871 if (alloc_clone)
1872 free_clone_request(md, clone);
1799 return NULL; 1873 return NULL;
1800 } 1874 }
1801 1875
@@ -1804,6 +1878,19 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1804 1878
1805static void map_tio_request(struct kthread_work *work); 1879static void map_tio_request(struct kthread_work *work);
1806 1880
1881static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
1882 struct mapped_device *md)
1883{
1884 tio->md = md;
1885 tio->ti = NULL;
1886 tio->clone = NULL;
1887 tio->orig = rq;
1888 tio->error = 0;
1889 memset(&tio->info, 0, sizeof(tio->info));
1890 if (md->kworker_task)
1891 init_kthread_work(&tio->work, map_tio_request);
1892}
1893
1807static struct dm_rq_target_io *prep_tio(struct request *rq, 1894static struct dm_rq_target_io *prep_tio(struct request *rq,
1808 struct mapped_device *md, gfp_t gfp_mask) 1895 struct mapped_device *md, gfp_t gfp_mask)
1809{ 1896{
@@ -1815,13 +1902,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
1815 if (!tio) 1902 if (!tio)
1816 return NULL; 1903 return NULL;
1817 1904
1818 tio->md = md; 1905 init_tio(tio, rq, md);
1819 tio->ti = NULL;
1820 tio->clone = NULL;
1821 tio->orig = rq;
1822 tio->error = 0;
1823 memset(&tio->info, 0, sizeof(tio->info));
1824 init_kthread_work(&tio->work, map_tio_request);
1825 1906
1826 table = dm_get_live_table(md, &srcu_idx); 1907 table = dm_get_live_table(md, &srcu_idx);
1827 if (!dm_table_mq_request_based(table)) { 1908 if (!dm_table_mq_request_based(table)) {
@@ -1865,11 +1946,11 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
1865 * DM_MAPIO_REQUEUE : the original request needs to be requeued 1946 * DM_MAPIO_REQUEUE : the original request needs to be requeued
1866 * < 0 : the request was completed due to failure 1947 * < 0 : the request was completed due to failure
1867 */ 1948 */
1868static int map_request(struct dm_target *ti, struct request *rq, 1949static int map_request(struct dm_rq_target_io *tio, struct request *rq,
1869 struct mapped_device *md) 1950 struct mapped_device *md)
1870{ 1951{
1871 int r; 1952 int r;
1872 struct dm_rq_target_io *tio = rq->special; 1953 struct dm_target *ti = tio->ti;
1873 struct request *clone = NULL; 1954 struct request *clone = NULL;
1874 1955
1875 if (tio->clone) { 1956 if (tio->clone) {
@@ -1884,7 +1965,7 @@ static int map_request(struct dm_target *ti, struct request *rq,
1884 } 1965 }
1885 if (IS_ERR(clone)) 1966 if (IS_ERR(clone))
1886 return DM_MAPIO_REQUEUE; 1967 return DM_MAPIO_REQUEUE;
1887 if (setup_clone(clone, rq, tio, GFP_KERNEL)) { 1968 if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
1888 /* -ENOMEM */ 1969 /* -ENOMEM */
1889 ti->type->release_clone_rq(clone); 1970 ti->type->release_clone_rq(clone);
1890 return DM_MAPIO_REQUEUE; 1971 return DM_MAPIO_REQUEUE;
@@ -1925,15 +2006,24 @@ static void map_tio_request(struct kthread_work *work)
1925 struct request *rq = tio->orig; 2006 struct request *rq = tio->orig;
1926 struct mapped_device *md = tio->md; 2007 struct mapped_device *md = tio->md;
1927 2008
1928 if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE) 2009 if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
1929 dm_requeue_unmapped_original_request(md, rq); 2010 dm_requeue_unmapped_original_request(md, rq);
1930} 2011}
1931 2012
1932static void dm_start_request(struct mapped_device *md, struct request *orig) 2013static void dm_start_request(struct mapped_device *md, struct request *orig)
1933{ 2014{
1934 blk_start_request(orig); 2015 if (!orig->q->mq_ops)
2016 blk_start_request(orig);
2017 else
2018 blk_mq_start_request(orig);
1935 atomic_inc(&md->pending[rq_data_dir(orig)]); 2019 atomic_inc(&md->pending[rq_data_dir(orig)]);
1936 2020
2021 if (md->seq_rq_merge_deadline_usecs) {
2022 md->last_rq_pos = rq_end_sector(orig);
2023 md->last_rq_rw = rq_data_dir(orig);
2024 md->last_rq_start_time = ktime_get();
2025 }
2026
1937 /* 2027 /*
1938 * Hold the md reference here for the in-flight I/O. 2028 * Hold the md reference here for the in-flight I/O.
1939 * We can't rely on the reference count by device opener, 2029 * We can't rely on the reference count by device opener,
@@ -1944,6 +2034,45 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
1944 dm_get(md); 2034 dm_get(md);
1945} 2035}
1946 2036
2037#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
2038
2039ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
2040{
2041 return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
2042}
2043
2044ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
2045 const char *buf, size_t count)
2046{
2047 unsigned deadline;
2048
2049 if (!dm_request_based(md) || md->use_blk_mq)
2050 return count;
2051
2052 if (kstrtouint(buf, 10, &deadline))
2053 return -EINVAL;
2054
2055 if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
2056 deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
2057
2058 md->seq_rq_merge_deadline_usecs = deadline;
2059
2060 return count;
2061}
2062
2063static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
2064{
2065 ktime_t kt_deadline;
2066
2067 if (!md->seq_rq_merge_deadline_usecs)
2068 return false;
2069
2070 kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
2071 kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
2072
2073 return !ktime_after(ktime_get(), kt_deadline);
2074}
2075
1947/* 2076/*
1948 * q->request_fn for request-based dm. 2077 * q->request_fn for request-based dm.
1949 * Called with the queue lock held. 2078 * Called with the queue lock held.
@@ -1967,7 +2096,7 @@ static void dm_request_fn(struct request_queue *q)
1967 while (!blk_queue_stopped(q)) { 2096 while (!blk_queue_stopped(q)) {
1968 rq = blk_peek_request(q); 2097 rq = blk_peek_request(q);
1969 if (!rq) 2098 if (!rq)
1970 goto delay_and_out; 2099 goto out;
1971 2100
1972 /* always use block 0 to find the target for flushes for now */ 2101 /* always use block 0 to find the target for flushes for now */
1973 pos = 0; 2102 pos = 0;
@@ -1986,12 +2115,17 @@ static void dm_request_fn(struct request_queue *q)
1986 continue; 2115 continue;
1987 } 2116 }
1988 2117
2118 if (dm_request_peeked_before_merge_deadline(md) &&
2119 md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
2120 md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
2121 goto delay_and_out;
2122
1989 if (ti->type->busy && ti->type->busy(ti)) 2123 if (ti->type->busy && ti->type->busy(ti))
1990 goto delay_and_out; 2124 goto delay_and_out;
1991 2125
1992 dm_start_request(md, rq); 2126 dm_start_request(md, rq);
1993 2127
1994 tio = rq->special; 2128 tio = tio_from_request(rq);
1995 /* Establish tio->ti before queuing work (map_tio_request) */ 2129 /* Establish tio->ti before queuing work (map_tio_request) */
1996 tio->ti = ti; 2130 tio->ti = ti;
1997 queue_kthread_work(&md->kworker, &tio->work); 2131 queue_kthread_work(&md->kworker, &tio->work);
@@ -2001,33 +2135,11 @@ static void dm_request_fn(struct request_queue *q)
2001 goto out; 2135 goto out;
2002 2136
2003delay_and_out: 2137delay_and_out:
2004 blk_delay_queue(q, HZ / 10); 2138 blk_delay_queue(q, HZ / 100);
2005out: 2139out:
2006 dm_put_live_table(md, srcu_idx); 2140 dm_put_live_table(md, srcu_idx);
2007} 2141}
2008 2142
2009int dm_underlying_device_busy(struct request_queue *q)
2010{
2011 return blk_lld_busy(q);
2012}
2013EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
2014
2015static int dm_lld_busy(struct request_queue *q)
2016{
2017 int r;
2018 struct mapped_device *md = q->queuedata;
2019 struct dm_table *map = dm_get_live_table_fast(md);
2020
2021 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
2022 r = 1;
2023 else
2024 r = dm_table_any_busy_target(map);
2025
2026 dm_put_live_table_fast(md);
2027
2028 return r;
2029}
2030
2031static int dm_any_congested(void *congested_data, int bdi_bits) 2143static int dm_any_congested(void *congested_data, int bdi_bits)
2032{ 2144{
2033 int r = bdi_bits; 2145 int r = bdi_bits;
@@ -2110,7 +2222,7 @@ static void dm_init_md_queue(struct mapped_device *md)
2110{ 2222{
2111 /* 2223 /*
2112 * Request-based dm devices cannot be stacked on top of bio-based dm 2224 * Request-based dm devices cannot be stacked on top of bio-based dm
2113 * devices. The type of this dm device has not been decided yet. 2225 * devices. The type of this dm device may not have been decided yet.
2114 * The type is decided at the first table loading time. 2226 * The type is decided at the first table loading time.
2115 * To prevent problematic device stacking, clear the queue flag 2227 * To prevent problematic device stacking, clear the queue flag
2116 * for request stacking support until then. 2228 * for request stacking support until then.
@@ -2118,13 +2230,21 @@ static void dm_init_md_queue(struct mapped_device *md)
2118 * This queue is new, so no concurrency on the queue_flags. 2230 * This queue is new, so no concurrency on the queue_flags.
2119 */ 2231 */
2120 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 2232 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
2233}
2234
2235static void dm_init_old_md_queue(struct mapped_device *md)
2236{
2237 md->use_blk_mq = false;
2238 dm_init_md_queue(md);
2121 2239
2240 /*
2241 * Initialize aspects of queue that aren't relevant for blk-mq
2242 */
2122 md->queue->queuedata = md; 2243 md->queue->queuedata = md;
2123 md->queue->backing_dev_info.congested_fn = dm_any_congested; 2244 md->queue->backing_dev_info.congested_fn = dm_any_congested;
2124 md->queue->backing_dev_info.congested_data = md; 2245 md->queue->backing_dev_info.congested_data = md;
2125 blk_queue_make_request(md->queue, dm_request); 2246
2126 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 2247 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
2127 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
2128} 2248}
2129 2249
2130/* 2250/*
@@ -2156,6 +2276,7 @@ static struct mapped_device *alloc_dev(int minor)
2156 if (r < 0) 2276 if (r < 0)
2157 goto bad_io_barrier; 2277 goto bad_io_barrier;
2158 2278
2279 md->use_blk_mq = use_blk_mq;
2159 md->type = DM_TYPE_NONE; 2280 md->type = DM_TYPE_NONE;
2160 mutex_init(&md->suspend_lock); 2281 mutex_init(&md->suspend_lock);
2161 mutex_init(&md->type_lock); 2282 mutex_init(&md->type_lock);
@@ -2267,6 +2388,8 @@ static void free_dev(struct mapped_device *md)
2267 del_gendisk(md->disk); 2388 del_gendisk(md->disk);
2268 put_disk(md->disk); 2389 put_disk(md->disk);
2269 blk_cleanup_queue(md->queue); 2390 blk_cleanup_queue(md->queue);
2391 if (md->use_blk_mq)
2392 blk_mq_free_tag_set(&md->tag_set);
2270 bdput(md->bdev); 2393 bdput(md->bdev);
2271 free_minor(minor); 2394 free_minor(minor);
2272 2395
@@ -2278,7 +2401,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
2278{ 2401{
2279 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 2402 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2280 2403
2281 if (md->io_pool && md->bs) { 2404 if (md->bs) {
2282 /* The md already has necessary mempools. */ 2405 /* The md already has necessary mempools. */
2283 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { 2406 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
2284 /* 2407 /*
@@ -2310,7 +2433,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
2310 p->bs = NULL; 2433 p->bs = NULL;
2311 2434
2312out: 2435out:
2313 /* mempool bind completed, now no need any mempools in the table */ 2436 /* mempool bind completed, no longer need any mempools in the table */
2314 dm_table_free_md_mempools(t); 2437 dm_table_free_md_mempools(t);
2315} 2438}
2316 2439
@@ -2357,7 +2480,7 @@ int dm_queue_merge_is_compulsory(struct request_queue *q)
2357 if (!q->merge_bvec_fn) 2480 if (!q->merge_bvec_fn)
2358 return 0; 2481 return 0;
2359 2482
2360 if (q->make_request_fn == dm_request) { 2483 if (q->make_request_fn == dm_make_request) {
2361 dev_md = q->queuedata; 2484 dev_md = q->queuedata;
2362 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2485 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
2363 return 0; 2486 return 0;
@@ -2426,7 +2549,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2426 * This must be done before setting the queue restrictions, 2549 * This must be done before setting the queue restrictions,
2427 * because request-based dm may be run just after the setting. 2550 * because request-based dm may be run just after the setting.
2428 */ 2551 */
2429 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2552 if (dm_table_request_based(t))
2430 stop_queue(q); 2553 stop_queue(q);
2431 2554
2432 __bind_mempools(md, t); 2555 __bind_mempools(md, t);
@@ -2508,14 +2631,6 @@ unsigned dm_get_md_type(struct mapped_device *md)
2508 return md->type; 2631 return md->type;
2509} 2632}
2510 2633
2511static bool dm_md_type_request_based(struct mapped_device *md)
2512{
2513 unsigned table_type = dm_get_md_type(md);
2514
2515 return (table_type == DM_TYPE_REQUEST_BASED ||
2516 table_type == DM_TYPE_MQ_REQUEST_BASED);
2517}
2518
2519struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2634struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2520{ 2635{
2521 return md->immutable_target_type; 2636 return md->immutable_target_type;
@@ -2532,6 +2647,14 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2532} 2647}
2533EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2648EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2534 2649
2650static void init_rq_based_worker_thread(struct mapped_device *md)
2651{
2652 /* Initialize the request-based DM worker thread */
2653 init_kthread_worker(&md->kworker);
2654 md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
2655 "kdmwork-%s", dm_device_name(md));
2656}
2657
2535/* 2658/*
2536 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2659 * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2537 */ 2660 */
@@ -2540,27 +2663,160 @@ static int dm_init_request_based_queue(struct mapped_device *md)
2540 struct request_queue *q = NULL; 2663 struct request_queue *q = NULL;
2541 2664
2542 if (md->queue->elevator) 2665 if (md->queue->elevator)
2543 return 1; 2666 return 0;
2544 2667
2545 /* Fully initialize the queue */ 2668 /* Fully initialize the queue */
2546 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2669 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2547 if (!q) 2670 if (!q)
2548 return 0; 2671 return -EINVAL;
2672
2673 /* disable dm_request_fn's merge heuristic by default */
2674 md->seq_rq_merge_deadline_usecs = 0;
2549 2675
2550 md->queue = q; 2676 md->queue = q;
2551 dm_init_md_queue(md); 2677 dm_init_old_md_queue(md);
2552 blk_queue_softirq_done(md->queue, dm_softirq_done); 2678 blk_queue_softirq_done(md->queue, dm_softirq_done);
2553 blk_queue_prep_rq(md->queue, dm_prep_fn); 2679 blk_queue_prep_rq(md->queue, dm_prep_fn);
2554 blk_queue_lld_busy(md->queue, dm_lld_busy);
2555 2680
2556 /* Also initialize the request-based DM worker thread */ 2681 init_rq_based_worker_thread(md);
2557 init_kthread_worker(&md->kworker);
2558 md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
2559 "kdmwork-%s", dm_device_name(md));
2560 2682
2561 elv_register_queue(md->queue); 2683 elv_register_queue(md->queue);
2562 2684
2563 return 1; 2685 return 0;
2686}
2687
2688static int dm_mq_init_request(void *data, struct request *rq,
2689 unsigned int hctx_idx, unsigned int request_idx,
2690 unsigned int numa_node)
2691{
2692 struct mapped_device *md = data;
2693 struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
2694
2695 /*
2696 * Must initialize md member of tio, otherwise it won't
2697 * be available in dm_mq_queue_rq.
2698 */
2699 tio->md = md;
2700
2701 return 0;
2702}
2703
2704static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
2705 const struct blk_mq_queue_data *bd)
2706{
2707 struct request *rq = bd->rq;
2708 struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
2709 struct mapped_device *md = tio->md;
2710 int srcu_idx;
2711 struct dm_table *map = dm_get_live_table(md, &srcu_idx);
2712 struct dm_target *ti;
2713 sector_t pos;
2714
2715 /* always use block 0 to find the target for flushes for now */
2716 pos = 0;
2717 if (!(rq->cmd_flags & REQ_FLUSH))
2718 pos = blk_rq_pos(rq);
2719
2720 ti = dm_table_find_target(map, pos);
2721 if (!dm_target_is_valid(ti)) {
2722 dm_put_live_table(md, srcu_idx);
2723 DMERR_LIMIT("request attempted access beyond the end of device");
2724 /*
2725 * Must perform setup, that rq_completed() requires,
2726 * before returning BLK_MQ_RQ_QUEUE_ERROR
2727 */
2728 dm_start_request(md, rq);
2729 return BLK_MQ_RQ_QUEUE_ERROR;
2730 }
2731 dm_put_live_table(md, srcu_idx);
2732
2733 if (ti->type->busy && ti->type->busy(ti))
2734 return BLK_MQ_RQ_QUEUE_BUSY;
2735
2736 dm_start_request(md, rq);
2737
2738 /* Init tio using md established in .init_request */
2739 init_tio(tio, rq, md);
2740
2741 /*
2742 * Establish tio->ti before queuing work (map_tio_request)
2743 * or making direct call to map_request().
2744 */
2745 tio->ti = ti;
2746
2747 /* Clone the request if underlying devices aren't blk-mq */
2748 if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
2749 /* clone request is allocated at the end of the pdu */
2750 tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
2751 if (!clone_rq(rq, md, tio, GFP_ATOMIC))
2752 return BLK_MQ_RQ_QUEUE_BUSY;
2753 queue_kthread_work(&md->kworker, &tio->work);
2754 } else {
2755 /* Direct call is fine since .queue_rq allows allocations */
2756 if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
2757 dm_requeue_unmapped_original_request(md, rq);
2758 }
2759
2760 return BLK_MQ_RQ_QUEUE_OK;
2761}
2762
2763static struct blk_mq_ops dm_mq_ops = {
2764 .queue_rq = dm_mq_queue_rq,
2765 .map_queue = blk_mq_map_queue,
2766 .complete = dm_softirq_done,
2767 .init_request = dm_mq_init_request,
2768};
2769
2770static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
2771{
2772 unsigned md_type = dm_get_md_type(md);
2773 struct request_queue *q;
2774 int err;
2775
2776 memset(&md->tag_set, 0, sizeof(md->tag_set));
2777 md->tag_set.ops = &dm_mq_ops;
2778 md->tag_set.queue_depth = BLKDEV_MAX_RQ;
2779 md->tag_set.numa_node = NUMA_NO_NODE;
2780 md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
2781 md->tag_set.nr_hw_queues = 1;
2782 if (md_type == DM_TYPE_REQUEST_BASED) {
2783 /* make the memory for non-blk-mq clone part of the pdu */
2784 md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
2785 } else
2786 md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
2787 md->tag_set.driver_data = md;
2788
2789 err = blk_mq_alloc_tag_set(&md->tag_set);
2790 if (err)
2791 return err;
2792
2793 q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
2794 if (IS_ERR(q)) {
2795 err = PTR_ERR(q);
2796 goto out_tag_set;
2797 }
2798 md->queue = q;
2799 dm_init_md_queue(md);
2800
2801 /* backfill 'mq' sysfs registration normally done in blk_register_queue */
2802 blk_mq_register_disk(md->disk);
2803
2804 if (md_type == DM_TYPE_REQUEST_BASED)
2805 init_rq_based_worker_thread(md);
2806
2807 return 0;
2808
2809out_tag_set:
2810 blk_mq_free_tag_set(&md->tag_set);
2811 return err;
2812}
2813
2814static unsigned filter_md_type(unsigned type, struct mapped_device *md)
2815{
2816 if (type == DM_TYPE_BIO_BASED)
2817 return type;
2818
2819 return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
2564} 2820}
2565 2821
2566/* 2822/*
@@ -2568,9 +2824,29 @@ static int dm_init_request_based_queue(struct mapped_device *md)
2568 */ 2824 */
2569int dm_setup_md_queue(struct mapped_device *md) 2825int dm_setup_md_queue(struct mapped_device *md)
2570{ 2826{
2571 if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) { 2827 int r;
2572 DMWARN("Cannot initialize queue for request-based mapped device"); 2828 unsigned md_type = filter_md_type(dm_get_md_type(md), md);
2573 return -EINVAL; 2829
2830 switch (md_type) {
2831 case DM_TYPE_REQUEST_BASED:
2832 r = dm_init_request_based_queue(md);
2833 if (r) {
2834 DMWARN("Cannot initialize queue for request-based mapped device");
2835 return r;
2836 }
2837 break;
2838 case DM_TYPE_MQ_REQUEST_BASED:
2839 r = dm_init_request_based_blk_mq_queue(md);
2840 if (r) {
2841 DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
2842 return r;
2843 }
2844 break;
2845 case DM_TYPE_BIO_BASED:
2846 dm_init_old_md_queue(md);
2847 blk_queue_make_request(md->queue, dm_make_request);
2848 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
2849 break;
2574 } 2850 }
2575 2851
2576 return 0; 2852 return 0;
@@ -2654,7 +2930,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
2654 set_bit(DMF_FREEING, &md->flags); 2930 set_bit(DMF_FREEING, &md->flags);
2655 spin_unlock(&_minor_lock); 2931 spin_unlock(&_minor_lock);
2656 2932
2657 if (dm_request_based(md)) 2933 if (dm_request_based(md) && md->kworker_task)
2658 flush_kthread_worker(&md->kworker); 2934 flush_kthread_worker(&md->kworker);
2659 2935
2660 /* 2936 /*
@@ -2908,7 +3184,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2908 */ 3184 */
2909 if (dm_request_based(md)) { 3185 if (dm_request_based(md)) {
2910 stop_queue(md->queue); 3186 stop_queue(md->queue);
2911 flush_kthread_worker(&md->kworker); 3187 if (md->kworker_task)
3188 flush_kthread_worker(&md->kworker);
2912 } 3189 }
2913 3190
2914 flush_workqueue(md->wq); 3191 flush_workqueue(md->wq);
@@ -3206,6 +3483,7 @@ struct gendisk *dm_disk(struct mapped_device *md)
3206{ 3483{
3207 return md->disk; 3484 return md->disk;
3208} 3485}
3486EXPORT_SYMBOL_GPL(dm_disk);
3209 3487
3210struct kobject *dm_kobject(struct mapped_device *md) 3488struct kobject *dm_kobject(struct mapped_device *md)
3211{ 3489{
@@ -3253,16 +3531,19 @@ int dm_noflush_suspending(struct dm_target *ti)
3253} 3531}
3254EXPORT_SYMBOL_GPL(dm_noflush_suspending); 3532EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3255 3533
3256struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) 3534struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
3535 unsigned integrity, unsigned per_bio_data_size)
3257{ 3536{
3258 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); 3537 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
3259 struct kmem_cache *cachep; 3538 struct kmem_cache *cachep = NULL;
3260 unsigned int pool_size = 0; 3539 unsigned int pool_size = 0;
3261 unsigned int front_pad; 3540 unsigned int front_pad;
3262 3541
3263 if (!pools) 3542 if (!pools)
3264 return NULL; 3543 return NULL;
3265 3544
3545 type = filter_md_type(type, md);
3546
3266 switch (type) { 3547 switch (type) {
3267 case DM_TYPE_BIO_BASED: 3548 case DM_TYPE_BIO_BASED:
3268 cachep = _io_cache; 3549 cachep = _io_cache;
@@ -3270,13 +3551,13 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
3270 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 3551 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3271 break; 3552 break;
3272 case DM_TYPE_REQUEST_BASED: 3553 case DM_TYPE_REQUEST_BASED:
3554 cachep = _rq_tio_cache;
3273 pool_size = dm_get_reserved_rq_based_ios(); 3555 pool_size = dm_get_reserved_rq_based_ios();
3274 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); 3556 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
3275 if (!pools->rq_pool) 3557 if (!pools->rq_pool)
3276 goto out; 3558 goto out;
3277 /* fall through to setup remaining rq-based pools */ 3559 /* fall through to setup remaining rq-based pools */
3278 case DM_TYPE_MQ_REQUEST_BASED: 3560 case DM_TYPE_MQ_REQUEST_BASED:
3279 cachep = _rq_tio_cache;
3280 if (!pool_size) 3561 if (!pool_size)
3281 pool_size = dm_get_reserved_rq_based_ios(); 3562 pool_size = dm_get_reserved_rq_based_ios();
3282 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 3563 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
@@ -3284,12 +3565,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
3284 WARN_ON(per_bio_data_size != 0); 3565 WARN_ON(per_bio_data_size != 0);
3285 break; 3566 break;
3286 default: 3567 default:
3287 goto out; 3568 BUG();
3288 } 3569 }
3289 3570
3290 pools->io_pool = mempool_create_slab_pool(pool_size, cachep); 3571 if (cachep) {
3291 if (!pools->io_pool) 3572 pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
3292 goto out; 3573 if (!pools->io_pool)
3574 goto out;
3575 }
3293 3576
3294 pools->bs = bioset_create_nobvec(pool_size, front_pad); 3577 pools->bs = bioset_create_nobvec(pool_size, front_pad);
3295 if (!pools->bs) 3578 if (!pools->bs)
@@ -3346,6 +3629,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3346module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); 3629module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
3347MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); 3630MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
3348 3631
3632module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
3633MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
3634
3349MODULE_DESCRIPTION(DM_NAME " driver"); 3635MODULE_DESCRIPTION(DM_NAME " driver");
3350MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3636MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3351MODULE_LICENSE("GPL"); 3637MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 59f53e79db82..6123c2bf9150 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -70,7 +70,6 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
70void dm_table_postsuspend_targets(struct dm_table *t); 70void dm_table_postsuspend_targets(struct dm_table *t);
71int dm_table_resume_targets(struct dm_table *t); 71int dm_table_resume_targets(struct dm_table *t);
72int dm_table_any_congested(struct dm_table *t, int bdi_bits); 72int dm_table_any_congested(struct dm_table *t, int bdi_bits);
73int dm_table_any_busy_target(struct dm_table *t);
74unsigned dm_table_get_type(struct dm_table *t); 73unsigned dm_table_get_type(struct dm_table *t);
75struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); 74struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
76bool dm_table_request_based(struct dm_table *t); 75bool dm_table_request_based(struct dm_table *t);
@@ -212,6 +211,8 @@ int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
212void dm_internal_suspend(struct mapped_device *md); 211void dm_internal_suspend(struct mapped_device *md);
213void dm_internal_resume(struct mapped_device *md); 212void dm_internal_resume(struct mapped_device *md);
214 213
214bool dm_use_blk_mq(struct mapped_device *md);
215
215int dm_io_init(void); 216int dm_io_init(void);
216void dm_io_exit(void); 217void dm_io_exit(void);
217 218
@@ -221,7 +222,8 @@ void dm_kcopyd_exit(void);
221/* 222/*
222 * Mempool operations 223 * Mempool operations
223 */ 224 */
224struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size); 225struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
226 unsigned integrity, unsigned per_bio_data_size);
225void dm_free_md_mempools(struct dm_md_mempools *pools); 227void dm_free_md_mempools(struct dm_md_mempools *pools);
226 228
227/* 229/*
@@ -235,4 +237,8 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen
235 return !maxlen || strlen(result) + 1 >= maxlen; 237 return !maxlen || strlen(result) + 1 >= maxlen;
236} 238}
237 239
240ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
241ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
242 const char *buf, size_t count);
243
238#endif 244#endif