aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-bio-prison.c186
-rw-r--r--drivers/md/dm-bio-prison.h28
-rw-r--r--drivers/md/dm-bufio.c226
-rw-r--r--drivers/md/dm-cache-block-types.h11
-rw-r--r--drivers/md/dm-cache-metadata.c34
-rw-r--r--drivers/md/dm-cache-metadata.h6
-rw-r--r--drivers/md/dm-cache-policy-mq.c82
-rw-r--r--drivers/md/dm-cache-target.c378
-rw-r--r--drivers/md/dm-crypt.c2
-rw-r--r--drivers/md/dm-ioctl.c5
-rw-r--r--drivers/md/dm-stats.c2
-rw-r--r--drivers/md/dm-table.c36
-rw-r--r--drivers/md/dm-thin-metadata.c35
-rw-r--r--drivers/md/dm-thin-metadata.h9
-rw-r--r--drivers/md/dm-thin.c760
-rw-r--r--drivers/md/dm.c273
-rw-r--r--drivers/md/dm.h10
-rw-r--r--drivers/md/persistent-data/dm-array.c4
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c8
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c77
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h7
21 files changed, 1610 insertions, 569 deletions
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index f752d12081ff..be065300e93c 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -14,68 +14,38 @@
14 14
15/*----------------------------------------------------------------*/ 15/*----------------------------------------------------------------*/
16 16
17struct bucket { 17#define MIN_CELLS 1024
18 spinlock_t lock;
19 struct hlist_head cells;
20};
21 18
22struct dm_bio_prison { 19struct dm_bio_prison {
20 spinlock_t lock;
23 mempool_t *cell_pool; 21 mempool_t *cell_pool;
24 22 struct rb_root cells;
25 unsigned nr_buckets;
26 unsigned hash_mask;
27 struct bucket *buckets;
28}; 23};
29 24
30/*----------------------------------------------------------------*/
31
32static uint32_t calc_nr_buckets(unsigned nr_cells)
33{
34 uint32_t n = 128;
35
36 nr_cells /= 4;
37 nr_cells = min(nr_cells, 8192u);
38
39 while (n < nr_cells)
40 n <<= 1;
41
42 return n;
43}
44
45static struct kmem_cache *_cell_cache; 25static struct kmem_cache *_cell_cache;
46 26
47static void init_bucket(struct bucket *b) 27/*----------------------------------------------------------------*/
48{
49 spin_lock_init(&b->lock);
50 INIT_HLIST_HEAD(&b->cells);
51}
52 28
53/* 29/*
54 * @nr_cells should be the number of cells you want in use _concurrently_. 30 * @nr_cells should be the number of cells you want in use _concurrently_.
55 * Don't confuse it with the number of distinct keys. 31 * Don't confuse it with the number of distinct keys.
56 */ 32 */
57struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells) 33struct dm_bio_prison *dm_bio_prison_create(void)
58{ 34{
59 unsigned i; 35 struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
60 uint32_t nr_buckets = calc_nr_buckets(nr_cells);
61 size_t len = sizeof(struct dm_bio_prison) +
62 (sizeof(struct bucket) * nr_buckets);
63 struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
64 36
65 if (!prison) 37 if (!prison)
66 return NULL; 38 return NULL;
67 39
68 prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); 40 spin_lock_init(&prison->lock);
41
42 prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
69 if (!prison->cell_pool) { 43 if (!prison->cell_pool) {
70 kfree(prison); 44 kfree(prison);
71 return NULL; 45 return NULL;
72 } 46 }
73 47
74 prison->nr_buckets = nr_buckets; 48 prison->cells = RB_ROOT;
75 prison->hash_mask = nr_buckets - 1;
76 prison->buckets = (struct bucket *) (prison + 1);
77 for (i = 0; i < nr_buckets; i++)
78 init_bucket(prison->buckets + i);
79 49
80 return prison; 50 return prison;
81} 51}
@@ -101,68 +71,73 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
101} 71}
102EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell); 72EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
103 73
104static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key) 74static void __setup_new_cell(struct dm_cell_key *key,
75 struct bio *holder,
76 struct dm_bio_prison_cell *cell)
105{ 77{
106 const unsigned long BIG_PRIME = 4294967291UL; 78 memcpy(&cell->key, key, sizeof(cell->key));
107 uint64_t hash = key->block * BIG_PRIME; 79 cell->holder = holder;
108 80 bio_list_init(&cell->bios);
109 return (uint32_t) (hash & prison->hash_mask);
110} 81}
111 82
112static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs) 83static int cmp_keys(struct dm_cell_key *lhs,
84 struct dm_cell_key *rhs)
113{ 85{
114 return (lhs->virtual == rhs->virtual) && 86 if (lhs->virtual < rhs->virtual)
115 (lhs->dev == rhs->dev) && 87 return -1;
116 (lhs->block == rhs->block);
117}
118 88
119static struct bucket *get_bucket(struct dm_bio_prison *prison, 89 if (lhs->virtual > rhs->virtual)
120 struct dm_cell_key *key) 90 return 1;
121{
122 return prison->buckets + hash_key(prison, key);
123}
124 91
125static struct dm_bio_prison_cell *__search_bucket(struct bucket *b, 92 if (lhs->dev < rhs->dev)
126 struct dm_cell_key *key) 93 return -1;
127{
128 struct dm_bio_prison_cell *cell;
129 94
130 hlist_for_each_entry(cell, &b->cells, list) 95 if (lhs->dev > rhs->dev)
131 if (keys_equal(&cell->key, key)) 96 return 1;
132 return cell;
133 97
134 return NULL; 98 if (lhs->block_end <= rhs->block_begin)
135} 99 return -1;
136 100
137static void __setup_new_cell(struct bucket *b, 101 if (lhs->block_begin >= rhs->block_end)
138 struct dm_cell_key *key, 102 return 1;
139 struct bio *holder, 103
140 struct dm_bio_prison_cell *cell) 104 return 0;
141{
142 memcpy(&cell->key, key, sizeof(cell->key));
143 cell->holder = holder;
144 bio_list_init(&cell->bios);
145 hlist_add_head(&cell->list, &b->cells);
146} 105}
147 106
148static int __bio_detain(struct bucket *b, 107static int __bio_detain(struct dm_bio_prison *prison,
149 struct dm_cell_key *key, 108 struct dm_cell_key *key,
150 struct bio *inmate, 109 struct bio *inmate,
151 struct dm_bio_prison_cell *cell_prealloc, 110 struct dm_bio_prison_cell *cell_prealloc,
152 struct dm_bio_prison_cell **cell_result) 111 struct dm_bio_prison_cell **cell_result)
153{ 112{
154 struct dm_bio_prison_cell *cell; 113 int r;
155 114 struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
156 cell = __search_bucket(b, key); 115
157 if (cell) { 116 while (*new) {
158 if (inmate) 117 struct dm_bio_prison_cell *cell =
159 bio_list_add(&cell->bios, inmate); 118 container_of(*new, struct dm_bio_prison_cell, node);
160 *cell_result = cell; 119
161 return 1; 120 r = cmp_keys(key, &cell->key);
121
122 parent = *new;
123 if (r < 0)
124 new = &((*new)->rb_left);
125 else if (r > 0)
126 new = &((*new)->rb_right);
127 else {
128 if (inmate)
129 bio_list_add(&cell->bios, inmate);
130 *cell_result = cell;
131 return 1;
132 }
162 } 133 }
163 134
164 __setup_new_cell(b, key, inmate, cell_prealloc); 135 __setup_new_cell(key, inmate, cell_prealloc);
165 *cell_result = cell_prealloc; 136 *cell_result = cell_prealloc;
137
138 rb_link_node(&cell_prealloc->node, parent, new);
139 rb_insert_color(&cell_prealloc->node, &prison->cells);
140
166 return 0; 141 return 0;
167} 142}
168 143
@@ -174,11 +149,10 @@ static int bio_detain(struct dm_bio_prison *prison,
174{ 149{
175 int r; 150 int r;
176 unsigned long flags; 151 unsigned long flags;
177 struct bucket *b = get_bucket(prison, key);
178 152
179 spin_lock_irqsave(&b->lock, flags); 153 spin_lock_irqsave(&prison->lock, flags);
180 r = __bio_detain(b, key, inmate, cell_prealloc, cell_result); 154 r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
181 spin_unlock_irqrestore(&b->lock, flags); 155 spin_unlock_irqrestore(&prison->lock, flags);
182 156
183 return r; 157 return r;
184} 158}
@@ -205,10 +179,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell);
205/* 179/*
206 * @inmates must have been initialised prior to this call 180 * @inmates must have been initialised prior to this call
207 */ 181 */
208static void __cell_release(struct dm_bio_prison_cell *cell, 182static void __cell_release(struct dm_bio_prison *prison,
183 struct dm_bio_prison_cell *cell,
209 struct bio_list *inmates) 184 struct bio_list *inmates)
210{ 185{
211 hlist_del(&cell->list); 186 rb_erase(&cell->node, &prison->cells);
212 187
213 if (inmates) { 188 if (inmates) {
214 if (cell->holder) 189 if (cell->holder)
@@ -222,21 +197,21 @@ void dm_cell_release(struct dm_bio_prison *prison,
222 struct bio_list *bios) 197 struct bio_list *bios)
223{ 198{
224 unsigned long flags; 199 unsigned long flags;
225 struct bucket *b = get_bucket(prison, &cell->key);
226 200
227 spin_lock_irqsave(&b->lock, flags); 201 spin_lock_irqsave(&prison->lock, flags);
228 __cell_release(cell, bios); 202 __cell_release(prison, cell, bios);
229 spin_unlock_irqrestore(&b->lock, flags); 203 spin_unlock_irqrestore(&prison->lock, flags);
230} 204}
231EXPORT_SYMBOL_GPL(dm_cell_release); 205EXPORT_SYMBOL_GPL(dm_cell_release);
232 206
233/* 207/*
234 * Sometimes we don't want the holder, just the additional bios. 208 * Sometimes we don't want the holder, just the additional bios.
235 */ 209 */
236static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, 210static void __cell_release_no_holder(struct dm_bio_prison *prison,
211 struct dm_bio_prison_cell *cell,
237 struct bio_list *inmates) 212 struct bio_list *inmates)
238{ 213{
239 hlist_del(&cell->list); 214 rb_erase(&cell->node, &prison->cells);
240 bio_list_merge(inmates, &cell->bios); 215 bio_list_merge(inmates, &cell->bios);
241} 216}
242 217
@@ -245,11 +220,10 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
245 struct bio_list *inmates) 220 struct bio_list *inmates)
246{ 221{
247 unsigned long flags; 222 unsigned long flags;
248 struct bucket *b = get_bucket(prison, &cell->key);
249 223
250 spin_lock_irqsave(&b->lock, flags); 224 spin_lock_irqsave(&prison->lock, flags);
251 __cell_release_no_holder(cell, inmates); 225 __cell_release_no_holder(prison, cell, inmates);
252 spin_unlock_irqrestore(&b->lock, flags); 226 spin_unlock_irqrestore(&prison->lock, flags);
253} 227}
254EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); 228EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
255 229
@@ -267,6 +241,20 @@ void dm_cell_error(struct dm_bio_prison *prison,
267} 241}
268EXPORT_SYMBOL_GPL(dm_cell_error); 242EXPORT_SYMBOL_GPL(dm_cell_error);
269 243
244void dm_cell_visit_release(struct dm_bio_prison *prison,
245 void (*visit_fn)(void *, struct dm_bio_prison_cell *),
246 void *context,
247 struct dm_bio_prison_cell *cell)
248{
249 unsigned long flags;
250
251 spin_lock_irqsave(&prison->lock, flags);
252 visit_fn(context, cell);
253 rb_erase(&cell->node, &prison->cells);
254 spin_unlock_irqrestore(&prison->lock, flags);
255}
256EXPORT_SYMBOL_GPL(dm_cell_visit_release);
257
270/*----------------------------------------------------------------*/ 258/*----------------------------------------------------------------*/
271 259
272#define DEFERRED_SET_SIZE 64 260#define DEFERRED_SET_SIZE 64
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 6805a142b750..74cf01144b1f 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -10,8 +10,8 @@
10#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */ 10#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
11#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */ 11#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
12 12
13#include <linux/list.h>
14#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/rbtree.h>
15 15
16/*----------------------------------------------------------------*/ 16/*----------------------------------------------------------------*/
17 17
@@ -23,11 +23,14 @@
23 */ 23 */
24struct dm_bio_prison; 24struct dm_bio_prison;
25 25
26/* FIXME: this needs to be more abstract */ 26/*
27 * Keys define a range of blocks within either a virtual or physical
28 * device.
29 */
27struct dm_cell_key { 30struct dm_cell_key {
28 int virtual; 31 int virtual;
29 dm_thin_id dev; 32 dm_thin_id dev;
30 dm_block_t block; 33 dm_block_t block_begin, block_end;
31}; 34};
32 35
33/* 36/*
@@ -35,13 +38,15 @@ struct dm_cell_key {
35 * themselves. 38 * themselves.
36 */ 39 */
37struct dm_bio_prison_cell { 40struct dm_bio_prison_cell {
38 struct hlist_node list; 41 struct list_head user_list; /* for client use */
42 struct rb_node node;
43
39 struct dm_cell_key key; 44 struct dm_cell_key key;
40 struct bio *holder; 45 struct bio *holder;
41 struct bio_list bios; 46 struct bio_list bios;
42}; 47};
43 48
44struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells); 49struct dm_bio_prison *dm_bio_prison_create(void);
45void dm_bio_prison_destroy(struct dm_bio_prison *prison); 50void dm_bio_prison_destroy(struct dm_bio_prison *prison);
46 51
47/* 52/*
@@ -57,7 +62,7 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
57 struct dm_bio_prison_cell *cell); 62 struct dm_bio_prison_cell *cell);
58 63
59/* 64/*
60 * Creates, or retrieves a cell for the given key. 65 * Creates, or retrieves a cell that overlaps the given key.
61 * 66 *
62 * Returns 1 if pre-existing cell returned, zero if new cell created using 67 * Returns 1 if pre-existing cell returned, zero if new cell created using
63 * @cell_prealloc. 68 * @cell_prealloc.
@@ -68,7 +73,8 @@ int dm_get_cell(struct dm_bio_prison *prison,
68 struct dm_bio_prison_cell **cell_result); 73 struct dm_bio_prison_cell **cell_result);
69 74
70/* 75/*
71 * An atomic op that combines retrieving a cell, and adding a bio to it. 76 * An atomic op that combines retrieving or creating a cell, and adding a
77 * bio to it.
72 * 78 *
73 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 79 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
74 */ 80 */
@@ -87,6 +93,14 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
87void dm_cell_error(struct dm_bio_prison *prison, 93void dm_cell_error(struct dm_bio_prison *prison,
88 struct dm_bio_prison_cell *cell, int error); 94 struct dm_bio_prison_cell *cell, int error);
89 95
96/*
97 * Visits the cell and then releases. Guarantees no new inmates are
98 * inserted between the visit and release.
99 */
100void dm_cell_visit_release(struct dm_bio_prison *prison,
101 void (*visit_fn)(void *, struct dm_bio_prison_cell *),
102 void *context, struct dm_bio_prison_cell *cell);
103
90/*----------------------------------------------------------------*/ 104/*----------------------------------------------------------------*/
91 105
92/* 106/*
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index afe79719ea32..c33b49792b87 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -14,6 +14,7 @@
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/shrinker.h> 15#include <linux/shrinker.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/rbtree.h>
17 18
18#define DM_MSG_PREFIX "bufio" 19#define DM_MSG_PREFIX "bufio"
19 20
@@ -34,26 +35,23 @@
34/* 35/*
35 * Check buffer ages in this interval (seconds) 36 * Check buffer ages in this interval (seconds)
36 */ 37 */
37#define DM_BUFIO_WORK_TIMER_SECS 10 38#define DM_BUFIO_WORK_TIMER_SECS 30
38 39
39/* 40/*
40 * Free buffers when they are older than this (seconds) 41 * Free buffers when they are older than this (seconds)
41 */ 42 */
42#define DM_BUFIO_DEFAULT_AGE_SECS 60 43#define DM_BUFIO_DEFAULT_AGE_SECS 300
43 44
44/* 45/*
45 * The number of bvec entries that are embedded directly in the buffer. 46 * The nr of bytes of cached data to keep around.
46 * If the chunk size is larger, dm-io is used to do the io.
47 */ 47 */
48#define DM_BUFIO_INLINE_VECS 16 48#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
49 49
50/* 50/*
51 * Buffer hash 51 * The number of bvec entries that are embedded directly in the buffer.
52 * If the chunk size is larger, dm-io is used to do the io.
52 */ 53 */
53#define DM_BUFIO_HASH_BITS 20 54#define DM_BUFIO_INLINE_VECS 16
54#define DM_BUFIO_HASH(block) \
55 ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
56 ((1 << DM_BUFIO_HASH_BITS) - 1))
57 55
58/* 56/*
59 * Don't try to use kmem_cache_alloc for blocks larger than this. 57 * Don't try to use kmem_cache_alloc for blocks larger than this.
@@ -106,7 +104,7 @@ struct dm_bufio_client {
106 104
107 unsigned minimum_buffers; 105 unsigned minimum_buffers;
108 106
109 struct hlist_head *cache_hash; 107 struct rb_root buffer_tree;
110 wait_queue_head_t free_buffer_wait; 108 wait_queue_head_t free_buffer_wait;
111 109
112 int async_write_error; 110 int async_write_error;
@@ -135,7 +133,7 @@ enum data_mode {
135}; 133};
136 134
137struct dm_buffer { 135struct dm_buffer {
138 struct hlist_node hash_list; 136 struct rb_node node;
139 struct list_head lru_list; 137 struct list_head lru_list;
140 sector_t block; 138 sector_t block;
141 void *data; 139 void *data;
@@ -223,6 +221,7 @@ static DEFINE_SPINLOCK(param_spinlock);
223 * Buffers are freed after this timeout 221 * Buffers are freed after this timeout
224 */ 222 */
225static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 223static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
224static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
226 225
227static unsigned long dm_bufio_peak_allocated; 226static unsigned long dm_bufio_peak_allocated;
228static unsigned long dm_bufio_allocated_kmem_cache; 227static unsigned long dm_bufio_allocated_kmem_cache;
@@ -253,6 +252,53 @@ static LIST_HEAD(dm_bufio_all_clients);
253 */ 252 */
254static DEFINE_MUTEX(dm_bufio_clients_lock); 253static DEFINE_MUTEX(dm_bufio_clients_lock);
255 254
255/*----------------------------------------------------------------
256 * A red/black tree acts as an index for all the buffers.
257 *--------------------------------------------------------------*/
258static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
259{
260 struct rb_node *n = c->buffer_tree.rb_node;
261 struct dm_buffer *b;
262
263 while (n) {
264 b = container_of(n, struct dm_buffer, node);
265
266 if (b->block == block)
267 return b;
268
269 n = (b->block < block) ? n->rb_left : n->rb_right;
270 }
271
272 return NULL;
273}
274
275static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
276{
277 struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
278 struct dm_buffer *found;
279
280 while (*new) {
281 found = container_of(*new, struct dm_buffer, node);
282
283 if (found->block == b->block) {
284 BUG_ON(found != b);
285 return;
286 }
287
288 parent = *new;
289 new = (found->block < b->block) ?
290 &((*new)->rb_left) : &((*new)->rb_right);
291 }
292
293 rb_link_node(&b->node, parent, new);
294 rb_insert_color(&b->node, &c->buffer_tree);
295}
296
297static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
298{
299 rb_erase(&b->node, &c->buffer_tree);
300}
301
256/*----------------------------------------------------------------*/ 302/*----------------------------------------------------------------*/
257 303
258static void adjust_total_allocated(enum data_mode data_mode, long diff) 304static void adjust_total_allocated(enum data_mode data_mode, long diff)
@@ -434,7 +480,7 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
434 b->block = block; 480 b->block = block;
435 b->list_mode = dirty; 481 b->list_mode = dirty;
436 list_add(&b->lru_list, &c->lru[dirty]); 482 list_add(&b->lru_list, &c->lru[dirty]);
437 hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); 483 __insert(b->c, b);
438 b->last_accessed = jiffies; 484 b->last_accessed = jiffies;
439} 485}
440 486
@@ -448,7 +494,7 @@ static void __unlink_buffer(struct dm_buffer *b)
448 BUG_ON(!c->n_buffers[b->list_mode]); 494 BUG_ON(!c->n_buffers[b->list_mode]);
449 495
450 c->n_buffers[b->list_mode]--; 496 c->n_buffers[b->list_mode]--;
451 hlist_del(&b->hash_list); 497 __remove(b->c, b);
452 list_del(&b->lru_list); 498 list_del(&b->lru_list);
453} 499}
454 500
@@ -532,6 +578,19 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
532 end_io(&b->bio, r); 578 end_io(&b->bio, r);
533} 579}
534 580
581static void inline_endio(struct bio *bio, int error)
582{
583 bio_end_io_t *end_fn = bio->bi_private;
584
585 /*
586 * Reset the bio to free any attached resources
587 * (e.g. bio integrity profiles).
588 */
589 bio_reset(bio);
590
591 end_fn(bio, error);
592}
593
535static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 594static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
536 bio_end_io_t *end_io) 595 bio_end_io_t *end_io)
537{ 596{
@@ -543,7 +602,12 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
543 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 602 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
544 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; 603 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
545 b->bio.bi_bdev = b->c->bdev; 604 b->bio.bi_bdev = b->c->bdev;
546 b->bio.bi_end_io = end_io; 605 b->bio.bi_end_io = inline_endio;
606 /*
607 * Use of .bi_private isn't a problem here because
608 * the dm_buffer's inline bio is local to bufio.
609 */
610 b->bio.bi_private = end_io;
547 611
548 /* 612 /*
549 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 613 * We assume that if len >= PAGE_SIZE ptr is page-aligned.
@@ -887,23 +951,6 @@ static void __check_watermark(struct dm_bufio_client *c,
887 __write_dirty_buffers_async(c, 1, write_list); 951 __write_dirty_buffers_async(c, 1, write_list);
888} 952}
889 953
890/*
891 * Find a buffer in the hash.
892 */
893static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
894{
895 struct dm_buffer *b;
896
897 hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)],
898 hash_list) {
899 dm_bufio_cond_resched();
900 if (b->block == block)
901 return b;
902 }
903
904 return NULL;
905}
906
907/*---------------------------------------------------------------- 954/*----------------------------------------------------------------
908 * Getting a buffer 955 * Getting a buffer
909 *--------------------------------------------------------------*/ 956 *--------------------------------------------------------------*/
@@ -1433,45 +1480,52 @@ static void drop_buffers(struct dm_bufio_client *c)
1433} 1480}
1434 1481
1435/* 1482/*
1436 * Test if the buffer is unused and too old, and commit it. 1483 * We may not be able to evict this buffer if IO pending or the client
1484 * is still using it. Caller is expected to know buffer is too old.
1485 *
1437 * And if GFP_NOFS is used, we must not do any I/O because we hold 1486 * And if GFP_NOFS is used, we must not do any I/O because we hold
1438 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets 1487 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
1439 * rerouted to different bufio client. 1488 * rerouted to different bufio client.
1440 */ 1489 */
1441static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, 1490static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
1442 unsigned long max_jiffies)
1443{ 1491{
1444 if (jiffies - b->last_accessed < max_jiffies)
1445 return 0;
1446
1447 if (!(gfp & __GFP_FS)) { 1492 if (!(gfp & __GFP_FS)) {
1448 if (test_bit(B_READING, &b->state) || 1493 if (test_bit(B_READING, &b->state) ||
1449 test_bit(B_WRITING, &b->state) || 1494 test_bit(B_WRITING, &b->state) ||
1450 test_bit(B_DIRTY, &b->state)) 1495 test_bit(B_DIRTY, &b->state))
1451 return 0; 1496 return false;
1452 } 1497 }
1453 1498
1454 if (b->hold_count) 1499 if (b->hold_count)
1455 return 0; 1500 return false;
1456 1501
1457 __make_buffer_clean(b); 1502 __make_buffer_clean(b);
1458 __unlink_buffer(b); 1503 __unlink_buffer(b);
1459 __free_buffer_wake(b); 1504 __free_buffer_wake(b);
1460 1505
1461 return 1; 1506 return true;
1462} 1507}
1463 1508
1464static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1509static unsigned get_retain_buffers(struct dm_bufio_client *c)
1465 gfp_t gfp_mask) 1510{
1511 unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
1512 return retain_bytes / c->block_size;
1513}
1514
1515static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
1516 gfp_t gfp_mask)
1466{ 1517{
1467 int l; 1518 int l;
1468 struct dm_buffer *b, *tmp; 1519 struct dm_buffer *b, *tmp;
1469 long freed = 0; 1520 unsigned long freed = 0;
1521 unsigned long count = nr_to_scan;
1522 unsigned retain_target = get_retain_buffers(c);
1470 1523
1471 for (l = 0; l < LIST_SIZE; l++) { 1524 for (l = 0; l < LIST_SIZE; l++) {
1472 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { 1525 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
1473 freed += __cleanup_old_buffer(b, gfp_mask, 0); 1526 if (__try_evict_buffer(b, gfp_mask))
1474 if (!--nr_to_scan) 1527 freed++;
1528 if (!--nr_to_scan || ((count - freed) <= retain_target))
1475 return freed; 1529 return freed;
1476 dm_bufio_cond_resched(); 1530 dm_bufio_cond_resched();
1477 } 1531 }
@@ -1533,11 +1587,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
1533 r = -ENOMEM; 1587 r = -ENOMEM;
1534 goto bad_client; 1588 goto bad_client;
1535 } 1589 }
1536 c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); 1590 c->buffer_tree = RB_ROOT;
1537 if (!c->cache_hash) {
1538 r = -ENOMEM;
1539 goto bad_hash;
1540 }
1541 1591
1542 c->bdev = bdev; 1592 c->bdev = bdev;
1543 c->block_size = block_size; 1593 c->block_size = block_size;
@@ -1556,9 +1606,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
1556 c->n_buffers[i] = 0; 1606 c->n_buffers[i] = 0;
1557 } 1607 }
1558 1608
1559 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
1560 INIT_HLIST_HEAD(&c->cache_hash[i]);
1561
1562 mutex_init(&c->lock); 1609 mutex_init(&c->lock);
1563 INIT_LIST_HEAD(&c->reserved_buffers); 1610 INIT_LIST_HEAD(&c->reserved_buffers);
1564 c->need_reserved_buffers = reserved_buffers; 1611 c->need_reserved_buffers = reserved_buffers;
@@ -1632,8 +1679,6 @@ bad_cache:
1632 } 1679 }
1633 dm_io_client_destroy(c->dm_io); 1680 dm_io_client_destroy(c->dm_io);
1634bad_dm_io: 1681bad_dm_io:
1635 vfree(c->cache_hash);
1636bad_hash:
1637 kfree(c); 1682 kfree(c);
1638bad_client: 1683bad_client:
1639 return ERR_PTR(r); 1684 return ERR_PTR(r);
@@ -1660,9 +1705,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
1660 1705
1661 mutex_unlock(&dm_bufio_clients_lock); 1706 mutex_unlock(&dm_bufio_clients_lock);
1662 1707
1663 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1708 BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
1664 BUG_ON(!hlist_empty(&c->cache_hash[i]));
1665
1666 BUG_ON(c->need_reserved_buffers); 1709 BUG_ON(c->need_reserved_buffers);
1667 1710
1668 while (!list_empty(&c->reserved_buffers)) { 1711 while (!list_empty(&c->reserved_buffers)) {
@@ -1680,36 +1723,60 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
1680 BUG_ON(c->n_buffers[i]); 1723 BUG_ON(c->n_buffers[i]);
1681 1724
1682 dm_io_client_destroy(c->dm_io); 1725 dm_io_client_destroy(c->dm_io);
1683 vfree(c->cache_hash);
1684 kfree(c); 1726 kfree(c);
1685} 1727}
1686EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1728EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1687 1729
1688static void cleanup_old_buffers(void) 1730static unsigned get_max_age_hz(void)
1689{ 1731{
1690 unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); 1732 unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
1691 struct dm_bufio_client *c;
1692 1733
1693 if (max_age > ULONG_MAX / HZ) 1734 if (max_age > UINT_MAX / HZ)
1694 max_age = ULONG_MAX / HZ; 1735 max_age = UINT_MAX / HZ;
1695 1736
1696 mutex_lock(&dm_bufio_clients_lock); 1737 return max_age * HZ;
1697 list_for_each_entry(c, &dm_bufio_all_clients, client_list) { 1738}
1698 if (!dm_bufio_trylock(c))
1699 continue;
1700 1739
1701 while (!list_empty(&c->lru[LIST_CLEAN])) { 1740static bool older_than(struct dm_buffer *b, unsigned long age_hz)
1702 struct dm_buffer *b; 1741{
1703 b = list_entry(c->lru[LIST_CLEAN].prev, 1742 return (jiffies - b->last_accessed) >= age_hz;
1704 struct dm_buffer, lru_list); 1743}
1705 if (!__cleanup_old_buffer(b, 0, max_age * HZ)) 1744
1706 break; 1745static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1707 dm_bufio_cond_resched(); 1746{
1708 } 1747 struct dm_buffer *b, *tmp;
1748 unsigned retain_target = get_retain_buffers(c);
1749 unsigned count;
1750
1751 dm_bufio_lock(c);
1752
1753 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1754 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
1755 if (count <= retain_target)
1756 break;
1757
1758 if (!older_than(b, age_hz))
1759 break;
1760
1761 if (__try_evict_buffer(b, 0))
1762 count--;
1709 1763
1710 dm_bufio_unlock(c);
1711 dm_bufio_cond_resched(); 1764 dm_bufio_cond_resched();
1712 } 1765 }
1766
1767 dm_bufio_unlock(c);
1768}
1769
1770static void cleanup_old_buffers(void)
1771{
1772 unsigned long max_age_hz = get_max_age_hz();
1773 struct dm_bufio_client *c;
1774
1775 mutex_lock(&dm_bufio_clients_lock);
1776
1777 list_for_each_entry(c, &dm_bufio_all_clients, client_list)
1778 __evict_old_buffers(c, max_age_hz);
1779
1713 mutex_unlock(&dm_bufio_clients_lock); 1780 mutex_unlock(&dm_bufio_clients_lock);
1714} 1781}
1715 1782
@@ -1834,6 +1901,9 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
1834module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1901module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
1835MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1902MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
1836 1903
1904module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR);
1905MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
1906
1837module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1907module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
1838MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1908MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
1839 1909
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
index aac0e2df06be..bed4ad4e1b7c 100644
--- a/drivers/md/dm-cache-block-types.h
+++ b/drivers/md/dm-cache-block-types.h
@@ -19,6 +19,7 @@
19 19
20typedef dm_block_t __bitwise__ dm_oblock_t; 20typedef dm_block_t __bitwise__ dm_oblock_t;
21typedef uint32_t __bitwise__ dm_cblock_t; 21typedef uint32_t __bitwise__ dm_cblock_t;
22typedef dm_block_t __bitwise__ dm_dblock_t;
22 23
23static inline dm_oblock_t to_oblock(dm_block_t b) 24static inline dm_oblock_t to_oblock(dm_block_t b)
24{ 25{
@@ -40,4 +41,14 @@ static inline uint32_t from_cblock(dm_cblock_t b)
40 return (__force uint32_t) b; 41 return (__force uint32_t) b;
41} 42}
42 43
44static inline dm_dblock_t to_dblock(dm_block_t b)
45{
46 return (__force dm_dblock_t) b;
47}
48
49static inline dm_block_t from_dblock(dm_dblock_t b)
50{
51 return (__force dm_block_t) b;
52}
53
43#endif /* DM_CACHE_BLOCK_TYPES_H */ 54#endif /* DM_CACHE_BLOCK_TYPES_H */
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 06709257adde..9fc616c2755e 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -109,7 +109,7 @@ struct dm_cache_metadata {
109 dm_block_t discard_root; 109 dm_block_t discard_root;
110 110
111 sector_t discard_block_size; 111 sector_t discard_block_size;
112 dm_oblock_t discard_nr_blocks; 112 dm_dblock_t discard_nr_blocks;
113 113
114 sector_t data_block_size; 114 sector_t data_block_size;
115 dm_cblock_t cache_blocks; 115 dm_cblock_t cache_blocks;
@@ -329,7 +329,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
329 disk_super->hint_root = cpu_to_le64(cmd->hint_root); 329 disk_super->hint_root = cpu_to_le64(cmd->hint_root);
330 disk_super->discard_root = cpu_to_le64(cmd->discard_root); 330 disk_super->discard_root = cpu_to_le64(cmd->discard_root);
331 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); 331 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
332 disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); 332 disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
333 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE); 333 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE);
334 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); 334 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
335 disk_super->cache_blocks = cpu_to_le32(0); 335 disk_super->cache_blocks = cpu_to_le32(0);
@@ -528,7 +528,7 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
528 cmd->hint_root = le64_to_cpu(disk_super->hint_root); 528 cmd->hint_root = le64_to_cpu(disk_super->hint_root);
529 cmd->discard_root = le64_to_cpu(disk_super->discard_root); 529 cmd->discard_root = le64_to_cpu(disk_super->discard_root);
530 cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size); 530 cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
531 cmd->discard_nr_blocks = to_oblock(le64_to_cpu(disk_super->discard_nr_blocks)); 531 cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks));
532 cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); 532 cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
533 cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); 533 cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
534 strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); 534 strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
@@ -626,7 +626,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
626 disk_super->hint_root = cpu_to_le64(cmd->hint_root); 626 disk_super->hint_root = cpu_to_le64(cmd->hint_root);
627 disk_super->discard_root = cpu_to_le64(cmd->discard_root); 627 disk_super->discard_root = cpu_to_le64(cmd->discard_root);
628 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); 628 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
629 disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); 629 disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
630 disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); 630 disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
631 strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); 631 strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
632 disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); 632 disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
@@ -797,15 +797,15 @@ out:
797 797
798int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, 798int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
799 sector_t discard_block_size, 799 sector_t discard_block_size,
800 dm_oblock_t new_nr_entries) 800 dm_dblock_t new_nr_entries)
801{ 801{
802 int r; 802 int r;
803 803
804 down_write(&cmd->root_lock); 804 down_write(&cmd->root_lock);
805 r = dm_bitset_resize(&cmd->discard_info, 805 r = dm_bitset_resize(&cmd->discard_info,
806 cmd->discard_root, 806 cmd->discard_root,
807 from_oblock(cmd->discard_nr_blocks), 807 from_dblock(cmd->discard_nr_blocks),
808 from_oblock(new_nr_entries), 808 from_dblock(new_nr_entries),
809 false, &cmd->discard_root); 809 false, &cmd->discard_root);
810 if (!r) { 810 if (!r) {
811 cmd->discard_block_size = discard_block_size; 811 cmd->discard_block_size = discard_block_size;
@@ -818,28 +818,28 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
818 return r; 818 return r;
819} 819}
820 820
821static int __set_discard(struct dm_cache_metadata *cmd, dm_oblock_t b) 821static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
822{ 822{
823 return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root, 823 return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
824 from_oblock(b), &cmd->discard_root); 824 from_dblock(b), &cmd->discard_root);
825} 825}
826 826
827static int __clear_discard(struct dm_cache_metadata *cmd, dm_oblock_t b) 827static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
828{ 828{
829 return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root, 829 return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root,
830 from_oblock(b), &cmd->discard_root); 830 from_dblock(b), &cmd->discard_root);
831} 831}
832 832
833static int __is_discarded(struct dm_cache_metadata *cmd, dm_oblock_t b, 833static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
834 bool *is_discarded) 834 bool *is_discarded)
835{ 835{
836 return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root, 836 return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
837 from_oblock(b), &cmd->discard_root, 837 from_dblock(b), &cmd->discard_root,
838 is_discarded); 838 is_discarded);
839} 839}
840 840
841static int __discard(struct dm_cache_metadata *cmd, 841static int __discard(struct dm_cache_metadata *cmd,
842 dm_oblock_t dblock, bool discard) 842 dm_dblock_t dblock, bool discard)
843{ 843{
844 int r; 844 int r;
845 845
@@ -852,7 +852,7 @@ static int __discard(struct dm_cache_metadata *cmd,
852} 852}
853 853
854int dm_cache_set_discard(struct dm_cache_metadata *cmd, 854int dm_cache_set_discard(struct dm_cache_metadata *cmd,
855 dm_oblock_t dblock, bool discard) 855 dm_dblock_t dblock, bool discard)
856{ 856{
857 int r; 857 int r;
858 858
@@ -870,8 +870,8 @@ static int __load_discards(struct dm_cache_metadata *cmd,
870 dm_block_t b; 870 dm_block_t b;
871 bool discard; 871 bool discard;
872 872
873 for (b = 0; b < from_oblock(cmd->discard_nr_blocks); b++) { 873 for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
874 dm_oblock_t dblock = to_oblock(b); 874 dm_dblock_t dblock = to_dblock(b);
875 875
876 if (cmd->clean_when_opened) { 876 if (cmd->clean_when_opened) {
877 r = __is_discarded(cmd, dblock, &discard); 877 r = __is_discarded(cmd, dblock, &discard);
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 7383c90ccdb8..4ecc403be283 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -70,14 +70,14 @@ dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
70 70
71int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, 71int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
72 sector_t discard_block_size, 72 sector_t discard_block_size,
73 dm_oblock_t new_nr_entries); 73 dm_dblock_t new_nr_entries);
74 74
75typedef int (*load_discard_fn)(void *context, sector_t discard_block_size, 75typedef int (*load_discard_fn)(void *context, sector_t discard_block_size,
76 dm_oblock_t dblock, bool discarded); 76 dm_dblock_t dblock, bool discarded);
77int dm_cache_load_discards(struct dm_cache_metadata *cmd, 77int dm_cache_load_discards(struct dm_cache_metadata *cmd,
78 load_discard_fn fn, void *context); 78 load_discard_fn fn, void *context);
79 79
80int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_oblock_t dblock, bool discard); 80int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard);
81 81
82int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock); 82int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock);
83int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock); 83int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 0e385e40909e..13f547a4eeb6 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -181,24 +181,30 @@ static void queue_shift_down(struct queue *q)
181 * Gives us the oldest entry of the lowest popoulated level. If the first 181 * Gives us the oldest entry of the lowest popoulated level. If the first
182 * level is emptied then we shift down one level. 182 * level is emptied then we shift down one level.
183 */ 183 */
184static struct list_head *queue_pop(struct queue *q) 184static struct list_head *queue_peek(struct queue *q)
185{ 185{
186 unsigned level; 186 unsigned level;
187 struct list_head *r;
188 187
189 for (level = 0; level < NR_QUEUE_LEVELS; level++) 188 for (level = 0; level < NR_QUEUE_LEVELS; level++)
190 if (!list_empty(q->qs + level)) { 189 if (!list_empty(q->qs + level))
191 r = q->qs[level].next; 190 return q->qs[level].next;
192 list_del(r);
193 191
194 /* have we just emptied the bottom level? */ 192 return NULL;
195 if (level == 0 && list_empty(q->qs)) 193}
196 queue_shift_down(q);
197 194
198 return r; 195static struct list_head *queue_pop(struct queue *q)
199 } 196{
197 struct list_head *r = queue_peek(q);
200 198
201 return NULL; 199 if (r) {
200 list_del(r);
201
202 /* have we just emptied the bottom level? */
203 if (list_empty(q->qs))
204 queue_shift_down(q);
205 }
206
207 return r;
202} 208}
203 209
204static struct list_head *list_pop(struct list_head *lh) 210static struct list_head *list_pop(struct list_head *lh)
@@ -383,13 +389,6 @@ struct mq_policy {
383 unsigned generation; 389 unsigned generation;
384 unsigned generation_period; /* in lookups (will probably change) */ 390 unsigned generation_period; /* in lookups (will probably change) */
385 391
386 /*
387 * Entries in the pre_cache whose hit count passes the promotion
388 * threshold move to the cache proper. Working out the correct
389 * value for the promotion_threshold is crucial to this policy.
390 */
391 unsigned promote_threshold;
392
393 unsigned discard_promote_adjustment; 392 unsigned discard_promote_adjustment;
394 unsigned read_promote_adjustment; 393 unsigned read_promote_adjustment;
395 unsigned write_promote_adjustment; 394 unsigned write_promote_adjustment;
@@ -406,6 +405,7 @@ struct mq_policy {
406#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 405#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
407#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 406#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
408#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 407#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
408#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128
409 409
410/*----------------------------------------------------------------*/ 410/*----------------------------------------------------------------*/
411 411
@@ -518,6 +518,12 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q)
518 return e; 518 return e;
519} 519}
520 520
521static struct entry *peek(struct queue *q)
522{
523 struct list_head *h = queue_peek(q);
524 return h ? container_of(h, struct entry, list) : NULL;
525}
526
521/* 527/*
522 * Has this entry already been updated? 528 * Has this entry already been updated?
523 */ 529 */
@@ -570,10 +576,6 @@ static void check_generation(struct mq_policy *mq)
570 break; 576 break;
571 } 577 }
572 } 578 }
573
574 mq->promote_threshold = nr ? total / nr : 1;
575 if (mq->promote_threshold * nr < total)
576 mq->promote_threshold++;
577 } 579 }
578} 580}
579 581
@@ -641,6 +643,30 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
641} 643}
642 644
643/* 645/*
646 * Entries in the pre_cache whose hit count passes the promotion
647 * threshold move to the cache proper. Working out the correct
648 * value for the promotion_threshold is crucial to this policy.
649 */
650static unsigned promote_threshold(struct mq_policy *mq)
651{
652 struct entry *e;
653
654 if (any_free_cblocks(mq))
655 return 0;
656
657 e = peek(&mq->cache_clean);
658 if (e)
659 return e->hit_count;
660
661 e = peek(&mq->cache_dirty);
662 if (e)
663 return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD;
664
665 /* This should never happen */
666 return 0;
667}
668
669/*
644 * We modify the basic promotion_threshold depending on the specific io. 670 * We modify the basic promotion_threshold depending on the specific io.
645 * 671 *
646 * If the origin block has been discarded then there's no cost to copy it 672 * If the origin block has been discarded then there's no cost to copy it
@@ -653,7 +679,7 @@ static unsigned adjusted_promote_threshold(struct mq_policy *mq,
653 bool discarded_oblock, int data_dir) 679 bool discarded_oblock, int data_dir)
654{ 680{
655 if (data_dir == READ) 681 if (data_dir == READ)
656 return mq->promote_threshold + mq->read_promote_adjustment; 682 return promote_threshold(mq) + mq->read_promote_adjustment;
657 683
658 if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { 684 if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
659 /* 685 /*
@@ -663,7 +689,7 @@ static unsigned adjusted_promote_threshold(struct mq_policy *mq,
663 return mq->discard_promote_adjustment; 689 return mq->discard_promote_adjustment;
664 } 690 }
665 691
666 return mq->promote_threshold + mq->write_promote_adjustment; 692 return promote_threshold(mq) + mq->write_promote_adjustment;
667} 693}
668 694
669static bool should_promote(struct mq_policy *mq, struct entry *e, 695static bool should_promote(struct mq_policy *mq, struct entry *e,
@@ -839,7 +865,8 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
839 if (e && in_cache(mq, e)) 865 if (e && in_cache(mq, e))
840 r = cache_entry_found(mq, e, result); 866 r = cache_entry_found(mq, e, result);
841 867
842 else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) 868 else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] &&
869 iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
843 result->op = POLICY_MISS; 870 result->op = POLICY_MISS;
844 871
845 else if (e) 872 else if (e)
@@ -1230,7 +1257,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1230 mq->tick = 0; 1257 mq->tick = 0;
1231 mq->hit_count = 0; 1258 mq->hit_count = 0;
1232 mq->generation = 0; 1259 mq->generation = 0;
1233 mq->promote_threshold = 0;
1234 mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; 1260 mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
1235 mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; 1261 mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
1236 mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; 1262 mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
@@ -1265,7 +1291,7 @@ bad_pre_cache_init:
1265 1291
1266static struct dm_cache_policy_type mq_policy_type = { 1292static struct dm_cache_policy_type mq_policy_type = {
1267 .name = "mq", 1293 .name = "mq",
1268 .version = {1, 2, 0}, 1294 .version = {1, 3, 0},
1269 .hint_size = 4, 1295 .hint_size = 4,
1270 .owner = THIS_MODULE, 1296 .owner = THIS_MODULE,
1271 .create = mq_create 1297 .create = mq_create
@@ -1273,7 +1299,7 @@ static struct dm_cache_policy_type mq_policy_type = {
1273 1299
1274static struct dm_cache_policy_type default_policy_type = { 1300static struct dm_cache_policy_type default_policy_type = {
1275 .name = "default", 1301 .name = "default",
1276 .version = {1, 2, 0}, 1302 .version = {1, 3, 0},
1277 .hint_size = 4, 1303 .hint_size = 4,
1278 .owner = THIS_MODULE, 1304 .owner = THIS_MODULE,
1279 .create = mq_create, 1305 .create = mq_create,
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7130505c2425..1e96d7889f51 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -95,7 +95,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
95 95
96/*----------------------------------------------------------------*/ 96/*----------------------------------------------------------------*/
97 97
98#define PRISON_CELLS 1024
99#define MIGRATION_POOL_SIZE 128 98#define MIGRATION_POOL_SIZE 128
100#define COMMIT_PERIOD HZ 99#define COMMIT_PERIOD HZ
101#define MIGRATION_COUNT_WINDOW 10 100#define MIGRATION_COUNT_WINDOW 10
@@ -237,8 +236,9 @@ struct cache {
237 /* 236 /*
238 * origin_blocks entries, discarded if set. 237 * origin_blocks entries, discarded if set.
239 */ 238 */
240 dm_oblock_t discard_nr_blocks; 239 dm_dblock_t discard_nr_blocks;
241 unsigned long *discard_bitset; 240 unsigned long *discard_bitset;
241 uint32_t discard_block_size; /* a power of 2 times sectors per block */
242 242
243 /* 243 /*
244 * Rather than reconstructing the table line for the status we just 244 * Rather than reconstructing the table line for the status we just
@@ -310,6 +310,7 @@ struct dm_cache_migration {
310 dm_cblock_t cblock; 310 dm_cblock_t cblock;
311 311
312 bool err:1; 312 bool err:1;
313 bool discard:1;
313 bool writeback:1; 314 bool writeback:1;
314 bool demote:1; 315 bool demote:1;
315 bool promote:1; 316 bool promote:1;
@@ -433,11 +434,12 @@ static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cel
433 434
434/*----------------------------------------------------------------*/ 435/*----------------------------------------------------------------*/
435 436
436static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) 437static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
437{ 438{
438 key->virtual = 0; 439 key->virtual = 0;
439 key->dev = 0; 440 key->dev = 0;
440 key->block = from_oblock(oblock); 441 key->block_begin = from_oblock(begin);
442 key->block_end = from_oblock(end);
441} 443}
442 444
443/* 445/*
@@ -447,15 +449,15 @@ static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
447 */ 449 */
448typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 450typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
449 451
450static int bio_detain(struct cache *cache, dm_oblock_t oblock, 452static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
451 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 453 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
452 cell_free_fn free_fn, void *free_context, 454 cell_free_fn free_fn, void *free_context,
453 struct dm_bio_prison_cell **cell_result) 455 struct dm_bio_prison_cell **cell_result)
454{ 456{
455 int r; 457 int r;
456 struct dm_cell_key key; 458 struct dm_cell_key key;
457 459
458 build_key(oblock, &key); 460 build_key(oblock_begin, oblock_end, &key);
459 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 461 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
460 if (r) 462 if (r)
461 free_fn(free_context, cell_prealloc); 463 free_fn(free_context, cell_prealloc);
@@ -463,6 +465,16 @@ static int bio_detain(struct cache *cache, dm_oblock_t oblock,
463 return r; 465 return r;
464} 466}
465 467
468static int bio_detain(struct cache *cache, dm_oblock_t oblock,
469 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
470 cell_free_fn free_fn, void *free_context,
471 struct dm_bio_prison_cell **cell_result)
472{
473 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
474 return bio_detain_range(cache, oblock, end, bio,
475 cell_prealloc, free_fn, free_context, cell_result);
476}
477
466static int get_cell(struct cache *cache, 478static int get_cell(struct cache *cache,
467 dm_oblock_t oblock, 479 dm_oblock_t oblock,
468 struct prealloc *structs, 480 struct prealloc *structs,
@@ -474,7 +486,7 @@ static int get_cell(struct cache *cache,
474 486
475 cell_prealloc = prealloc_get_cell(structs); 487 cell_prealloc = prealloc_get_cell(structs);
476 488
477 build_key(oblock, &key); 489 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
478 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 490 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
479 if (r) 491 if (r)
480 prealloc_put_cell(structs, cell_prealloc); 492 prealloc_put_cell(structs, cell_prealloc);
@@ -524,33 +536,57 @@ static dm_block_t block_div(dm_block_t b, uint32_t n)
524 return b; 536 return b;
525} 537}
526 538
527static void set_discard(struct cache *cache, dm_oblock_t b) 539static dm_block_t oblocks_per_dblock(struct cache *cache)
540{
541 dm_block_t oblocks = cache->discard_block_size;
542
543 if (block_size_is_power_of_two(cache))
544 oblocks >>= cache->sectors_per_block_shift;
545 else
546 oblocks = block_div(oblocks, cache->sectors_per_block);
547
548 return oblocks;
549}
550
551static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
552{
553 return to_dblock(block_div(from_oblock(oblock),
554 oblocks_per_dblock(cache)));
555}
556
557static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
558{
559 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
560}
561
562static void set_discard(struct cache *cache, dm_dblock_t b)
528{ 563{
529 unsigned long flags; 564 unsigned long flags;
530 565
566 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
531 atomic_inc(&cache->stats.discard_count); 567 atomic_inc(&cache->stats.discard_count);
532 568
533 spin_lock_irqsave(&cache->lock, flags); 569 spin_lock_irqsave(&cache->lock, flags);
534 set_bit(from_oblock(b), cache->discard_bitset); 570 set_bit(from_dblock(b), cache->discard_bitset);
535 spin_unlock_irqrestore(&cache->lock, flags); 571 spin_unlock_irqrestore(&cache->lock, flags);
536} 572}
537 573
538static void clear_discard(struct cache *cache, dm_oblock_t b) 574static void clear_discard(struct cache *cache, dm_dblock_t b)
539{ 575{
540 unsigned long flags; 576 unsigned long flags;
541 577
542 spin_lock_irqsave(&cache->lock, flags); 578 spin_lock_irqsave(&cache->lock, flags);
543 clear_bit(from_oblock(b), cache->discard_bitset); 579 clear_bit(from_dblock(b), cache->discard_bitset);
544 spin_unlock_irqrestore(&cache->lock, flags); 580 spin_unlock_irqrestore(&cache->lock, flags);
545} 581}
546 582
547static bool is_discarded(struct cache *cache, dm_oblock_t b) 583static bool is_discarded(struct cache *cache, dm_dblock_t b)
548{ 584{
549 int r; 585 int r;
550 unsigned long flags; 586 unsigned long flags;
551 587
552 spin_lock_irqsave(&cache->lock, flags); 588 spin_lock_irqsave(&cache->lock, flags);
553 r = test_bit(from_oblock(b), cache->discard_bitset); 589 r = test_bit(from_dblock(b), cache->discard_bitset);
554 spin_unlock_irqrestore(&cache->lock, flags); 590 spin_unlock_irqrestore(&cache->lock, flags);
555 591
556 return r; 592 return r;
@@ -562,7 +598,8 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
562 unsigned long flags; 598 unsigned long flags;
563 599
564 spin_lock_irqsave(&cache->lock, flags); 600 spin_lock_irqsave(&cache->lock, flags);
565 r = test_bit(from_oblock(b), cache->discard_bitset); 601 r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
602 cache->discard_bitset);
566 spin_unlock_irqrestore(&cache->lock, flags); 603 spin_unlock_irqrestore(&cache->lock, flags);
567 604
568 return r; 605 return r;
@@ -687,7 +724,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
687 check_if_tick_bio_needed(cache, bio); 724 check_if_tick_bio_needed(cache, bio);
688 remap_to_origin(cache, bio); 725 remap_to_origin(cache, bio);
689 if (bio_data_dir(bio) == WRITE) 726 if (bio_data_dir(bio) == WRITE)
690 clear_discard(cache, oblock); 727 clear_discard(cache, oblock_to_dblock(cache, oblock));
691} 728}
692 729
693static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 730static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
@@ -697,7 +734,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
697 remap_to_cache(cache, bio, cblock); 734 remap_to_cache(cache, bio, cblock);
698 if (bio_data_dir(bio) == WRITE) { 735 if (bio_data_dir(bio) == WRITE) {
699 set_dirty(cache, oblock, cblock); 736 set_dirty(cache, oblock, cblock);
700 clear_discard(cache, oblock); 737 clear_discard(cache, oblock_to_dblock(cache, oblock));
701 } 738 }
702} 739}
703 740
@@ -951,10 +988,14 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
951 } 988 }
952 989
953 } else { 990 } else {
954 clear_dirty(cache, mg->new_oblock, mg->cblock); 991 if (mg->requeue_holder) {
955 if (mg->requeue_holder) 992 clear_dirty(cache, mg->new_oblock, mg->cblock);
956 cell_defer(cache, mg->new_ocell, true); 993 cell_defer(cache, mg->new_ocell, true);
957 else { 994 } else {
995 /*
996 * The block was promoted via an overwrite, so it's dirty.
997 */
998 set_dirty(cache, mg->new_oblock, mg->cblock);
958 bio_endio(mg->new_ocell->holder, 0); 999 bio_endio(mg->new_ocell->holder, 0);
959 cell_defer(cache, mg->new_ocell, false); 1000 cell_defer(cache, mg->new_ocell, false);
960 } 1001 }
@@ -978,7 +1019,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
978 wake_worker(cache); 1019 wake_worker(cache);
979} 1020}
980 1021
981static void issue_copy_real(struct dm_cache_migration *mg) 1022static void issue_copy(struct dm_cache_migration *mg)
982{ 1023{
983 int r; 1024 int r;
984 struct dm_io_region o_region, c_region; 1025 struct dm_io_region o_region, c_region;
@@ -1057,11 +1098,46 @@ static void avoid_copy(struct dm_cache_migration *mg)
1057 migration_success_pre_commit(mg); 1098 migration_success_pre_commit(mg);
1058} 1099}
1059 1100
1060static void issue_copy(struct dm_cache_migration *mg) 1101static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1102 dm_dblock_t *b, dm_dblock_t *e)
1103{
1104 sector_t sb = bio->bi_iter.bi_sector;
1105 sector_t se = bio_end_sector(bio);
1106
1107 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1108
1109 if (se - sb < cache->discard_block_size)
1110 *e = *b;
1111 else
1112 *e = to_dblock(block_div(se, cache->discard_block_size));
1113}
1114
1115static void issue_discard(struct dm_cache_migration *mg)
1116{
1117 dm_dblock_t b, e;
1118 struct bio *bio = mg->new_ocell->holder;
1119
1120 calc_discard_block_range(mg->cache, bio, &b, &e);
1121 while (b != e) {
1122 set_discard(mg->cache, b);
1123 b = to_dblock(from_dblock(b) + 1);
1124 }
1125
1126 bio_endio(bio, 0);
1127 cell_defer(mg->cache, mg->new_ocell, false);
1128 free_migration(mg);
1129}
1130
1131static void issue_copy_or_discard(struct dm_cache_migration *mg)
1061{ 1132{
1062 bool avoid; 1133 bool avoid;
1063 struct cache *cache = mg->cache; 1134 struct cache *cache = mg->cache;
1064 1135
1136 if (mg->discard) {
1137 issue_discard(mg);
1138 return;
1139 }
1140
1065 if (mg->writeback || mg->demote) 1141 if (mg->writeback || mg->demote)
1066 avoid = !is_dirty(cache, mg->cblock) || 1142 avoid = !is_dirty(cache, mg->cblock) ||
1067 is_discarded_oblock(cache, mg->old_oblock); 1143 is_discarded_oblock(cache, mg->old_oblock);
@@ -1070,13 +1146,14 @@ static void issue_copy(struct dm_cache_migration *mg)
1070 1146
1071 avoid = is_discarded_oblock(cache, mg->new_oblock); 1147 avoid = is_discarded_oblock(cache, mg->new_oblock);
1072 1148
1073 if (!avoid && bio_writes_complete_block(cache, bio)) { 1149 if (writeback_mode(&cache->features) &&
1150 !avoid && bio_writes_complete_block(cache, bio)) {
1074 issue_overwrite(mg, bio); 1151 issue_overwrite(mg, bio);
1075 return; 1152 return;
1076 } 1153 }
1077 } 1154 }
1078 1155
1079 avoid ? avoid_copy(mg) : issue_copy_real(mg); 1156 avoid ? avoid_copy(mg) : issue_copy(mg);
1080} 1157}
1081 1158
1082static void complete_migration(struct dm_cache_migration *mg) 1159static void complete_migration(struct dm_cache_migration *mg)
@@ -1161,6 +1238,7 @@ static void promote(struct cache *cache, struct prealloc *structs,
1161 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1238 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1162 1239
1163 mg->err = false; 1240 mg->err = false;
1241 mg->discard = false;
1164 mg->writeback = false; 1242 mg->writeback = false;
1165 mg->demote = false; 1243 mg->demote = false;
1166 mg->promote = true; 1244 mg->promote = true;
@@ -1184,6 +1262,7 @@ static void writeback(struct cache *cache, struct prealloc *structs,
1184 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1262 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1185 1263
1186 mg->err = false; 1264 mg->err = false;
1265 mg->discard = false;
1187 mg->writeback = true; 1266 mg->writeback = true;
1188 mg->demote = false; 1267 mg->demote = false;
1189 mg->promote = false; 1268 mg->promote = false;
@@ -1209,6 +1288,7 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1209 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1288 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1210 1289
1211 mg->err = false; 1290 mg->err = false;
1291 mg->discard = false;
1212 mg->writeback = false; 1292 mg->writeback = false;
1213 mg->demote = true; 1293 mg->demote = true;
1214 mg->promote = true; 1294 mg->promote = true;
@@ -1237,6 +1317,7 @@ static void invalidate(struct cache *cache, struct prealloc *structs,
1237 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1317 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1238 1318
1239 mg->err = false; 1319 mg->err = false;
1320 mg->discard = false;
1240 mg->writeback = false; 1321 mg->writeback = false;
1241 mg->demote = true; 1322 mg->demote = true;
1242 mg->promote = false; 1323 mg->promote = false;
@@ -1253,6 +1334,26 @@ static void invalidate(struct cache *cache, struct prealloc *structs,
1253 quiesce_migration(mg); 1334 quiesce_migration(mg);
1254} 1335}
1255 1336
1337static void discard(struct cache *cache, struct prealloc *structs,
1338 struct dm_bio_prison_cell *cell)
1339{
1340 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1341
1342 mg->err = false;
1343 mg->discard = true;
1344 mg->writeback = false;
1345 mg->demote = false;
1346 mg->promote = false;
1347 mg->requeue_holder = false;
1348 mg->invalidate = false;
1349 mg->cache = cache;
1350 mg->old_ocell = NULL;
1351 mg->new_ocell = cell;
1352 mg->start_jiffies = jiffies;
1353
1354 quiesce_migration(mg);
1355}
1356
1256/*---------------------------------------------------------------- 1357/*----------------------------------------------------------------
1257 * bio processing 1358 * bio processing
1258 *--------------------------------------------------------------*/ 1359 *--------------------------------------------------------------*/
@@ -1286,31 +1387,27 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
1286 issue(cache, bio); 1387 issue(cache, bio);
1287} 1388}
1288 1389
1289/* 1390static void process_discard_bio(struct cache *cache, struct prealloc *structs,
1290 * People generally discard large parts of a device, eg, the whole device 1391 struct bio *bio)
1291 * when formatting. Splitting these large discards up into cache block
1292 * sized ios and then quiescing (always neccessary for discard) takes too
1293 * long.
1294 *
1295 * We keep it simple, and allow any size of discard to come in, and just
1296 * mark off blocks on the discard bitset. No passdown occurs!
1297 *
1298 * To implement passdown we need to change the bio_prison such that a cell
1299 * can have a key that spans many blocks.
1300 */
1301static void process_discard_bio(struct cache *cache, struct bio *bio)
1302{ 1392{
1303 dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector, 1393 int r;
1304 cache->sectors_per_block); 1394 dm_dblock_t b, e;
1305 dm_block_t end_block = bio_end_sector(bio); 1395 struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1306 dm_block_t b;
1307 1396
1308 end_block = block_div(end_block, cache->sectors_per_block); 1397 calc_discard_block_range(cache, bio, &b, &e);
1398 if (b == e) {
1399 bio_endio(bio, 0);
1400 return;
1401 }
1309 1402
1310 for (b = start_block; b < end_block; b++) 1403 cell_prealloc = prealloc_get_cell(structs);
1311 set_discard(cache, to_oblock(b)); 1404 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
1405 (cell_free_fn) prealloc_put_cell,
1406 structs, &new_ocell);
1407 if (r > 0)
1408 return;
1312 1409
1313 bio_endio(bio, 0); 1410 discard(cache, structs, new_ocell);
1314} 1411}
1315 1412
1316static bool spare_migration_bandwidth(struct cache *cache) 1413static bool spare_migration_bandwidth(struct cache *cache)
@@ -1340,9 +1437,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1340 dm_oblock_t block = get_bio_block(cache, bio); 1437 dm_oblock_t block = get_bio_block(cache, bio);
1341 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; 1438 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1342 struct policy_result lookup_result; 1439 struct policy_result lookup_result;
1343 bool discarded_block = is_discarded_oblock(cache, block);
1344 bool passthrough = passthrough_mode(&cache->features); 1440 bool passthrough = passthrough_mode(&cache->features);
1345 bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); 1441 bool discarded_block, can_migrate;
1346 1442
1347 /* 1443 /*
1348 * Check to see if that block is currently migrating. 1444 * Check to see if that block is currently migrating.
@@ -1354,6 +1450,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1354 if (r > 0) 1450 if (r > 0)
1355 return; 1451 return;
1356 1452
1453 discarded_block = is_discarded_oblock(cache, block);
1454 can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
1455
1357 r = policy_map(cache->policy, block, true, can_migrate, discarded_block, 1456 r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1358 bio, &lookup_result); 1457 bio, &lookup_result);
1359 1458
@@ -1500,7 +1599,7 @@ static void process_deferred_bios(struct cache *cache)
1500 if (bio->bi_rw & REQ_FLUSH) 1599 if (bio->bi_rw & REQ_FLUSH)
1501 process_flush_bio(cache, bio); 1600 process_flush_bio(cache, bio);
1502 else if (bio->bi_rw & REQ_DISCARD) 1601 else if (bio->bi_rw & REQ_DISCARD)
1503 process_discard_bio(cache, bio); 1602 process_discard_bio(cache, &structs, bio);
1504 else 1603 else
1505 process_bio(cache, &structs, bio); 1604 process_bio(cache, &structs, bio);
1506 } 1605 }
@@ -1715,7 +1814,7 @@ static void do_worker(struct work_struct *ws)
1715 process_invalidation_requests(cache); 1814 process_invalidation_requests(cache);
1716 } 1815 }
1717 1816
1718 process_migrations(cache, &cache->quiesced_migrations, issue_copy); 1817 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
1719 process_migrations(cache, &cache->completed_migrations, complete_migration); 1818 process_migrations(cache, &cache->completed_migrations, complete_migration);
1720 1819
1721 if (commit_if_needed(cache)) { 1820 if (commit_if_needed(cache)) {
@@ -2180,6 +2279,45 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2180 return 0; 2279 return 0;
2181} 2280}
2182 2281
2282/*
2283 * We want the discard block size to be at least the size of the cache
2284 * block size and have no more than 2^14 discard blocks across the origin.
2285 */
2286#define MAX_DISCARD_BLOCKS (1 << 14)
2287
2288static bool too_many_discard_blocks(sector_t discard_block_size,
2289 sector_t origin_size)
2290{
2291 (void) sector_div(origin_size, discard_block_size);
2292
2293 return origin_size > MAX_DISCARD_BLOCKS;
2294}
2295
2296static sector_t calculate_discard_block_size(sector_t cache_block_size,
2297 sector_t origin_size)
2298{
2299 sector_t discard_block_size = cache_block_size;
2300
2301 if (origin_size)
2302 while (too_many_discard_blocks(discard_block_size, origin_size))
2303 discard_block_size *= 2;
2304
2305 return discard_block_size;
2306}
2307
2308static void set_cache_size(struct cache *cache, dm_cblock_t size)
2309{
2310 dm_block_t nr_blocks = from_cblock(size);
2311
2312 if (nr_blocks > (1 << 20) && cache->cache_size != size)
2313 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2314 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2315 "Please consider increasing the cache block size to reduce the overall cache block count.",
2316 (unsigned long long) nr_blocks);
2317
2318 cache->cache_size = size;
2319}
2320
2183#define DEFAULT_MIGRATION_THRESHOLD 2048 2321#define DEFAULT_MIGRATION_THRESHOLD 2048
2184 2322
2185static int cache_create(struct cache_args *ca, struct cache **result) 2323static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2204,8 +2342,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2204 ti->num_discard_bios = 1; 2342 ti->num_discard_bios = 1;
2205 ti->discards_supported = true; 2343 ti->discards_supported = true;
2206 ti->discard_zeroes_data_unsupported = true; 2344 ti->discard_zeroes_data_unsupported = true;
2207 /* Discard bios must be split on a block boundary */ 2345 ti->split_discard_bios = false;
2208 ti->split_discard_bios = true;
2209 2346
2210 cache->features = ca->features; 2347 cache->features = ca->features;
2211 ti->per_bio_data_size = get_per_bio_data_size(cache); 2348 ti->per_bio_data_size = get_per_bio_data_size(cache);
@@ -2235,10 +2372,10 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2235 2372
2236 cache->sectors_per_block_shift = -1; 2373 cache->sectors_per_block_shift = -1;
2237 cache_size = block_div(cache_size, ca->block_size); 2374 cache_size = block_div(cache_size, ca->block_size);
2238 cache->cache_size = to_cblock(cache_size); 2375 set_cache_size(cache, to_cblock(cache_size));
2239 } else { 2376 } else {
2240 cache->sectors_per_block_shift = __ffs(ca->block_size); 2377 cache->sectors_per_block_shift = __ffs(ca->block_size);
2241 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift); 2378 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2242 } 2379 }
2243 2380
2244 r = create_cache_policy(cache, ca, error); 2381 r = create_cache_policy(cache, ca, error);
@@ -2303,13 +2440,17 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2303 } 2440 }
2304 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2441 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2305 2442
2306 cache->discard_nr_blocks = cache->origin_blocks; 2443 cache->discard_block_size =
2307 cache->discard_bitset = alloc_bitset(from_oblock(cache->discard_nr_blocks)); 2444 calculate_discard_block_size(cache->sectors_per_block,
2445 cache->origin_sectors);
2446 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2447 cache->discard_block_size));
2448 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2308 if (!cache->discard_bitset) { 2449 if (!cache->discard_bitset) {
2309 *error = "could not allocate discard bitset"; 2450 *error = "could not allocate discard bitset";
2310 goto bad; 2451 goto bad;
2311 } 2452 }
2312 clear_bitset(cache->discard_bitset, from_oblock(cache->discard_nr_blocks)); 2453 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2313 2454
2314 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2455 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2315 if (IS_ERR(cache->copier)) { 2456 if (IS_ERR(cache->copier)) {
@@ -2327,7 +2468,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2327 INIT_DELAYED_WORK(&cache->waker, do_waker); 2468 INIT_DELAYED_WORK(&cache->waker, do_waker);
2328 cache->last_commit_jiffies = jiffies; 2469 cache->last_commit_jiffies = jiffies;
2329 2470
2330 cache->prison = dm_bio_prison_create(PRISON_CELLS); 2471 cache->prison = dm_bio_prison_create();
2331 if (!cache->prison) { 2472 if (!cache->prison) {
2332 *error = "could not create bio prison"; 2473 *error = "could not create bio prison";
2333 goto bad; 2474 goto bad;
@@ -2549,11 +2690,11 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
2549static int cache_map(struct dm_target *ti, struct bio *bio) 2690static int cache_map(struct dm_target *ti, struct bio *bio)
2550{ 2691{
2551 int r; 2692 int r;
2552 struct dm_bio_prison_cell *cell; 2693 struct dm_bio_prison_cell *cell = NULL;
2553 struct cache *cache = ti->private; 2694 struct cache *cache = ti->private;
2554 2695
2555 r = __cache_map(cache, bio, &cell); 2696 r = __cache_map(cache, bio, &cell);
2556 if (r == DM_MAPIO_REMAPPED) { 2697 if (r == DM_MAPIO_REMAPPED && cell) {
2557 inc_ds(cache, bio, cell); 2698 inc_ds(cache, bio, cell);
2558 cell_defer(cache, cell, false); 2699 cell_defer(cache, cell, false);
2559 } 2700 }
@@ -2599,16 +2740,16 @@ static int write_discard_bitset(struct cache *cache)
2599{ 2740{
2600 unsigned i, r; 2741 unsigned i, r;
2601 2742
2602 r = dm_cache_discard_bitset_resize(cache->cmd, cache->sectors_per_block, 2743 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2603 cache->origin_blocks); 2744 cache->discard_nr_blocks);
2604 if (r) { 2745 if (r) {
2605 DMERR("could not resize on-disk discard bitset"); 2746 DMERR("could not resize on-disk discard bitset");
2606 return r; 2747 return r;
2607 } 2748 }
2608 2749
2609 for (i = 0; i < from_oblock(cache->discard_nr_blocks); i++) { 2750 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2610 r = dm_cache_set_discard(cache->cmd, to_oblock(i), 2751 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2611 is_discarded(cache, to_oblock(i))); 2752 is_discarded(cache, to_dblock(i)));
2612 if (r) 2753 if (r)
2613 return r; 2754 return r;
2614 } 2755 }
@@ -2680,15 +2821,86 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2680 return 0; 2821 return 0;
2681} 2822}
2682 2823
2824/*
2825 * The discard block size in the on disk metadata is not
2826 * neccessarily the same as we're currently using. So we have to
2827 * be careful to only set the discarded attribute if we know it
2828 * covers a complete block of the new size.
2829 */
2830struct discard_load_info {
2831 struct cache *cache;
2832
2833 /*
2834 * These blocks are sized using the on disk dblock size, rather
2835 * than the current one.
2836 */
2837 dm_block_t block_size;
2838 dm_block_t discard_begin, discard_end;
2839};
2840
2841static void discard_load_info_init(struct cache *cache,
2842 struct discard_load_info *li)
2843{
2844 li->cache = cache;
2845 li->discard_begin = li->discard_end = 0;
2846}
2847
2848static void set_discard_range(struct discard_load_info *li)
2849{
2850 sector_t b, e;
2851
2852 if (li->discard_begin == li->discard_end)
2853 return;
2854
2855 /*
2856 * Convert to sectors.
2857 */
2858 b = li->discard_begin * li->block_size;
2859 e = li->discard_end * li->block_size;
2860
2861 /*
2862 * Then convert back to the current dblock size.
2863 */
2864 b = dm_sector_div_up(b, li->cache->discard_block_size);
2865 sector_div(e, li->cache->discard_block_size);
2866
2867 /*
2868 * The origin may have shrunk, so we need to check we're still in
2869 * bounds.
2870 */
2871 if (e > from_dblock(li->cache->discard_nr_blocks))
2872 e = from_dblock(li->cache->discard_nr_blocks);
2873
2874 for (; b < e; b++)
2875 set_discard(li->cache, to_dblock(b));
2876}
2877
2683static int load_discard(void *context, sector_t discard_block_size, 2878static int load_discard(void *context, sector_t discard_block_size,
2684 dm_oblock_t oblock, bool discard) 2879 dm_dblock_t dblock, bool discard)
2685{ 2880{
2686 struct cache *cache = context; 2881 struct discard_load_info *li = context;
2687 2882
2688 if (discard) 2883 li->block_size = discard_block_size;
2689 set_discard(cache, oblock); 2884
2690 else 2885 if (discard) {
2691 clear_discard(cache, oblock); 2886 if (from_dblock(dblock) == li->discard_end)
2887 /*
2888 * We're already in a discard range, just extend it.
2889 */
2890 li->discard_end = li->discard_end + 1ULL;
2891
2892 else {
2893 /*
2894 * Emit the old range and start a new one.
2895 */
2896 set_discard_range(li);
2897 li->discard_begin = from_dblock(dblock);
2898 li->discard_end = li->discard_begin + 1ULL;
2899 }
2900 } else {
2901 set_discard_range(li);
2902 li->discard_begin = li->discard_end = 0;
2903 }
2692 2904
2693 return 0; 2905 return 0;
2694} 2906}
@@ -2730,7 +2942,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2730 return r; 2942 return r;
2731 } 2943 }
2732 2944
2733 cache->cache_size = new_size; 2945 set_cache_size(cache, new_size);
2734 2946
2735 return 0; 2947 return 0;
2736} 2948}
@@ -2772,11 +2984,22 @@ static int cache_preresume(struct dm_target *ti)
2772 } 2984 }
2773 2985
2774 if (!cache->loaded_discards) { 2986 if (!cache->loaded_discards) {
2775 r = dm_cache_load_discards(cache->cmd, load_discard, cache); 2987 struct discard_load_info li;
2988
2989 /*
2990 * The discard bitset could have been resized, or the
2991 * discard block size changed. To be safe we start by
2992 * setting every dblock to not discarded.
2993 */
2994 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2995
2996 discard_load_info_init(cache, &li);
2997 r = dm_cache_load_discards(cache->cmd, load_discard, &li);
2776 if (r) { 2998 if (r) {
2777 DMERR("could not load origin discards"); 2999 DMERR("could not load origin discards");
2778 return r; 3000 return r;
2779 } 3001 }
3002 set_discard_range(&li);
2780 3003
2781 cache->loaded_discards = true; 3004 cache->loaded_discards = true;
2782 } 3005 }
@@ -3079,8 +3302,9 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3079 /* 3302 /*
3080 * FIXME: these limits may be incompatible with the cache device 3303 * FIXME: these limits may be incompatible with the cache device
3081 */ 3304 */
3082 limits->max_discard_sectors = cache->sectors_per_block; 3305 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3083 limits->discard_granularity = cache->sectors_per_block << SECTOR_SHIFT; 3306 cache->origin_sectors);
3307 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3084} 3308}
3085 3309
3086static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3310static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -3104,7 +3328,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3104 3328
3105static struct target_type cache_target = { 3329static struct target_type cache_target = {
3106 .name = "cache", 3330 .name = "cache",
3107 .version = {1, 5, 0}, 3331 .version = {1, 6, 0},
3108 .module = THIS_MODULE, 3332 .module = THIS_MODULE,
3109 .ctr = cache_ctr, 3333 .ctr = cache_ctr,
3110 .dtr = cache_dtr, 3334 .dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index fc93b9330af4..08981be7baa1 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -705,7 +705,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
705 for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) 705 for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
706 crypto_xor(data + i * 8, buf, 8); 706 crypto_xor(data + i * 8, buf, 8);
707out: 707out:
708 memset(buf, 0, sizeof(buf)); 708 memzero_explicit(buf, sizeof(buf));
709 return r; 709 return r;
710} 710}
711 711
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 0be9381365d7..73f791bb9ea4 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -684,11 +684,14 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
684 int srcu_idx; 684 int srcu_idx;
685 685
686 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | 686 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
687 DM_ACTIVE_PRESENT_FLAG); 687 DM_ACTIVE_PRESENT_FLAG | DM_INTERNAL_SUSPEND_FLAG);
688 688
689 if (dm_suspended_md(md)) 689 if (dm_suspended_md(md))
690 param->flags |= DM_SUSPEND_FLAG; 690 param->flags |= DM_SUSPEND_FLAG;
691 691
692 if (dm_suspended_internally_md(md))
693 param->flags |= DM_INTERNAL_SUSPEND_FLAG;
694
692 if (dm_test_deferred_remove_flag(md)) 695 if (dm_test_deferred_remove_flag(md))
693 param->flags |= DM_DEFERRED_REMOVE; 696 param->flags |= DM_DEFERRED_REMOVE;
694 697
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 87f86c77b094..f478a4c96d2f 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -824,7 +824,7 @@ static int message_stats_create(struct mapped_device *md,
824 return 1; 824 return 1;
825 825
826 id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data, 826 id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
827 dm_internal_suspend, dm_internal_resume, md); 827 dm_internal_suspend_fast, dm_internal_resume_fast, md);
828 if (id < 0) 828 if (id < 0)
829 return id; 829 return id;
830 830
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index b2bd1ebf4562..3afae9e062f8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1521,18 +1521,32 @@ fmode_t dm_table_get_mode(struct dm_table *t)
1521} 1521}
1522EXPORT_SYMBOL(dm_table_get_mode); 1522EXPORT_SYMBOL(dm_table_get_mode);
1523 1523
1524static void suspend_targets(struct dm_table *t, unsigned postsuspend) 1524enum suspend_mode {
1525 PRESUSPEND,
1526 PRESUSPEND_UNDO,
1527 POSTSUSPEND,
1528};
1529
1530static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
1525{ 1531{
1526 int i = t->num_targets; 1532 int i = t->num_targets;
1527 struct dm_target *ti = t->targets; 1533 struct dm_target *ti = t->targets;
1528 1534
1529 while (i--) { 1535 while (i--) {
1530 if (postsuspend) { 1536 switch (mode) {
1537 case PRESUSPEND:
1538 if (ti->type->presuspend)
1539 ti->type->presuspend(ti);
1540 break;
1541 case PRESUSPEND_UNDO:
1542 if (ti->type->presuspend_undo)
1543 ti->type->presuspend_undo(ti);
1544 break;
1545 case POSTSUSPEND:
1531 if (ti->type->postsuspend) 1546 if (ti->type->postsuspend)
1532 ti->type->postsuspend(ti); 1547 ti->type->postsuspend(ti);
1533 } else if (ti->type->presuspend) 1548 break;
1534 ti->type->presuspend(ti); 1549 }
1535
1536 ti++; 1550 ti++;
1537 } 1551 }
1538} 1552}
@@ -1542,7 +1556,15 @@ void dm_table_presuspend_targets(struct dm_table *t)
1542 if (!t) 1556 if (!t)
1543 return; 1557 return;
1544 1558
1545 suspend_targets(t, 0); 1559 suspend_targets(t, PRESUSPEND);
1560}
1561
1562void dm_table_presuspend_undo_targets(struct dm_table *t)
1563{
1564 if (!t)
1565 return;
1566
1567 suspend_targets(t, PRESUSPEND_UNDO);
1546} 1568}
1547 1569
1548void dm_table_postsuspend_targets(struct dm_table *t) 1570void dm_table_postsuspend_targets(struct dm_table *t)
@@ -1550,7 +1572,7 @@ void dm_table_postsuspend_targets(struct dm_table *t)
1550 if (!t) 1572 if (!t)
1551 return; 1573 return;
1552 1574
1553 suspend_targets(t, 1); 1575 suspend_targets(t, POSTSUSPEND);
1554} 1576}
1555 1577
1556int dm_table_resume_targets(struct dm_table *t) 1578int dm_table_resume_targets(struct dm_table *t)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index e9d33ad59df5..43adbb863f5a 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1384,42 +1384,38 @@ static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1384} 1384}
1385 1385
1386int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 1386int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1387 int can_block, struct dm_thin_lookup_result *result) 1387 int can_issue_io, struct dm_thin_lookup_result *result)
1388{ 1388{
1389 int r = -EINVAL; 1389 int r;
1390 uint64_t block_time = 0;
1391 __le64 value; 1390 __le64 value;
1392 struct dm_pool_metadata *pmd = td->pmd; 1391 struct dm_pool_metadata *pmd = td->pmd;
1393 dm_block_t keys[2] = { td->id, block }; 1392 dm_block_t keys[2] = { td->id, block };
1394 struct dm_btree_info *info; 1393 struct dm_btree_info *info;
1395 1394
1396 if (can_block) {
1397 down_read(&pmd->root_lock);
1398 info = &pmd->info;
1399 } else if (down_read_trylock(&pmd->root_lock))
1400 info = &pmd->nb_info;
1401 else
1402 return -EWOULDBLOCK;
1403
1404 if (pmd->fail_io) 1395 if (pmd->fail_io)
1405 goto out; 1396 return -EINVAL;
1406 1397
1407 r = dm_btree_lookup(info, pmd->root, keys, &value); 1398 down_read(&pmd->root_lock);
1408 if (!r)
1409 block_time = le64_to_cpu(value);
1410 1399
1411out: 1400 if (can_issue_io) {
1412 up_read(&pmd->root_lock); 1401 info = &pmd->info;
1402 } else
1403 info = &pmd->nb_info;
1413 1404
1405 r = dm_btree_lookup(info, pmd->root, keys, &value);
1414 if (!r) { 1406 if (!r) {
1407 uint64_t block_time = 0;
1415 dm_block_t exception_block; 1408 dm_block_t exception_block;
1416 uint32_t exception_time; 1409 uint32_t exception_time;
1410
1411 block_time = le64_to_cpu(value);
1417 unpack_block_time(block_time, &exception_block, 1412 unpack_block_time(block_time, &exception_block,
1418 &exception_time); 1413 &exception_time);
1419 result->block = exception_block; 1414 result->block = exception_block;
1420 result->shared = __snapshotted_since(td, exception_time); 1415 result->shared = __snapshotted_since(td, exception_time);
1421 } 1416 }
1422 1417
1418 up_read(&pmd->root_lock);
1423 return r; 1419 return r;
1424} 1420}
1425 1421
@@ -1813,3 +1809,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
1813 1809
1814 return needs_check; 1810 return needs_check;
1815} 1811}
1812
1813void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
1814{
1815 dm_tm_issue_prefetches(pmd->tm);
1816}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index e3c857db195a..921d15ee56a0 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -139,12 +139,12 @@ struct dm_thin_lookup_result {
139 139
140/* 140/*
141 * Returns: 141 * Returns:
142 * -EWOULDBLOCK iff @can_block is set and would block. 142 * -EWOULDBLOCK iff @can_issue_io is set and would issue IO
143 * -ENODATA iff that mapping is not present. 143 * -ENODATA iff that mapping is not present.
144 * 0 success 144 * 0 success
145 */ 145 */
146int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 146int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
147 int can_block, struct dm_thin_lookup_result *result); 147 int can_issue_io, struct dm_thin_lookup_result *result);
148 148
149/* 149/*
150 * Obtain an unused block. 150 * Obtain an unused block.
@@ -213,6 +213,11 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
213int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd); 213int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
214bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); 214bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
215 215
216/*
217 * Issue any prefetches that may be useful.
218 */
219void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd);
220
216/*----------------------------------------------------------------*/ 221/*----------------------------------------------------------------*/
217 222
218#endif 223#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 0f86d802b533..8735543eacdb 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -11,11 +11,13 @@
11#include <linux/device-mapper.h> 11#include <linux/device-mapper.h>
12#include <linux/dm-io.h> 12#include <linux/dm-io.h>
13#include <linux/dm-kcopyd.h> 13#include <linux/dm-kcopyd.h>
14#include <linux/log2.h>
14#include <linux/list.h> 15#include <linux/list.h>
15#include <linux/rculist.h> 16#include <linux/rculist.h>
16#include <linux/init.h> 17#include <linux/init.h>
17#include <linux/module.h> 18#include <linux/module.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/sort.h>
19#include <linux/rbtree.h> 21#include <linux/rbtree.h>
20 22
21#define DM_MSG_PREFIX "thin" 23#define DM_MSG_PREFIX "thin"
@@ -25,7 +27,6 @@
25 */ 27 */
26#define ENDIO_HOOK_POOL_SIZE 1024 28#define ENDIO_HOOK_POOL_SIZE 1024
27#define MAPPING_POOL_SIZE 1024 29#define MAPPING_POOL_SIZE 1024
28#define PRISON_CELLS 1024
29#define COMMIT_PERIOD HZ 30#define COMMIT_PERIOD HZ
30#define NO_SPACE_TIMEOUT_SECS 60 31#define NO_SPACE_TIMEOUT_SECS 60
31 32
@@ -114,7 +115,8 @@ static void build_data_key(struct dm_thin_device *td,
114{ 115{
115 key->virtual = 0; 116 key->virtual = 0;
116 key->dev = dm_thin_dev_id(td); 117 key->dev = dm_thin_dev_id(td);
117 key->block = b; 118 key->block_begin = b;
119 key->block_end = b + 1ULL;
118} 120}
119 121
120static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 122static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
@@ -122,7 +124,55 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
122{ 124{
123 key->virtual = 1; 125 key->virtual = 1;
124 key->dev = dm_thin_dev_id(td); 126 key->dev = dm_thin_dev_id(td);
125 key->block = b; 127 key->block_begin = b;
128 key->block_end = b + 1ULL;
129}
130
131/*----------------------------------------------------------------*/
132
133#define THROTTLE_THRESHOLD (1 * HZ)
134
135struct throttle {
136 struct rw_semaphore lock;
137 unsigned long threshold;
138 bool throttle_applied;
139};
140
141static void throttle_init(struct throttle *t)
142{
143 init_rwsem(&t->lock);
144 t->throttle_applied = false;
145}
146
147static void throttle_work_start(struct throttle *t)
148{
149 t->threshold = jiffies + THROTTLE_THRESHOLD;
150}
151
152static void throttle_work_update(struct throttle *t)
153{
154 if (!t->throttle_applied && jiffies > t->threshold) {
155 down_write(&t->lock);
156 t->throttle_applied = true;
157 }
158}
159
160static void throttle_work_complete(struct throttle *t)
161{
162 if (t->throttle_applied) {
163 t->throttle_applied = false;
164 up_write(&t->lock);
165 }
166}
167
168static void throttle_lock(struct throttle *t)
169{
170 down_read(&t->lock);
171}
172
173static void throttle_unlock(struct throttle *t)
174{
175 up_read(&t->lock);
126} 176}
127 177
128/*----------------------------------------------------------------*/ 178/*----------------------------------------------------------------*/
@@ -155,8 +205,11 @@ struct pool_features {
155 205
156struct thin_c; 206struct thin_c;
157typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); 207typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
208typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
158typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); 209typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
159 210
211#define CELL_SORT_ARRAY_SIZE 8192
212
160struct pool { 213struct pool {
161 struct list_head list; 214 struct list_head list;
162 struct dm_target *ti; /* Only set if a pool target is bound */ 215 struct dm_target *ti; /* Only set if a pool target is bound */
@@ -171,11 +224,13 @@ struct pool {
171 224
172 struct pool_features pf; 225 struct pool_features pf;
173 bool low_water_triggered:1; /* A dm event has been sent */ 226 bool low_water_triggered:1; /* A dm event has been sent */
227 bool suspended:1;
174 228
175 struct dm_bio_prison *prison; 229 struct dm_bio_prison *prison;
176 struct dm_kcopyd_client *copier; 230 struct dm_kcopyd_client *copier;
177 231
178 struct workqueue_struct *wq; 232 struct workqueue_struct *wq;
233 struct throttle throttle;
179 struct work_struct worker; 234 struct work_struct worker;
180 struct delayed_work waker; 235 struct delayed_work waker;
181 struct delayed_work no_space_timeout; 236 struct delayed_work no_space_timeout;
@@ -198,8 +253,13 @@ struct pool {
198 process_bio_fn process_bio; 253 process_bio_fn process_bio;
199 process_bio_fn process_discard; 254 process_bio_fn process_discard;
200 255
256 process_cell_fn process_cell;
257 process_cell_fn process_discard_cell;
258
201 process_mapping_fn process_prepared_mapping; 259 process_mapping_fn process_prepared_mapping;
202 process_mapping_fn process_prepared_discard; 260 process_mapping_fn process_prepared_discard;
261
262 struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];
203}; 263};
204 264
205static enum pool_mode get_pool_mode(struct pool *pool); 265static enum pool_mode get_pool_mode(struct pool *pool);
@@ -232,8 +292,11 @@ struct thin_c {
232 292
233 struct pool *pool; 293 struct pool *pool;
234 struct dm_thin_device *td; 294 struct dm_thin_device *td;
295 struct mapped_device *thin_md;
296
235 bool requeue_mode:1; 297 bool requeue_mode:1;
236 spinlock_t lock; 298 spinlock_t lock;
299 struct list_head deferred_cells;
237 struct bio_list deferred_bio_list; 300 struct bio_list deferred_bio_list;
238 struct bio_list retry_on_resume_list; 301 struct bio_list retry_on_resume_list;
239 struct rb_root sort_bio_list; /* sorted list of deferred bios */ 302 struct rb_root sort_bio_list; /* sorted list of deferred bios */
@@ -290,6 +353,15 @@ static void cell_release(struct pool *pool,
290 dm_bio_prison_free_cell(pool->prison, cell); 353 dm_bio_prison_free_cell(pool->prison, cell);
291} 354}
292 355
356static void cell_visit_release(struct pool *pool,
357 void (*fn)(void *, struct dm_bio_prison_cell *),
358 void *context,
359 struct dm_bio_prison_cell *cell)
360{
361 dm_cell_visit_release(pool->prison, fn, context, cell);
362 dm_bio_prison_free_cell(pool->prison, cell);
363}
364
293static void cell_release_no_holder(struct pool *pool, 365static void cell_release_no_holder(struct pool *pool,
294 struct dm_bio_prison_cell *cell, 366 struct dm_bio_prison_cell *cell,
295 struct bio_list *bios) 367 struct bio_list *bios)
@@ -298,19 +370,6 @@ static void cell_release_no_holder(struct pool *pool,
298 dm_bio_prison_free_cell(pool->prison, cell); 370 dm_bio_prison_free_cell(pool->prison, cell);
299} 371}
300 372
301static void cell_defer_no_holder_no_free(struct thin_c *tc,
302 struct dm_bio_prison_cell *cell)
303{
304 struct pool *pool = tc->pool;
305 unsigned long flags;
306
307 spin_lock_irqsave(&tc->lock, flags);
308 dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
309 spin_unlock_irqrestore(&tc->lock, flags);
310
311 wake_worker(pool);
312}
313
314static void cell_error_with_code(struct pool *pool, 373static void cell_error_with_code(struct pool *pool,
315 struct dm_bio_prison_cell *cell, int error_code) 374 struct dm_bio_prison_cell *cell, int error_code)
316{ 375{
@@ -323,6 +382,16 @@ static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
323 cell_error_with_code(pool, cell, -EIO); 382 cell_error_with_code(pool, cell, -EIO);
324} 383}
325 384
385static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
386{
387 cell_error_with_code(pool, cell, 0);
388}
389
390static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
391{
392 cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
393}
394
326/*----------------------------------------------------------------*/ 395/*----------------------------------------------------------------*/
327 396
328/* 397/*
@@ -393,44 +462,65 @@ struct dm_thin_endio_hook {
393 struct rb_node rb_node; 462 struct rb_node rb_node;
394}; 463};
395 464
396static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) 465static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
466{
467 bio_list_merge(bios, master);
468 bio_list_init(master);
469}
470
471static void error_bio_list(struct bio_list *bios, int error)
397{ 472{
398 struct bio *bio; 473 struct bio *bio;
474
475 while ((bio = bio_list_pop(bios)))
476 bio_endio(bio, error);
477}
478
479static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
480{
399 struct bio_list bios; 481 struct bio_list bios;
400 unsigned long flags; 482 unsigned long flags;
401 483
402 bio_list_init(&bios); 484 bio_list_init(&bios);
403 485
404 spin_lock_irqsave(&tc->lock, flags); 486 spin_lock_irqsave(&tc->lock, flags);
405 bio_list_merge(&bios, master); 487 __merge_bio_list(&bios, master);
406 bio_list_init(master);
407 spin_unlock_irqrestore(&tc->lock, flags); 488 spin_unlock_irqrestore(&tc->lock, flags);
408 489
409 while ((bio = bio_list_pop(&bios))) 490 error_bio_list(&bios, error);
410 bio_endio(bio, DM_ENDIO_REQUEUE);
411} 491}
412 492
413static void requeue_io(struct thin_c *tc) 493static void requeue_deferred_cells(struct thin_c *tc)
414{ 494{
415 requeue_bio_list(tc, &tc->deferred_bio_list); 495 struct pool *pool = tc->pool;
416 requeue_bio_list(tc, &tc->retry_on_resume_list); 496 unsigned long flags;
497 struct list_head cells;
498 struct dm_bio_prison_cell *cell, *tmp;
499
500 INIT_LIST_HEAD(&cells);
501
502 spin_lock_irqsave(&tc->lock, flags);
503 list_splice_init(&tc->deferred_cells, &cells);
504 spin_unlock_irqrestore(&tc->lock, flags);
505
506 list_for_each_entry_safe(cell, tmp, &cells, user_list)
507 cell_requeue(pool, cell);
417} 508}
418 509
419static void error_thin_retry_list(struct thin_c *tc) 510static void requeue_io(struct thin_c *tc)
420{ 511{
421 struct bio *bio;
422 unsigned long flags;
423 struct bio_list bios; 512 struct bio_list bios;
513 unsigned long flags;
424 514
425 bio_list_init(&bios); 515 bio_list_init(&bios);
426 516
427 spin_lock_irqsave(&tc->lock, flags); 517 spin_lock_irqsave(&tc->lock, flags);
428 bio_list_merge(&bios, &tc->retry_on_resume_list); 518 __merge_bio_list(&bios, &tc->deferred_bio_list);
429 bio_list_init(&tc->retry_on_resume_list); 519 __merge_bio_list(&bios, &tc->retry_on_resume_list);
430 spin_unlock_irqrestore(&tc->lock, flags); 520 spin_unlock_irqrestore(&tc->lock, flags);
431 521
432 while ((bio = bio_list_pop(&bios))) 522 error_bio_list(&bios, DM_ENDIO_REQUEUE);
433 bio_io_error(bio); 523 requeue_deferred_cells(tc);
434} 524}
435 525
436static void error_retry_list(struct pool *pool) 526static void error_retry_list(struct pool *pool)
@@ -439,7 +529,7 @@ static void error_retry_list(struct pool *pool)
439 529
440 rcu_read_lock(); 530 rcu_read_lock();
441 list_for_each_entry_rcu(tc, &pool->active_thins, list) 531 list_for_each_entry_rcu(tc, &pool->active_thins, list)
442 error_thin_retry_list(tc); 532 error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
443 rcu_read_unlock(); 533 rcu_read_unlock();
444} 534}
445 535
@@ -629,33 +719,75 @@ static void overwrite_endio(struct bio *bio, int err)
629 */ 719 */
630 720
631/* 721/*
632 * This sends the bios in the cell back to the deferred_bios list. 722 * This sends the bios in the cell, except the original holder, back
723 * to the deferred_bios list.
633 */ 724 */
634static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) 725static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
635{ 726{
636 struct pool *pool = tc->pool; 727 struct pool *pool = tc->pool;
637 unsigned long flags; 728 unsigned long flags;
638 729
639 spin_lock_irqsave(&tc->lock, flags); 730 spin_lock_irqsave(&tc->lock, flags);
640 cell_release(pool, cell, &tc->deferred_bio_list); 731 cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
641 spin_unlock_irqrestore(&tc->lock, flags); 732 spin_unlock_irqrestore(&tc->lock, flags);
642 733
643 wake_worker(pool); 734 wake_worker(pool);
644} 735}
645 736
646/* 737static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
647 * Same as cell_defer above, except it omits the original holder of the cell. 738
648 */ 739struct remap_info {
649static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) 740 struct thin_c *tc;
741 struct bio_list defer_bios;
742 struct bio_list issue_bios;
743};
744
745static void __inc_remap_and_issue_cell(void *context,
746 struct dm_bio_prison_cell *cell)
650{ 747{
651 struct pool *pool = tc->pool; 748 struct remap_info *info = context;
652 unsigned long flags; 749 struct bio *bio;
653 750
654 spin_lock_irqsave(&tc->lock, flags); 751 while ((bio = bio_list_pop(&cell->bios))) {
655 cell_release_no_holder(pool, cell, &tc->deferred_bio_list); 752 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))
656 spin_unlock_irqrestore(&tc->lock, flags); 753 bio_list_add(&info->defer_bios, bio);
754 else {
755 inc_all_io_entry(info->tc->pool, bio);
657 756
658 wake_worker(pool); 757 /*
758 * We can't issue the bios with the bio prison lock
759 * held, so we add them to a list to issue on
760 * return from this function.
761 */
762 bio_list_add(&info->issue_bios, bio);
763 }
764 }
765}
766
767static void inc_remap_and_issue_cell(struct thin_c *tc,
768 struct dm_bio_prison_cell *cell,
769 dm_block_t block)
770{
771 struct bio *bio;
772 struct remap_info info;
773
774 info.tc = tc;
775 bio_list_init(&info.defer_bios);
776 bio_list_init(&info.issue_bios);
777
778 /*
779 * We have to be careful to inc any bios we're about to issue
780 * before the cell is released, and avoid a race with new bios
781 * being added to the cell.
782 */
783 cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
784 &info, cell);
785
786 while ((bio = bio_list_pop(&info.defer_bios)))
787 thin_defer_bio(tc, bio);
788
789 while ((bio = bio_list_pop(&info.issue_bios)))
790 remap_and_issue(info.tc, bio, block);
659} 791}
660 792
661static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) 793static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
@@ -706,10 +838,13 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
706 * the bios in the cell. 838 * the bios in the cell.
707 */ 839 */
708 if (bio) { 840 if (bio) {
709 cell_defer_no_holder(tc, m->cell); 841 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
710 bio_endio(bio, 0); 842 bio_endio(bio, 0);
711 } else 843 } else {
712 cell_defer(tc, m->cell); 844 inc_all_io_entry(tc->pool, m->cell->holder);
845 remap_and_issue(tc, m->cell->holder, m->data_block);
846 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
847 }
713 848
714out: 849out:
715 list_del(&m->list); 850 list_del(&m->list);
@@ -842,6 +977,20 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
842 } 977 }
843} 978}
844 979
980static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
981 dm_block_t data_block,
982 struct dm_thin_new_mapping *m)
983{
984 struct pool *pool = tc->pool;
985 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
986
987 h->overwrite_mapping = m;
988 m->bio = bio;
989 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
990 inc_all_io_entry(pool, bio);
991 remap_and_issue(tc, bio, data_block);
992}
993
845/* 994/*
846 * A partial copy also needs to zero the uncopied region. 995 * A partial copy also needs to zero the uncopied region.
847 */ 996 */
@@ -876,15 +1025,9 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
876 * If the whole block of data is being overwritten, we can issue the 1025 * If the whole block of data is being overwritten, we can issue the
877 * bio immediately. Otherwise we use kcopyd to clone the data first. 1026 * bio immediately. Otherwise we use kcopyd to clone the data first.
878 */ 1027 */
879 if (io_overwrites_block(pool, bio)) { 1028 if (io_overwrites_block(pool, bio))
880 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1029 remap_and_issue_overwrite(tc, bio, data_dest, m);
881 1030 else {
882 h->overwrite_mapping = m;
883 m->bio = bio;
884 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
885 inc_all_io_entry(pool, bio);
886 remap_and_issue(tc, bio, data_dest);
887 } else {
888 struct dm_io_region from, to; 1031 struct dm_io_region from, to;
889 1032
890 from.bdev = origin->bdev; 1033 from.bdev = origin->bdev;
@@ -953,16 +1096,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
953 if (!pool->pf.zero_new_blocks) 1096 if (!pool->pf.zero_new_blocks)
954 process_prepared_mapping(m); 1097 process_prepared_mapping(m);
955 1098
956 else if (io_overwrites_block(pool, bio)) { 1099 else if (io_overwrites_block(pool, bio))
957 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1100 remap_and_issue_overwrite(tc, bio, data_block, m);
958
959 h->overwrite_mapping = m;
960 m->bio = bio;
961 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
962 inc_all_io_entry(pool, bio);
963 remap_and_issue(tc, bio, data_block);
964 1101
965 } else 1102 else
966 ll_zero(tc, m, 1103 ll_zero(tc, m,
967 data_block * pool->sectors_per_block, 1104 data_block * pool->sectors_per_block,
968 (data_block + 1) * pool->sectors_per_block); 1105 (data_block + 1) * pool->sectors_per_block);
@@ -1134,29 +1271,25 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
1134 bio_list_init(&bios); 1271 bio_list_init(&bios);
1135 cell_release(pool, cell, &bios); 1272 cell_release(pool, cell, &bios);
1136 1273
1137 error = should_error_unserviceable_bio(pool); 1274 while ((bio = bio_list_pop(&bios)))
1138 if (error) 1275 retry_on_resume(bio);
1139 while ((bio = bio_list_pop(&bios)))
1140 bio_endio(bio, error);
1141 else
1142 while ((bio = bio_list_pop(&bios)))
1143 retry_on_resume(bio);
1144} 1276}
1145 1277
1146static void process_discard(struct thin_c *tc, struct bio *bio) 1278static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1147{ 1279{
1148 int r; 1280 int r;
1149 unsigned long flags; 1281 struct bio *bio = cell->holder;
1150 struct pool *pool = tc->pool; 1282 struct pool *pool = tc->pool;
1151 struct dm_bio_prison_cell *cell, *cell2; 1283 struct dm_bio_prison_cell *cell2;
1152 struct dm_cell_key key, key2; 1284 struct dm_cell_key key2;
1153 dm_block_t block = get_bio_block(tc, bio); 1285 dm_block_t block = get_bio_block(tc, bio);
1154 struct dm_thin_lookup_result lookup_result; 1286 struct dm_thin_lookup_result lookup_result;
1155 struct dm_thin_new_mapping *m; 1287 struct dm_thin_new_mapping *m;
1156 1288
1157 build_virtual_key(tc->td, block, &key); 1289 if (tc->requeue_mode) {
1158 if (bio_detain(tc->pool, &key, bio, &cell)) 1290 cell_requeue(pool, cell);
1159 return; 1291 return;
1292 }
1160 1293
1161 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1294 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1162 switch (r) { 1295 switch (r) {
@@ -1187,12 +1320,9 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
1187 m->cell2 = cell2; 1320 m->cell2 = cell2;
1188 m->bio = bio; 1321 m->bio = bio;
1189 1322
1190 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { 1323 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1191 spin_lock_irqsave(&pool->lock, flags); 1324 pool->process_prepared_discard(m);
1192 list_add_tail(&m->list, &pool->prepared_discards); 1325
1193 spin_unlock_irqrestore(&pool->lock, flags);
1194 wake_worker(pool);
1195 }
1196 } else { 1326 } else {
1197 inc_all_io_entry(pool, bio); 1327 inc_all_io_entry(pool, bio);
1198 cell_defer_no_holder(tc, cell); 1328 cell_defer_no_holder(tc, cell);
@@ -1227,6 +1357,19 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
1227 } 1357 }
1228} 1358}
1229 1359
1360static void process_discard_bio(struct thin_c *tc, struct bio *bio)
1361{
1362 struct dm_bio_prison_cell *cell;
1363 struct dm_cell_key key;
1364 dm_block_t block = get_bio_block(tc, bio);
1365
1366 build_virtual_key(tc->td, block, &key);
1367 if (bio_detain(tc->pool, &key, bio, &cell))
1368 return;
1369
1370 process_discard_cell(tc, cell);
1371}
1372
1230static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1373static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1231 struct dm_cell_key *key, 1374 struct dm_cell_key *key,
1232 struct dm_thin_lookup_result *lookup_result, 1375 struct dm_thin_lookup_result *lookup_result,
@@ -1255,11 +1398,53 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1255 } 1398 }
1256} 1399}
1257 1400
1401static void __remap_and_issue_shared_cell(void *context,
1402 struct dm_bio_prison_cell *cell)
1403{
1404 struct remap_info *info = context;
1405 struct bio *bio;
1406
1407 while ((bio = bio_list_pop(&cell->bios))) {
1408 if ((bio_data_dir(bio) == WRITE) ||
1409 (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)))
1410 bio_list_add(&info->defer_bios, bio);
1411 else {
1412 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
1413
1414 h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
1415 inc_all_io_entry(info->tc->pool, bio);
1416 bio_list_add(&info->issue_bios, bio);
1417 }
1418 }
1419}
1420
1421static void remap_and_issue_shared_cell(struct thin_c *tc,
1422 struct dm_bio_prison_cell *cell,
1423 dm_block_t block)
1424{
1425 struct bio *bio;
1426 struct remap_info info;
1427
1428 info.tc = tc;
1429 bio_list_init(&info.defer_bios);
1430 bio_list_init(&info.issue_bios);
1431
1432 cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
1433 &info, cell);
1434
1435 while ((bio = bio_list_pop(&info.defer_bios)))
1436 thin_defer_bio(tc, bio);
1437
1438 while ((bio = bio_list_pop(&info.issue_bios)))
1439 remap_and_issue(tc, bio, block);
1440}
1441
1258static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1442static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1259 dm_block_t block, 1443 dm_block_t block,
1260 struct dm_thin_lookup_result *lookup_result) 1444 struct dm_thin_lookup_result *lookup_result,
1445 struct dm_bio_prison_cell *virt_cell)
1261{ 1446{
1262 struct dm_bio_prison_cell *cell; 1447 struct dm_bio_prison_cell *data_cell;
1263 struct pool *pool = tc->pool; 1448 struct pool *pool = tc->pool;
1264 struct dm_cell_key key; 1449 struct dm_cell_key key;
1265 1450
@@ -1268,19 +1453,23 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1268 * of being broken so we have nothing further to do here. 1453 * of being broken so we have nothing further to do here.
1269 */ 1454 */
1270 build_data_key(tc->td, lookup_result->block, &key); 1455 build_data_key(tc->td, lookup_result->block, &key);
1271 if (bio_detain(pool, &key, bio, &cell)) 1456 if (bio_detain(pool, &key, bio, &data_cell)) {
1457 cell_defer_no_holder(tc, virt_cell);
1272 return; 1458 return;
1459 }
1273 1460
1274 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) 1461 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
1275 break_sharing(tc, bio, block, &key, lookup_result, cell); 1462 break_sharing(tc, bio, block, &key, lookup_result, data_cell);
1276 else { 1463 cell_defer_no_holder(tc, virt_cell);
1464 } else {
1277 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1465 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1278 1466
1279 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); 1467 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1280 inc_all_io_entry(pool, bio); 1468 inc_all_io_entry(pool, bio);
1281 cell_defer_no_holder(tc, cell);
1282
1283 remap_and_issue(tc, bio, lookup_result->block); 1469 remap_and_issue(tc, bio, lookup_result->block);
1470
1471 remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
1472 remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
1284 } 1473 }
1285} 1474}
1286 1475
@@ -1333,34 +1522,28 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1333 } 1522 }
1334} 1523}
1335 1524
1336static void process_bio(struct thin_c *tc, struct bio *bio) 1525static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1337{ 1526{
1338 int r; 1527 int r;
1339 struct pool *pool = tc->pool; 1528 struct pool *pool = tc->pool;
1529 struct bio *bio = cell->holder;
1340 dm_block_t block = get_bio_block(tc, bio); 1530 dm_block_t block = get_bio_block(tc, bio);
1341 struct dm_bio_prison_cell *cell;
1342 struct dm_cell_key key;
1343 struct dm_thin_lookup_result lookup_result; 1531 struct dm_thin_lookup_result lookup_result;
1344 1532
1345 /* 1533 if (tc->requeue_mode) {
1346 * If cell is already occupied, then the block is already 1534 cell_requeue(pool, cell);
1347 * being provisioned so we have nothing further to do here.
1348 */
1349 build_virtual_key(tc->td, block, &key);
1350 if (bio_detain(pool, &key, bio, &cell))
1351 return; 1535 return;
1536 }
1352 1537
1353 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1538 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1354 switch (r) { 1539 switch (r) {
1355 case 0: 1540 case 0:
1356 if (lookup_result.shared) { 1541 if (lookup_result.shared)
1357 process_shared_bio(tc, bio, block, &lookup_result); 1542 process_shared_bio(tc, bio, block, &lookup_result, cell);
1358 cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ 1543 else {
1359 } else {
1360 inc_all_io_entry(pool, bio); 1544 inc_all_io_entry(pool, bio);
1361 cell_defer_no_holder(tc, cell);
1362
1363 remap_and_issue(tc, bio, lookup_result.block); 1545 remap_and_issue(tc, bio, lookup_result.block);
1546 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
1364 } 1547 }
1365 break; 1548 break;
1366 1549
@@ -1394,7 +1577,26 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1394 } 1577 }
1395} 1578}
1396 1579
1397static void process_bio_read_only(struct thin_c *tc, struct bio *bio) 1580static void process_bio(struct thin_c *tc, struct bio *bio)
1581{
1582 struct pool *pool = tc->pool;
1583 dm_block_t block = get_bio_block(tc, bio);
1584 struct dm_bio_prison_cell *cell;
1585 struct dm_cell_key key;
1586
1587 /*
1588 * If cell is already occupied, then the block is already
1589 * being provisioned so we have nothing further to do here.
1590 */
1591 build_virtual_key(tc->td, block, &key);
1592 if (bio_detain(pool, &key, bio, &cell))
1593 return;
1594
1595 process_cell(tc, cell);
1596}
1597
1598static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
1599 struct dm_bio_prison_cell *cell)
1398{ 1600{
1399 int r; 1601 int r;
1400 int rw = bio_data_dir(bio); 1602 int rw = bio_data_dir(bio);
@@ -1404,15 +1606,21 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1404 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1606 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1405 switch (r) { 1607 switch (r) {
1406 case 0: 1608 case 0:
1407 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) 1609 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
1408 handle_unserviceable_bio(tc->pool, bio); 1610 handle_unserviceable_bio(tc->pool, bio);
1409 else { 1611 if (cell)
1612 cell_defer_no_holder(tc, cell);
1613 } else {
1410 inc_all_io_entry(tc->pool, bio); 1614 inc_all_io_entry(tc->pool, bio);
1411 remap_and_issue(tc, bio, lookup_result.block); 1615 remap_and_issue(tc, bio, lookup_result.block);
1616 if (cell)
1617 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
1412 } 1618 }
1413 break; 1619 break;
1414 1620
1415 case -ENODATA: 1621 case -ENODATA:
1622 if (cell)
1623 cell_defer_no_holder(tc, cell);
1416 if (rw != READ) { 1624 if (rw != READ) {
1417 handle_unserviceable_bio(tc->pool, bio); 1625 handle_unserviceable_bio(tc->pool, bio);
1418 break; 1626 break;
@@ -1431,11 +1639,23 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1431 default: 1639 default:
1432 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", 1640 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1433 __func__, r); 1641 __func__, r);
1642 if (cell)
1643 cell_defer_no_holder(tc, cell);
1434 bio_io_error(bio); 1644 bio_io_error(bio);
1435 break; 1645 break;
1436 } 1646 }
1437} 1647}
1438 1648
1649static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1650{
1651 __process_bio_read_only(tc, bio, NULL);
1652}
1653
1654static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1655{
1656 __process_bio_read_only(tc, cell->holder, cell);
1657}
1658
1439static void process_bio_success(struct thin_c *tc, struct bio *bio) 1659static void process_bio_success(struct thin_c *tc, struct bio *bio)
1440{ 1660{
1441 bio_endio(bio, 0); 1661 bio_endio(bio, 0);
@@ -1446,6 +1666,16 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1446 bio_io_error(bio); 1666 bio_io_error(bio);
1447} 1667}
1448 1668
1669static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1670{
1671 cell_success(tc->pool, cell);
1672}
1673
1674static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1675{
1676 cell_error(tc->pool, cell);
1677}
1678
1449/* 1679/*
1450 * FIXME: should we also commit due to size of transaction, measured in 1680 * FIXME: should we also commit due to size of transaction, measured in
1451 * metadata blocks? 1681 * metadata blocks?
@@ -1527,9 +1757,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
1527 struct bio *bio; 1757 struct bio *bio;
1528 struct bio_list bios; 1758 struct bio_list bios;
1529 struct blk_plug plug; 1759 struct blk_plug plug;
1760 unsigned count = 0;
1530 1761
1531 if (tc->requeue_mode) { 1762 if (tc->requeue_mode) {
1532 requeue_bio_list(tc, &tc->deferred_bio_list); 1763 error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);
1533 return; 1764 return;
1534 } 1765 }
1535 1766
@@ -1568,10 +1799,97 @@ static void process_thin_deferred_bios(struct thin_c *tc)
1568 pool->process_discard(tc, bio); 1799 pool->process_discard(tc, bio);
1569 else 1800 else
1570 pool->process_bio(tc, bio); 1801 pool->process_bio(tc, bio);
1802
1803 if ((count++ & 127) == 0) {
1804 throttle_work_update(&pool->throttle);
1805 dm_pool_issue_prefetches(pool->pmd);
1806 }
1571 } 1807 }
1572 blk_finish_plug(&plug); 1808 blk_finish_plug(&plug);
1573} 1809}
1574 1810
1811static int cmp_cells(const void *lhs, const void *rhs)
1812{
1813 struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
1814 struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
1815
1816 BUG_ON(!lhs_cell->holder);
1817 BUG_ON(!rhs_cell->holder);
1818
1819 if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
1820 return -1;
1821
1822 if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
1823 return 1;
1824
1825 return 0;
1826}
1827
1828static unsigned sort_cells(struct pool *pool, struct list_head *cells)
1829{
1830 unsigned count = 0;
1831 struct dm_bio_prison_cell *cell, *tmp;
1832
1833 list_for_each_entry_safe(cell, tmp, cells, user_list) {
1834 if (count >= CELL_SORT_ARRAY_SIZE)
1835 break;
1836
1837 pool->cell_sort_array[count++] = cell;
1838 list_del(&cell->user_list);
1839 }
1840
1841 sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
1842
1843 return count;
1844}
1845
1846static void process_thin_deferred_cells(struct thin_c *tc)
1847{
1848 struct pool *pool = tc->pool;
1849 unsigned long flags;
1850 struct list_head cells;
1851 struct dm_bio_prison_cell *cell;
1852 unsigned i, j, count;
1853
1854 INIT_LIST_HEAD(&cells);
1855
1856 spin_lock_irqsave(&tc->lock, flags);
1857 list_splice_init(&tc->deferred_cells, &cells);
1858 spin_unlock_irqrestore(&tc->lock, flags);
1859
1860 if (list_empty(&cells))
1861 return;
1862
1863 do {
1864 count = sort_cells(tc->pool, &cells);
1865
1866 for (i = 0; i < count; i++) {
1867 cell = pool->cell_sort_array[i];
1868 BUG_ON(!cell->holder);
1869
1870 /*
1871 * If we've got no free new_mapping structs, and processing
1872 * this bio might require one, we pause until there are some
1873 * prepared mappings to process.
1874 */
1875 if (ensure_next_mapping(pool)) {
1876 for (j = i; j < count; j++)
1877 list_add(&pool->cell_sort_array[j]->user_list, &cells);
1878
1879 spin_lock_irqsave(&tc->lock, flags);
1880 list_splice(&cells, &tc->deferred_cells);
1881 spin_unlock_irqrestore(&tc->lock, flags);
1882 return;
1883 }
1884
1885 if (cell->holder->bi_rw & REQ_DISCARD)
1886 pool->process_discard_cell(tc, cell);
1887 else
1888 pool->process_cell(tc, cell);
1889 }
1890 } while (!list_empty(&cells));
1891}
1892
1575static void thin_get(struct thin_c *tc); 1893static void thin_get(struct thin_c *tc);
1576static void thin_put(struct thin_c *tc); 1894static void thin_put(struct thin_c *tc);
1577 1895
@@ -1620,6 +1938,7 @@ static void process_deferred_bios(struct pool *pool)
1620 1938
1621 tc = get_first_thin(pool); 1939 tc = get_first_thin(pool);
1622 while (tc) { 1940 while (tc) {
1941 process_thin_deferred_cells(tc);
1623 process_thin_deferred_bios(tc); 1942 process_thin_deferred_bios(tc);
1624 tc = get_next_thin(pool, tc); 1943 tc = get_next_thin(pool, tc);
1625 } 1944 }
@@ -1653,9 +1972,15 @@ static void do_worker(struct work_struct *ws)
1653{ 1972{
1654 struct pool *pool = container_of(ws, struct pool, worker); 1973 struct pool *pool = container_of(ws, struct pool, worker);
1655 1974
1975 throttle_work_start(&pool->throttle);
1976 dm_pool_issue_prefetches(pool->pmd);
1977 throttle_work_update(&pool->throttle);
1656 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); 1978 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1979 throttle_work_update(&pool->throttle);
1657 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); 1980 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1981 throttle_work_update(&pool->throttle);
1658 process_deferred_bios(pool); 1982 process_deferred_bios(pool);
1983 throttle_work_complete(&pool->throttle);
1659} 1984}
1660 1985
1661/* 1986/*
@@ -1792,6 +2117,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1792 dm_pool_metadata_read_only(pool->pmd); 2117 dm_pool_metadata_read_only(pool->pmd);
1793 pool->process_bio = process_bio_fail; 2118 pool->process_bio = process_bio_fail;
1794 pool->process_discard = process_bio_fail; 2119 pool->process_discard = process_bio_fail;
2120 pool->process_cell = process_cell_fail;
2121 pool->process_discard_cell = process_cell_fail;
1795 pool->process_prepared_mapping = process_prepared_mapping_fail; 2122 pool->process_prepared_mapping = process_prepared_mapping_fail;
1796 pool->process_prepared_discard = process_prepared_discard_fail; 2123 pool->process_prepared_discard = process_prepared_discard_fail;
1797 2124
@@ -1804,6 +2131,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1804 dm_pool_metadata_read_only(pool->pmd); 2131 dm_pool_metadata_read_only(pool->pmd);
1805 pool->process_bio = process_bio_read_only; 2132 pool->process_bio = process_bio_read_only;
1806 pool->process_discard = process_bio_success; 2133 pool->process_discard = process_bio_success;
2134 pool->process_cell = process_cell_read_only;
2135 pool->process_discard_cell = process_cell_success;
1807 pool->process_prepared_mapping = process_prepared_mapping_fail; 2136 pool->process_prepared_mapping = process_prepared_mapping_fail;
1808 pool->process_prepared_discard = process_prepared_discard_passdown; 2137 pool->process_prepared_discard = process_prepared_discard_passdown;
1809 2138
@@ -1822,7 +2151,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1822 if (old_mode != new_mode) 2151 if (old_mode != new_mode)
1823 notify_of_pool_mode_change(pool, "out-of-data-space"); 2152 notify_of_pool_mode_change(pool, "out-of-data-space");
1824 pool->process_bio = process_bio_read_only; 2153 pool->process_bio = process_bio_read_only;
1825 pool->process_discard = process_discard; 2154 pool->process_discard = process_discard_bio;
2155 pool->process_cell = process_cell_read_only;
2156 pool->process_discard_cell = process_discard_cell;
1826 pool->process_prepared_mapping = process_prepared_mapping; 2157 pool->process_prepared_mapping = process_prepared_mapping;
1827 pool->process_prepared_discard = process_prepared_discard_passdown; 2158 pool->process_prepared_discard = process_prepared_discard_passdown;
1828 2159
@@ -1835,7 +2166,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1835 notify_of_pool_mode_change(pool, "write"); 2166 notify_of_pool_mode_change(pool, "write");
1836 dm_pool_metadata_read_write(pool->pmd); 2167 dm_pool_metadata_read_write(pool->pmd);
1837 pool->process_bio = process_bio; 2168 pool->process_bio = process_bio;
1838 pool->process_discard = process_discard; 2169 pool->process_discard = process_discard_bio;
2170 pool->process_cell = process_cell;
2171 pool->process_discard_cell = process_discard_cell;
1839 pool->process_prepared_mapping = process_prepared_mapping; 2172 pool->process_prepared_mapping = process_prepared_mapping;
1840 pool->process_prepared_discard = process_prepared_discard; 2173 pool->process_prepared_discard = process_prepared_discard;
1841 break; 2174 break;
@@ -1895,6 +2228,29 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1895 wake_worker(pool); 2228 wake_worker(pool);
1896} 2229}
1897 2230
2231static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
2232{
2233 struct pool *pool = tc->pool;
2234
2235 throttle_lock(&pool->throttle);
2236 thin_defer_bio(tc, bio);
2237 throttle_unlock(&pool->throttle);
2238}
2239
2240static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2241{
2242 unsigned long flags;
2243 struct pool *pool = tc->pool;
2244
2245 throttle_lock(&pool->throttle);
2246 spin_lock_irqsave(&tc->lock, flags);
2247 list_add_tail(&cell->user_list, &tc->deferred_cells);
2248 spin_unlock_irqrestore(&tc->lock, flags);
2249 throttle_unlock(&pool->throttle);
2250
2251 wake_worker(pool);
2252}
2253
1898static void thin_hook_bio(struct thin_c *tc, struct bio *bio) 2254static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
1899{ 2255{
1900 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 2256 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -1915,8 +2271,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1915 dm_block_t block = get_bio_block(tc, bio); 2271 dm_block_t block = get_bio_block(tc, bio);
1916 struct dm_thin_device *td = tc->td; 2272 struct dm_thin_device *td = tc->td;
1917 struct dm_thin_lookup_result result; 2273 struct dm_thin_lookup_result result;
1918 struct dm_bio_prison_cell cell1, cell2; 2274 struct dm_bio_prison_cell *virt_cell, *data_cell;
1919 struct dm_bio_prison_cell *cell_result;
1920 struct dm_cell_key key; 2275 struct dm_cell_key key;
1921 2276
1922 thin_hook_bio(tc, bio); 2277 thin_hook_bio(tc, bio);
@@ -1932,7 +2287,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1932 } 2287 }
1933 2288
1934 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 2289 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1935 thin_defer_bio(tc, bio); 2290 thin_defer_bio_with_throttle(tc, bio);
1936 return DM_MAPIO_SUBMITTED; 2291 return DM_MAPIO_SUBMITTED;
1937 } 2292 }
1938 2293
@@ -1941,7 +2296,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1941 * there's a race with discard. 2296 * there's a race with discard.
1942 */ 2297 */
1943 build_virtual_key(tc->td, block, &key); 2298 build_virtual_key(tc->td, block, &key);
1944 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) 2299 if (bio_detain(tc->pool, &key, bio, &virt_cell))
1945 return DM_MAPIO_SUBMITTED; 2300 return DM_MAPIO_SUBMITTED;
1946 2301
1947 r = dm_thin_find_block(td, block, 0, &result); 2302 r = dm_thin_find_block(td, block, 0, &result);
@@ -1966,20 +2321,19 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1966 * More distant ancestors are irrelevant. The 2321 * More distant ancestors are irrelevant. The
1967 * shared flag will be set in their case. 2322 * shared flag will be set in their case.
1968 */ 2323 */
1969 thin_defer_bio(tc, bio); 2324 thin_defer_cell(tc, virt_cell);
1970 cell_defer_no_holder_no_free(tc, &cell1);
1971 return DM_MAPIO_SUBMITTED; 2325 return DM_MAPIO_SUBMITTED;
1972 } 2326 }
1973 2327
1974 build_data_key(tc->td, result.block, &key); 2328 build_data_key(tc->td, result.block, &key);
1975 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { 2329 if (bio_detain(tc->pool, &key, bio, &data_cell)) {
1976 cell_defer_no_holder_no_free(tc, &cell1); 2330 cell_defer_no_holder(tc, virt_cell);
1977 return DM_MAPIO_SUBMITTED; 2331 return DM_MAPIO_SUBMITTED;
1978 } 2332 }
1979 2333
1980 inc_all_io_entry(tc->pool, bio); 2334 inc_all_io_entry(tc->pool, bio);
1981 cell_defer_no_holder_no_free(tc, &cell2); 2335 cell_defer_no_holder(tc, data_cell);
1982 cell_defer_no_holder_no_free(tc, &cell1); 2336 cell_defer_no_holder(tc, virt_cell);
1983 2337
1984 remap(tc, bio, result.block); 2338 remap(tc, bio, result.block);
1985 return DM_MAPIO_REMAPPED; 2339 return DM_MAPIO_REMAPPED;
@@ -1991,18 +2345,13 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1991 * of doing so. 2345 * of doing so.
1992 */ 2346 */
1993 handle_unserviceable_bio(tc->pool, bio); 2347 handle_unserviceable_bio(tc->pool, bio);
1994 cell_defer_no_holder_no_free(tc, &cell1); 2348 cell_defer_no_holder(tc, virt_cell);
1995 return DM_MAPIO_SUBMITTED; 2349 return DM_MAPIO_SUBMITTED;
1996 } 2350 }
1997 /* fall through */ 2351 /* fall through */
1998 2352
1999 case -EWOULDBLOCK: 2353 case -EWOULDBLOCK:
2000 /* 2354 thin_defer_cell(tc, virt_cell);
2001 * In future, the failed dm_thin_find_block above could
2002 * provide the hint to load the metadata into cache.
2003 */
2004 thin_defer_bio(tc, bio);
2005 cell_defer_no_holder_no_free(tc, &cell1);
2006 return DM_MAPIO_SUBMITTED; 2355 return DM_MAPIO_SUBMITTED;
2007 2356
2008 default: 2357 default:
@@ -2012,7 +2361,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
2012 * pool is switched to fail-io mode. 2361 * pool is switched to fail-io mode.
2013 */ 2362 */
2014 bio_io_error(bio); 2363 bio_io_error(bio);
2015 cell_defer_no_holder_no_free(tc, &cell1); 2364 cell_defer_no_holder(tc, virt_cell);
2016 return DM_MAPIO_SUBMITTED; 2365 return DM_MAPIO_SUBMITTED;
2017 } 2366 }
2018} 2367}
@@ -2193,7 +2542,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
2193 pool->sectors_per_block_shift = __ffs(block_size); 2542 pool->sectors_per_block_shift = __ffs(block_size);
2194 pool->low_water_blocks = 0; 2543 pool->low_water_blocks = 0;
2195 pool_features_init(&pool->pf); 2544 pool_features_init(&pool->pf);
2196 pool->prison = dm_bio_prison_create(PRISON_CELLS); 2545 pool->prison = dm_bio_prison_create();
2197 if (!pool->prison) { 2546 if (!pool->prison) {
2198 *error = "Error creating pool's bio prison"; 2547 *error = "Error creating pool's bio prison";
2199 err_p = ERR_PTR(-ENOMEM); 2548 err_p = ERR_PTR(-ENOMEM);
@@ -2219,6 +2568,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
2219 goto bad_wq; 2568 goto bad_wq;
2220 } 2569 }
2221 2570
2571 throttle_init(&pool->throttle);
2222 INIT_WORK(&pool->worker, do_worker); 2572 INIT_WORK(&pool->worker, do_worker);
2223 INIT_DELAYED_WORK(&pool->waker, do_waker); 2573 INIT_DELAYED_WORK(&pool->waker, do_waker);
2224 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); 2574 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
@@ -2228,6 +2578,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
2228 INIT_LIST_HEAD(&pool->prepared_discards); 2578 INIT_LIST_HEAD(&pool->prepared_discards);
2229 INIT_LIST_HEAD(&pool->active_thins); 2579 INIT_LIST_HEAD(&pool->active_thins);
2230 pool->low_water_triggered = false; 2580 pool->low_water_triggered = false;
2581 pool->suspended = true;
2231 2582
2232 pool->shared_read_ds = dm_deferred_set_create(); 2583 pool->shared_read_ds = dm_deferred_set_create();
2233 if (!pool->shared_read_ds) { 2584 if (!pool->shared_read_ds) {
@@ -2764,20 +3115,77 @@ static int pool_preresume(struct dm_target *ti)
2764 return 0; 3115 return 0;
2765} 3116}
2766 3117
3118static void pool_suspend_active_thins(struct pool *pool)
3119{
3120 struct thin_c *tc;
3121
3122 /* Suspend all active thin devices */
3123 tc = get_first_thin(pool);
3124 while (tc) {
3125 dm_internal_suspend_noflush(tc->thin_md);
3126 tc = get_next_thin(pool, tc);
3127 }
3128}
3129
3130static void pool_resume_active_thins(struct pool *pool)
3131{
3132 struct thin_c *tc;
3133
3134 /* Resume all active thin devices */
3135 tc = get_first_thin(pool);
3136 while (tc) {
3137 dm_internal_resume(tc->thin_md);
3138 tc = get_next_thin(pool, tc);
3139 }
3140}
3141
2767static void pool_resume(struct dm_target *ti) 3142static void pool_resume(struct dm_target *ti)
2768{ 3143{
2769 struct pool_c *pt = ti->private; 3144 struct pool_c *pt = ti->private;
2770 struct pool *pool = pt->pool; 3145 struct pool *pool = pt->pool;
2771 unsigned long flags; 3146 unsigned long flags;
2772 3147
3148 /*
3149 * Must requeue active_thins' bios and then resume
3150 * active_thins _before_ clearing 'suspend' flag.
3151 */
3152 requeue_bios(pool);
3153 pool_resume_active_thins(pool);
3154
2773 spin_lock_irqsave(&pool->lock, flags); 3155 spin_lock_irqsave(&pool->lock, flags);
2774 pool->low_water_triggered = false; 3156 pool->low_water_triggered = false;
3157 pool->suspended = false;
2775 spin_unlock_irqrestore(&pool->lock, flags); 3158 spin_unlock_irqrestore(&pool->lock, flags);
2776 requeue_bios(pool);
2777 3159
2778 do_waker(&pool->waker.work); 3160 do_waker(&pool->waker.work);
2779} 3161}
2780 3162
3163static void pool_presuspend(struct dm_target *ti)
3164{
3165 struct pool_c *pt = ti->private;
3166 struct pool *pool = pt->pool;
3167 unsigned long flags;
3168
3169 spin_lock_irqsave(&pool->lock, flags);
3170 pool->suspended = true;
3171 spin_unlock_irqrestore(&pool->lock, flags);
3172
3173 pool_suspend_active_thins(pool);
3174}
3175
3176static void pool_presuspend_undo(struct dm_target *ti)
3177{
3178 struct pool_c *pt = ti->private;
3179 struct pool *pool = pt->pool;
3180 unsigned long flags;
3181
3182 pool_resume_active_thins(pool);
3183
3184 spin_lock_irqsave(&pool->lock, flags);
3185 pool->suspended = false;
3186 spin_unlock_irqrestore(&pool->lock, flags);
3187}
3188
2781static void pool_postsuspend(struct dm_target *ti) 3189static void pool_postsuspend(struct dm_target *ti)
2782{ 3190{
2783 struct pool_c *pt = ti->private; 3191 struct pool_c *pt = ti->private;
@@ -2949,7 +3357,6 @@ static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct
2949 * create_thin <dev_id> 3357 * create_thin <dev_id>
2950 * create_snap <dev_id> <origin_id> 3358 * create_snap <dev_id> <origin_id>
2951 * delete <dev_id> 3359 * delete <dev_id>
2952 * trim <dev_id> <new_size_in_sectors>
2953 * set_transaction_id <current_trans_id> <new_trans_id> 3360 * set_transaction_id <current_trans_id> <new_trans_id>
2954 * reserve_metadata_snap 3361 * reserve_metadata_snap
2955 * release_metadata_snap 3362 * release_metadata_snap
@@ -3177,15 +3584,35 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
3177{ 3584{
3178 struct pool_c *pt = ti->private; 3585 struct pool_c *pt = ti->private;
3179 struct pool *pool = pt->pool; 3586 struct pool *pool = pt->pool;
3180 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3587 sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3588
3589 /*
3590 * If max_sectors is smaller than pool->sectors_per_block adjust it
3591 * to the highest possible power-of-2 factor of pool->sectors_per_block.
3592 * This is especially beneficial when the pool's data device is a RAID
3593 * device that has a full stripe width that matches pool->sectors_per_block
3594 * -- because even though partial RAID stripe-sized IOs will be issued to a
3595 * single RAID stripe; when aggregated they will end on a full RAID stripe
3596 * boundary.. which avoids additional partial RAID stripe writes cascading
3597 */
3598 if (limits->max_sectors < pool->sectors_per_block) {
3599 while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
3600 if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
3601 limits->max_sectors--;
3602 limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
3603 }
3604 }
3181 3605
3182 /* 3606 /*
3183 * If the system-determined stacked limits are compatible with the 3607 * If the system-determined stacked limits are compatible with the
3184 * pool's blocksize (io_opt is a factor) do not override them. 3608 * pool's blocksize (io_opt is a factor) do not override them.
3185 */ 3609 */
3186 if (io_opt_sectors < pool->sectors_per_block || 3610 if (io_opt_sectors < pool->sectors_per_block ||
3187 do_div(io_opt_sectors, pool->sectors_per_block)) { 3611 !is_factor(io_opt_sectors, pool->sectors_per_block)) {
3188 blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); 3612 if (is_factor(pool->sectors_per_block, limits->max_sectors))
3613 blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
3614 else
3615 blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
3189 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 3616 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
3190 } 3617 }
3191 3618
@@ -3214,11 +3641,13 @@ static struct target_type pool_target = {
3214 .name = "thin-pool", 3641 .name = "thin-pool",
3215 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 3642 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
3216 DM_TARGET_IMMUTABLE, 3643 DM_TARGET_IMMUTABLE,
3217 .version = {1, 13, 0}, 3644 .version = {1, 14, 0},
3218 .module = THIS_MODULE, 3645 .module = THIS_MODULE,
3219 .ctr = pool_ctr, 3646 .ctr = pool_ctr,
3220 .dtr = pool_dtr, 3647 .dtr = pool_dtr,
3221 .map = pool_map, 3648 .map = pool_map,
3649 .presuspend = pool_presuspend,
3650 .presuspend_undo = pool_presuspend_undo,
3222 .postsuspend = pool_postsuspend, 3651 .postsuspend = pool_postsuspend,
3223 .preresume = pool_preresume, 3652 .preresume = pool_preresume,
3224 .resume = pool_resume, 3653 .resume = pool_resume,
@@ -3248,14 +3677,14 @@ static void thin_dtr(struct dm_target *ti)
3248 struct thin_c *tc = ti->private; 3677 struct thin_c *tc = ti->private;
3249 unsigned long flags; 3678 unsigned long flags;
3250 3679
3251 thin_put(tc);
3252 wait_for_completion(&tc->can_destroy);
3253
3254 spin_lock_irqsave(&tc->pool->lock, flags); 3680 spin_lock_irqsave(&tc->pool->lock, flags);
3255 list_del_rcu(&tc->list); 3681 list_del_rcu(&tc->list);
3256 spin_unlock_irqrestore(&tc->pool->lock, flags); 3682 spin_unlock_irqrestore(&tc->pool->lock, flags);
3257 synchronize_rcu(); 3683 synchronize_rcu();
3258 3684
3685 thin_put(tc);
3686 wait_for_completion(&tc->can_destroy);
3687
3259 mutex_lock(&dm_thin_pool_table.mutex); 3688 mutex_lock(&dm_thin_pool_table.mutex);
3260 3689
3261 __pool_dec(tc->pool); 3690 __pool_dec(tc->pool);
@@ -3302,7 +3731,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3302 r = -ENOMEM; 3731 r = -ENOMEM;
3303 goto out_unlock; 3732 goto out_unlock;
3304 } 3733 }
3734 tc->thin_md = dm_table_get_md(ti->table);
3305 spin_lock_init(&tc->lock); 3735 spin_lock_init(&tc->lock);
3736 INIT_LIST_HEAD(&tc->deferred_cells);
3306 bio_list_init(&tc->deferred_bio_list); 3737 bio_list_init(&tc->deferred_bio_list);
3307 bio_list_init(&tc->retry_on_resume_list); 3738 bio_list_init(&tc->retry_on_resume_list);
3308 tc->sort_bio_list = RB_ROOT; 3739 tc->sort_bio_list = RB_ROOT;
@@ -3347,18 +3778,18 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3347 if (get_pool_mode(tc->pool) == PM_FAIL) { 3778 if (get_pool_mode(tc->pool) == PM_FAIL) {
3348 ti->error = "Couldn't open thin device, Pool is in fail mode"; 3779 ti->error = "Couldn't open thin device, Pool is in fail mode";
3349 r = -EINVAL; 3780 r = -EINVAL;
3350 goto bad_thin_open; 3781 goto bad_pool;
3351 } 3782 }
3352 3783
3353 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 3784 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
3354 if (r) { 3785 if (r) {
3355 ti->error = "Couldn't open thin internal device"; 3786 ti->error = "Couldn't open thin internal device";
3356 goto bad_thin_open; 3787 goto bad_pool;
3357 } 3788 }
3358 3789
3359 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); 3790 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
3360 if (r) 3791 if (r)
3361 goto bad_target_max_io_len; 3792 goto bad;
3362 3793
3363 ti->num_flush_bios = 1; 3794 ti->num_flush_bios = 1;
3364 ti->flush_supported = true; 3795 ti->flush_supported = true;
@@ -3373,14 +3804,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3373 ti->split_discard_bios = true; 3804 ti->split_discard_bios = true;
3374 } 3805 }
3375 3806
3376 dm_put(pool_md);
3377
3378 mutex_unlock(&dm_thin_pool_table.mutex); 3807 mutex_unlock(&dm_thin_pool_table.mutex);
3379 3808
3380 atomic_set(&tc->refcount, 1);
3381 init_completion(&tc->can_destroy);
3382
3383 spin_lock_irqsave(&tc->pool->lock, flags); 3809 spin_lock_irqsave(&tc->pool->lock, flags);
3810 if (tc->pool->suspended) {
3811 spin_unlock_irqrestore(&tc->pool->lock, flags);
3812 mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
3813 ti->error = "Unable to activate thin device while pool is suspended";
3814 r = -EINVAL;
3815 goto bad;
3816 }
3384 list_add_tail_rcu(&tc->list, &tc->pool->active_thins); 3817 list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
3385 spin_unlock_irqrestore(&tc->pool->lock, flags); 3818 spin_unlock_irqrestore(&tc->pool->lock, flags);
3386 /* 3819 /*
@@ -3391,11 +3824,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3391 */ 3824 */
3392 synchronize_rcu(); 3825 synchronize_rcu();
3393 3826
3827 dm_put(pool_md);
3828
3829 atomic_set(&tc->refcount, 1);
3830 init_completion(&tc->can_destroy);
3831
3394 return 0; 3832 return 0;
3395 3833
3396bad_target_max_io_len: 3834bad:
3397 dm_pool_close_thin_device(tc->td); 3835 dm_pool_close_thin_device(tc->td);
3398bad_thin_open: 3836bad_pool:
3399 __pool_dec(tc->pool); 3837 __pool_dec(tc->pool);
3400bad_pool_lookup: 3838bad_pool_lookup:
3401 dm_put(pool_md); 3839 dm_put(pool_md);
@@ -3541,6 +3979,21 @@ err:
3541 DMEMIT("Error"); 3979 DMEMIT("Error");
3542} 3980}
3543 3981
3982static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
3983 struct bio_vec *biovec, int max_size)
3984{
3985 struct thin_c *tc = ti->private;
3986 struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
3987
3988 if (!q->merge_bvec_fn)
3989 return max_size;
3990
3991 bvm->bi_bdev = tc->pool_dev->bdev;
3992 bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
3993
3994 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3995}
3996
3544static int thin_iterate_devices(struct dm_target *ti, 3997static int thin_iterate_devices(struct dm_target *ti,
3545 iterate_devices_callout_fn fn, void *data) 3998 iterate_devices_callout_fn fn, void *data)
3546{ 3999{
@@ -3565,7 +4018,7 @@ static int thin_iterate_devices(struct dm_target *ti,
3565 4018
3566static struct target_type thin_target = { 4019static struct target_type thin_target = {
3567 .name = "thin", 4020 .name = "thin",
3568 .version = {1, 13, 0}, 4021 .version = {1, 14, 0},
3569 .module = THIS_MODULE, 4022 .module = THIS_MODULE,
3570 .ctr = thin_ctr, 4023 .ctr = thin_ctr,
3571 .dtr = thin_dtr, 4024 .dtr = thin_dtr,
@@ -3575,6 +4028,7 @@ static struct target_type thin_target = {
3575 .presuspend = thin_presuspend, 4028 .presuspend = thin_presuspend,
3576 .postsuspend = thin_postsuspend, 4029 .postsuspend = thin_postsuspend,
3577 .status = thin_status, 4030 .status = thin_status,
4031 .merge = thin_merge,
3578 .iterate_devices = thin_iterate_devices, 4032 .iterate_devices = thin_iterate_devices,
3579}; 4033};
3580 4034
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 58f3927fd7cc..8f37ed215b19 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,6 +19,7 @@
19#include <linux/idr.h> 19#include <linux/idr.h>
20#include <linux/hdreg.h> 20#include <linux/hdreg.h>
21#include <linux/delay.h> 21#include <linux/delay.h>
22#include <linux/wait.h>
22 23
23#include <trace/events/block.h> 24#include <trace/events/block.h>
24 25
@@ -117,6 +118,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
117#define DMF_NOFLUSH_SUSPENDING 5 118#define DMF_NOFLUSH_SUSPENDING 5
118#define DMF_MERGE_IS_OPTIONAL 6 119#define DMF_MERGE_IS_OPTIONAL 6
119#define DMF_DEFERRED_REMOVE 7 120#define DMF_DEFERRED_REMOVE 7
121#define DMF_SUSPENDED_INTERNALLY 8
120 122
121/* 123/*
122 * A dummy definition to make RCU happy. 124 * A dummy definition to make RCU happy.
@@ -140,7 +142,7 @@ struct mapped_device {
140 * Use dm_get_live_table{_fast} or take suspend_lock for 142 * Use dm_get_live_table{_fast} or take suspend_lock for
141 * dereference. 143 * dereference.
142 */ 144 */
143 struct dm_table *map; 145 struct dm_table __rcu *map;
144 146
145 struct list_head table_devices; 147 struct list_head table_devices;
146 struct mutex table_devices_lock; 148 struct mutex table_devices_lock;
@@ -525,14 +527,15 @@ retry:
525 goto out; 527 goto out;
526 528
527 tgt = dm_table_get_target(map, 0); 529 tgt = dm_table_get_target(map, 0);
530 if (!tgt->type->ioctl)
531 goto out;
528 532
529 if (dm_suspended_md(md)) { 533 if (dm_suspended_md(md)) {
530 r = -EAGAIN; 534 r = -EAGAIN;
531 goto out; 535 goto out;
532 } 536 }
533 537
534 if (tgt->type->ioctl) 538 r = tgt->type->ioctl(tgt, cmd, arg);
535 r = tgt->type->ioctl(tgt, cmd, arg);
536 539
537out: 540out:
538 dm_put_live_table(md, srcu_idx); 541 dm_put_live_table(md, srcu_idx);
@@ -1607,9 +1610,9 @@ static int dm_merge_bvec(struct request_queue *q,
1607 * Find maximum amount of I/O that won't need splitting 1610 * Find maximum amount of I/O that won't need splitting
1608 */ 1611 */
1609 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1612 max_sectors = min(max_io_len(bvm->bi_sector, ti),
1610 (sector_t) BIO_MAX_SECTORS); 1613 (sector_t) queue_max_sectors(q));
1611 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1614 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1612 if (max_size < 0) 1615 if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
1613 max_size = 0; 1616 max_size = 0;
1614 1617
1615 /* 1618 /*
@@ -1621,10 +1624,10 @@ static int dm_merge_bvec(struct request_queue *q,
1621 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1624 max_size = ti->type->merge(ti, bvm, biovec, max_size);
1622 /* 1625 /*
1623 * If the target doesn't support merge method and some of the devices 1626 * If the target doesn't support merge method and some of the devices
1624 * provided their merge_bvec method (we know this by looking at 1627 * provided their merge_bvec method (we know this by looking for the
1625 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1628 * max_hw_sectors that dm_set_device_limits may set), then we can't
1626 * entries. So always set max_size to 0, and the code below allows 1629 * allow bios with multiple vector entries. So always set max_size
1627 * just one page. 1630 * to 0, and the code below allows just one page.
1628 */ 1631 */
1629 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1632 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1630 max_size = 0; 1633 max_size = 0;
@@ -2332,7 +2335,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2332 2335
2333 merge_is_optional = dm_table_merge_is_optional(t); 2336 merge_is_optional = dm_table_merge_is_optional(t);
2334 2337
2335 old_map = md->map; 2338 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2336 rcu_assign_pointer(md->map, t); 2339 rcu_assign_pointer(md->map, t);
2337 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2340 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2338 2341
@@ -2341,7 +2344,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2341 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2344 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2342 else 2345 else
2343 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2346 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2344 dm_sync_table(md); 2347 if (old_map)
2348 dm_sync_table(md);
2345 2349
2346 return old_map; 2350 return old_map;
2347} 2351}
@@ -2351,7 +2355,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2351 */ 2355 */
2352static struct dm_table *__unbind(struct mapped_device *md) 2356static struct dm_table *__unbind(struct mapped_device *md)
2353{ 2357{
2354 struct dm_table *map = md->map; 2358 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2355 2359
2356 if (!map) 2360 if (!map)
2357 return NULL; 2361 return NULL;
@@ -2716,36 +2720,18 @@ static void unlock_fs(struct mapped_device *md)
2716} 2720}
2717 2721
2718/* 2722/*
2719 * We need to be able to change a mapping table under a mounted 2723 * If __dm_suspend returns 0, the device is completely quiescent
2720 * filesystem. For example we might want to move some data in 2724 * now. There is no request-processing activity. All new requests
2721 * the background. Before the table can be swapped with 2725 * are being added to md->deferred list.
2722 * dm_bind_table, dm_suspend must be called to flush any in
2723 * flight bios and ensure that any further io gets deferred.
2724 */
2725/*
2726 * Suspend mechanism in request-based dm.
2727 * 2726 *
2728 * 1. Flush all I/Os by lock_fs() if needed. 2727 * Caller must hold md->suspend_lock
2729 * 2. Stop dispatching any I/O by stopping the request_queue.
2730 * 3. Wait for all in-flight I/Os to be completed or requeued.
2731 *
2732 * To abort suspend, start the request_queue.
2733 */ 2728 */
2734int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2729static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2730 unsigned suspend_flags, int interruptible)
2735{ 2731{
2736 struct dm_table *map = NULL; 2732 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2737 int r = 0; 2733 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2738 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2734 int r;
2739 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
2740
2741 mutex_lock(&md->suspend_lock);
2742
2743 if (dm_suspended_md(md)) {
2744 r = -EINVAL;
2745 goto out_unlock;
2746 }
2747
2748 map = md->map;
2749 2735
2750 /* 2736 /*
2751 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2737 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2754,7 +2740,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2754 if (noflush) 2740 if (noflush)
2755 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2741 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2756 2742
2757 /* This does not get reverted if there's an error later. */ 2743 /*
2744 * This gets reverted if there's an error later and the targets
2745 * provide the .presuspend_undo hook.
2746 */
2758 dm_table_presuspend_targets(map); 2747 dm_table_presuspend_targets(map);
2759 2748
2760 /* 2749 /*
@@ -2765,8 +2754,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2765 */ 2754 */
2766 if (!noflush && do_lockfs) { 2755 if (!noflush && do_lockfs) {
2767 r = lock_fs(md); 2756 r = lock_fs(md);
2768 if (r) 2757 if (r) {
2769 goto out_unlock; 2758 dm_table_presuspend_undo_targets(map);
2759 return r;
2760 }
2770 } 2761 }
2771 2762
2772 /* 2763 /*
@@ -2782,7 +2773,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2782 * flush_workqueue(md->wq). 2773 * flush_workqueue(md->wq).
2783 */ 2774 */
2784 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2775 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2785 synchronize_srcu(&md->io_barrier); 2776 if (map)
2777 synchronize_srcu(&md->io_barrier);
2786 2778
2787 /* 2779 /*
2788 * Stop md->queue before flushing md->wq in case request-based 2780 * Stop md->queue before flushing md->wq in case request-based
@@ -2798,11 +2790,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2798 * We call dm_wait_for_completion to wait for all existing requests 2790 * We call dm_wait_for_completion to wait for all existing requests
2799 * to finish. 2791 * to finish.
2800 */ 2792 */
2801 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2793 r = dm_wait_for_completion(md, interruptible);
2802 2794
2803 if (noflush) 2795 if (noflush)
2804 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2796 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2805 synchronize_srcu(&md->io_barrier); 2797 if (map)
2798 synchronize_srcu(&md->io_barrier);
2806 2799
2807 /* were we interrupted ? */ 2800 /* were we interrupted ? */
2808 if (r < 0) { 2801 if (r < 0) {
@@ -2812,14 +2805,56 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2812 start_queue(md->queue); 2805 start_queue(md->queue);
2813 2806
2814 unlock_fs(md); 2807 unlock_fs(md);
2815 goto out_unlock; /* pushback list is already flushed, so skip flush */ 2808 dm_table_presuspend_undo_targets(map);
2809 /* pushback list is already flushed, so skip flush */
2816 } 2810 }
2817 2811
2818 /* 2812 return r;
2819 * If dm_wait_for_completion returned 0, the device is completely 2813}
2820 * quiescent now. There is no request-processing activity. All new 2814
2821 * requests are being added to md->deferred list. 2815/*
2822 */ 2816 * We need to be able to change a mapping table under a mounted
2817 * filesystem. For example we might want to move some data in
2818 * the background. Before the table can be swapped with
2819 * dm_bind_table, dm_suspend must be called to flush any in
2820 * flight bios and ensure that any further io gets deferred.
2821 */
2822/*
2823 * Suspend mechanism in request-based dm.
2824 *
2825 * 1. Flush all I/Os by lock_fs() if needed.
2826 * 2. Stop dispatching any I/O by stopping the request_queue.
2827 * 3. Wait for all in-flight I/Os to be completed or requeued.
2828 *
2829 * To abort suspend, start the request_queue.
2830 */
2831int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2832{
2833 struct dm_table *map = NULL;
2834 int r = 0;
2835
2836retry:
2837 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2838
2839 if (dm_suspended_md(md)) {
2840 r = -EINVAL;
2841 goto out_unlock;
2842 }
2843
2844 if (dm_suspended_internally_md(md)) {
2845 /* already internally suspended, wait for internal resume */
2846 mutex_unlock(&md->suspend_lock);
2847 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2848 if (r)
2849 return r;
2850 goto retry;
2851 }
2852
2853 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2854
2855 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
2856 if (r)
2857 goto out_unlock;
2823 2858
2824 set_bit(DMF_SUSPENDED, &md->flags); 2859 set_bit(DMF_SUSPENDED, &md->flags);
2825 2860
@@ -2830,22 +2865,13 @@ out_unlock:
2830 return r; 2865 return r;
2831} 2866}
2832 2867
2833int dm_resume(struct mapped_device *md) 2868static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2834{ 2869{
2835 int r = -EINVAL; 2870 if (map) {
2836 struct dm_table *map = NULL; 2871 int r = dm_table_resume_targets(map);
2837 2872 if (r)
2838 mutex_lock(&md->suspend_lock); 2873 return r;
2839 if (!dm_suspended_md(md)) 2874 }
2840 goto out;
2841
2842 map = md->map;
2843 if (!map || !dm_table_get_size(map))
2844 goto out;
2845
2846 r = dm_table_resume_targets(map);
2847 if (r)
2848 goto out;
2849 2875
2850 dm_queue_flush(md); 2876 dm_queue_flush(md);
2851 2877
@@ -2859,6 +2885,37 @@ int dm_resume(struct mapped_device *md)
2859 2885
2860 unlock_fs(md); 2886 unlock_fs(md);
2861 2887
2888 return 0;
2889}
2890
2891int dm_resume(struct mapped_device *md)
2892{
2893 int r = -EINVAL;
2894 struct dm_table *map = NULL;
2895
2896retry:
2897 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2898
2899 if (!dm_suspended_md(md))
2900 goto out;
2901
2902 if (dm_suspended_internally_md(md)) {
2903 /* already internally suspended, wait for internal resume */
2904 mutex_unlock(&md->suspend_lock);
2905 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2906 if (r)
2907 return r;
2908 goto retry;
2909 }
2910
2911 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2912 if (!map || !dm_table_get_size(map))
2913 goto out;
2914
2915 r = __dm_resume(md, map);
2916 if (r)
2917 goto out;
2918
2862 clear_bit(DMF_SUSPENDED, &md->flags); 2919 clear_bit(DMF_SUSPENDED, &md->flags);
2863 2920
2864 r = 0; 2921 r = 0;
@@ -2872,15 +2929,80 @@ out:
2872 * Internal suspend/resume works like userspace-driven suspend. It waits 2929 * Internal suspend/resume works like userspace-driven suspend. It waits
2873 * until all bios finish and prevents issuing new bios to the target drivers. 2930 * until all bios finish and prevents issuing new bios to the target drivers.
2874 * It may be used only from the kernel. 2931 * It may be used only from the kernel.
2875 *
2876 * Internal suspend holds md->suspend_lock, which prevents interaction with
2877 * userspace-driven suspend.
2878 */ 2932 */
2879 2933
2880void dm_internal_suspend(struct mapped_device *md) 2934static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2881{ 2935{
2882 mutex_lock(&md->suspend_lock); 2936 struct dm_table *map = NULL;
2937
2938 if (dm_suspended_internally_md(md))
2939 return; /* nested internal suspend */
2940
2941 if (dm_suspended_md(md)) {
2942 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2943 return; /* nest suspend */
2944 }
2945
2946 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2947
2948 /*
2949 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2950 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
2951 * would require changing .presuspend to return an error -- avoid this
2952 * until there is a need for more elaborate variants of internal suspend.
2953 */
2954 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE);
2955
2956 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2957
2958 dm_table_postsuspend_targets(map);
2959}
2960
2961static void __dm_internal_resume(struct mapped_device *md)
2962{
2963 if (!dm_suspended_internally_md(md))
2964 return; /* resume from nested internal suspend */
2965
2883 if (dm_suspended_md(md)) 2966 if (dm_suspended_md(md))
2967 goto done; /* resume from nested suspend */
2968
2969 /*
2970 * NOTE: existing callers don't need to call dm_table_resume_targets
2971 * (which may fail -- so best to avoid it for now by passing NULL map)
2972 */
2973 (void) __dm_resume(md, NULL);
2974
2975done:
2976 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2977 smp_mb__after_atomic();
2978 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2979}
2980
2981void dm_internal_suspend_noflush(struct mapped_device *md)
2982{
2983 mutex_lock(&md->suspend_lock);
2984 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2985 mutex_unlock(&md->suspend_lock);
2986}
2987EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2988
2989void dm_internal_resume(struct mapped_device *md)
2990{
2991 mutex_lock(&md->suspend_lock);
2992 __dm_internal_resume(md);
2993 mutex_unlock(&md->suspend_lock);
2994}
2995EXPORT_SYMBOL_GPL(dm_internal_resume);
2996
2997/*
2998 * Fast variants of internal suspend/resume hold md->suspend_lock,
2999 * which prevents interaction with userspace-driven suspend.
3000 */
3001
3002void dm_internal_suspend_fast(struct mapped_device *md)
3003{
3004 mutex_lock(&md->suspend_lock);
3005 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2884 return; 3006 return;
2885 3007
2886 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3008 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
@@ -2889,9 +3011,9 @@ void dm_internal_suspend(struct mapped_device *md)
2889 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 3011 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2890} 3012}
2891 3013
2892void dm_internal_resume(struct mapped_device *md) 3014void dm_internal_resume_fast(struct mapped_device *md)
2893{ 3015{
2894 if (dm_suspended_md(md)) 3016 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2895 goto done; 3017 goto done;
2896 3018
2897 dm_queue_flush(md); 3019 dm_queue_flush(md);
@@ -2977,6 +3099,11 @@ int dm_suspended_md(struct mapped_device *md)
2977 return test_bit(DMF_SUSPENDED, &md->flags); 3099 return test_bit(DMF_SUSPENDED, &md->flags);
2978} 3100}
2979 3101
3102int dm_suspended_internally_md(struct mapped_device *md)
3103{
3104 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3105}
3106
2980int dm_test_deferred_remove_flag(struct mapped_device *md) 3107int dm_test_deferred_remove_flag(struct mapped_device *md)
2981{ 3108{
2982 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 3109 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 988c7fb7b145..84b0f9e4ba6c 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -65,6 +65,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
65 struct queue_limits *limits); 65 struct queue_limits *limits);
66struct list_head *dm_table_get_devices(struct dm_table *t); 66struct list_head *dm_table_get_devices(struct dm_table *t);
67void dm_table_presuspend_targets(struct dm_table *t); 67void dm_table_presuspend_targets(struct dm_table *t);
68void dm_table_presuspend_undo_targets(struct dm_table *t);
68void dm_table_postsuspend_targets(struct dm_table *t); 69void dm_table_postsuspend_targets(struct dm_table *t);
69int dm_table_resume_targets(struct dm_table *t); 70int dm_table_resume_targets(struct dm_table *t);
70int dm_table_any_congested(struct dm_table *t, int bdi_bits); 71int dm_table_any_congested(struct dm_table *t, int bdi_bits);
@@ -129,6 +130,15 @@ int dm_deleting_md(struct mapped_device *md);
129int dm_suspended_md(struct mapped_device *md); 130int dm_suspended_md(struct mapped_device *md);
130 131
131/* 132/*
133 * Internal suspend and resume methods.
134 */
135int dm_suspended_internally_md(struct mapped_device *md);
136void dm_internal_suspend_fast(struct mapped_device *md);
137void dm_internal_resume_fast(struct mapped_device *md);
138void dm_internal_suspend_noflush(struct mapped_device *md);
139void dm_internal_resume(struct mapped_device *md);
140
141/*
132 * Test if the device is scheduled for deferred remove. 142 * Test if the device is scheduled for deferred remove.
133 */ 143 */
134int dm_test_deferred_remove_flag(struct mapped_device *md); 144int dm_test_deferred_remove_flag(struct mapped_device *md);
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 1d75b1dc1e2e..e64b61ad0ef3 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -645,8 +645,10 @@ static int array_resize(struct dm_array_info *info, dm_block_t root,
645 int r; 645 int r;
646 struct resize resize; 646 struct resize resize;
647 647
648 if (old_size == new_size) 648 if (old_size == new_size) {
649 *new_root = root;
649 return 0; 650 return 0;
651 }
650 652
651 resize.info = info; 653 resize.info = info;
652 resize.root = root; 654 resize.root = root;
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 786b689bdfc7..e8a904298887 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -564,7 +564,9 @@ static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count
564{ 564{
565 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 565 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
566 566
567 return smm->ll.nr_blocks; 567 *count = smm->ll.nr_blocks;
568
569 return 0;
568} 570}
569 571
570static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count) 572static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
@@ -581,7 +583,9 @@ static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b,
581{ 583{
582 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 584 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
583 585
584 return b < smm->begin ? 1 : 0; 586 *result = (b < smm->begin) ? 1 : 0;
587
588 return 0;
585} 589}
586 590
587static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm, 591static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 3bc30a0ae3d6..9cb797d800cf 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -10,6 +10,8 @@
10#include "dm-persistent-data-internal.h" 10#include "dm-persistent-data-internal.h"
11 11
12#include <linux/export.h> 12#include <linux/export.h>
13#include <linux/mutex.h>
14#include <linux/hash.h>
13#include <linux/slab.h> 15#include <linux/slab.h>
14#include <linux/device-mapper.h> 16#include <linux/device-mapper.h>
15 17
@@ -17,6 +19,61 @@
17 19
18/*----------------------------------------------------------------*/ 20/*----------------------------------------------------------------*/
19 21
22#define PREFETCH_SIZE 128
23#define PREFETCH_BITS 7
24#define PREFETCH_SENTINEL ((dm_block_t) -1ULL)
25
26struct prefetch_set {
27 struct mutex lock;
28 dm_block_t blocks[PREFETCH_SIZE];
29};
30
31static unsigned prefetch_hash(dm_block_t b)
32{
33 return hash_64(b, PREFETCH_BITS);
34}
35
36static void prefetch_wipe(struct prefetch_set *p)
37{
38 unsigned i;
39 for (i = 0; i < PREFETCH_SIZE; i++)
40 p->blocks[i] = PREFETCH_SENTINEL;
41}
42
43static void prefetch_init(struct prefetch_set *p)
44{
45 mutex_init(&p->lock);
46 prefetch_wipe(p);
47}
48
49static void prefetch_add(struct prefetch_set *p, dm_block_t b)
50{
51 unsigned h = prefetch_hash(b);
52
53 mutex_lock(&p->lock);
54 if (p->blocks[h] == PREFETCH_SENTINEL)
55 p->blocks[h] = b;
56
57 mutex_unlock(&p->lock);
58}
59
60static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm)
61{
62 unsigned i;
63
64 mutex_lock(&p->lock);
65
66 for (i = 0; i < PREFETCH_SIZE; i++)
67 if (p->blocks[i] != PREFETCH_SENTINEL) {
68 dm_bm_prefetch(bm, p->blocks[i]);
69 p->blocks[i] = PREFETCH_SENTINEL;
70 }
71
72 mutex_unlock(&p->lock);
73}
74
75/*----------------------------------------------------------------*/
76
20struct shadow_info { 77struct shadow_info {
21 struct hlist_node hlist; 78 struct hlist_node hlist;
22 dm_block_t where; 79 dm_block_t where;
@@ -37,6 +94,8 @@ struct dm_transaction_manager {
37 94
38 spinlock_t lock; 95 spinlock_t lock;
39 struct hlist_head buckets[DM_HASH_SIZE]; 96 struct hlist_head buckets[DM_HASH_SIZE];
97
98 struct prefetch_set prefetches;
40}; 99};
41 100
42/*----------------------------------------------------------------*/ 101/*----------------------------------------------------------------*/
@@ -117,6 +176,8 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
117 for (i = 0; i < DM_HASH_SIZE; i++) 176 for (i = 0; i < DM_HASH_SIZE; i++)
118 INIT_HLIST_HEAD(tm->buckets + i); 177 INIT_HLIST_HEAD(tm->buckets + i);
119 178
179 prefetch_init(&tm->prefetches);
180
120 return tm; 181 return tm;
121} 182}
122 183
@@ -268,8 +329,14 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
268 struct dm_block_validator *v, 329 struct dm_block_validator *v,
269 struct dm_block **blk) 330 struct dm_block **blk)
270{ 331{
271 if (tm->is_clone) 332 if (tm->is_clone) {
272 return dm_bm_read_try_lock(tm->real->bm, b, v, blk); 333 int r = dm_bm_read_try_lock(tm->real->bm, b, v, blk);
334
335 if (r == -EWOULDBLOCK)
336 prefetch_add(&tm->real->prefetches, b);
337
338 return r;
339 }
273 340
274 return dm_bm_read_lock(tm->bm, b, v, blk); 341 return dm_bm_read_lock(tm->bm, b, v, blk);
275} 342}
@@ -317,6 +384,12 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
317 return tm->bm; 384 return tm->bm;
318} 385}
319 386
387void dm_tm_issue_prefetches(struct dm_transaction_manager *tm)
388{
389 prefetch_issue(&tm->prefetches, tm->bm);
390}
391EXPORT_SYMBOL_GPL(dm_tm_issue_prefetches);
392
320/*----------------------------------------------------------------*/ 393/*----------------------------------------------------------------*/
321 394
322static int dm_tm_create_internal(struct dm_block_manager *bm, 395static int dm_tm_create_internal(struct dm_block_manager *bm,
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index 2772ed2a781a..2e0d4d66fb1b 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -109,6 +109,13 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
109struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); 109struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
110 110
111/* 111/*
112 * If you're using a non-blocking clone the tm will build up a list of
113 * requested blocks that weren't in core. This call will request those
114 * blocks to be prefetched.
115 */
116void dm_tm_issue_prefetches(struct dm_transaction_manager *tm);
117
118/*
112 * A little utility that ties the knot by producing a transaction manager 119 * A little utility that ties the knot by producing a transaction manager
113 * that has a space map managed by the transaction manager... 120 * that has a space map managed by the transaction manager...
114 * 121 *