aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-09 00:10:03 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-09 00:10:03 -0500
commit140dfc9299c33bbfc9350fa061f5ab65cb83df13 (patch)
tree09508691964e277f4835d30f7b9c3962e8cac596 /drivers/md
parentf94784bdb114439eb3a5e62343826887bbf3f37c (diff)
parent1a71d6ffe18c0d0f03fc8531949cc8ed41d702ee (diff)
Merge tag 'dm-3.19-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - Significant DM thin-provisioning performance improvements to meet performance requirements that were requested by the Gluster distributed filesystem. Specifically, dm-thinp now takes care to aggregate IO that will be issued to the same thinp block before issuing IO to the underlying devices. This really helps improve performance on HW RAID6 devices that have a writeback cache because it avoids RMW in the HW RAID controller. - Some stable fixes: fix leak in DM bufio if integrity profiles were enabled, use memzero_explicit in DM crypt to avoid any potential for information leak, and a DM cache fix to properly mark a cache block dirty if it was promoted to the cache via the overwrite optimization. - A few simple DM persistent data library fixes - DM cache multiqueue policy block promotion improvements. - DM cache discard improvements that take advantage of range (multiblock) discard support in the DM bio-prison. This allows for much more efficient bulk discard processing (e.g. when mkfs.xfs discards the entire device). - Some small optimizations in DM core and RCU deference cleanups - DM core changes to suspend/resume code to introduce the new internal suspend/resume interface that the DM thin-pool target now uses to suspend/resume active thin devices when the thin-pool must suspend/resume. This avoids forcing userspace to track all active thin volumes in a thin-pool when the thin-pool is suspended for the purposes of metadata or data space resize. * tag 'dm-3.19-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (49 commits) dm crypt: use memzero_explicit for on-stack buffer dm space map metadata: fix sm_bootstrap_get_count() dm space map metadata: fix sm_bootstrap_get_nr_blocks() dm bufio: fix memleak when using a dm_buffer's inline bio dm cache: fix spurious cell_defer when dealing with partial block at end of device dm cache: dirty flag was mistakenly being cleared when promoting via overwrite dm cache: only use overwrite optimisation for promotion when in writeback mode dm cache: discard block size must be a multiple of cache block size dm cache: fix a harmless race when working out if a block is discarded dm cache: when reloading a discard bitset allow for a different discard block size dm cache: fix some issues with the new discard range support dm array: if resizing the array is a noop set the new root to the old one dm: use rcu_dereference_protected instead of rcu_dereference dm thin: fix pool_io_hints to avoid looking at max_hw_sectors dm thin: suspend/resume active thin devices when reloading thin-pool dm: enhance internal suspend and resume interface dm thin: do not allow thin device activation while pool is suspended dm: add presuspend_undo hook to target_type dm: return earlier from dm_blk_ioctl if target doesn't implement .ioctl dm thin: remove stale 'trim' message in block comment above pool_message ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-bio-prison.c186
-rw-r--r--drivers/md/dm-bio-prison.h28
-rw-r--r--drivers/md/dm-bufio.c226
-rw-r--r--drivers/md/dm-cache-block-types.h11
-rw-r--r--drivers/md/dm-cache-metadata.c34
-rw-r--r--drivers/md/dm-cache-metadata.h6
-rw-r--r--drivers/md/dm-cache-policy-mq.c82
-rw-r--r--drivers/md/dm-cache-target.c378
-rw-r--r--drivers/md/dm-crypt.c2
-rw-r--r--drivers/md/dm-ioctl.c5
-rw-r--r--drivers/md/dm-stats.c2
-rw-r--r--drivers/md/dm-table.c36
-rw-r--r--drivers/md/dm-thin-metadata.c35
-rw-r--r--drivers/md/dm-thin-metadata.h9
-rw-r--r--drivers/md/dm-thin.c760
-rw-r--r--drivers/md/dm.c273
-rw-r--r--drivers/md/dm.h10
-rw-r--r--drivers/md/persistent-data/dm-array.c4
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c8
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c77
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h7
21 files changed, 1610 insertions, 569 deletions
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index f752d12081ff..be065300e93c 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -14,68 +14,38 @@
14 14
15/*----------------------------------------------------------------*/ 15/*----------------------------------------------------------------*/
16 16
17struct bucket { 17#define MIN_CELLS 1024
18 spinlock_t lock;
19 struct hlist_head cells;
20};
21 18
22struct dm_bio_prison { 19struct dm_bio_prison {
20 spinlock_t lock;
23 mempool_t *cell_pool; 21 mempool_t *cell_pool;
24 22 struct rb_root cells;
25 unsigned nr_buckets;
26 unsigned hash_mask;
27 struct bucket *buckets;
28}; 23};
29 24
30/*----------------------------------------------------------------*/
31
32static uint32_t calc_nr_buckets(unsigned nr_cells)
33{
34 uint32_t n = 128;
35
36 nr_cells /= 4;
37 nr_cells = min(nr_cells, 8192u);
38
39 while (n < nr_cells)
40 n <<= 1;
41
42 return n;
43}
44
45static struct kmem_cache *_cell_cache; 25static struct kmem_cache *_cell_cache;
46 26
47static void init_bucket(struct bucket *b) 27/*----------------------------------------------------------------*/
48{
49 spin_lock_init(&b->lock);
50 INIT_HLIST_HEAD(&b->cells);
51}
52 28
53/* 29/*
54 * @nr_cells should be the number of cells you want in use _concurrently_. 30 * @nr_cells should be the number of cells you want in use _concurrently_.
55 * Don't confuse it with the number of distinct keys. 31 * Don't confuse it with the number of distinct keys.
56 */ 32 */
57struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells) 33struct dm_bio_prison *dm_bio_prison_create(void)
58{ 34{
59 unsigned i; 35 struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
60 uint32_t nr_buckets = calc_nr_buckets(nr_cells);
61 size_t len = sizeof(struct dm_bio_prison) +
62 (sizeof(struct bucket) * nr_buckets);
63 struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
64 36
65 if (!prison) 37 if (!prison)
66 return NULL; 38 return NULL;
67 39
68 prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); 40 spin_lock_init(&prison->lock);
41
42 prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
69 if (!prison->cell_pool) { 43 if (!prison->cell_pool) {
70 kfree(prison); 44 kfree(prison);
71 return NULL; 45 return NULL;
72 } 46 }
73 47
74 prison->nr_buckets = nr_buckets; 48 prison->cells = RB_ROOT;
75 prison->hash_mask = nr_buckets - 1;
76 prison->buckets = (struct bucket *) (prison + 1);
77 for (i = 0; i < nr_buckets; i++)
78 init_bucket(prison->buckets + i);
79 49
80 return prison; 50 return prison;
81} 51}
@@ -101,68 +71,73 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
101} 71}
102EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell); 72EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
103 73
104static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key) 74static void __setup_new_cell(struct dm_cell_key *key,
75 struct bio *holder,
76 struct dm_bio_prison_cell *cell)
105{ 77{
106 const unsigned long BIG_PRIME = 4294967291UL; 78 memcpy(&cell->key, key, sizeof(cell->key));
107 uint64_t hash = key->block * BIG_PRIME; 79 cell->holder = holder;
108 80 bio_list_init(&cell->bios);
109 return (uint32_t) (hash & prison->hash_mask);
110} 81}
111 82
112static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs) 83static int cmp_keys(struct dm_cell_key *lhs,
84 struct dm_cell_key *rhs)
113{ 85{
114 return (lhs->virtual == rhs->virtual) && 86 if (lhs->virtual < rhs->virtual)
115 (lhs->dev == rhs->dev) && 87 return -1;
116 (lhs->block == rhs->block);
117}
118 88
119static struct bucket *get_bucket(struct dm_bio_prison *prison, 89 if (lhs->virtual > rhs->virtual)
120 struct dm_cell_key *key) 90 return 1;
121{
122 return prison->buckets + hash_key(prison, key);
123}
124 91
125static struct dm_bio_prison_cell *__search_bucket(struct bucket *b, 92 if (lhs->dev < rhs->dev)
126 struct dm_cell_key *key) 93 return -1;
127{
128 struct dm_bio_prison_cell *cell;
129 94
130 hlist_for_each_entry(cell, &b->cells, list) 95 if (lhs->dev > rhs->dev)
131 if (keys_equal(&cell->key, key)) 96 return 1;
132 return cell;
133 97
134 return NULL; 98 if (lhs->block_end <= rhs->block_begin)
135} 99 return -1;
136 100
137static void __setup_new_cell(struct bucket *b, 101 if (lhs->block_begin >= rhs->block_end)
138 struct dm_cell_key *key, 102 return 1;
139 struct bio *holder, 103
140 struct dm_bio_prison_cell *cell) 104 return 0;
141{
142 memcpy(&cell->key, key, sizeof(cell->key));
143 cell->holder = holder;
144 bio_list_init(&cell->bios);
145 hlist_add_head(&cell->list, &b->cells);
146} 105}
147 106
148static int __bio_detain(struct bucket *b, 107static int __bio_detain(struct dm_bio_prison *prison,
149 struct dm_cell_key *key, 108 struct dm_cell_key *key,
150 struct bio *inmate, 109 struct bio *inmate,
151 struct dm_bio_prison_cell *cell_prealloc, 110 struct dm_bio_prison_cell *cell_prealloc,
152 struct dm_bio_prison_cell **cell_result) 111 struct dm_bio_prison_cell **cell_result)
153{ 112{
154 struct dm_bio_prison_cell *cell; 113 int r;
155 114 struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
156 cell = __search_bucket(b, key); 115
157 if (cell) { 116 while (*new) {
158 if (inmate) 117 struct dm_bio_prison_cell *cell =
159 bio_list_add(&cell->bios, inmate); 118 container_of(*new, struct dm_bio_prison_cell, node);
160 *cell_result = cell; 119
161 return 1; 120 r = cmp_keys(key, &cell->key);
121
122 parent = *new;
123 if (r < 0)
124 new = &((*new)->rb_left);
125 else if (r > 0)
126 new = &((*new)->rb_right);
127 else {
128 if (inmate)
129 bio_list_add(&cell->bios, inmate);
130 *cell_result = cell;
131 return 1;
132 }
162 } 133 }
163 134
164 __setup_new_cell(b, key, inmate, cell_prealloc); 135 __setup_new_cell(key, inmate, cell_prealloc);
165 *cell_result = cell_prealloc; 136 *cell_result = cell_prealloc;
137
138 rb_link_node(&cell_prealloc->node, parent, new);
139 rb_insert_color(&cell_prealloc->node, &prison->cells);
140
166 return 0; 141 return 0;
167} 142}
168 143
@@ -174,11 +149,10 @@ static int bio_detain(struct dm_bio_prison *prison,
174{ 149{
175 int r; 150 int r;
176 unsigned long flags; 151 unsigned long flags;
177 struct bucket *b = get_bucket(prison, key);
178 152
179 spin_lock_irqsave(&b->lock, flags); 153 spin_lock_irqsave(&prison->lock, flags);
180 r = __bio_detain(b, key, inmate, cell_prealloc, cell_result); 154 r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
181 spin_unlock_irqrestore(&b->lock, flags); 155 spin_unlock_irqrestore(&prison->lock, flags);
182 156
183 return r; 157 return r;
184} 158}
@@ -205,10 +179,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell);
205/* 179/*
206 * @inmates must have been initialised prior to this call 180 * @inmates must have been initialised prior to this call
207 */ 181 */
208static void __cell_release(struct dm_bio_prison_cell *cell, 182static void __cell_release(struct dm_bio_prison *prison,
183 struct dm_bio_prison_cell *cell,
209 struct bio_list *inmates) 184 struct bio_list *inmates)
210{ 185{
211 hlist_del(&cell->list); 186 rb_erase(&cell->node, &prison->cells);
212 187
213 if (inmates) { 188 if (inmates) {
214 if (cell->holder) 189 if (cell->holder)
@@ -222,21 +197,21 @@ void dm_cell_release(struct dm_bio_prison *prison,
222 struct bio_list *bios) 197 struct bio_list *bios)
223{ 198{
224 unsigned long flags; 199 unsigned long flags;
225 struct bucket *b = get_bucket(prison, &cell->key);
226 200
227 spin_lock_irqsave(&b->lock, flags); 201 spin_lock_irqsave(&prison->lock, flags);
228 __cell_release(cell, bios); 202 __cell_release(prison, cell, bios);
229 spin_unlock_irqrestore(&b->lock, flags); 203 spin_unlock_irqrestore(&prison->lock, flags);
230} 204}
231EXPORT_SYMBOL_GPL(dm_cell_release); 205EXPORT_SYMBOL_GPL(dm_cell_release);
232 206
233/* 207/*
234 * Sometimes we don't want the holder, just the additional bios. 208 * Sometimes we don't want the holder, just the additional bios.
235 */ 209 */
236static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, 210static void __cell_release_no_holder(struct dm_bio_prison *prison,
211 struct dm_bio_prison_cell *cell,
237 struct bio_list *inmates) 212 struct bio_list *inmates)
238{ 213{
239 hlist_del(&cell->list); 214 rb_erase(&cell->node, &prison->cells);
240 bio_list_merge(inmates, &cell->bios); 215 bio_list_merge(inmates, &cell->bios);
241} 216}
242 217
@@ -245,11 +220,10 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
245 struct bio_list *inmates) 220 struct bio_list *inmates)
246{ 221{
247 unsigned long flags; 222 unsigned long flags;
248 struct bucket *b = get_bucket(prison, &cell->key);
249 223
250 spin_lock_irqsave(&b->lock, flags); 224 spin_lock_irqsave(&prison->lock, flags);
251 __cell_release_no_holder(cell, inmates); 225 __cell_release_no_holder(prison, cell, inmates);
252 spin_unlock_irqrestore(&b->lock, flags); 226 spin_unlock_irqrestore(&prison->lock, flags);
253} 227}
254EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); 228EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
255 229
@@ -267,6 +241,20 @@ void dm_cell_error(struct dm_bio_prison *prison,
267} 241}
268EXPORT_SYMBOL_GPL(dm_cell_error); 242EXPORT_SYMBOL_GPL(dm_cell_error);
269 243
244void dm_cell_visit_release(struct dm_bio_prison *prison,
245 void (*visit_fn)(void *, struct dm_bio_prison_cell *),
246 void *context,
247 struct dm_bio_prison_cell *cell)
248{
249 unsigned long flags;
250
251 spin_lock_irqsave(&prison->lock, flags);
252 visit_fn(context, cell);
253 rb_erase(&cell->node, &prison->cells);
254 spin_unlock_irqrestore(&prison->lock, flags);
255}
256EXPORT_SYMBOL_GPL(dm_cell_visit_release);
257
270/*----------------------------------------------------------------*/ 258/*----------------------------------------------------------------*/
271 259
272#define DEFERRED_SET_SIZE 64 260#define DEFERRED_SET_SIZE 64
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 6805a142b750..74cf01144b1f 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -10,8 +10,8 @@
10#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */ 10#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
11#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */ 11#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
12 12
13#include <linux/list.h>
14#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/rbtree.h>
15 15
16/*----------------------------------------------------------------*/ 16/*----------------------------------------------------------------*/
17 17
@@ -23,11 +23,14 @@
23 */ 23 */
24struct dm_bio_prison; 24struct dm_bio_prison;
25 25
26/* FIXME: this needs to be more abstract */ 26/*
27 * Keys define a range of blocks within either a virtual or physical
28 * device.
29 */
27struct dm_cell_key { 30struct dm_cell_key {
28 int virtual; 31 int virtual;
29 dm_thin_id dev; 32 dm_thin_id dev;
30 dm_block_t block; 33 dm_block_t block_begin, block_end;
31}; 34};
32 35
33/* 36/*
@@ -35,13 +38,15 @@ struct dm_cell_key {
35 * themselves. 38 * themselves.
36 */ 39 */
37struct dm_bio_prison_cell { 40struct dm_bio_prison_cell {
38 struct hlist_node list; 41 struct list_head user_list; /* for client use */
42 struct rb_node node;
43
39 struct dm_cell_key key; 44 struct dm_cell_key key;
40 struct bio *holder; 45 struct bio *holder;
41 struct bio_list bios; 46 struct bio_list bios;
42}; 47};
43 48
44struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells); 49struct dm_bio_prison *dm_bio_prison_create(void);
45void dm_bio_prison_destroy(struct dm_bio_prison *prison); 50void dm_bio_prison_destroy(struct dm_bio_prison *prison);
46 51
47/* 52/*
@@ -57,7 +62,7 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
57 struct dm_bio_prison_cell *cell); 62 struct dm_bio_prison_cell *cell);
58 63
59/* 64/*
60 * Creates, or retrieves a cell for the given key. 65 * Creates, or retrieves a cell that overlaps the given key.
61 * 66 *
62 * Returns 1 if pre-existing cell returned, zero if new cell created using 67 * Returns 1 if pre-existing cell returned, zero if new cell created using
63 * @cell_prealloc. 68 * @cell_prealloc.
@@ -68,7 +73,8 @@ int dm_get_cell(struct dm_bio_prison *prison,
68 struct dm_bio_prison_cell **cell_result); 73 struct dm_bio_prison_cell **cell_result);
69 74
70/* 75/*
71 * An atomic op that combines retrieving a cell, and adding a bio to it. 76 * An atomic op that combines retrieving or creating a cell, and adding a
77 * bio to it.
72 * 78 *
73 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 79 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
74 */ 80 */
@@ -87,6 +93,14 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
87void dm_cell_error(struct dm_bio_prison *prison, 93void dm_cell_error(struct dm_bio_prison *prison,
88 struct dm_bio_prison_cell *cell, int error); 94 struct dm_bio_prison_cell *cell, int error);
89 95
96/*
97 * Visits the cell and then releases. Guarantees no new inmates are
98 * inserted between the visit and release.
99 */
100void dm_cell_visit_release(struct dm_bio_prison *prison,
101 void (*visit_fn)(void *, struct dm_bio_prison_cell *),
102 void *context, struct dm_bio_prison_cell *cell);
103
90/*----------------------------------------------------------------*/ 104/*----------------------------------------------------------------*/
91 105
92/* 106/*
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index afe79719ea32..c33b49792b87 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -14,6 +14,7 @@
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/shrinker.h> 15#include <linux/shrinker.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/rbtree.h>
17 18
18#define DM_MSG_PREFIX "bufio" 19#define DM_MSG_PREFIX "bufio"
19 20
@@ -34,26 +35,23 @@
34/* 35/*
35 * Check buffer ages in this interval (seconds) 36 * Check buffer ages in this interval (seconds)
36 */ 37 */
37#define DM_BUFIO_WORK_TIMER_SECS 10 38#define DM_BUFIO_WORK_TIMER_SECS 30
38 39
39/* 40/*
40 * Free buffers when they are older than this (seconds) 41 * Free buffers when they are older than this (seconds)
41 */ 42 */
42#define DM_BUFIO_DEFAULT_AGE_SECS 60 43#define DM_BUFIO_DEFAULT_AGE_SECS 300
43 44
44/* 45/*
45 * The number of bvec entries that are embedded directly in the buffer. 46 * The nr of bytes of cached data to keep around.
46 * If the chunk size is larger, dm-io is used to do the io.
47 */ 47 */
48#define DM_BUFIO_INLINE_VECS 16 48#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
49 49
50/* 50/*
51 * Buffer hash 51 * The number of bvec entries that are embedded directly in the buffer.
52 * If the chunk size is larger, dm-io is used to do the io.
52 */ 53 */
53#define DM_BUFIO_HASH_BITS 20 54#define DM_BUFIO_INLINE_VECS 16
54#define DM_BUFIO_HASH(block) \
55 ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
56 ((1 << DM_BUFIO_HASH_BITS) - 1))
57 55
58/* 56/*
59 * Don't try to use kmem_cache_alloc for blocks larger than this. 57 * Don't try to use kmem_cache_alloc for blocks larger than this.
@@ -106,7 +104,7 @@ struct dm_bufio_client {
106 104
107 unsigned minimum_buffers; 105 unsigned minimum_buffers;
108 106
109 struct hlist_head *cache_hash; 107 struct rb_root buffer_tree;
110 wait_queue_head_t free_buffer_wait; 108 wait_queue_head_t free_buffer_wait;
111 109
112 int async_write_error; 110 int async_write_error;
@@ -135,7 +133,7 @@ enum data_mode {
135}; 133};
136 134
137struct dm_buffer { 135struct dm_buffer {
138 struct hlist_node hash_list; 136 struct rb_node node;
139 struct list_head lru_list; 137 struct list_head lru_list;
140 sector_t block; 138 sector_t block;
141 void *data; 139 void *data;
@@ -223,6 +221,7 @@ static DEFINE_SPINLOCK(param_spinlock);
223 * Buffers are freed after this timeout 221 * Buffers are freed after this timeout
224 */ 222 */
225static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 223static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
224static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
226 225
227static unsigned long dm_bufio_peak_allocated; 226static unsigned long dm_bufio_peak_allocated;
228static unsigned long dm_bufio_allocated_kmem_cache; 227static unsigned long dm_bufio_allocated_kmem_cache;
@@ -253,6 +252,53 @@ static LIST_HEAD(dm_bufio_all_clients);
253 */ 252 */
254static DEFINE_MUTEX(dm_bufio_clients_lock); 253static DEFINE_MUTEX(dm_bufio_clients_lock);
255 254
255/*----------------------------------------------------------------
256 * A red/black tree acts as an index for all the buffers.
257 *--------------------------------------------------------------*/
258static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
259{
260 struct rb_node *n = c->buffer_tree.rb_node;
261 struct dm_buffer *b;
262
263 while (n) {
264 b = container_of(n, struct dm_buffer, node);
265
266 if (b->block == block)
267 return b;
268
269 n = (b->block < block) ? n->rb_left : n->rb_right;
270 }
271
272 return NULL;
273}
274
275static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
276{
277 struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
278 struct dm_buffer *found;
279
280 while (*new) {
281 found = container_of(*new, struct dm_buffer, node);
282
283 if (found->block == b->block) {
284 BUG_ON(found != b);
285 return;
286 }
287
288 parent = *new;
289 new = (found->block < b->block) ?
290 &((*new)->rb_left) : &((*new)->rb_right);
291 }
292
293 rb_link_node(&b->node, parent, new);
294 rb_insert_color(&b->node, &c->buffer_tree);
295}
296
297static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
298{
299 rb_erase(&b->node, &c->buffer_tree);
300}
301
256/*----------------------------------------------------------------*/ 302/*----------------------------------------------------------------*/
257 303
258static void adjust_total_allocated(enum data_mode data_mode, long diff) 304static void adjust_total_allocated(enum data_mode data_mode, long diff)
@@ -434,7 +480,7 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
434 b->block = block; 480 b->block = block;
435 b->list_mode = dirty; 481 b->list_mode = dirty;
436 list_add(&b->lru_list, &c->lru[dirty]); 482 list_add(&b->lru_list, &c->lru[dirty]);
437 hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); 483 __insert(b->c, b);
438 b->last_accessed = jiffies; 484 b->last_accessed = jiffies;
439} 485}
440 486
@@ -448,7 +494,7 @@ static void __unlink_buffer(struct dm_buffer *b)
448 BUG_ON(!c->n_buffers[b->list_mode]); 494 BUG_ON(!c->n_buffers[b->list_mode]);
449 495
450 c->n_buffers[b->list_mode]--; 496 c->n_buffers[b->list_mode]--;
451 hlist_del(&b->hash_list); 497 __remove(b->c, b);
452 list_del(&b->lru_list); 498 list_del(&b->lru_list);
453} 499}
454 500
@@ -532,6 +578,19 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
532 end_io(&b->bio, r); 578 end_io(&b->bio, r);
533} 579}
534 580
581static void inline_endio(struct bio *bio, int error)
582{
583 bio_end_io_t *end_fn = bio->bi_private;
584
585 /*
586 * Reset the bio to free any attached resources
587 * (e.g. bio integrity profiles).
588 */
589 bio_reset(bio);
590
591 end_fn(bio, error);
592}
593
535static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 594static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
536 bio_end_io_t *end_io) 595 bio_end_io_t *end_io)
537{ 596{
@@ -543,7 +602,12 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
543 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 602 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
544 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; 603 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
545 b->bio.bi_bdev = b->c->bdev; 604 b->bio.bi_bdev = b->c->bdev;
546 b->bio.bi_end_io = end_io; 605 b->bio.bi_end_io = inline_endio;
606 /*
607 * Use of .bi_private isn't a problem here because
608 * the dm_buffer's inline bio is local to bufio.
609 */
610 b->bio.bi_private = end_io;
547 611
548 /* 612 /*
549 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 613 * We assume that if len >= PAGE_SIZE ptr is page-aligned.
@@ -887,23 +951,6 @@ static void __check_watermark(struct dm_bufio_client *c,
887 __write_dirty_buffers_async(c, 1, write_list); 951 __write_dirty_buffers_async(c, 1, write_list);
888} 952}
889 953
890/*
891 * Find a buffer in the hash.
892 */
893static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
894{
895 struct dm_buffer *b;
896
897 hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)],
898 hash_list) {
899 dm_bufio_cond_resched();
900 if (b->block == block)
901 return b;
902 }
903
904 return NULL;
905}
906
907/*---------------------------------------------------------------- 954/*----------------------------------------------------------------
908 * Getting a buffer 955 * Getting a buffer
909 *--------------------------------------------------------------*/ 956 *--------------------------------------------------------------*/
@@ -1433,45 +1480,52 @@ static void drop_buffers(struct dm_bufio_client *c)
1433} 1480}
1434 1481
1435/* 1482/*
1436 * Test if the buffer is unused and too old, and commit it. 1483 * We may not be able to evict this buffer if IO pending or the client
1484 * is still using it. Caller is expected to know buffer is too old.
1485 *
1437 * And if GFP_NOFS is used, we must not do any I/O because we hold 1486 * And if GFP_NOFS is used, we must not do any I/O because we hold
1438 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets 1487 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
1439 * rerouted to different bufio client. 1488 * rerouted to different bufio client.
1440 */ 1489 */
1441static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, 1490static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
1442 unsigned long max_jiffies)
1443{ 1491{
1444 if (jiffies - b->last_accessed < max_jiffies)
1445 return 0;
1446
1447 if (!(gfp & __GFP_FS)) { 1492 if (!(gfp & __GFP_FS)) {
1448 if (test_bit(B_READING, &b->state) || 1493 if (test_bit(B_READING, &b->state) ||
1449 test_bit(B_WRITING, &b->state) || 1494 test_bit(B_WRITING, &b->state) ||
1450 test_bit(B_DIRTY, &b->state)) 1495 test_bit(B_DIRTY, &b->state))
1451 return 0; 1496 return false;
1452 } 1497 }
1453 1498
1454 if (b->hold_count) 1499 if (b->hold_count)
1455 return 0; 1500 return false;
1456 1501
1457 __make_buffer_clean(b); 1502 __make_buffer_clean(b);
1458 __unlink_buffer(b); 1503 __unlink_buffer(b);
1459 __free_buffer_wake(b); 1504 __free_buffer_wake(b);
1460 1505
1461 return 1; 1506 return true;
1462} 1507}
1463 1508
1464static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1509static unsigned get_retain_buffers(struct dm_bufio_client *c)
1465 gfp_t gfp_mask) 1510{
1511 unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
1512 return retain_bytes / c->block_size;
1513}
1514
1515static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
1516 gfp_t gfp_mask)
1466{ 1517{
1467 int l; 1518 int l;
1468 struct dm_buffer *b, *tmp; 1519 struct dm_buffer *b, *tmp;
1469 long freed = 0; 1520 unsigned long freed = 0;
1521 unsigned long count = nr_to_scan;
1522 unsigned retain_target = get_retain_buffers(c);
1470 1523
1471 for (l = 0; l < LIST_SIZE; l++) { 1524 for (l = 0; l < LIST_SIZE; l++) {
1472 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { 1525 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
1473 freed += __cleanup_old_buffer(b, gfp_mask, 0); 1526 if (__try_evict_buffer(b, gfp_mask))
1474 if (!--nr_to_scan) 1527 freed++;
1528 if (!--nr_to_scan || ((count - freed) <= retain_target))
1475 return freed; 1529 return freed;
1476 dm_bufio_cond_resched(); 1530 dm_bufio_cond_resched();
1477 } 1531 }
@@ -1533,11 +1587,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
1533 r = -ENOMEM; 1587 r = -ENOMEM;
1534 goto bad_client; 1588 goto bad_client;
1535 } 1589 }
1536 c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); 1590 c->buffer_tree = RB_ROOT;
1537 if (!c->cache_hash) {
1538 r = -ENOMEM;
1539 goto bad_hash;
1540 }
1541 1591
1542 c->bdev = bdev; 1592 c->bdev = bdev;
1543 c->block_size = block_size; 1593 c->block_size = block_size;
@@ -1556,9 +1606,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
1556 c->n_buffers[i] = 0; 1606 c->n_buffers[i] = 0;
1557 } 1607 }
1558 1608
1559 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
1560 INIT_HLIST_HEAD(&c->cache_hash[i]);
1561
1562 mutex_init(&c->lock); 1609 mutex_init(&c->lock);
1563 INIT_LIST_HEAD(&c->reserved_buffers); 1610 INIT_LIST_HEAD(&c->reserved_buffers);
1564 c->need_reserved_buffers = reserved_buffers; 1611 c->need_reserved_buffers = reserved_buffers;
@@ -1632,8 +1679,6 @@ bad_cache:
1632 } 1679 }
1633 dm_io_client_destroy(c->dm_io); 1680 dm_io_client_destroy(c->dm_io);
1634bad_dm_io: 1681bad_dm_io:
1635 vfree(c->cache_hash);
1636bad_hash:
1637 kfree(c); 1682 kfree(c);
1638bad_client: 1683bad_client:
1639 return ERR_PTR(r); 1684 return ERR_PTR(r);
@@ -1660,9 +1705,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
1660 1705
1661 mutex_unlock(&dm_bufio_clients_lock); 1706 mutex_unlock(&dm_bufio_clients_lock);
1662 1707
1663 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1708 BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
1664 BUG_ON(!hlist_empty(&c->cache_hash[i]));
1665
1666 BUG_ON(c->need_reserved_buffers); 1709 BUG_ON(c->need_reserved_buffers);
1667 1710
1668 while (!list_empty(&c->reserved_buffers)) { 1711 while (!list_empty(&c->reserved_buffers)) {
@@ -1680,36 +1723,60 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
1680 BUG_ON(c->n_buffers[i]); 1723 BUG_ON(c->n_buffers[i]);
1681 1724
1682 dm_io_client_destroy(c->dm_io); 1725 dm_io_client_destroy(c->dm_io);
1683 vfree(c->cache_hash);
1684 kfree(c); 1726 kfree(c);
1685} 1727}
1686EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1728EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1687 1729
1688static void cleanup_old_buffers(void) 1730static unsigned get_max_age_hz(void)
1689{ 1731{
1690 unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); 1732 unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
1691 struct dm_bufio_client *c;
1692 1733
1693 if (max_age > ULONG_MAX / HZ) 1734 if (max_age > UINT_MAX / HZ)
1694 max_age = ULONG_MAX / HZ; 1735 max_age = UINT_MAX / HZ;
1695 1736
1696 mutex_lock(&dm_bufio_clients_lock); 1737 return max_age * HZ;
1697 list_for_each_entry(c, &dm_bufio_all_clients, client_list) { 1738}
1698 if (!dm_bufio_trylock(c))
1699 continue;
1700 1739
1701 while (!list_empty(&c->lru[LIST_CLEAN])) { 1740static bool older_than(struct dm_buffer *b, unsigned long age_hz)
1702 struct dm_buffer *b; 1741{
1703 b = list_entry(c->lru[LIST_CLEAN].prev, 1742 return (jiffies - b->last_accessed) >= age_hz;
1704 struct dm_buffer, lru_list); 1743}
1705 if (!__cleanup_old_buffer(b, 0, max_age * HZ)) 1744
1706 break; 1745static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1707 dm_bufio_cond_resched(); 1746{
1708 } 1747 struct dm_buffer *b, *tmp;
1748 unsigned retain_target = get_retain_buffers(c);
1749 unsigned count;
1750
1751 dm_bufio_lock(c);
1752
1753 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1754 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
1755 if (count <= retain_target)
1756 break;
1757
1758 if (!older_than(b, age_hz))
1759 break;
1760
1761 if (__try_evict_buffer(b, 0))
1762 count--;
1709 1763
1710 dm_bufio_unlock(c);
1711 dm_bufio_cond_resched(); 1764 dm_bufio_cond_resched();
1712 } 1765 }
1766
1767 dm_bufio_unlock(c);
1768}
1769
1770static void cleanup_old_buffers(void)
1771{
1772 unsigned long max_age_hz = get_max_age_hz();
1773 struct dm_bufio_client *c;
1774
1775 mutex_lock(&dm_bufio_clients_lock);
1776
1777 list_for_each_entry(c, &dm_bufio_all_clients, client_list)
1778 __evict_old_buffers(c, max_age_hz);
1779
1713 mutex_unlock(&dm_bufio_clients_lock); 1780 mutex_unlock(&dm_bufio_clients_lock);
1714} 1781}
1715 1782
@@ -1834,6 +1901,9 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
1834module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1901module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
1835MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1902MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
1836 1903
1904module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR);
1905MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
1906
1837module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1907module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
1838MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1908MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
1839 1909
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
index aac0e2df06be..bed4ad4e1b7c 100644
--- a/drivers/md/dm-cache-block-types.h
+++ b/drivers/md/dm-cache-block-types.h
@@ -19,6 +19,7 @@
19 19
20typedef dm_block_t __bitwise__ dm_oblock_t; 20typedef dm_block_t __bitwise__ dm_oblock_t;
21typedef uint32_t __bitwise__ dm_cblock_t; 21typedef uint32_t __bitwise__ dm_cblock_t;
22typedef dm_block_t __bitwise__ dm_dblock_t;
22 23
23static inline dm_oblock_t to_oblock(dm_block_t b) 24static inline dm_oblock_t to_oblock(dm_block_t b)
24{ 25{
@@ -40,4 +41,14 @@ static inline uint32_t from_cblock(dm_cblock_t b)
40 return (__force uint32_t) b; 41 return (__force uint32_t) b;
41} 42}
42 43
44static inline dm_dblock_t to_dblock(dm_block_t b)
45{
46 return (__force dm_dblock_t) b;
47}
48
49static inline dm_block_t from_dblock(dm_dblock_t b)
50{
51 return (__force dm_block_t) b;
52}
53
43#endif /* DM_CACHE_BLOCK_TYPES_H */ 54#endif /* DM_CACHE_BLOCK_TYPES_H */
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 06709257adde..9fc616c2755e 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -109,7 +109,7 @@ struct dm_cache_metadata {
109 dm_block_t discard_root; 109 dm_block_t discard_root;
110 110
111 sector_t discard_block_size; 111 sector_t discard_block_size;
112 dm_oblock_t discard_nr_blocks; 112 dm_dblock_t discard_nr_blocks;
113 113
114 sector_t data_block_size; 114 sector_t data_block_size;
115 dm_cblock_t cache_blocks; 115 dm_cblock_t cache_blocks;
@@ -329,7 +329,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
329 disk_super->hint_root = cpu_to_le64(cmd->hint_root); 329 disk_super->hint_root = cpu_to_le64(cmd->hint_root);
330 disk_super->discard_root = cpu_to_le64(cmd->discard_root); 330 disk_super->discard_root = cpu_to_le64(cmd->discard_root);
331 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); 331 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
332 disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); 332 disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
333 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE); 333 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE);
334 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); 334 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
335 disk_super->cache_blocks = cpu_to_le32(0); 335 disk_super->cache_blocks = cpu_to_le32(0);
@@ -528,7 +528,7 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
528 cmd->hint_root = le64_to_cpu(disk_super->hint_root); 528 cmd->hint_root = le64_to_cpu(disk_super->hint_root);
529 cmd->discard_root = le64_to_cpu(disk_super->discard_root); 529 cmd->discard_root = le64_to_cpu(disk_super->discard_root);
530 cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size); 530 cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
531 cmd->discard_nr_blocks = to_oblock(le64_to_cpu(disk_super->discard_nr_blocks)); 531 cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks));
532 cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); 532 cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
533 cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); 533 cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
534 strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); 534 strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
@@ -626,7 +626,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
626 disk_super->hint_root = cpu_to_le64(cmd->hint_root); 626 disk_super->hint_root = cpu_to_le64(cmd->hint_root);
627 disk_super->discard_root = cpu_to_le64(cmd->discard_root); 627 disk_super->discard_root = cpu_to_le64(cmd->discard_root);
628 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); 628 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
629 disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); 629 disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
630 disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); 630 disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
631 strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); 631 strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
632 disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); 632 disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
@@ -797,15 +797,15 @@ out:
797 797
798int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, 798int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
799 sector_t discard_block_size, 799 sector_t discard_block_size,
800 dm_oblock_t new_nr_entries) 800 dm_dblock_t new_nr_entries)
801{ 801{
802 int r; 802 int r;
803 803
804 down_write(&cmd->root_lock); 804 down_write(&cmd->root_lock);
805 r = dm_bitset_resize(&cmd->discard_info, 805 r = dm_bitset_resize(&cmd->discard_info,
806 cmd->discard_root, 806 cmd->discard_root,
807 from_oblock(cmd->discard_nr_blocks), 807 from_dblock(cmd->discard_nr_blocks),
808 from_oblock(new_nr_entries), 808 from_dblock(new_nr_entries),
809 false, &cmd->discard_root); 809 false, &cmd->discard_root);
810 if (!r) { 810 if (!r) {
811 cmd->discard_block_size = discard_block_size; 811 cmd->discard_block_size = discard_block_size;
@@ -818,28 +818,28 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
818 return r; 818 return r;
819} 819}
820 820
821static int __set_discard(struct dm_cache_metadata *cmd, dm_oblock_t b) 821static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
822{ 822{
823 return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root, 823 return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
824 from_oblock(b), &cmd->discard_root); 824 from_dblock(b), &cmd->discard_root);
825} 825}
826 826
827static int __clear_discard(struct dm_cache_metadata *cmd, dm_oblock_t b) 827static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
828{ 828{
829 return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root, 829 return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root,
830 from_oblock(b), &cmd->discard_root); 830 from_dblock(b), &cmd->discard_root);
831} 831}
832 832
833static int __is_discarded(struct dm_cache_metadata *cmd, dm_oblock_t b, 833static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
834 bool *is_discarded) 834 bool *is_discarded)
835{ 835{
836 return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root, 836 return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
837 from_oblock(b), &cmd->discard_root, 837 from_dblock(b), &cmd->discard_root,
838 is_discarded); 838 is_discarded);
839} 839}
840 840
841static int __discard(struct dm_cache_metadata *cmd, 841static int __discard(struct dm_cache_metadata *cmd,
842 dm_oblock_t dblock, bool discard) 842 dm_dblock_t dblock, bool discard)
843{ 843{
844 int r; 844 int r;
845 845
@@ -852,7 +852,7 @@ static int __discard(struct dm_cache_metadata *cmd,
852} 852}
853 853
854int dm_cache_set_discard(struct dm_cache_metadata *cmd, 854int dm_cache_set_discard(struct dm_cache_metadata *cmd,
855 dm_oblock_t dblock, bool discard) 855 dm_dblock_t dblock, bool discard)
856{ 856{
857 int r; 857 int r;
858 858
@@ -870,8 +870,8 @@ static int __load_discards(struct dm_cache_metadata *cmd,
870 dm_block_t b; 870 dm_block_t b;
871 bool discard; 871 bool discard;
872 872
873 for (b = 0; b < from_oblock(cmd->discard_nr_blocks); b++) { 873 for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
874 dm_oblock_t dblock = to_oblock(b); 874 dm_dblock_t dblock = to_dblock(b);
875 875
876 if (cmd->clean_when_opened) { 876 if (cmd->clean_when_opened) {
877 r = __is_discarded(cmd, dblock, &discard); 877 r = __is_discarded(cmd, dblock, &discard);
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 7383c90ccdb8..4ecc403be283 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -70,14 +70,14 @@ dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
70 70
71int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, 71int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
72 sector_t discard_block_size, 72 sector_t discard_block_size,
73 dm_oblock_t new_nr_entries); 73 dm_dblock_t new_nr_entries);
74 74
75typedef int (*load_discard_fn)(void *context, sector_t discard_block_size, 75typedef int (*load_discard_fn)(void *context, sector_t discard_block_size,
76 dm_oblock_t dblock, bool discarded); 76 dm_dblock_t dblock, bool discarded);
77int dm_cache_load_discards(struct dm_cache_metadata *cmd, 77int dm_cache_load_discards(struct dm_cache_metadata *cmd,
78 load_discard_fn fn, void *context); 78 load_discard_fn fn, void *context);
79 79
80int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_oblock_t dblock, bool discard); 80int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard);
81 81
82int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock); 82int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock);
83int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock); 83int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 0e385e40909e..13f547a4eeb6 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -181,24 +181,30 @@ static void queue_shift_down(struct queue *q)
181 * Gives us the oldest entry of the lowest popoulated level. If the first 181 * Gives us the oldest entry of the lowest popoulated level. If the first
182 * level is emptied then we shift down one level. 182 * level is emptied then we shift down one level.
183 */ 183 */
184static struct list_head *queue_pop(struct queue *q) 184static struct list_head *queue_peek(struct queue *q)
185{ 185{
186 unsigned level; 186 unsigned level;
187 struct list_head *r;
188 187
189 for (level = 0; level < NR_QUEUE_LEVELS; level++) 188 for (level = 0; level < NR_QUEUE_LEVELS; level++)
190 if (!list_empty(q->qs + level)) { 189 if (!list_empty(q->qs + level))
191 r = q->qs[level].next; 190 return q->qs[level].next;
192 list_del(r);
193 191
194 /* have we just emptied the bottom level? */ 192 return NULL;
195 if (level == 0 && list_empty(q->qs)) 193}
196 queue_shift_down(q);
197 194
198 return r; 195static struct list_head *queue_pop(struct queue *q)
199 } 196{
197 struct list_head *r = queue_peek(q);
200 198
201 return NULL; 199 if (r) {
200 list_del(r);
201
202 /* have we just emptied the bottom level? */
203 if (list_empty(q->qs))
204 queue_shift_down(q);
205 }
206
207 return r;
202} 208}
203 209
204static struct list_head *list_pop(struct list_head *lh) 210static struct list_head *list_pop(struct list_head *lh)
@@ -383,13 +389,6 @@ struct mq_policy {
383 unsigned generation; 389 unsigned generation;
384 unsigned generation_period; /* in lookups (will probably change) */ 390 unsigned generation_period; /* in lookups (will probably change) */
385 391
386 /*
387 * Entries in the pre_cache whose hit count passes the promotion
388 * threshold move to the cache proper. Working out the correct
389 * value for the promotion_threshold is crucial to this policy.
390 */
391 unsigned promote_threshold;
392
393 unsigned discard_promote_adjustment; 392 unsigned discard_promote_adjustment;
394 unsigned read_promote_adjustment; 393 unsigned read_promote_adjustment;
395 unsigned write_promote_adjustment; 394 unsigned write_promote_adjustment;
@@ -406,6 +405,7 @@ struct mq_policy {
406#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 405#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
407#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 406#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
408#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 407#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
408#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128
409 409
410/*----------------------------------------------------------------*/ 410/*----------------------------------------------------------------*/
411 411
@@ -518,6 +518,12 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q)
518 return e; 518 return e;
519} 519}
520 520
521static struct entry *peek(struct queue *q)
522{
523 struct list_head *h = queue_peek(q);
524 return h ? container_of(h, struct entry, list) : NULL;
525}
526
521/* 527/*
522 * Has this entry already been updated? 528 * Has this entry already been updated?
523 */ 529 */
@@ -570,10 +576,6 @@ static void check_generation(struct mq_policy *mq)
570 break; 576 break;
571 } 577 }
572 } 578 }
573
574 mq->promote_threshold = nr ? total / nr : 1;
575 if (mq->promote_threshold * nr < total)
576 mq->promote_threshold++;
577 } 579 }
578} 580}
579 581
@@ -641,6 +643,30 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
641} 643}
642 644
643/* 645/*
646 * Entries in the pre_cache whose hit count passes the promotion
647 * threshold move to the cache proper. Working out the correct
648 * value for the promotion_threshold is crucial to this policy.
649 */
650static unsigned promote_threshold(struct mq_policy *mq)
651{
652 struct entry *e;
653
654 if (any_free_cblocks(mq))
655 return 0;
656
657 e = peek(&mq->cache_clean);
658 if (e)
659 return e->hit_count;
660
661 e = peek(&mq->cache_dirty);
662 if (e)
663 return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD;
664
665 /* This should never happen */
666 return 0;
667}
668
669/*
644 * We modify the basic promotion_threshold depending on the specific io. 670 * We modify the basic promotion_threshold depending on the specific io.
645 * 671 *
646 * If the origin block has been discarded then there's no cost to copy it 672 * If the origin block has been discarded then there's no cost to copy it
@@ -653,7 +679,7 @@ static unsigned adjusted_promote_threshold(struct mq_policy *mq,
653 bool discarded_oblock, int data_dir) 679 bool discarded_oblock, int data_dir)
654{ 680{
655 if (data_dir == READ) 681 if (data_dir == READ)
656 return mq->promote_threshold + mq->read_promote_adjustment; 682 return promote_threshold(mq) + mq->read_promote_adjustment;
657 683
658 if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { 684 if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
659 /* 685 /*
@@ -663,7 +689,7 @@ static unsigned adjusted_promote_threshold(struct mq_policy *mq,
663 return mq->discard_promote_adjustment; 689 return mq->discard_promote_adjustment;
664 } 690 }
665 691
666 return mq->promote_threshold + mq->write_promote_adjustment; 692 return promote_threshold(mq) + mq->write_promote_adjustment;
667} 693}
668 694
669static bool should_promote(struct mq_policy *mq, struct entry *e, 695static bool should_promote(struct mq_policy *mq, struct entry *e,
@@ -839,7 +865,8 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
839 if (e && in_cache(mq, e)) 865 if (e && in_cache(mq, e))
840 r = cache_entry_found(mq, e, result); 866 r = cache_entry_found(mq, e, result);
841 867
842 else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) 868 else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] &&
869 iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
843 result->op = POLICY_MISS; 870 result->op = POLICY_MISS;
844 871
845 else if (e) 872 else if (e)
@@ -1230,7 +1257,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1230 mq->tick = 0; 1257 mq->tick = 0;
1231 mq->hit_count = 0; 1258 mq->hit_count = 0;
1232 mq->generation = 0; 1259 mq->generation = 0;
1233 mq->promote_threshold = 0;
1234 mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; 1260 mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
1235 mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; 1261 mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
1236 mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; 1262 mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
@@ -1265,7 +1291,7 @@ bad_pre_cache_init:
1265 1291
1266static struct dm_cache_policy_type mq_policy_type = { 1292static struct dm_cache_policy_type mq_policy_type = {
1267 .name = "mq", 1293 .name = "mq",
1268 .version = {1, 2, 0}, 1294 .version = {1, 3, 0},
1269 .hint_size = 4, 1295 .hint_size = 4,
1270 .owner = THIS_MODULE, 1296 .owner = THIS_MODULE,
1271 .create = mq_create 1297 .create = mq_create
@@ -1273,7 +1299,7 @@ static struct dm_cache_policy_type mq_policy_type = {
1273 1299
1274static struct dm_cache_policy_type default_policy_type = { 1300static struct dm_cache_policy_type default_policy_type = {
1275 .name = "default", 1301 .name = "default",
1276 .version = {1, 2, 0}, 1302 .version = {1, 3, 0},
1277 .hint_size = 4, 1303 .hint_size = 4,
1278 .owner = THIS_MODULE, 1304 .owner = THIS_MODULE,
1279 .create = mq_create, 1305 .create = mq_create,
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7130505c2425..1e96d7889f51 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -95,7 +95,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
95 95
96/*----------------------------------------------------------------*/ 96/*----------------------------------------------------------------*/
97 97
98#define PRISON_CELLS 1024
99#define MIGRATION_POOL_SIZE 128 98#define MIGRATION_POOL_SIZE 128
100#define COMMIT_PERIOD HZ 99#define COMMIT_PERIOD HZ
101#define MIGRATION_COUNT_WINDOW 10 100#define MIGRATION_COUNT_WINDOW 10
@@ -237,8 +236,9 @@ struct cache {
237 /* 236 /*
238 * origin_blocks entries, discarded if set. 237 * origin_blocks entries, discarded if set.
239 */ 238 */
240 dm_oblock_t discard_nr_blocks; 239 dm_dblock_t discard_nr_blocks;
241 unsigned long *discard_bitset; 240 unsigned long *discard_bitset;
241 uint32_t discard_block_size; /* a power of 2 times sectors per block */
242 242
243 /* 243 /*
244 * Rather than reconstructing the table line for the status we just 244 * Rather than reconstructing the table line for the status we just
@@ -310,6 +310,7 @@ struct dm_cache_migration {
310 dm_cblock_t cblock; 310 dm_cblock_t cblock;
311 311
312 bool err:1; 312 bool err:1;
313 bool discard:1;
313 bool writeback:1; 314 bool writeback:1;
314 bool demote:1; 315 bool demote:1;
315 bool promote:1; 316 bool promote:1;
@@ -433,11 +434,12 @@ static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cel
433 434
434/*----------------------------------------------------------------*/ 435/*----------------------------------------------------------------*/
435 436
436static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) 437static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
437{ 438{
438 key->virtual = 0; 439 key->virtual = 0;
439 key->dev = 0; 440 key->dev = 0;
440 key->block = from_oblock(oblock); 441 key->block_begin = from_oblock(begin);
442 key->block_end = from_oblock(end);
441} 443}
442 444
443/* 445/*
@@ -447,15 +449,15 @@ static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
447 */ 449 */
448typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 450typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
449 451
450static int bio_detain(struct cache *cache, dm_oblock_t oblock, 452static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
451 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 453 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
452 cell_free_fn free_fn, void *free_context, 454 cell_free_fn free_fn, void *free_context,
453 struct dm_bio_prison_cell **cell_result) 455 struct dm_bio_prison_cell **cell_result)
454{ 456{
455 int r; 457 int r;
456 struct dm_cell_key key; 458 struct dm_cell_key key;
457 459
458 build_key(oblock, &key); 460 build_key(oblock_begin, oblock_end, &key);
459 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 461 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
460 if (r) 462 if (r)
461 free_fn(free_context, cell_prealloc); 463 free_fn(free_context, cell_prealloc);
@@ -463,6 +465,16 @@ static int bio_detain(struct cache *cache, dm_oblock_t oblock,
463 return r; 465 return r;
464} 466}
465 467
468static int bio_detain(struct cache *cache, dm_oblock_t oblock,
469 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
470 cell_free_fn free_fn, void *free_context,
471 struct dm_bio_prison_cell **cell_result)
472{
473 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
474 return bio_detain_range(cache, oblock, end, bio,
475 cell_prealloc, free_fn, free_context, cell_result);
476}
477
466static int get_cell(struct cache *cache, 478static int get_cell(struct cache *cache,
467 dm_oblock_t oblock, 479 dm_oblock_t oblock,
468 struct prealloc *structs, 480 struct prealloc *structs,
@@ -474,7 +486,7 @@ static int get_cell(struct cache *cache,
474 486
475 cell_prealloc = prealloc_get_cell(structs); 487 cell_prealloc = prealloc_get_cell(structs);
476 488
477 build_key(oblock, &key); 489 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
478 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 490 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
479 if (r) 491 if (r)
480 prealloc_put_cell(structs, cell_prealloc); 492 prealloc_put_cell(structs, cell_prealloc);
@@ -524,33 +536,57 @@ static dm_block_t block_div(dm_block_t b, uint32_t n)
524 return b; 536 return b;
525} 537}
526 538
527static void set_discard(struct cache *cache, dm_oblock_t b) 539static dm_block_t oblocks_per_dblock(struct cache *cache)
540{
541 dm_block_t oblocks = cache->discard_block_size;
542
543 if (block_size_is_power_of_two(cache))
544 oblocks >>= cache->sectors_per_block_shift;
545 else
546 oblocks = block_div(oblocks, cache->sectors_per_block);
547
548 return oblocks;
549}
550
551static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
552{
553 return to_dblock(block_div(from_oblock(oblock),
554 oblocks_per_dblock(cache)));
555}
556
557static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
558{
559 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
560}
561
562static void set_discard(struct cache *cache, dm_dblock_t b)
528{ 563{
529 unsigned long flags; 564 unsigned long flags;
530 565
566 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
531 atomic_inc(&cache->stats.discard_count); 567 atomic_inc(&cache->stats.discard_count);
532 568
533 spin_lock_irqsave(&cache->lock, flags); 569 spin_lock_irqsave(&cache->lock, flags);
534 set_bit(from_oblock(b), cache->discard_bitset); 570 set_bit(from_dblock(b), cache->discard_bitset);
535 spin_unlock_irqrestore(&cache->lock, flags); 571 spin_unlock_irqrestore(&cache->lock, flags);
536} 572}
537 573
538static void clear_discard(struct cache *cache, dm_oblock_t b) 574static void clear_discard(struct cache *cache, dm_dblock_t b)
539{ 575{
540 unsigned long flags; 576 unsigned long flags;
541 577
542 spin_lock_irqsave(&cache->lock, flags); 578 spin_lock_irqsave(&cache->lock, flags);
543 clear_bit(from_oblock(b), cache->discard_bitset); 579 clear_bit(from_dblock(b), cache->discard_bitset);
544 spin_unlock_irqrestore(&cache->lock, flags); 580 spin_unlock_irqrestore(&cache->lock, flags);
545} 581}
546 582
547static bool is_discarded(struct cache *cache, dm_oblock_t b) 583static bool is_discarded(struct cache *cache, dm_dblock_t b)
548{ 584{
549 int r; 585 int r;
550 unsigned long flags; 586 unsigned long flags;
551 587
552 spin_lock_irqsave(&cache->lock, flags); 588 spin_lock_irqsave(&cache->lock, flags);
553 r = test_bit(from_oblock(b), cache->discard_bitset); 589 r = test_bit(from_dblock(b), cache->discard_bitset);
554 spin_unlock_irqrestore(&cache->lock, flags); 590 spin_unlock_irqrestore(&cache->lock, flags);
555 591
556 return r; 592 return r;
@@ -562,7 +598,8 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
562 unsigned long flags; 598 unsigned long flags;
563 599
564 spin_lock_irqsave(&cache->lock, flags); 600 spin_lock_irqsave(&cache->lock, flags);
565 r = test_bit(from_oblock(b), cache->discard_bitset); 601 r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
602 cache->discard_bitset);
566 spin_unlock_irqrestore(&cache->lock, flags); 603 spin_unlock_irqrestore(&cache->lock, flags);
567 604
568 return r; 605 return r;
@@ -687,7 +724,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
687 check_if_tick_bio_needed(cache, bio); 724 check_if_tick_bio_needed(cache, bio);
688 remap_to_origin(cache, bio); 725 remap_to_origin(cache, bio);
689 if (bio_data_dir(bio) == WRITE) 726 if (bio_data_dir(bio) == WRITE)
690 clear_discard(cache, oblock); 727 clear_discard(cache, oblock_to_dblock(cache, oblock));
691} 728}
692 729
693static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 730static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
@@ -697,7 +734,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
697 remap_to_cache(cache, bio, cblock); 734 remap_to_cache(cache, bio, cblock);
698 if (bio_data_dir(bio) == WRITE) { 735 if (bio_data_dir(bio) == WRITE) {
699 set_dirty(cache, oblock, cblock); 736 set_dirty(cache, oblock, cblock);
700 clear_discard(cache, oblock); 737 clear_discard(cache, oblock_to_dblock(cache, oblock));
701 } 738 }
702} 739}
703 740
@@ -951,10 +988,14 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
951 } 988 }
952 989
953 } else { 990 } else {
954 clear_dirty(cache, mg->new_oblock, mg->cblock); 991 if (mg->requeue_holder) {
955 if (mg->requeue_holder) 992 clear_dirty(cache, mg->new_oblock, mg->cblock);
956 cell_defer(cache, mg->new_ocell, true); 993 cell_defer(cache, mg->new_ocell, true);
957 else { 994 } else {
995 /*
996 * The block was promoted via an overwrite, so it's dirty.
997 */
998 set_dirty(cache, mg->new_oblock, mg->cblock);
958 bio_endio(mg->new_ocell->holder, 0); 999 bio_endio(mg->new_ocell->holder, 0);
959 cell_defer(cache, mg->new_ocell, false); 1000 cell_defer(cache, mg->new_ocell, false);
960 } 1001 }
@@ -978,7 +1019,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
978 wake_worker(cache); 1019 wake_worker(cache);
979} 1020}
980 1021
981static void issue_copy_real(struct dm_cache_migration *mg) 1022static void issue_copy(struct dm_cache_migration *mg)
982{ 1023{
983 int r; 1024 int r;
984 struct dm_io_region o_region, c_region; 1025 struct dm_io_region o_region, c_region;
@@ -1057,11 +1098,46 @@ static void avoid_copy(struct dm_cache_migration *mg)
1057 migration_success_pre_commit(mg); 1098 migration_success_pre_commit(mg);
1058} 1099}
1059 1100
1060static void issue_copy(struct dm_cache_migration *mg) 1101static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1102 dm_dblock_t *b, dm_dblock_t *e)
1103{
1104 sector_t sb = bio->bi_iter.bi_sector;
1105 sector_t se = bio_end_sector(bio);
1106
1107 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1108
1109 if (se - sb < cache->discard_block_size)
1110 *e = *b;
1111 else
1112 *e = to_dblock(block_div(se, cache->discard_block_size));
1113}
1114
1115static void issue_discard(struct dm_cache_migration *mg)
1116{
1117 dm_dblock_t b, e;
1118 struct bio *bio = mg->new_ocell->holder;
1119
1120 calc_discard_block_range(mg->cache, bio, &b, &e);
1121 while (b != e) {
1122 set_discard(mg->cache, b);
1123 b = to_dblock(from_dblock(b) + 1);
1124 }
1125
1126 bio_endio(bio, 0);
1127 cell_defer(mg->cache, mg->new_ocell, false);
1128 free_migration(mg);
1129}
1130
1131static void issue_copy_or_discard(struct dm_cache_migration *mg)
1061{ 1132{
1062 bool avoid; 1133 bool avoid;
1063 struct cache *cache = mg->cache; 1134 struct cache *cache = mg->cache;
1064 1135
1136 if (mg->discard) {
1137 issue_discard(mg);
1138 return;
1139 }
1140
1065 if (mg->writeback || mg->demote) 1141 if (mg->writeback || mg->demote)
1066 avoid = !is_dirty(cache, mg->cblock) || 1142 avoid = !is_dirty(cache, mg->cblock) ||
1067 is_discarded_oblock(cache, mg->old_oblock); 1143 is_discarded_oblock(cache, mg->old_oblock);
@@ -1070,13 +1146,14 @@ static void issue_copy(struct dm_cache_migration *mg)
1070 1146
1071 avoid = is_discarded_oblock(cache, mg->new_oblock); 1147 avoid = is_discarded_oblock(cache, mg->new_oblock);
1072 1148
1073 if (!avoid && bio_writes_complete_block(cache, bio)) { 1149 if (writeback_mode(&cache->features) &&
1150 !avoid && bio_writes_complete_block(cache, bio)) {
1074 issue_overwrite(mg, bio); 1151 issue_overwrite(mg, bio);
1075 return; 1152 return;
1076 } 1153 }
1077 } 1154 }
1078 1155
1079 avoid ? avoid_copy(mg) : issue_copy_real(mg); 1156 avoid ? avoid_copy(mg) : issue_copy(mg);
1080} 1157}
1081 1158
1082static void complete_migration(struct dm_cache_migration *mg) 1159static void complete_migration(struct dm_cache_migration *mg)
@@ -1161,6 +1238,7 @@ static void promote(struct cache *cache, struct prealloc *structs,
1161 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1238 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1162 1239
1163 mg->err = false; 1240 mg->err = false;
1241 mg->discard = false;
1164 mg->writeback = false; 1242 mg->writeback = false;
1165 mg->demote = false; 1243 mg->demote = false;
1166 mg->promote = true; 1244 mg->promote = true;
@@ -1184,6 +1262,7 @@ static void writeback(struct cache *cache, struct prealloc *structs,
1184 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1262 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1185 1263
1186 mg->err = false; 1264 mg->err = false;
1265 mg->discard = false;
1187 mg->writeback = true; 1266 mg->writeback = true;
1188 mg->demote = false; 1267 mg->demote = false;
1189 mg->promote = false; 1268 mg->promote = false;
@@ -1209,6 +1288,7 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1209 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1288 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1210 1289
1211 mg->err = false; 1290 mg->err = false;
1291 mg->discard = false;
1212 mg->writeback = false; 1292 mg->writeback = false;
1213 mg->demote = true; 1293 mg->demote = true;
1214 mg->promote = true; 1294 mg->promote = true;
@@ -1237,6 +1317,7 @@ static void invalidate(struct cache *cache, struct prealloc *structs,
1237 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1317 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1238 1318
1239 mg->err = false; 1319 mg->err = false;
1320 mg->discard = false;
1240 mg->writeback = false; 1321 mg->writeback = false;
1241 mg->demote = true; 1322 mg->demote = true;
1242 mg->promote = false; 1323 mg->promote = false;
@@ -1253,6 +1334,26 @@ static void invalidate(struct cache *cache, struct prealloc *structs,
1253 quiesce_migration(mg); 1334 quiesce_migration(mg);
1254} 1335}
1255 1336
1337static void discard(struct cache *cache, struct prealloc *structs,
1338 struct dm_bio_prison_cell *cell)
1339{
1340 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1341
1342 mg->err = false;
1343 mg->discard = true;
1344 mg->writeback = false;
1345 mg->demote = false;
1346 mg->promote = false;
1347 mg->requeue_holder = false;
1348 mg->invalidate = false;
1349 mg->cache = cache;
1350 mg->old_ocell = NULL;
1351 mg->new_ocell = cell;
1352 mg->start_jiffies = jiffies;
1353
1354 quiesce_migration(mg);
1355}
1356
1256/*---------------------------------------------------------------- 1357/*----------------------------------------------------------------
1257 * bio processing 1358 * bio processing
1258 *--------------------------------------------------------------*/ 1359 *--------------------------------------------------------------*/
@@ -1286,31 +1387,27 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
1286 issue(cache, bio); 1387 issue(cache, bio);
1287} 1388}
1288 1389
1289/* 1390static void process_discard_bio(struct cache *cache, struct prealloc *structs,
1290 * People generally discard large parts of a device, eg, the whole device 1391 struct bio *bio)
1291 * when formatting. Splitting these large discards up into cache block
1292 * sized ios and then quiescing (always neccessary for discard) takes too
1293 * long.
1294 *
1295 * We keep it simple, and allow any size of discard to come in, and just
1296 * mark off blocks on the discard bitset. No passdown occurs!
1297 *
1298 * To implement passdown we need to change the bio_prison such that a cell
1299 * can have a key that spans many blocks.
1300 */
1301static void process_discard_bio(struct cache *cache, struct bio *bio)
1302{ 1392{
1303 dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector, 1393 int r;
1304 cache->sectors_per_block); 1394 dm_dblock_t b, e;
1305 dm_block_t end_block = bio_end_sector(bio); 1395 struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1306 dm_block_t b;
1307 1396
1308 end_block = block_div(end_block, cache->sectors_per_block); 1397 calc_discard_block_range(cache, bio, &b, &e);
1398 if (b == e) {
1399 bio_endio(bio, 0);
1400 return;
1401 }
1309 1402
1310 for (b = start_block; b < end_block; b++) 1403 cell_prealloc = prealloc_get_cell(structs);
1311 set_discard(cache, to_oblock(b)); 1404 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
1405 (cell_free_fn) prealloc_put_cell,
1406 structs, &new_ocell);
1407 if (r > 0)
1408 return;
1312 1409
1313 bio_endio(bio, 0); 1410 discard(cache, structs, new_ocell);
1314} 1411}
1315 1412
1316static bool spare_migration_bandwidth(struct cache *cache) 1413static bool spare_migration_bandwidth(struct cache *cache)
@@ -1340,9 +1437,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1340 dm_oblock_t block = get_bio_block(cache, bio); 1437 dm_oblock_t block = get_bio_block(cache, bio);
1341 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; 1438 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1342 struct policy_result lookup_result; 1439 struct policy_result lookup_result;
1343 bool discarded_block = is_discarded_oblock(cache, block);
1344 bool passthrough = passthrough_mode(&cache->features); 1440 bool passthrough = passthrough_mode(&cache->features);
1345 bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); 1441 bool discarded_block, can_migrate;
1346 1442
1347 /* 1443 /*
1348 * Check to see if that block is currently migrating. 1444 * Check to see if that block is currently migrating.
@@ -1354,6 +1450,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1354 if (r > 0) 1450 if (r > 0)
1355 return; 1451 return;
1356 1452
1453 discarded_block = is_discarded_oblock(cache, block);
1454 can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
1455
1357 r = policy_map(cache->policy, block, true, can_migrate, discarded_block, 1456 r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1358 bio, &lookup_result); 1457 bio, &lookup_result);
1359 1458
@@ -1500,7 +1599,7 @@ static void process_deferred_bios(struct cache *cache)
1500 if (bio->bi_rw & REQ_FLUSH) 1599 if (bio->bi_rw & REQ_FLUSH)
1501 process_flush_bio(cache, bio); 1600 process_flush_bio(cache, bio);
1502 else if (bio->bi_rw & REQ_DISCARD) 1601 else if (bio->bi_rw & REQ_DISCARD)
1503 process_discard_bio(cache, bio); 1602 process_discard_bio(cache, &structs, bio);
1504 else 1603 else
1505 process_bio(cache, &structs, bio); 1604 process_bio(cache, &structs, bio);
1506 } 1605 }
@@ -1715,7 +1814,7 @@ static void do_worker(struct work_struct *ws)
1715 process_invalidation_requests(cache); 1814 process_invalidation_requests(cache);
1716 } 1815 }
1717 1816
1718 process_migrations(cache, &cache->quiesced_migrations, issue_copy); 1817 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
1719 process_migrations(cache, &cache->completed_migrations, complete_migration); 1818 process_migrations(cache, &cache->completed_migrations, complete_migration);
1720 1819
1721 if (commit_if_needed(cache)) { 1820 if (commit_if_needed(cache)) {
@@ -2180,6 +2279,45 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2180 return 0; 2279 return 0;
2181} 2280}
2182 2281
2282/*
2283 * We want the discard block size to be at least the size of the cache
2284 * block size and have no more than 2^14 discard blocks across the origin.
2285 */
2286#define MAX_DISCARD_BLOCKS (1 << 14)
2287
2288static bool too_many_discard_blocks(sector_t discard_block_size,
2289 sector_t origin_size)
2290{
2291 (void) sector_div(origin_size, discard_block_size);
2292
2293 return origin_size > MAX_DISCARD_BLOCKS;
2294}
2295
2296static sector_t calculate_discard_block_size(sector_t cache_block_size,
2297 sector_t origin_size)
2298{
2299 sector_t discard_block_size = cache_block_size;
2300
2301 if (origin_size)
2302 while (too_many_discard_blocks(discard_block_size, origin_size))
2303 discard_block_size *= 2;
2304
2305 return discard_block_size;
2306}
2307
2308static void set_cache_size(struct cache *cache, dm_cblock_t size)
2309{
2310 dm_block_t nr_blocks = from_cblock(size);
2311
2312 if (nr_blocks > (1 << 20) && cache->cache_size != size)
2313 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2314 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2315 "Please consider increasing the cache block size to reduce the overall cache block count.",
2316 (unsigned long long) nr_blocks);
2317
2318 cache->cache_size = size;
2319}
2320
2183#define DEFAULT_MIGRATION_THRESHOLD 2048 2321#define DEFAULT_MIGRATION_THRESHOLD 2048
2184 2322
2185static int cache_create(struct cache_args *ca, struct cache **result) 2323static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2204,8 +2342,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2204 ti->num_discard_bios = 1; 2342 ti->num_discard_bios = 1;
2205 ti->discards_supported = true; 2343 ti->discards_supported = true;
2206 ti->discard_zeroes_data_unsupported = true; 2344 ti->discard_zeroes_data_unsupported = true;
2207 /* Discard bios must be split on a block boundary */ 2345 ti->split_discard_bios = false;
2208 ti->split_discard_bios = true;
2209 2346
2210 cache->features = ca->features; 2347 cache->features = ca->features;
2211 ti->per_bio_data_size = get_per_bio_data_size(cache); 2348 ti->per_bio_data_size = get_per_bio_data_size(cache);
@@ -2235,10 +2372,10 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2235 2372
2236 cache->sectors_per_block_shift = -1; 2373 cache->sectors_per_block_shift = -1;
2237 cache_size = block_div(cache_size, ca->block_size); 2374 cache_size = block_div(cache_size, ca->block_size);
2238 cache->cache_size = to_cblock(cache_size); 2375 set_cache_size(cache, to_cblock(cache_size));
2239 } else { 2376 } else {
2240 cache->sectors_per_block_shift = __ffs(ca->block_size); 2377 cache->sectors_per_block_shift = __ffs(ca->block_size);
2241 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift); 2378 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2242 } 2379 }
2243 2380
2244 r = create_cache_policy(cache, ca, error); 2381 r = create_cache_policy(cache, ca, error);
@@ -2303,13 +2440,17 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2303 } 2440 }
2304 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2441 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2305 2442
2306 cache->discard_nr_blocks = cache->origin_blocks; 2443 cache->discard_block_size =
2307 cache->discard_bitset = alloc_bitset(from_oblock(cache->discard_nr_blocks)); 2444 calculate_discard_block_size(cache->sectors_per_block,
2445 cache->origin_sectors);
2446 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2447 cache->discard_block_size));
2448 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2308 if (!cache->discard_bitset) { 2449 if (!cache->discard_bitset) {
2309 *error = "could not allocate discard bitset"; 2450 *error = "could not allocate discard bitset";
2310 goto bad; 2451 goto bad;
2311 } 2452 }
2312 clear_bitset(cache->discard_bitset, from_oblock(cache->discard_nr_blocks)); 2453 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2313 2454
2314 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2455 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2315 if (IS_ERR(cache->copier)) { 2456 if (IS_ERR(cache->copier)) {
@@ -2327,7 +2468,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2327 INIT_DELAYED_WORK(&cache->waker, do_waker); 2468 INIT_DELAYED_WORK(&cache->waker, do_waker);
2328 cache->last_commit_jiffies = jiffies; 2469 cache->last_commit_jiffies = jiffies;
2329 2470
2330 cache->prison = dm_bio_prison_create(PRISON_CELLS); 2471 cache->prison = dm_bio_prison_create();
2331 if (!cache->prison) { 2472 if (!cache->prison) {
2332 *error = "could not create bio prison"; 2473 *error = "could not create bio prison";
2333 goto bad; 2474 goto bad;
@@ -2549,11 +2690,11 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
2549static int cache_map(struct dm_target *ti, struct bio *bio) 2690static int cache_map(struct dm_target *ti, struct bio *bio)
2550{ 2691{
2551 int r; 2692 int r;
2552 struct dm_bio_prison_cell *cell; 2693 struct dm_bio_prison_cell *cell = NULL;
2553 struct cache *cache = ti->private; 2694 struct cache *cache = ti->private;
2554 2695
2555 r = __cache_map(cache, bio, &cell); 2696 r = __cache_map(cache, bio, &cell);
2556 if (r == DM_MAPIO_REMAPPED) { 2697 if (r == DM_MAPIO_REMAPPED && cell) {
2557 inc_ds(cache, bio, cell); 2698 inc_ds(cache, bio, cell);
2558 cell_defer(cache, cell, false); 2699 cell_defer(cache, cell, false);
2559 } 2700 }
@@ -2599,16 +2740,16 @@ static int write_discard_bitset(struct cache *cache)
2599{ 2740{
2600 unsigned i, r; 2741 unsigned i, r;
2601 2742
2602 r = dm_cache_discard_bitset_resize(cache->cmd, cache->sectors_per_block, 2743 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2603 cache->origin_blocks); 2744 cache->discard_nr_blocks);
2604 if (r) { 2745 if (r) {
2605 DMERR("could not resize on-disk discard bitset"); 2746 DMERR("could not resize on-disk discard bitset");
2606 return r; 2747 return r;
2607 } 2748 }
2608 2749
2609 for (i = 0; i < from_oblock(cache->discard_nr_blocks); i++) { 2750 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2610 r = dm_cache_set_discard(cache->cmd, to_oblock(i), 2751 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2611 is_discarded(cache, to_oblock(i))); 2752 is_discarded(cache, to_dblock(i)));
2612 if (r) 2753 if (r)
2613 return r; 2754 return r;
2614 } 2755 }
@@ -2680,15 +2821,86 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2680 return 0; 2821 return 0;
2681} 2822}
2682 2823
2824/*
2825 * The discard block size in the on disk metadata is not
2826 * neccessarily the same as we're currently using. So we have to
2827 * be careful to only set the discarded attribute if we know it
2828 * covers a complete block of the new size.
2829 */
2830struct discard_load_info {
2831 struct cache *cache;
2832
2833 /*
2834 * These blocks are sized using the on disk dblock size, rather
2835 * than the current one.
2836 */
2837 dm_block_t block_size;
2838 dm_block_t discard_begin, discard_end;
2839};
2840
2841static void discard_load_info_init(struct cache *cache,
2842 struct discard_load_info *li)
2843{
2844 li->cache = cache;
2845 li->discard_begin = li->discard_end = 0;
2846}
2847
2848static void set_discard_range(struct discard_load_info *li)
2849{
2850 sector_t b, e;
2851
2852 if (li->discard_begin == li->discard_end)
2853 return;
2854
2855 /*
2856 * Convert to sectors.
2857 */
2858 b = li->discard_begin * li->block_size;
2859 e = li->discard_end * li->block_size;
2860
2861 /*
2862 * Then convert back to the current dblock size.
2863 */
2864 b = dm_sector_div_up(b, li->cache->discard_block_size);
2865 sector_div(e, li->cache->discard_block_size);
2866
2867 /*
2868 * The origin may have shrunk, so we need to check we're still in
2869 * bounds.
2870 */
2871 if (e > from_dblock(li->cache->discard_nr_blocks))
2872 e = from_dblock(li->cache->discard_nr_blocks);
2873
2874 for (; b < e; b++)
2875 set_discard(li->cache, to_dblock(b));
2876}
2877
2683static int load_discard(void *context, sector_t discard_block_size, 2878static int load_discard(void *context, sector_t discard_block_size,
2684 dm_oblock_t oblock, bool discard) 2879 dm_dblock_t dblock, bool discard)
2685{ 2880{
2686 struct cache *cache = context; 2881 struct discard_load_info *li = context;
2687 2882
2688 if (discard) 2883 li->block_size = discard_block_size;
2689 set_discard(cache, oblock); 2884
2690 else 2885 if (discard) {
2691 clear_discard(cache, oblock); 2886 if (from_dblock(dblock) == li->discard_end)
2887 /*
2888 * We're already in a discard range, just extend it.
2889 */
2890 li->discard_end = li->discard_end + 1ULL;
2891
2892 else {
2893 /*
2894 * Emit the old range and start a new one.
2895 */
2896 set_discard_range(li);
2897 li->discard_begin = from_dblock(dblock);
2898 li->discard_end = li->discard_begin + 1ULL;
2899 }
2900 } else {
2901 set_discard_range(li);
2902 li->discard_begin = li->discard_end = 0;
2903 }
2692 2904
2693 return 0; 2905 return 0;
2694} 2906}
@@ -2730,7 +2942,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2730 return r; 2942 return r;
2731 } 2943 }
2732 2944
2733 cache->cache_size = new_size; 2945 set_cache_size(cache, new_size);
2734 2946
2735 return 0; 2947 return 0;
2736} 2948}
@@ -2772,11 +2984,22 @@ static int cache_preresume(struct dm_target *ti)
2772 } 2984 }
2773 2985
2774 if (!cache->loaded_discards) { 2986 if (!cache->loaded_discards) {
2775 r = dm_cache_load_discards(cache->cmd, load_discard, cache); 2987 struct discard_load_info li;
2988
2989 /*
2990 * The discard bitset could have been resized, or the
2991 * discard block size changed. To be safe we start by
2992 * setting every dblock to not discarded.
2993 */
2994 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2995
2996 discard_load_info_init(cache, &li);
2997 r = dm_cache_load_discards(cache->cmd, load_discard, &li);
2776 if (r) { 2998 if (r) {
2777 DMERR("could not load origin discards"); 2999 DMERR("could not load origin discards");
2778 return r; 3000 return r;
2779 } 3001 }
3002 set_discard_range(&li);
2780 3003
2781 cache->loaded_discards = true; 3004 cache->loaded_discards = true;
2782 } 3005 }
@@ -3079,8 +3302,9 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3079 /* 3302 /*
3080 * FIXME: these limits may be incompatible with the cache device 3303 * FIXME: these limits may be incompatible with the cache device
3081 */ 3304 */
3082 limits->max_discard_sectors = cache->sectors_per_block; 3305 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3083 limits->discard_granularity = cache->sectors_per_block << SECTOR_SHIFT; 3306 cache->origin_sectors);
3307 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3084} 3308}
3085 3309
3086static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3310static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -3104,7 +3328,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3104 3328
3105static struct target_type cache_target = { 3329static struct target_type cache_target = {
3106 .name = "cache", 3330 .name = "cache",
3107 .version = {1, 5, 0}, 3331 .version = {1, 6, 0},
3108 .module = THIS_MODULE, 3332 .module = THIS_MODULE,
3109 .ctr = cache_ctr, 3333 .ctr = cache_ctr,
3110 .dtr = cache_dtr, 3334 .dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index fc93b9330af4..08981be7baa1 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -705,7 +705,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
705 for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) 705 for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
706 crypto_xor(data + i * 8, buf, 8); 706 crypto_xor(data + i * 8, buf, 8);
707out: 707out:
708 memset(buf, 0, sizeof(buf)); 708 memzero_explicit(buf, sizeof(buf));
709 return r; 709 return r;
710} 710}
711 711
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 0be9381365d7..73f791bb9ea4 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -684,11 +684,14 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
684 int srcu_idx; 684 int srcu_idx;
685 685
686 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | 686 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
687 DM_ACTIVE_PRESENT_FLAG); 687 DM_ACTIVE_PRESENT_FLAG | DM_INTERNAL_SUSPEND_FLAG);
688 688
689 if (dm_suspended_md(md)) 689 if (dm_suspended_md(md))
690 param->flags |= DM_SUSPEND_FLAG; 690 param->flags |= DM_SUSPEND_FLAG;
691 691
692 if (dm_suspended_internally_md(md))
693 param->flags |= DM_INTERNAL_SUSPEND_FLAG;
694
692 if (dm_test_deferred_remove_flag(md)) 695 if (dm_test_deferred_remove_flag(md))
693 param->flags |= DM_DEFERRED_REMOVE; 696 param->flags |= DM_DEFERRED_REMOVE;
694 697
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 87f86c77b094..f478a4c96d2f 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -824,7 +824,7 @@ static int message_stats_create(struct mapped_device *md,
824 return 1; 824 return 1;
825 825
826 id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data, 826 id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
827 dm_internal_suspend, dm_internal_resume, md); 827 dm_internal_suspend_fast, dm_internal_resume_fast, md);
828 if (id < 0) 828 if (id < 0)
829 return id; 829 return id;
830 830
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index b2bd1ebf4562..3afae9e062f8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1521,18 +1521,32 @@ fmode_t dm_table_get_mode(struct dm_table *t)
1521} 1521}
1522EXPORT_SYMBOL(dm_table_get_mode); 1522EXPORT_SYMBOL(dm_table_get_mode);
1523 1523
1524static void suspend_targets(struct dm_table *t, unsigned postsuspend) 1524enum suspend_mode {
1525 PRESUSPEND,
1526 PRESUSPEND_UNDO,
1527 POSTSUSPEND,
1528};
1529
1530static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
1525{ 1531{
1526 int i = t->num_targets; 1532 int i = t->num_targets;
1527 struct dm_target *ti = t->targets; 1533 struct dm_target *ti = t->targets;
1528 1534
1529 while (i--) { 1535 while (i--) {
1530 if (postsuspend) { 1536 switch (mode) {
1537 case PRESUSPEND:
1538 if (ti->type->presuspend)
1539 ti->type->presuspend(ti);
1540 break;
1541 case PRESUSPEND_UNDO:
1542 if (ti->type->presuspend_undo)
1543 ti->type->presuspend_undo(ti);
1544 break;
1545 case POSTSUSPEND:
1531 if (ti->type->postsuspend) 1546 if (ti->type->postsuspend)
1532 ti->type->postsuspend(ti); 1547 ti->type->postsuspend(ti);
1533 } else if (ti->type->presuspend) 1548 break;
1534 ti->type->presuspend(ti); 1549 }
1535
1536 ti++; 1550 ti++;
1537 } 1551 }
1538} 1552}
@@ -1542,7 +1556,15 @@ void dm_table_presuspend_targets(struct dm_table *t)
1542 if (!t) 1556 if (!t)
1543 return; 1557 return;
1544 1558
1545 suspend_targets(t, 0); 1559 suspend_targets(t, PRESUSPEND);
1560}
1561
1562void dm_table_presuspend_undo_targets(struct dm_table *t)
1563{
1564 if (!t)
1565 return;
1566
1567 suspend_targets(t, PRESUSPEND_UNDO);
1546} 1568}
1547 1569
1548void dm_table_postsuspend_targets(struct dm_table *t) 1570void dm_table_postsuspend_targets(struct dm_table *t)
@@ -1550,7 +1572,7 @@ void dm_table_postsuspend_targets(struct dm_table *t)
1550 if (!t) 1572 if (!t)
1551 return; 1573 return;
1552 1574
1553 suspend_targets(t, 1); 1575 suspend_targets(t, POSTSUSPEND);
1554} 1576}
1555 1577
1556int dm_table_resume_targets(struct dm_table *t) 1578int dm_table_resume_targets(struct dm_table *t)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index e9d33ad59df5..43adbb863f5a 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1384,42 +1384,38 @@ static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1384} 1384}
1385 1385
1386int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 1386int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1387 int can_block, struct dm_thin_lookup_result *result) 1387 int can_issue_io, struct dm_thin_lookup_result *result)
1388{ 1388{
1389 int r = -EINVAL; 1389 int r;
1390 uint64_t block_time = 0;
1391 __le64 value; 1390 __le64 value;
1392 struct dm_pool_metadata *pmd = td->pmd; 1391 struct dm_pool_metadata *pmd = td->pmd;
1393 dm_block_t keys[2] = { td->id, block }; 1392 dm_block_t keys[2] = { td->id, block };
1394 struct dm_btree_info *info; 1393 struct dm_btree_info *info;
1395 1394
1396 if (can_block) {
1397 down_read(&pmd->root_lock);
1398 info = &pmd->info;
1399 } else if (down_read_trylock(&pmd->root_lock))
1400 info = &pmd->nb_info;
1401 else
1402 return -EWOULDBLOCK;
1403
1404 if (pmd->fail_io) 1395 if (pmd->fail_io)
1405 goto out; 1396 return -EINVAL;
1406 1397
1407 r = dm_btree_lookup(info, pmd->root, keys, &value); 1398 down_read(&pmd->root_lock);
1408 if (!r)
1409 block_time = le64_to_cpu(value);
1410 1399
1411out: 1400 if (can_issue_io) {
1412 up_read(&pmd->root_lock); 1401 info = &pmd->info;
1402 } else
1403 info = &pmd->nb_info;
1413 1404
1405 r = dm_btree_lookup(info, pmd->root, keys, &value);
1414 if (!r) { 1406 if (!r) {
1407 uint64_t block_time = 0;
1415 dm_block_t exception_block; 1408 dm_block_t exception_block;
1416 uint32_t exception_time; 1409 uint32_t exception_time;
1410
1411 block_time = le64_to_cpu(value);
1417 unpack_block_time(block_time, &exception_block, 1412 unpack_block_time(block_time, &exception_block,
1418 &exception_time); 1413 &exception_time);
1419 result->block = exception_block; 1414 result->block = exception_block;
1420 result->shared = __snapshotted_since(td, exception_time); 1415 result->shared = __snapshotted_since(td, exception_time);
1421 } 1416 }
1422 1417
1418 up_read(&pmd->root_lock);
1423 return r; 1419 return r;
1424} 1420}
1425 1421
@@ -1813,3 +1809,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
1813 1809
1814 return needs_check; 1810 return needs_check;
1815} 1811}
1812
1813void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
1814{
1815 dm_tm_issue_prefetches(pmd->tm);
1816}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index e3c857db195a..921d15ee56a0 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -139,12 +139,12 @@ struct dm_thin_lookup_result {
139 139
140/* 140/*
141 * Returns: 141 * Returns:
142 * -EWOULDBLOCK iff @can_block is set and would block. 142 * -EWOULDBLOCK iff @can_issue_io is set and would issue IO
143 * -ENODATA iff that mapping is not present. 143 * -ENODATA iff that mapping is not present.
144 * 0 success 144 * 0 success
145 */ 145 */
146int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 146int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
147 int can_block, struct dm_thin_lookup_result *result); 147 int can_issue_io, struct dm_thin_lookup_result *result);
148 148
149/* 149/*
150 * Obtain an unused block. 150 * Obtain an unused block.
@@ -213,6 +213,11 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
213int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd); 213int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
214bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); 214bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
215 215
216/*
217 * Issue any prefetches that may be useful.
218 */
219void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd);
220
216/*----------------------------------------------------------------*/ 221/*----------------------------------------------------------------*/
217 222
218#endif 223#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 0f86d802b533..8735543eacdb 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -11,11 +11,13 @@
11#include <linux/device-mapper.h> 11#include <linux/device-mapper.h>
12#include <linux/dm-io.h> 12#include <linux/dm-io.h>
13#include <linux/dm-kcopyd.h> 13#include <linux/dm-kcopyd.h>
14#include <linux/log2.h>
14#include <linux/list.h> 15#include <linux/list.h>
15#include <linux/rculist.h> 16#include <linux/rculist.h>
16#include <linux/init.h> 17#include <linux/init.h>
17#include <linux/module.h> 18#include <linux/module.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/sort.h>
19#include <linux/rbtree.h> 21#include <linux/rbtree.h>
20 22
21#define DM_MSG_PREFIX "thin" 23#define DM_MSG_PREFIX "thin"
@@ -25,7 +27,6 @@
25 */ 27 */
26#define ENDIO_HOOK_POOL_SIZE 1024 28#define ENDIO_HOOK_POOL_SIZE 1024
27#define MAPPING_POOL_SIZE 1024 29#define MAPPING_POOL_SIZE 1024
28#define PRISON_CELLS 1024
29#define COMMIT_PERIOD HZ 30#define COMMIT_PERIOD HZ
30#define NO_SPACE_TIMEOUT_SECS 60 31#define NO_SPACE_TIMEOUT_SECS 60
31 32
@@ -114,7 +115,8 @@ static void build_data_key(struct dm_thin_device *td,
114{ 115{
115 key->virtual = 0; 116 key->virtual = 0;
116 key->dev = dm_thin_dev_id(td); 117 key->dev = dm_thin_dev_id(td);
117 key->block = b; 118 key->block_begin = b;
119 key->block_end = b + 1ULL;
118} 120}
119 121
120static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 122static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
@@ -122,7 +124,55 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
122{ 124{
123 key->virtual = 1; 125 key->virtual = 1;
124 key->dev = dm_thin_dev_id(td); 126 key->dev = dm_thin_dev_id(td);
125 key->block = b; 127 key->block_begin = b;
128 key->block_end = b + 1ULL;
129}
130
131/*----------------------------------------------------------------*/
132
133#define THROTTLE_THRESHOLD (1 * HZ)
134
135struct throttle {
136 struct rw_semaphore lock;
137 unsigned long threshold;
138 bool throttle_applied;
139};
140
141static void throttle_init(struct throttle *t)
142{
143 init_rwsem(&t->lock);
144 t->throttle_applied = false;
145}
146
147static void throttle_work_start(struct throttle *t)
148{
149 t->threshold = jiffies + THROTTLE_THRESHOLD;
150}
151
152static void throttle_work_update(struct throttle *t)
153{
154 if (!t->throttle_applied && jiffies > t->threshold) {
155 down_write(&t->lock);
156 t->throttle_applied = true;
157 }
158}
159
160static void throttle_work_complete(struct throttle *t)
161{
162 if (t->throttle_applied) {
163 t->throttle_applied = false;
164 up_write(&t->lock);
165 }
166}
167
168static void throttle_lock(struct throttle *t)
169{
170 down_read(&t->lock);
171}
172
173static void throttle_unlock(struct throttle *t)
174{
175 up_read(&t->lock);
126} 176}
127 177
128/*----------------------------------------------------------------*/ 178/*----------------------------------------------------------------*/
@@ -155,8 +205,11 @@ struct pool_features {
155 205
156struct thin_c; 206struct thin_c;
157typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); 207typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
208typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
158typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); 209typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
159 210
211#define CELL_SORT_ARRAY_SIZE 8192
212
160struct pool { 213struct pool {
161 struct list_head list; 214 struct list_head list;
162 struct dm_target *ti; /* Only set if a pool target is bound */ 215 struct dm_target *ti; /* Only set if a pool target is bound */
@@ -171,11 +224,13 @@ struct pool {
171 224
172 struct pool_features pf; 225 struct pool_features pf;
173 bool low_water_triggered:1; /* A dm event has been sent */ 226 bool low_water_triggered:1; /* A dm event has been sent */
227 bool suspended:1;
174 228
175 struct dm_bio_prison *prison; 229 struct dm_bio_prison *prison;
176 struct dm_kcopyd_client *copier; 230 struct dm_kcopyd_client *copier;
177 231
178 struct workqueue_struct *wq; 232 struct workqueue_struct *wq;
233 struct throttle throttle;
179 struct work_struct worker; 234 struct work_struct worker;
180 struct delayed_work waker; 235 struct delayed_work waker;
181 struct delayed_work no_space_timeout; 236 struct delayed_work no_space_timeout;
@@ -198,8 +253,13 @@ struct pool {
198 process_bio_fn process_bio; 253 process_bio_fn process_bio;
199 process_bio_fn process_discard; 254 process_bio_fn process_discard;
200 255
256 process_cell_fn process_cell;
257 process_cell_fn process_discard_cell;
258
201 process_mapping_fn process_prepared_mapping; 259 process_mapping_fn process_prepared_mapping;
202 process_mapping_fn process_prepared_discard; 260 process_mapping_fn process_prepared_discard;
261
262 struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];
203}; 263};
204 264
205static enum pool_mode get_pool_mode(struct pool *pool); 265static enum pool_mode get_pool_mode(struct pool *pool);
@@ -232,8 +292,11 @@ struct thin_c {
232 292
233 struct pool *pool; 293 struct pool *pool;
234 struct dm_thin_device *td; 294 struct dm_thin_device *td;
295 struct mapped_device *thin_md;
296
235 bool requeue_mode:1; 297 bool requeue_mode:1;
236 spinlock_t lock; 298 spinlock_t lock;
299 struct list_head deferred_cells;
237 struct bio_list deferred_bio_list; 300 struct bio_list deferred_bio_list;
238 struct bio_list retry_on_resume_list; 301 struct bio_list retry_on_resume_list;
239 struct rb_root sort_bio_list; /* sorted list of deferred bios */ 302 struct rb_root sort_bio_list; /* sorted list of deferred bios */
@@ -290,6 +353,15 @@ static void cell_release(struct pool *pool,
290 dm_bio_prison_free_cell(pool->prison, cell); 353 dm_bio_prison_free_cell(pool->prison, cell);
291} 354}
292 355
356static void cell_visit_release(struct pool *pool,
357 void (*fn)(void *, struct dm_bio_prison_cell *),
358 void *context,
359 struct dm_bio_prison_cell *cell)
360{
361 dm_cell_visit_release(pool->prison, fn, context, cell);
362 dm_bio_prison_free_cell(pool->prison, cell);
363}
364
293static void cell_release_no_holder(struct pool *pool, 365static void cell_release_no_holder(struct pool *pool,
294 struct dm_bio_prison_cell *cell, 366 struct dm_bio_prison_cell *cell,
295 struct bio_list *bios) 367 struct bio_list *bios)
@@ -298,19 +370,6 @@ static void cell_release_no_holder(struct pool *pool,
298 dm_bio_prison_free_cell(pool->prison, cell); 370 dm_bio_prison_free_cell(pool->prison, cell);
299} 371}
300 372
301static void cell_defer_no_holder_no_free(struct thin_c *tc,
302 struct dm_bio_prison_cell *cell)
303{
304 struct pool *pool = tc->pool;
305 unsigned long flags;
306
307 spin_lock_irqsave(&tc->lock, flags);
308 dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
309 spin_unlock_irqrestore(&tc->lock, flags);
310
311 wake_worker(pool);
312}
313
314static void cell_error_with_code(struct pool *pool, 373static void cell_error_with_code(struct pool *pool,
315 struct dm_bio_prison_cell *cell, int error_code) 374 struct dm_bio_prison_cell *cell, int error_code)
316{ 375{
@@ -323,6 +382,16 @@ static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
323 cell_error_with_code(pool, cell, -EIO); 382 cell_error_with_code(pool, cell, -EIO);
324} 383}
325 384
385static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
386{
387 cell_error_with_code(pool, cell, 0);
388}
389
390static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
391{
392 cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
393}
394
326/*----------------------------------------------------------------*/ 395/*----------------------------------------------------------------*/
327 396
328/* 397/*
@@ -393,44 +462,65 @@ struct dm_thin_endio_hook {
393 struct rb_node rb_node; 462 struct rb_node rb_node;
394}; 463};
395 464
396static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) 465static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
466{
467 bio_list_merge(bios, master);
468 bio_list_init(master);
469}
470
471static void error_bio_list(struct bio_list *bios, int error)
397{ 472{
398 struct bio *bio; 473 struct bio *bio;
474
475 while ((bio = bio_list_pop(bios)))
476 bio_endio(bio, error);
477}
478
479static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
480{
399 struct bio_list bios; 481 struct bio_list bios;
400 unsigned long flags; 482 unsigned long flags;
401 483
402 bio_list_init(&bios); 484 bio_list_init(&bios);
403 485
404 spin_lock_irqsave(&tc->lock, flags); 486 spin_lock_irqsave(&tc->lock, flags);
405 bio_list_merge(&bios, master); 487 __merge_bio_list(&bios, master);
406 bio_list_init(master);
407 spin_unlock_irqrestore(&tc->lock, flags); 488 spin_unlock_irqrestore(&tc->lock, flags);
408 489
409 while ((bio = bio_list_pop(&bios))) 490 error_bio_list(&bios, error);
410 bio_endio(bio, DM_ENDIO_REQUEUE);
411} 491}
412 492
413static void requeue_io(struct thin_c *tc) 493static void requeue_deferred_cells(struct thin_c *tc)
414{ 494{
415 requeue_bio_list(tc, &tc->deferred_bio_list); 495 struct pool *pool = tc->pool;
416 requeue_bio_list(tc, &tc->retry_on_resume_list); 496 unsigned long flags;
497 struct list_head cells;
498 struct dm_bio_prison_cell *cell, *tmp;
499
500 INIT_LIST_HEAD(&cells);
501
502 spin_lock_irqsave(&tc->lock, flags);
503 list_splice_init(&tc->deferred_cells, &cells);
504 spin_unlock_irqrestore(&tc->lock, flags);
505
506 list_for_each_entry_safe(cell, tmp, &cells, user_list)
507 cell_requeue(pool, cell);
417} 508}
418 509
419static void error_thin_retry_list(struct thin_c *tc) 510static void requeue_io(struct thin_c *tc)
420{ 511{
421 struct bio *bio;
422 unsigned long flags;
423 struct bio_list bios; 512 struct bio_list bios;
513 unsigned long flags;
424 514
425 bio_list_init(&bios); 515 bio_list_init(&bios);
426 516
427 spin_lock_irqsave(&tc->lock, flags); 517 spin_lock_irqsave(&tc->lock, flags);
428 bio_list_merge(&bios, &tc->retry_on_resume_list); 518 __merge_bio_list(&bios, &tc->deferred_bio_list);
429 bio_list_init(&tc->retry_on_resume_list); 519 __merge_bio_list(&bios, &tc->retry_on_resume_list);
430 spin_unlock_irqrestore(&tc->lock, flags); 520 spin_unlock_irqrestore(&tc->lock, flags);
431 521
432 while ((bio = bio_list_pop(&bios))) 522 error_bio_list(&bios, DM_ENDIO_REQUEUE);
433 bio_io_error(bio); 523 requeue_deferred_cells(tc);
434} 524}
435 525
436static void error_retry_list(struct pool *pool) 526static void error_retry_list(struct pool *pool)
@@ -439,7 +529,7 @@ static void error_retry_list(struct pool *pool)
439 529
440 rcu_read_lock(); 530 rcu_read_lock();
441 list_for_each_entry_rcu(tc, &pool->active_thins, list) 531 list_for_each_entry_rcu(tc, &pool->active_thins, list)
442 error_thin_retry_list(tc); 532 error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
443 rcu_read_unlock(); 533 rcu_read_unlock();
444} 534}
445 535
@@ -629,33 +719,75 @@ static void overwrite_endio(struct bio *bio, int err)
629 */ 719 */
630 720
631/* 721/*
632 * This sends the bios in the cell back to the deferred_bios list. 722 * This sends the bios in the cell, except the original holder, back
723 * to the deferred_bios list.
633 */ 724 */
634static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) 725static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
635{ 726{
636 struct pool *pool = tc->pool; 727 struct pool *pool = tc->pool;
637 unsigned long flags; 728 unsigned long flags;
638 729
639 spin_lock_irqsave(&tc->lock, flags); 730 spin_lock_irqsave(&tc->lock, flags);
640 cell_release(pool, cell, &tc->deferred_bio_list); 731 cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
641 spin_unlock_irqrestore(&tc->lock, flags); 732 spin_unlock_irqrestore(&tc->lock, flags);
642 733
643 wake_worker(pool); 734 wake_worker(pool);
644} 735}
645 736
646/* 737static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
647 * Same as cell_defer above, except it omits the original holder of the cell. 738
648 */ 739struct remap_info {
649static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) 740 struct thin_c *tc;
741 struct bio_list defer_bios;
742 struct bio_list issue_bios;
743};
744
745static void __inc_remap_and_issue_cell(void *context,
746 struct dm_bio_prison_cell *cell)
650{ 747{
651 struct pool *pool = tc->pool; 748 struct remap_info *info = context;
652 unsigned long flags; 749 struct bio *bio;
653 750
654 spin_lock_irqsave(&tc->lock, flags); 751 while ((bio = bio_list_pop(&cell->bios))) {
655 cell_release_no_holder(pool, cell, &tc->deferred_bio_list); 752 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))
656 spin_unlock_irqrestore(&tc->lock, flags); 753 bio_list_add(&info->defer_bios, bio);
754 else {
755 inc_all_io_entry(info->tc->pool, bio);
657 756
658 wake_worker(pool); 757 /*
758 * We can't issue the bios with the bio prison lock
759 * held, so we add them to a list to issue on
760 * return from this function.
761 */
762 bio_list_add(&info->issue_bios, bio);
763 }
764 }
765}
766
767static void inc_remap_and_issue_cell(struct thin_c *tc,
768 struct dm_bio_prison_cell *cell,
769 dm_block_t block)
770{
771 struct bio *bio;
772 struct remap_info info;
773
774 info.tc = tc;
775 bio_list_init(&info.defer_bios);
776 bio_list_init(&info.issue_bios);
777
778 /*
779 * We have to be careful to inc any bios we're about to issue
780 * before the cell is released, and avoid a race with new bios
781 * being added to the cell.
782 */
783 cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
784 &info, cell);
785
786 while ((bio = bio_list_pop(&info.defer_bios)))
787 thin_defer_bio(tc, bio);
788
789 while ((bio = bio_list_pop(&info.issue_bios)))
790 remap_and_issue(info.tc, bio, block);
659} 791}
660 792
661static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) 793static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
@@ -706,10 +838,13 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
706 * the bios in the cell. 838 * the bios in the cell.
707 */ 839 */
708 if (bio) { 840 if (bio) {
709 cell_defer_no_holder(tc, m->cell); 841 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
710 bio_endio(bio, 0); 842 bio_endio(bio, 0);
711 } else 843 } else {
712 cell_defer(tc, m->cell); 844 inc_all_io_entry(tc->pool, m->cell->holder);
845 remap_and_issue(tc, m->cell->holder, m->data_block);
846 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
847 }
713 848
714out: 849out:
715 list_del(&m->list); 850 list_del(&m->list);
@@ -842,6 +977,20 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
842 } 977 }
843} 978}
844 979
980static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
981 dm_block_t data_block,
982 struct dm_thin_new_mapping *m)
983{
984 struct pool *pool = tc->pool;
985 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
986
987 h->overwrite_mapping = m;
988 m->bio = bio;
989 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
990 inc_all_io_entry(pool, bio);
991 remap_and_issue(tc, bio, data_block);
992}
993
845/* 994/*
846 * A partial copy also needs to zero the uncopied region. 995 * A partial copy also needs to zero the uncopied region.
847 */ 996 */
@@ -876,15 +1025,9 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
876 * If the whole block of data is being overwritten, we can issue the 1025 * If the whole block of data is being overwritten, we can issue the
877 * bio immediately. Otherwise we use kcopyd to clone the data first. 1026 * bio immediately. Otherwise we use kcopyd to clone the data first.
878 */ 1027 */
879 if (io_overwrites_block(pool, bio)) { 1028 if (io_overwrites_block(pool, bio))
880 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1029 remap_and_issue_overwrite(tc, bio, data_dest, m);
881 1030 else {
882 h->overwrite_mapping = m;
883 m->bio = bio;
884 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
885 inc_all_io_entry(pool, bio);
886 remap_and_issue(tc, bio, data_dest);
887 } else {
888 struct dm_io_region from, to; 1031 struct dm_io_region from, to;
889 1032
890 from.bdev = origin->bdev; 1033 from.bdev = origin->bdev;
@@ -953,16 +1096,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
953 if (!pool->pf.zero_new_blocks) 1096 if (!pool->pf.zero_new_blocks)
954 process_prepared_mapping(m); 1097 process_prepared_mapping(m);
955 1098
956 else if (io_overwrites_block(pool, bio)) { 1099 else if (io_overwrites_block(pool, bio))
957 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1100 remap_and_issue_overwrite(tc, bio, data_block, m);
958
959 h->overwrite_mapping = m;
960 m->bio = bio;
961 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
962 inc_all_io_entry(pool, bio);
963 remap_and_issue(tc, bio, data_block);
964 1101
965 } else 1102 else
966 ll_zero(tc, m, 1103 ll_zero(tc, m,
967 data_block * pool->sectors_per_block, 1104 data_block * pool->sectors_per_block,
968 (data_block + 1) * pool->sectors_per_block); 1105 (data_block + 1) * pool->sectors_per_block);
@@ -1134,29 +1271,25 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
1134 bio_list_init(&bios); 1271 bio_list_init(&bios);
1135 cell_release(pool, cell, &bios); 1272 cell_release(pool, cell, &bios);
1136 1273
1137 error = should_error_unserviceable_bio(pool); 1274 while ((bio = bio_list_pop(&bios)))
1138 if (error) 1275 retry_on_resume(bio);
1139 while ((bio = bio_list_pop(&bios)))
1140 bio_endio(bio, error);
1141 else
1142 while ((bio = bio_list_pop(&bios)))
1143 retry_on_resume(bio);
1144} 1276}
1145 1277
1146static void process_discard(struct thin_c *tc, struct bio *bio) 1278static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1147{ 1279{
1148 int r; 1280 int r;
1149 unsigned long flags; 1281 struct bio *bio = cell->holder;
1150 struct pool *pool = tc->pool; 1282 struct pool *pool = tc->pool;
1151 struct dm_bio_prison_cell *cell, *cell2; 1283 struct dm_bio_prison_cell *cell2;
1152 struct dm_cell_key key, key2; 1284 struct dm_cell_key key2;
1153 dm_block_t block = get_bio_block(tc, bio); 1285 dm_block_t block = get_bio_block(tc, bio);
1154 struct dm_thin_lookup_result lookup_result; 1286 struct dm_thin_lookup_result lookup_result;
1155 struct dm_thin_new_mapping *m; 1287 struct dm_thin_new_mapping *m;
1156 1288
1157 build_virtual_key(tc->td, block, &key); 1289 if (tc->requeue_mode) {
1158 if (bio_detain(tc->pool, &key, bio, &cell)) 1290 cell_requeue(pool, cell);
1159 return; 1291 return;
1292 }
1160 1293
1161 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1294 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1162 switch (r) { 1295 switch (r) {
@@ -1187,12 +1320,9 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
1187 m->cell2 = cell2; 1320 m->cell2 = cell2;
1188 m->bio = bio; 1321 m->bio = bio;
1189 1322
1190 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { 1323 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1191 spin_lock_irqsave(&pool->lock, flags); 1324 pool->process_prepared_discard(m);
1192 list_add_tail(&m->list, &pool->prepared_discards); 1325
1193 spin_unlock_irqrestore(&pool->lock, flags);
1194 wake_worker(pool);
1195 }
1196 } else { 1326 } else {
1197 inc_all_io_entry(pool, bio); 1327 inc_all_io_entry(pool, bio);
1198 cell_defer_no_holder(tc, cell); 1328 cell_defer_no_holder(tc, cell);
@@ -1227,6 +1357,19 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
1227 } 1357 }
1228} 1358}
1229 1359
1360static void process_discard_bio(struct thin_c *tc, struct bio *bio)
1361{
1362 struct dm_bio_prison_cell *cell;
1363 struct dm_cell_key key;
1364 dm_block_t block = get_bio_block(tc, bio);
1365
1366 build_virtual_key(tc->td, block, &key);
1367 if (bio_detain(tc->pool, &key, bio, &cell))
1368 return;
1369
1370 process_discard_cell(tc, cell);
1371}
1372
1230static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1373static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1231 struct dm_cell_key *key, 1374 struct dm_cell_key *key,
1232 struct dm_thin_lookup_result *lookup_result, 1375 struct dm_thin_lookup_result *lookup_result,
@@ -1255,11 +1398,53 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1255 } 1398 }
1256} 1399}
1257 1400
1401static void __remap_and_issue_shared_cell(void *context,
1402 struct dm_bio_prison_cell *cell)
1403{
1404 struct remap_info *info = context;
1405 struct bio *bio;
1406
1407 while ((bio = bio_list_pop(&cell->bios))) {
1408 if ((bio_data_dir(bio) == WRITE) ||
1409 (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)))
1410 bio_list_add(&info->defer_bios, bio);
1411 else {
1412 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
1413
1414 h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
1415 inc_all_io_entry(info->tc->pool, bio);
1416 bio_list_add(&info->issue_bios, bio);
1417 }
1418 }
1419}
1420
1421static void remap_and_issue_shared_cell(struct thin_c *tc,
1422 struct dm_bio_prison_cell *cell,
1423 dm_block_t block)
1424{
1425 struct bio *bio;
1426 struct remap_info info;
1427
1428 info.tc = tc;
1429 bio_list_init(&info.defer_bios);
1430 bio_list_init(&info.issue_bios);
1431
1432 cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
1433 &info, cell);
1434
1435 while ((bio = bio_list_pop(&info.defer_bios)))
1436 thin_defer_bio(tc, bio);
1437
1438 while ((bio = bio_list_pop(&info.issue_bios)))
1439 remap_and_issue(tc, bio, block);
1440}
1441
1258static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1442static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1259 dm_block_t block, 1443 dm_block_t block,
1260 struct dm_thin_lookup_result *lookup_result) 1444 struct dm_thin_lookup_result *lookup_result,
1445 struct dm_bio_prison_cell *virt_cell)
1261{ 1446{
1262 struct dm_bio_prison_cell *cell; 1447 struct dm_bio_prison_cell *data_cell;
1263 struct pool *pool = tc->pool; 1448 struct pool *pool = tc->pool;
1264 struct dm_cell_key key; 1449 struct dm_cell_key key;
1265 1450
@@ -1268,19 +1453,23 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1268 * of being broken so we have nothing further to do here. 1453 * of being broken so we have nothing further to do here.
1269 */ 1454 */
1270 build_data_key(tc->td, lookup_result->block, &key); 1455 build_data_key(tc->td, lookup_result->block, &key);
1271 if (bio_detain(pool, &key, bio, &cell)) 1456 if (bio_detain(pool, &key, bio, &data_cell)) {
1457 cell_defer_no_holder(tc, virt_cell);
1272 return; 1458 return;
1459 }
1273 1460
1274 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) 1461 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
1275 break_sharing(tc, bio, block, &key, lookup_result, cell); 1462 break_sharing(tc, bio, block, &key, lookup_result, data_cell);
1276 else { 1463 cell_defer_no_holder(tc, virt_cell);
1464 } else {
1277 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1465 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1278 1466
1279 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); 1467 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1280 inc_all_io_entry(pool, bio); 1468 inc_all_io_entry(pool, bio);
1281 cell_defer_no_holder(tc, cell);
1282
1283 remap_and_issue(tc, bio, lookup_result->block); 1469 remap_and_issue(tc, bio, lookup_result->block);
1470
1471 remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
1472 remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
1284 } 1473 }
1285} 1474}
1286 1475
@@ -1333,34 +1522,28 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1333 } 1522 }
1334} 1523}
1335 1524
1336static void process_bio(struct thin_c *tc, struct bio *bio) 1525static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1337{ 1526{
1338 int r; 1527 int r;
1339 struct pool *pool = tc->pool; 1528 struct pool *pool = tc->pool;
1529 struct bio *bio = cell->holder;
1340 dm_block_t block = get_bio_block(tc, bio); 1530 dm_block_t block = get_bio_block(tc, bio);
1341 struct dm_bio_prison_cell *cell;
1342 struct dm_cell_key key;
1343 struct dm_thin_lookup_result lookup_result; 1531 struct dm_thin_lookup_result lookup_result;
1344 1532
1345 /* 1533 if (tc->requeue_mode) {
1346 * If cell is already occupied, then the block is already 1534 cell_requeue(pool, cell);
1347 * being provisioned so we have nothing further to do here.
1348 */
1349 build_virtual_key(tc->td, block, &key);
1350 if (bio_detain(pool, &key, bio, &cell))
1351 return; 1535 return;
1536 }
1352 1537
1353 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1538 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1354 switch (r) { 1539 switch (r) {
1355 case 0: 1540 case 0:
1356 if (lookup_result.shared) { 1541 if (lookup_result.shared)
1357 process_shared_bio(tc, bio, block, &lookup_result); 1542 process_shared_bio(tc, bio, block, &lookup_result, cell);
1358 cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ 1543 else {
1359 } else {
1360 inc_all_io_entry(pool, bio); 1544 inc_all_io_entry(pool, bio);
1361 cell_defer_no_holder(tc, cell);
1362
1363 remap_and_issue(tc, bio, lookup_result.block); 1545 remap_and_issue(tc, bio, lookup_result.block);
1546 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
1364 } 1547 }
1365 break; 1548 break;
1366 1549
@@ -1394,7 +1577,26 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1394 } 1577 }
1395} 1578}
1396 1579
1397static void process_bio_read_only(struct thin_c *tc, struct bio *bio) 1580static void process_bio(struct thin_c *tc, struct bio *bio)
1581{
1582 struct pool *pool = tc->pool;
1583 dm_block_t block = get_bio_block(tc, bio);
1584 struct dm_bio_prison_cell *cell;
1585 struct dm_cell_key key;
1586
1587 /*
1588 * If cell is already occupied, then the block is already
1589 * being provisioned so we have nothing further to do here.
1590 */
1591 build_virtual_key(tc->td, block, &key);
1592 if (bio_detain(pool, &key, bio, &cell))
1593 return;
1594
1595 process_cell(tc, cell);
1596}
1597
1598static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
1599 struct dm_bio_prison_cell *cell)
1398{ 1600{
1399 int r; 1601 int r;
1400 int rw = bio_data_dir(bio); 1602 int rw = bio_data_dir(bio);
@@ -1404,15 +1606,21 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1404 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1606 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1405 switch (r) { 1607 switch (r) {
1406 case 0: 1608 case 0:
1407 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) 1609 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
1408 handle_unserviceable_bio(tc->pool, bio); 1610 handle_unserviceable_bio(tc->pool, bio);
1409 else { 1611 if (cell)
1612 cell_defer_no_holder(tc, cell);
1613 } else {
1410 inc_all_io_entry(tc->pool, bio); 1614 inc_all_io_entry(tc->pool, bio);
1411 remap_and_issue(tc, bio, lookup_result.block); 1615 remap_and_issue(tc, bio, lookup_result.block);
1616 if (cell)
1617 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
1412 } 1618 }
1413 break; 1619 break;
1414 1620
1415 case -ENODATA: 1621 case -ENODATA:
1622 if (cell)
1623 cell_defer_no_holder(tc, cell);
1416 if (rw != READ) { 1624 if (rw != READ) {
1417 handle_unserviceable_bio(tc->pool, bio); 1625 handle_unserviceable_bio(tc->pool, bio);
1418 break; 1626 break;
@@ -1431,11 +1639,23 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1431 default: 1639 default:
1432 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", 1640 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1433 __func__, r); 1641 __func__, r);
1642 if (cell)
1643 cell_defer_no_holder(tc, cell);
1434 bio_io_error(bio); 1644 bio_io_error(bio);
1435 break; 1645 break;
1436 } 1646 }
1437} 1647}
1438 1648
1649static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1650{
1651 __process_bio_read_only(tc, bio, NULL);
1652}
1653
1654static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1655{
1656 __process_bio_read_only(tc, cell->holder, cell);
1657}
1658
1439static void process_bio_success(struct thin_c *tc, struct bio *bio) 1659static void process_bio_success(struct thin_c *tc, struct bio *bio)
1440{ 1660{
1441 bio_endio(bio, 0); 1661 bio_endio(bio, 0);
@@ -1446,6 +1666,16 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1446 bio_io_error(bio); 1666 bio_io_error(bio);
1447} 1667}
1448 1668
1669static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1670{
1671 cell_success(tc->pool, cell);
1672}
1673
1674static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1675{
1676 cell_error(tc->pool, cell);
1677}
1678
1449/* 1679/*
1450 * FIXME: should we also commit due to size of transaction, measured in 1680 * FIXME: should we also commit due to size of transaction, measured in
1451 * metadata blocks? 1681 * metadata blocks?
@@ -1527,9 +1757,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
1527 struct bio *bio; 1757 struct bio *bio;
1528 struct bio_list bios; 1758 struct bio_list bios;
1529 struct blk_plug plug; 1759 struct blk_plug plug;
1760 unsigned count = 0;
1530 1761
1531 if (tc->requeue_mode) { 1762 if (tc->requeue_mode) {
1532 requeue_bio_list(tc, &tc->deferred_bio_list); 1763 error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);
1533 return; 1764 return;
1534 } 1765 }
1535 1766
@@ -1568,10 +1799,97 @@ static void process_thin_deferred_bios(struct thin_c *tc)
1568 pool->process_discard(tc, bio); 1799 pool->process_discard(tc, bio);
1569 else 1800 else
1570 pool->process_bio(tc, bio); 1801 pool->process_bio(tc, bio);
1802
1803 if ((count++ & 127) == 0) {
1804 throttle_work_update(&pool->throttle);
1805 dm_pool_issue_prefetches(pool->pmd);
1806 }
1571 } 1807 }
1572 blk_finish_plug(&plug); 1808 blk_finish_plug(&plug);
1573} 1809}
1574 1810
1811static int cmp_cells(const void *lhs, const void *rhs)
1812{
1813 struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
1814 struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
1815
1816 BUG_ON(!lhs_cell->holder);
1817 BUG_ON(!rhs_cell->holder);
1818
1819 if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
1820 return -1;
1821
1822 if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
1823 return 1;
1824
1825 return 0;
1826}
1827
1828static unsigned sort_cells(struct pool *pool, struct list_head *cells)
1829{
1830 unsigned count = 0;
1831 struct dm_bio_prison_cell *cell, *tmp;
1832
1833 list_for_each_entry_safe(cell, tmp, cells, user_list) {
1834 if (count >= CELL_SORT_ARRAY_SIZE)
1835 break;
1836
1837 pool->cell_sort_array[count++] = cell;
1838 list_del(&cell->user_list);
1839 }
1840
1841 sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
1842
1843 return count;
1844}
1845
1846static void process_thin_deferred_cells(struct thin_c *tc)
1847{
1848 struct pool *pool = tc->pool;
1849 unsigned long flags;
1850 struct list_head cells;
1851 struct dm_bio_prison_cell *cell;
1852 unsigned i, j, count;
1853
1854 INIT_LIST_HEAD(&cells);
1855
1856 spin_lock_irqsave(&tc->lock, flags);
1857 list_splice_init(&tc->deferred_cells, &cells);
1858 spin_unlock_irqrestore(&tc->lock, flags);
1859
1860 if (list_empty(&cells))
1861 return;
1862
1863 do {
1864 count = sort_cells(tc->pool, &cells);
1865
1866 for (i = 0; i < count; i++) {
1867 cell = pool->cell_sort_array[i];
1868 BUG_ON(!cell->holder);
1869
1870 /*
1871 * If we've got no free new_mapping structs, and processing
1872 * this bio might require one, we pause until there are some
1873 * prepared mappings to process.
1874 */
1875 if (ensure_next_mapping(pool)) {
1876 for (j = i; j < count; j++)
1877 list_add(&pool->cell_sort_array[j]->user_list, &cells);
1878
1879 spin_lock_irqsave(&tc->lock, flags);
1880 list_splice(&cells, &tc->deferred_cells);
1881 spin_unlock_irqrestore(&tc->lock, flags);
1882 return;
1883 }
1884
1885 if (cell->holder->bi_rw & REQ_DISCARD)
1886 pool->process_discard_cell(tc, cell);
1887 else
1888 pool->process_cell(tc, cell);
1889 }
1890 } while (!list_empty(&cells));
1891}
1892
1575static void thin_get(struct thin_c *tc); 1893static void thin_get(struct thin_c *tc);
1576static void thin_put(struct thin_c *tc); 1894static void thin_put(struct thin_c *tc);
1577 1895
@@ -1620,6 +1938,7 @@ static void process_deferred_bios(struct pool *pool)
1620 1938
1621 tc = get_first_thin(pool); 1939 tc = get_first_thin(pool);
1622 while (tc) { 1940 while (tc) {
1941 process_thin_deferred_cells(tc);
1623 process_thin_deferred_bios(tc); 1942 process_thin_deferred_bios(tc);
1624 tc = get_next_thin(pool, tc); 1943 tc = get_next_thin(pool, tc);
1625 } 1944 }
@@ -1653,9 +1972,15 @@ static void do_worker(struct work_struct *ws)
1653{ 1972{
1654 struct pool *pool = container_of(ws, struct pool, worker); 1973 struct pool *pool = container_of(ws, struct pool, worker);
1655 1974
1975 throttle_work_start(&pool->throttle);
1976 dm_pool_issue_prefetches(pool->pmd);
1977 throttle_work_update(&pool->throttle);
1656 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); 1978 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1979 throttle_work_update(&pool->throttle);
1657 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); 1980 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1981 throttle_work_update(&pool->throttle);
1658 process_deferred_bios(pool); 1982 process_deferred_bios(pool);
1983 throttle_work_complete(&pool->throttle);
1659} 1984}
1660 1985
1661/* 1986/*
@@ -1792,6 +2117,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1792 dm_pool_metadata_read_only(pool->pmd); 2117 dm_pool_metadata_read_only(pool->pmd);
1793 pool->process_bio = process_bio_fail; 2118 pool->process_bio = process_bio_fail;
1794 pool->process_discard = process_bio_fail; 2119 pool->process_discard = process_bio_fail;
2120 pool->process_cell = process_cell_fail;
2121 pool->process_discard_cell = process_cell_fail;
1795 pool->process_prepared_mapping = process_prepared_mapping_fail; 2122 pool->process_prepared_mapping = process_prepared_mapping_fail;
1796 pool->process_prepared_discard = process_prepared_discard_fail; 2123 pool->process_prepared_discard = process_prepared_discard_fail;
1797 2124
@@ -1804,6 +2131,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1804 dm_pool_metadata_read_only(pool->pmd); 2131 dm_pool_metadata_read_only(pool->pmd);
1805 pool->process_bio = process_bio_read_only; 2132 pool->process_bio = process_bio_read_only;
1806 pool->process_discard = process_bio_success; 2133 pool->process_discard = process_bio_success;
2134 pool->process_cell = process_cell_read_only;
2135 pool->process_discard_cell = process_cell_success;
1807 pool->process_prepared_mapping = process_prepared_mapping_fail; 2136 pool->process_prepared_mapping = process_prepared_mapping_fail;
1808 pool->process_prepared_discard = process_prepared_discard_passdown; 2137 pool->process_prepared_discard = process_prepared_discard_passdown;
1809 2138
@@ -1822,7 +2151,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1822 if (old_mode != new_mode) 2151 if (old_mode != new_mode)
1823 notify_of_pool_mode_change(pool, "out-of-data-space"); 2152 notify_of_pool_mode_change(pool, "out-of-data-space");
1824 pool->process_bio = process_bio_read_only; 2153 pool->process_bio = process_bio_read_only;
1825 pool->process_discard = process_discard; 2154 pool->process_discard = process_discard_bio;
2155 pool->process_cell = process_cell_read_only;
2156 pool->process_discard_cell = process_discard_cell;
1826 pool->process_prepared_mapping = process_prepared_mapping; 2157 pool->process_prepared_mapping = process_prepared_mapping;
1827 pool->process_prepared_discard = process_prepared_discard_passdown; 2158 pool->process_prepared_discard = process_prepared_discard_passdown;
1828 2159
@@ -1835,7 +2166,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1835 notify_of_pool_mode_change(pool, "write"); 2166 notify_of_pool_mode_change(pool, "write");
1836 dm_pool_metadata_read_write(pool->pmd); 2167 dm_pool_metadata_read_write(pool->pmd);
1837 pool->process_bio = process_bio; 2168 pool->process_bio = process_bio;
1838 pool->process_discard = process_discard; 2169 pool->process_discard = process_discard_bio;
2170 pool->process_cell = process_cell;
2171 pool->process_discard_cell = process_discard_cell;
1839 pool->process_prepared_mapping = process_prepared_mapping; 2172 pool->process_prepared_mapping = process_prepared_mapping;
1840 pool->process_prepared_discard = process_prepared_discard; 2173 pool->process_prepared_discard = process_prepared_discard;
1841 break; 2174 break;
@@ -1895,6 +2228,29 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1895 wake_worker(pool); 2228 wake_worker(pool);
1896} 2229}
1897 2230
2231static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
2232{
2233 struct pool *pool = tc->pool;
2234
2235 throttle_lock(&pool->throttle);
2236 thin_defer_bio(tc, bio);
2237 throttle_unlock(&pool->throttle);
2238}
2239
2240static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2241{
2242 unsigned long flags;
2243 struct pool *pool = tc->pool;
2244
2245 throttle_lock(&pool->throttle);
2246 spin_lock_irqsave(&tc->lock, flags);
2247 list_add_tail(&cell->user_list, &tc->deferred_cells);
2248 spin_unlock_irqrestore(&tc->lock, flags);
2249 throttle_unlock(&pool->throttle);
2250
2251 wake_worker(pool);
2252}
2253
1898static void thin_hook_bio(struct thin_c *tc, struct bio *bio) 2254static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
1899{ 2255{
1900 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 2256 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -1915,8 +2271,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1915 dm_block_t block = get_bio_block(tc, bio); 2271 dm_block_t block = get_bio_block(tc, bio);
1916 struct dm_thin_device *td = tc->td; 2272 struct dm_thin_device *td = tc->td;
1917 struct dm_thin_lookup_result result; 2273 struct dm_thin_lookup_result result;
1918 struct dm_bio_prison_cell cell1, cell2; 2274 struct dm_bio_prison_cell *virt_cell, *data_cell;
1919 struct dm_bio_prison_cell *cell_result;
1920 struct dm_cell_key key; 2275 struct dm_cell_key key;
1921 2276
1922 thin_hook_bio(tc, bio); 2277 thin_hook_bio(tc, bio);
@@ -1932,7 +2287,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1932 } 2287 }
1933 2288
1934 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 2289 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1935 thin_defer_bio(tc, bio); 2290 thin_defer_bio_with_throttle(tc, bio);
1936 return DM_MAPIO_SUBMITTED; 2291 return DM_MAPIO_SUBMITTED;
1937 } 2292 }
1938 2293
@@ -1941,7 +2296,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1941 * there's a race with discard. 2296 * there's a race with discard.
1942 */ 2297 */
1943 build_virtual_key(tc->td, block, &key); 2298 build_virtual_key(tc->td, block, &key);
1944 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) 2299 if (bio_detain(tc->pool, &key, bio, &virt_cell))
1945 return DM_MAPIO_SUBMITTED; 2300 return DM_MAPIO_SUBMITTED;
1946 2301
1947 r = dm_thin_find_block(td, block, 0, &result); 2302 r = dm_thin_find_block(td, block, 0, &result);
@@ -1966,20 +2321,19 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1966 * More distant ancestors are irrelevant. The 2321 * More distant ancestors are irrelevant. The
1967 * shared flag will be set in their case. 2322 * shared flag will be set in their case.
1968 */ 2323 */
1969 thin_defer_bio(tc, bio); 2324 thin_defer_cell(tc, virt_cell);
1970 cell_defer_no_holder_no_free(tc, &cell1);
1971 return DM_MAPIO_SUBMITTED; 2325 return DM_MAPIO_SUBMITTED;
1972 } 2326 }
1973 2327
1974 build_data_key(tc->td, result.block, &key); 2328 build_data_key(tc->td, result.block, &key);
1975 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { 2329 if (bio_detain(tc->pool, &key, bio, &data_cell)) {
1976 cell_defer_no_holder_no_free(tc, &cell1); 2330 cell_defer_no_holder(tc, virt_cell);
1977 return DM_MAPIO_SUBMITTED; 2331 return DM_MAPIO_SUBMITTED;
1978 } 2332 }
1979 2333
1980 inc_all_io_entry(tc->pool, bio); 2334 inc_all_io_entry(tc->pool, bio);
1981 cell_defer_no_holder_no_free(tc, &cell2); 2335 cell_defer_no_holder(tc, data_cell);
1982 cell_defer_no_holder_no_free(tc, &cell1); 2336 cell_defer_no_holder(tc, virt_cell);
1983 2337
1984 remap(tc, bio, result.block); 2338 remap(tc, bio, result.block);
1985 return DM_MAPIO_REMAPPED; 2339 return DM_MAPIO_REMAPPED;
@@ -1991,18 +2345,13 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1991 * of doing so. 2345 * of doing so.
1992 */ 2346 */
1993 handle_unserviceable_bio(tc->pool, bio); 2347 handle_unserviceable_bio(tc->pool, bio);
1994 cell_defer_no_holder_no_free(tc, &cell1); 2348 cell_defer_no_holder(tc, virt_cell);
1995 return DM_MAPIO_SUBMITTED; 2349 return DM_MAPIO_SUBMITTED;
1996 } 2350 }
1997 /* fall through */ 2351 /* fall through */
1998 2352
1999 case -EWOULDBLOCK: 2353 case -EWOULDBLOCK:
2000 /* 2354 thin_defer_cell(tc, virt_cell);
2001 * In future, the failed dm_thin_find_block above could
2002 * provide the hint to load the metadata into cache.
2003 */
2004 thin_defer_bio(tc, bio);
2005 cell_defer_no_holder_no_free(tc, &cell1);
2006 return DM_MAPIO_SUBMITTED; 2355 return DM_MAPIO_SUBMITTED;
2007 2356
2008 default: 2357 default:
@@ -2012,7 +2361,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
2012 * pool is switched to fail-io mode. 2361 * pool is switched to fail-io mode.
2013 */ 2362 */
2014 bio_io_error(bio); 2363 bio_io_error(bio);
2015 cell_defer_no_holder_no_free(tc, &cell1); 2364 cell_defer_no_holder(tc, virt_cell);
2016 return DM_MAPIO_SUBMITTED; 2365 return DM_MAPIO_SUBMITTED;
2017 } 2366 }
2018} 2367}
@@ -2193,7 +2542,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
2193 pool->sectors_per_block_shift = __ffs(block_size); 2542 pool->sectors_per_block_shift = __ffs(block_size);
2194 pool->low_water_blocks = 0; 2543 pool->low_water_blocks = 0;
2195 pool_features_init(&pool->pf); 2544 pool_features_init(&pool->pf);
2196 pool->prison = dm_bio_prison_create(PRISON_CELLS); 2545 pool->prison = dm_bio_prison_create();
2197 if (!pool->prison) { 2546 if (!pool->prison) {
2198 *error = "Error creating pool's bio prison"; 2547 *error = "Error creating pool's bio prison";
2199 err_p = ERR_PTR(-ENOMEM); 2548 err_p = ERR_PTR(-ENOMEM);
@@ -2219,6 +2568,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
2219 goto bad_wq; 2568 goto bad_wq;
2220 } 2569 }
2221 2570
2571 throttle_init(&pool->throttle);
2222 INIT_WORK(&pool->worker, do_worker); 2572 INIT_WORK(&pool->worker, do_worker);
2223 INIT_DELAYED_WORK(&pool->waker, do_waker); 2573 INIT_DELAYED_WORK(&pool->waker, do_waker);
2224 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); 2574 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
@@ -2228,6 +2578,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
2228 INIT_LIST_HEAD(&pool->prepared_discards); 2578 INIT_LIST_HEAD(&pool->prepared_discards);
2229 INIT_LIST_HEAD(&pool->active_thins); 2579 INIT_LIST_HEAD(&pool->active_thins);
2230 pool->low_water_triggered = false; 2580 pool->low_water_triggered = false;
2581 pool->suspended = true;
2231 2582
2232 pool->shared_read_ds = dm_deferred_set_create(); 2583 pool->shared_read_ds = dm_deferred_set_create();
2233 if (!pool->shared_read_ds) { 2584 if (!pool->shared_read_ds) {
@@ -2764,20 +3115,77 @@ static int pool_preresume(struct dm_target *ti)
2764 return 0; 3115 return 0;
2765} 3116}
2766 3117
3118static void pool_suspend_active_thins(struct pool *pool)
3119{
3120 struct thin_c *tc;
3121
3122 /* Suspend all active thin devices */
3123 tc = get_first_thin(pool);
3124 while (tc) {
3125 dm_internal_suspend_noflush(tc->thin_md);
3126 tc = get_next_thin(pool, tc);
3127 }
3128}
3129
3130static void pool_resume_active_thins(struct pool *pool)
3131{
3132 struct thin_c *tc;
3133
3134 /* Resume all active thin devices */
3135 tc = get_first_thin(pool);
3136 while (tc) {
3137 dm_internal_resume(tc->thin_md);
3138 tc = get_next_thin(pool, tc);
3139 }
3140}
3141
2767static void pool_resume(struct dm_target *ti) 3142static void pool_resume(struct dm_target *ti)
2768{ 3143{
2769 struct pool_c *pt = ti->private; 3144 struct pool_c *pt = ti->private;
2770 struct pool *pool = pt->pool; 3145 struct pool *pool = pt->pool;
2771 unsigned long flags; 3146 unsigned long flags;
2772 3147
3148 /*
3149 * Must requeue active_thins' bios and then resume
3150 * active_thins _before_ clearing 'suspend' flag.
3151 */
3152 requeue_bios(pool);
3153 pool_resume_active_thins(pool);
3154
2773 spin_lock_irqsave(&pool->lock, flags); 3155 spin_lock_irqsave(&pool->lock, flags);
2774 pool->low_water_triggered = false; 3156 pool->low_water_triggered = false;
3157 pool->suspended = false;
2775 spin_unlock_irqrestore(&pool->lock, flags); 3158 spin_unlock_irqrestore(&pool->lock, flags);
2776 requeue_bios(pool);
2777 3159
2778 do_waker(&pool->waker.work); 3160 do_waker(&pool->waker.work);
2779} 3161}
2780 3162
3163static void pool_presuspend(struct dm_target *ti)
3164{
3165 struct pool_c *pt = ti->private;
3166 struct pool *pool = pt->pool;
3167 unsigned long flags;
3168
3169 spin_lock_irqsave(&pool->lock, flags);
3170 pool->suspended = true;
3171 spin_unlock_irqrestore(&pool->lock, flags);
3172
3173 pool_suspend_active_thins(pool);
3174}
3175
3176static void pool_presuspend_undo(struct dm_target *ti)
3177{
3178 struct pool_c *pt = ti->private;
3179 struct pool *pool = pt->pool;
3180 unsigned long flags;
3181
3182 pool_resume_active_thins(pool);
3183
3184 spin_lock_irqsave(&pool->lock, flags);
3185 pool->suspended = false;
3186 spin_unlock_irqrestore(&pool->lock, flags);
3187}
3188
2781static void pool_postsuspend(struct dm_target *ti) 3189static void pool_postsuspend(struct dm_target *ti)
2782{ 3190{
2783 struct pool_c *pt = ti->private; 3191 struct pool_c *pt = ti->private;
@@ -2949,7 +3357,6 @@ static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct
2949 * create_thin <dev_id> 3357 * create_thin <dev_id>
2950 * create_snap <dev_id> <origin_id> 3358 * create_snap <dev_id> <origin_id>
2951 * delete <dev_id> 3359 * delete <dev_id>
2952 * trim <dev_id> <new_size_in_sectors>
2953 * set_transaction_id <current_trans_id> <new_trans_id> 3360 * set_transaction_id <current_trans_id> <new_trans_id>
2954 * reserve_metadata_snap 3361 * reserve_metadata_snap
2955 * release_metadata_snap 3362 * release_metadata_snap
@@ -3177,15 +3584,35 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
3177{ 3584{
3178 struct pool_c *pt = ti->private; 3585 struct pool_c *pt = ti->private;
3179 struct pool *pool = pt->pool; 3586 struct pool *pool = pt->pool;
3180 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3587 sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3588
3589 /*
3590 * If max_sectors is smaller than pool->sectors_per_block adjust it
3591 * to the highest possible power-of-2 factor of pool->sectors_per_block.
3592 * This is especially beneficial when the pool's data device is a RAID
3593 * device that has a full stripe width that matches pool->sectors_per_block
3594 * -- because even though partial RAID stripe-sized IOs will be issued to a
3595 * single RAID stripe; when aggregated they will end on a full RAID stripe
3596 * boundary.. which avoids additional partial RAID stripe writes cascading
3597 */
3598 if (limits->max_sectors < pool->sectors_per_block) {
3599 while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
3600 if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
3601 limits->max_sectors--;
3602 limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
3603 }
3604 }
3181 3605
3182 /* 3606 /*
3183 * If the system-determined stacked limits are compatible with the 3607 * If the system-determined stacked limits are compatible with the
3184 * pool's blocksize (io_opt is a factor) do not override them. 3608 * pool's blocksize (io_opt is a factor) do not override them.
3185 */ 3609 */
3186 if (io_opt_sectors < pool->sectors_per_block || 3610 if (io_opt_sectors < pool->sectors_per_block ||
3187 do_div(io_opt_sectors, pool->sectors_per_block)) { 3611 !is_factor(io_opt_sectors, pool->sectors_per_block)) {
3188 blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); 3612 if (is_factor(pool->sectors_per_block, limits->max_sectors))
3613 blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
3614 else
3615 blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
3189 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 3616 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
3190 } 3617 }
3191 3618
@@ -3214,11 +3641,13 @@ static struct target_type pool_target = {
3214 .name = "thin-pool", 3641 .name = "thin-pool",
3215 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 3642 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
3216 DM_TARGET_IMMUTABLE, 3643 DM_TARGET_IMMUTABLE,
3217 .version = {1, 13, 0}, 3644 .version = {1, 14, 0},
3218 .module = THIS_MODULE, 3645 .module = THIS_MODULE,
3219 .ctr = pool_ctr, 3646 .ctr = pool_ctr,
3220 .dtr = pool_dtr, 3647 .dtr = pool_dtr,
3221 .map = pool_map, 3648 .map = pool_map,
3649 .presuspend = pool_presuspend,
3650 .presuspend_undo = pool_presuspend_undo,
3222 .postsuspend = pool_postsuspend, 3651 .postsuspend = pool_postsuspend,
3223 .preresume = pool_preresume, 3652 .preresume = pool_preresume,
3224 .resume = pool_resume, 3653 .resume = pool_resume,
@@ -3248,14 +3677,14 @@ static void thin_dtr(struct dm_target *ti)
3248 struct thin_c *tc = ti->private; 3677 struct thin_c *tc = ti->private;
3249 unsigned long flags; 3678 unsigned long flags;
3250 3679
3251 thin_put(tc);
3252 wait_for_completion(&tc->can_destroy);
3253
3254 spin_lock_irqsave(&tc->pool->lock, flags); 3680 spin_lock_irqsave(&tc->pool->lock, flags);
3255 list_del_rcu(&tc->list); 3681 list_del_rcu(&tc->list);
3256 spin_unlock_irqrestore(&tc->pool->lock, flags); 3682 spin_unlock_irqrestore(&tc->pool->lock, flags);
3257 synchronize_rcu(); 3683 synchronize_rcu();
3258 3684
3685 thin_put(tc);
3686 wait_for_completion(&tc->can_destroy);
3687
3259 mutex_lock(&dm_thin_pool_table.mutex); 3688 mutex_lock(&dm_thin_pool_table.mutex);
3260 3689
3261 __pool_dec(tc->pool); 3690 __pool_dec(tc->pool);
@@ -3302,7 +3731,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3302 r = -ENOMEM; 3731 r = -ENOMEM;
3303 goto out_unlock; 3732 goto out_unlock;
3304 } 3733 }
3734 tc->thin_md = dm_table_get_md(ti->table);
3305 spin_lock_init(&tc->lock); 3735 spin_lock_init(&tc->lock);
3736 INIT_LIST_HEAD(&tc->deferred_cells);
3306 bio_list_init(&tc->deferred_bio_list); 3737 bio_list_init(&tc->deferred_bio_list);
3307 bio_list_init(&tc->retry_on_resume_list); 3738 bio_list_init(&tc->retry_on_resume_list);
3308 tc->sort_bio_list = RB_ROOT; 3739 tc->sort_bio_list = RB_ROOT;
@@ -3347,18 +3778,18 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3347 if (get_pool_mode(tc->pool) == PM_FAIL) { 3778 if (get_pool_mode(tc->pool) == PM_FAIL) {
3348 ti->error = "Couldn't open thin device, Pool is in fail mode"; 3779 ti->error = "Couldn't open thin device, Pool is in fail mode";
3349 r = -EINVAL; 3780 r = -EINVAL;
3350 goto bad_thin_open; 3781 goto bad_pool;
3351 } 3782 }
3352 3783
3353 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 3784 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
3354 if (r) { 3785 if (r) {
3355 ti->error = "Couldn't open thin internal device"; 3786 ti->error = "Couldn't open thin internal device";
3356 goto bad_thin_open; 3787 goto bad_pool;
3357 } 3788 }
3358 3789
3359 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); 3790 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
3360 if (r) 3791 if (r)
3361 goto bad_target_max_io_len; 3792 goto bad;
3362 3793
3363 ti->num_flush_bios = 1; 3794 ti->num_flush_bios = 1;
3364 ti->flush_supported = true; 3795 ti->flush_supported = true;
@@ -3373,14 +3804,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3373 ti->split_discard_bios = true; 3804 ti->split_discard_bios = true;
3374 } 3805 }
3375 3806
3376 dm_put(pool_md);
3377
3378 mutex_unlock(&dm_thin_pool_table.mutex); 3807 mutex_unlock(&dm_thin_pool_table.mutex);
3379 3808
3380 atomic_set(&tc->refcount, 1);
3381 init_completion(&tc->can_destroy);
3382
3383 spin_lock_irqsave(&tc->pool->lock, flags); 3809 spin_lock_irqsave(&tc->pool->lock, flags);
3810 if (tc->pool->suspended) {
3811 spin_unlock_irqrestore(&tc->pool->lock, flags);
3812 mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
3813 ti->error = "Unable to activate thin device while pool is suspended";
3814 r = -EINVAL;
3815 goto bad;
3816 }
3384 list_add_tail_rcu(&tc->list, &tc->pool->active_thins); 3817 list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
3385 spin_unlock_irqrestore(&tc->pool->lock, flags); 3818 spin_unlock_irqrestore(&tc->pool->lock, flags);
3386 /* 3819 /*
@@ -3391,11 +3824,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3391 */ 3824 */
3392 synchronize_rcu(); 3825 synchronize_rcu();
3393 3826
3827 dm_put(pool_md);
3828
3829 atomic_set(&tc->refcount, 1);
3830 init_completion(&tc->can_destroy);
3831
3394 return 0; 3832 return 0;
3395 3833
3396bad_target_max_io_len: 3834bad:
3397 dm_pool_close_thin_device(tc->td); 3835 dm_pool_close_thin_device(tc->td);
3398bad_thin_open: 3836bad_pool:
3399 __pool_dec(tc->pool); 3837 __pool_dec(tc->pool);
3400bad_pool_lookup: 3838bad_pool_lookup:
3401 dm_put(pool_md); 3839 dm_put(pool_md);
@@ -3541,6 +3979,21 @@ err:
3541 DMEMIT("Error"); 3979 DMEMIT("Error");
3542} 3980}
3543 3981
3982static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
3983 struct bio_vec *biovec, int max_size)
3984{
3985 struct thin_c *tc = ti->private;
3986 struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
3987
3988 if (!q->merge_bvec_fn)
3989 return max_size;
3990
3991 bvm->bi_bdev = tc->pool_dev->bdev;
3992 bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
3993
3994 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3995}
3996
3544static int thin_iterate_devices(struct dm_target *ti, 3997static int thin_iterate_devices(struct dm_target *ti,
3545 iterate_devices_callout_fn fn, void *data) 3998 iterate_devices_callout_fn fn, void *data)
3546{ 3999{
@@ -3565,7 +4018,7 @@ static int thin_iterate_devices(struct dm_target *ti,
3565 4018
3566static struct target_type thin_target = { 4019static struct target_type thin_target = {
3567 .name = "thin", 4020 .name = "thin",
3568 .version = {1, 13, 0}, 4021 .version = {1, 14, 0},
3569 .module = THIS_MODULE, 4022 .module = THIS_MODULE,
3570 .ctr = thin_ctr, 4023 .ctr = thin_ctr,
3571 .dtr = thin_dtr, 4024 .dtr = thin_dtr,
@@ -3575,6 +4028,7 @@ static struct target_type thin_target = {
3575 .presuspend = thin_presuspend, 4028 .presuspend = thin_presuspend,
3576 .postsuspend = thin_postsuspend, 4029 .postsuspend = thin_postsuspend,
3577 .status = thin_status, 4030 .status = thin_status,
4031 .merge = thin_merge,
3578 .iterate_devices = thin_iterate_devices, 4032 .iterate_devices = thin_iterate_devices,
3579}; 4033};
3580 4034
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 58f3927fd7cc..8f37ed215b19 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,6 +19,7 @@
19#include <linux/idr.h> 19#include <linux/idr.h>
20#include <linux/hdreg.h> 20#include <linux/hdreg.h>
21#include <linux/delay.h> 21#include <linux/delay.h>
22#include <linux/wait.h>
22 23
23#include <trace/events/block.h> 24#include <trace/events/block.h>
24 25
@@ -117,6 +118,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
117#define DMF_NOFLUSH_SUSPENDING 5 118#define DMF_NOFLUSH_SUSPENDING 5
118#define DMF_MERGE_IS_OPTIONAL 6 119#define DMF_MERGE_IS_OPTIONAL 6
119#define DMF_DEFERRED_REMOVE 7 120#define DMF_DEFERRED_REMOVE 7
121#define DMF_SUSPENDED_INTERNALLY 8
120 122
121/* 123/*
122 * A dummy definition to make RCU happy. 124 * A dummy definition to make RCU happy.
@@ -140,7 +142,7 @@ struct mapped_device {
140 * Use dm_get_live_table{_fast} or take suspend_lock for 142 * Use dm_get_live_table{_fast} or take suspend_lock for
141 * dereference. 143 * dereference.
142 */ 144 */
143 struct dm_table *map; 145 struct dm_table __rcu *map;
144 146
145 struct list_head table_devices; 147 struct list_head table_devices;
146 struct mutex table_devices_lock; 148 struct mutex table_devices_lock;
@@ -525,14 +527,15 @@ retry:
525 goto out; 527 goto out;
526 528
527 tgt = dm_table_get_target(map, 0); 529 tgt = dm_table_get_target(map, 0);
530 if (!tgt->type->ioctl)
531 goto out;
528 532
529 if (dm_suspended_md(md)) { 533 if (dm_suspended_md(md)) {
530 r = -EAGAIN; 534 r = -EAGAIN;
531 goto out; 535 goto out;
532 } 536 }
533 537
534 if (tgt->type->ioctl) 538 r = tgt->type->ioctl(tgt, cmd, arg);
535 r = tgt->type->ioctl(tgt, cmd, arg);
536 539
537out: 540out:
538 dm_put_live_table(md, srcu_idx); 541 dm_put_live_table(md, srcu_idx);
@@ -1607,9 +1610,9 @@ static int dm_merge_bvec(struct request_queue *q,
1607 * Find maximum amount of I/O that won't need splitting 1610 * Find maximum amount of I/O that won't need splitting
1608 */ 1611 */
1609 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1612 max_sectors = min(max_io_len(bvm->bi_sector, ti),
1610 (sector_t) BIO_MAX_SECTORS); 1613 (sector_t) queue_max_sectors(q));
1611 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1614 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1612 if (max_size < 0) 1615 if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
1613 max_size = 0; 1616 max_size = 0;
1614 1617
1615 /* 1618 /*
@@ -1621,10 +1624,10 @@ static int dm_merge_bvec(struct request_queue *q,
1621 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1624 max_size = ti->type->merge(ti, bvm, biovec, max_size);
1622 /* 1625 /*
1623 * If the target doesn't support merge method and some of the devices 1626 * If the target doesn't support merge method and some of the devices
1624 * provided their merge_bvec method (we know this by looking at 1627 * provided their merge_bvec method (we know this by looking for the
1625 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1628 * max_hw_sectors that dm_set_device_limits may set), then we can't
1626 * entries. So always set max_size to 0, and the code below allows 1629 * allow bios with multiple vector entries. So always set max_size
1627 * just one page. 1630 * to 0, and the code below allows just one page.
1628 */ 1631 */
1629 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1632 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1630 max_size = 0; 1633 max_size = 0;
@@ -2332,7 +2335,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2332 2335
2333 merge_is_optional = dm_table_merge_is_optional(t); 2336 merge_is_optional = dm_table_merge_is_optional(t);
2334 2337
2335 old_map = md->map; 2338 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2336 rcu_assign_pointer(md->map, t); 2339 rcu_assign_pointer(md->map, t);
2337 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2340 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2338 2341
@@ -2341,7 +2344,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2341 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2344 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2342 else 2345 else
2343 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2346 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2344 dm_sync_table(md); 2347 if (old_map)
2348 dm_sync_table(md);
2345 2349
2346 return old_map; 2350 return old_map;
2347} 2351}
@@ -2351,7 +2355,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2351 */ 2355 */
2352static struct dm_table *__unbind(struct mapped_device *md) 2356static struct dm_table *__unbind(struct mapped_device *md)
2353{ 2357{
2354 struct dm_table *map = md->map; 2358 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2355 2359
2356 if (!map) 2360 if (!map)
2357 return NULL; 2361 return NULL;
@@ -2716,36 +2720,18 @@ static void unlock_fs(struct mapped_device *md)
2716} 2720}
2717 2721
2718/* 2722/*
2719 * We need to be able to change a mapping table under a mounted 2723 * If __dm_suspend returns 0, the device is completely quiescent
2720 * filesystem. For example we might want to move some data in 2724 * now. There is no request-processing activity. All new requests
2721 * the background. Before the table can be swapped with 2725 * are being added to md->deferred list.
2722 * dm_bind_table, dm_suspend must be called to flush any in
2723 * flight bios and ensure that any further io gets deferred.
2724 */
2725/*
2726 * Suspend mechanism in request-based dm.
2727 * 2726 *
2728 * 1. Flush all I/Os by lock_fs() if needed. 2727 * Caller must hold md->suspend_lock
2729 * 2. Stop dispatching any I/O by stopping the request_queue.
2730 * 3. Wait for all in-flight I/Os to be completed or requeued.
2731 *
2732 * To abort suspend, start the request_queue.
2733 */ 2728 */
2734int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2729static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2730 unsigned suspend_flags, int interruptible)
2735{ 2731{
2736 struct dm_table *map = NULL; 2732 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2737 int r = 0; 2733 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2738 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2734 int r;
2739 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
2740
2741 mutex_lock(&md->suspend_lock);
2742
2743 if (dm_suspended_md(md)) {
2744 r = -EINVAL;
2745 goto out_unlock;
2746 }
2747
2748 map = md->map;
2749 2735
2750 /* 2736 /*
2751 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2737 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2754,7 +2740,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2754 if (noflush) 2740 if (noflush)
2755 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2741 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2756 2742
2757 /* This does not get reverted if there's an error later. */ 2743 /*
2744 * This gets reverted if there's an error later and the targets
2745 * provide the .presuspend_undo hook.
2746 */
2758 dm_table_presuspend_targets(map); 2747 dm_table_presuspend_targets(map);
2759 2748
2760 /* 2749 /*
@@ -2765,8 +2754,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2765 */ 2754 */
2766 if (!noflush && do_lockfs) { 2755 if (!noflush && do_lockfs) {
2767 r = lock_fs(md); 2756 r = lock_fs(md);
2768 if (r) 2757 if (r) {
2769 goto out_unlock; 2758 dm_table_presuspend_undo_targets(map);
2759 return r;
2760 }
2770 } 2761 }
2771 2762
2772 /* 2763 /*
@@ -2782,7 +2773,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2782 * flush_workqueue(md->wq). 2773 * flush_workqueue(md->wq).
2783 */ 2774 */
2784 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2775 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2785 synchronize_srcu(&md->io_barrier); 2776 if (map)
2777 synchronize_srcu(&md->io_barrier);
2786 2778
2787 /* 2779 /*
2788 * Stop md->queue before flushing md->wq in case request-based 2780 * Stop md->queue before flushing md->wq in case request-based
@@ -2798,11 +2790,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2798 * We call dm_wait_for_completion to wait for all existing requests 2790 * We call dm_wait_for_completion to wait for all existing requests
2799 * to finish. 2791 * to finish.
2800 */ 2792 */
2801 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2793 r = dm_wait_for_completion(md, interruptible);
2802 2794
2803 if (noflush) 2795 if (noflush)
2804 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2796 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2805 synchronize_srcu(&md->io_barrier); 2797 if (map)
2798 synchronize_srcu(&md->io_barrier);
2806 2799
2807 /* were we interrupted ? */ 2800 /* were we interrupted ? */
2808 if (r < 0) { 2801 if (r < 0) {
@@ -2812,14 +2805,56 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2812 start_queue(md->queue); 2805 start_queue(md->queue);
2813 2806
2814 unlock_fs(md); 2807 unlock_fs(md);
2815 goto out_unlock; /* pushback list is already flushed, so skip flush */ 2808 dm_table_presuspend_undo_targets(map);
2809 /* pushback list is already flushed, so skip flush */
2816 } 2810 }
2817 2811
2818 /* 2812 return r;
2819 * If dm_wait_for_completion returned 0, the device is completely 2813}
2820 * quiescent now. There is no request-processing activity. All new 2814
2821 * requests are being added to md->deferred list. 2815/*
2822 */ 2816 * We need to be able to change a mapping table under a mounted
2817 * filesystem. For example we might want to move some data in
2818 * the background. Before the table can be swapped with
2819 * dm_bind_table, dm_suspend must be called to flush any in
2820 * flight bios and ensure that any further io gets deferred.
2821 */
2822/*
2823 * Suspend mechanism in request-based dm.
2824 *
2825 * 1. Flush all I/Os by lock_fs() if needed.
2826 * 2. Stop dispatching any I/O by stopping the request_queue.
2827 * 3. Wait for all in-flight I/Os to be completed or requeued.
2828 *
2829 * To abort suspend, start the request_queue.
2830 */
2831int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2832{
2833 struct dm_table *map = NULL;
2834 int r = 0;
2835
2836retry:
2837 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2838
2839 if (dm_suspended_md(md)) {
2840 r = -EINVAL;
2841 goto out_unlock;
2842 }
2843
2844 if (dm_suspended_internally_md(md)) {
2845 /* already internally suspended, wait for internal resume */
2846 mutex_unlock(&md->suspend_lock);
2847 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2848 if (r)
2849 return r;
2850 goto retry;
2851 }
2852
2853 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2854
2855 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
2856 if (r)
2857 goto out_unlock;
2823 2858
2824 set_bit(DMF_SUSPENDED, &md->flags); 2859 set_bit(DMF_SUSPENDED, &md->flags);
2825 2860
@@ -2830,22 +2865,13 @@ out_unlock:
2830 return r; 2865 return r;
2831} 2866}
2832 2867
2833int dm_resume(struct mapped_device *md) 2868static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2834{ 2869{
2835 int r = -EINVAL; 2870 if (map) {
2836 struct dm_table *map = NULL; 2871 int r = dm_table_resume_targets(map);
2837 2872 if (r)
2838 mutex_lock(&md->suspend_lock); 2873 return r;
2839 if (!dm_suspended_md(md)) 2874 }
2840 goto out;
2841
2842 map = md->map;
2843 if (!map || !dm_table_get_size(map))
2844 goto out;
2845
2846 r = dm_table_resume_targets(map);
2847 if (r)
2848 goto out;
2849 2875
2850 dm_queue_flush(md); 2876 dm_queue_flush(md);
2851 2877
@@ -2859,6 +2885,37 @@ int dm_resume(struct mapped_device *md)
2859 2885
2860 unlock_fs(md); 2886 unlock_fs(md);
2861 2887
2888 return 0;
2889}
2890
2891int dm_resume(struct mapped_device *md)
2892{
2893 int r = -EINVAL;
2894 struct dm_table *map = NULL;
2895
2896retry:
2897 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2898
2899 if (!dm_suspended_md(md))
2900 goto out;
2901
2902 if (dm_suspended_internally_md(md)) {
2903 /* already internally suspended, wait for internal resume */
2904 mutex_unlock(&md->suspend_lock);
2905 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2906 if (r)
2907 return r;
2908 goto retry;
2909 }
2910
2911 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2912 if (!map || !dm_table_get_size(map))
2913 goto out;
2914
2915 r = __dm_resume(md, map);
2916 if (r)
2917 goto out;
2918
2862 clear_bit(DMF_SUSPENDED, &md->flags); 2919 clear_bit(DMF_SUSPENDED, &md->flags);
2863 2920
2864 r = 0; 2921 r = 0;
@@ -2872,15 +2929,80 @@ out:
2872 * Internal suspend/resume works like userspace-driven suspend. It waits 2929 * Internal suspend/resume works like userspace-driven suspend. It waits
2873 * until all bios finish and prevents issuing new bios to the target drivers. 2930 * until all bios finish and prevents issuing new bios to the target drivers.
2874 * It may be used only from the kernel. 2931 * It may be used only from the kernel.
2875 *
2876 * Internal suspend holds md->suspend_lock, which prevents interaction with
2877 * userspace-driven suspend.
2878 */ 2932 */
2879 2933
2880void dm_internal_suspend(struct mapped_device *md) 2934static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2881{ 2935{
2882 mutex_lock(&md->suspend_lock); 2936 struct dm_table *map = NULL;
2937
2938 if (dm_suspended_internally_md(md))
2939 return; /* nested internal suspend */
2940
2941 if (dm_suspended_md(md)) {
2942 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2943 return; /* nest suspend */
2944 }
2945
2946 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2947
2948 /*
2949 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2950 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
2951 * would require changing .presuspend to return an error -- avoid this
2952 * until there is a need for more elaborate variants of internal suspend.
2953 */
2954 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE);
2955
2956 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2957
2958 dm_table_postsuspend_targets(map);
2959}
2960
2961static void __dm_internal_resume(struct mapped_device *md)
2962{
2963 if (!dm_suspended_internally_md(md))
2964 return; /* resume from nested internal suspend */
2965
2883 if (dm_suspended_md(md)) 2966 if (dm_suspended_md(md))
2967 goto done; /* resume from nested suspend */
2968
2969 /*
2970 * NOTE: existing callers don't need to call dm_table_resume_targets
2971 * (which may fail -- so best to avoid it for now by passing NULL map)
2972 */
2973 (void) __dm_resume(md, NULL);
2974
2975done:
2976 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2977 smp_mb__after_atomic();
2978 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2979}
2980
2981void dm_internal_suspend_noflush(struct mapped_device *md)
2982{
2983 mutex_lock(&md->suspend_lock);
2984 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2985 mutex_unlock(&md->suspend_lock);
2986}
2987EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2988
2989void dm_internal_resume(struct mapped_device *md)
2990{
2991 mutex_lock(&md->suspend_lock);
2992 __dm_internal_resume(md);
2993 mutex_unlock(&md->suspend_lock);
2994}
2995EXPORT_SYMBOL_GPL(dm_internal_resume);
2996
2997/*
2998 * Fast variants of internal suspend/resume hold md->suspend_lock,
2999 * which prevents interaction with userspace-driven suspend.
3000 */
3001
3002void dm_internal_suspend_fast(struct mapped_device *md)
3003{
3004 mutex_lock(&md->suspend_lock);
3005 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2884 return; 3006 return;
2885 3007
2886 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3008 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
@@ -2889,9 +3011,9 @@ void dm_internal_suspend(struct mapped_device *md)
2889 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 3011 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2890} 3012}
2891 3013
2892void dm_internal_resume(struct mapped_device *md) 3014void dm_internal_resume_fast(struct mapped_device *md)
2893{ 3015{
2894 if (dm_suspended_md(md)) 3016 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2895 goto done; 3017 goto done;
2896 3018
2897 dm_queue_flush(md); 3019 dm_queue_flush(md);
@@ -2977,6 +3099,11 @@ int dm_suspended_md(struct mapped_device *md)
2977 return test_bit(DMF_SUSPENDED, &md->flags); 3099 return test_bit(DMF_SUSPENDED, &md->flags);
2978} 3100}
2979 3101
3102int dm_suspended_internally_md(struct mapped_device *md)
3103{
3104 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3105}
3106
2980int dm_test_deferred_remove_flag(struct mapped_device *md) 3107int dm_test_deferred_remove_flag(struct mapped_device *md)
2981{ 3108{
2982 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 3109 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 988c7fb7b145..84b0f9e4ba6c 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -65,6 +65,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
65 struct queue_limits *limits); 65 struct queue_limits *limits);
66struct list_head *dm_table_get_devices(struct dm_table *t); 66struct list_head *dm_table_get_devices(struct dm_table *t);
67void dm_table_presuspend_targets(struct dm_table *t); 67void dm_table_presuspend_targets(struct dm_table *t);
68void dm_table_presuspend_undo_targets(struct dm_table *t);
68void dm_table_postsuspend_targets(struct dm_table *t); 69void dm_table_postsuspend_targets(struct dm_table *t);
69int dm_table_resume_targets(struct dm_table *t); 70int dm_table_resume_targets(struct dm_table *t);
70int dm_table_any_congested(struct dm_table *t, int bdi_bits); 71int dm_table_any_congested(struct dm_table *t, int bdi_bits);
@@ -129,6 +130,15 @@ int dm_deleting_md(struct mapped_device *md);
129int dm_suspended_md(struct mapped_device *md); 130int dm_suspended_md(struct mapped_device *md);
130 131
131/* 132/*
133 * Internal suspend and resume methods.
134 */
135int dm_suspended_internally_md(struct mapped_device *md);
136void dm_internal_suspend_fast(struct mapped_device *md);
137void dm_internal_resume_fast(struct mapped_device *md);
138void dm_internal_suspend_noflush(struct mapped_device *md);
139void dm_internal_resume(struct mapped_device *md);
140
141/*
132 * Test if the device is scheduled for deferred remove. 142 * Test if the device is scheduled for deferred remove.
133 */ 143 */
134int dm_test_deferred_remove_flag(struct mapped_device *md); 144int dm_test_deferred_remove_flag(struct mapped_device *md);
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 1d75b1dc1e2e..e64b61ad0ef3 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -645,8 +645,10 @@ static int array_resize(struct dm_array_info *info, dm_block_t root,
645 int r; 645 int r;
646 struct resize resize; 646 struct resize resize;
647 647
648 if (old_size == new_size) 648 if (old_size == new_size) {
649 *new_root = root;
649 return 0; 650 return 0;
651 }
650 652
651 resize.info = info; 653 resize.info = info;
652 resize.root = root; 654 resize.root = root;
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 786b689bdfc7..e8a904298887 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -564,7 +564,9 @@ static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count
564{ 564{
565 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 565 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
566 566
567 return smm->ll.nr_blocks; 567 *count = smm->ll.nr_blocks;
568
569 return 0;
568} 570}
569 571
570static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count) 572static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
@@ -581,7 +583,9 @@ static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b,
581{ 583{
582 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 584 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
583 585
584 return b < smm->begin ? 1 : 0; 586 *result = (b < smm->begin) ? 1 : 0;
587
588 return 0;
585} 589}
586 590
587static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm, 591static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 3bc30a0ae3d6..9cb797d800cf 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -10,6 +10,8 @@
10#include "dm-persistent-data-internal.h" 10#include "dm-persistent-data-internal.h"
11 11
12#include <linux/export.h> 12#include <linux/export.h>
13#include <linux/mutex.h>
14#include <linux/hash.h>
13#include <linux/slab.h> 15#include <linux/slab.h>
14#include <linux/device-mapper.h> 16#include <linux/device-mapper.h>
15 17
@@ -17,6 +19,61 @@
17 19
18/*----------------------------------------------------------------*/ 20/*----------------------------------------------------------------*/
19 21
22#define PREFETCH_SIZE 128
23#define PREFETCH_BITS 7
24#define PREFETCH_SENTINEL ((dm_block_t) -1ULL)
25
26struct prefetch_set {
27 struct mutex lock;
28 dm_block_t blocks[PREFETCH_SIZE];
29};
30
31static unsigned prefetch_hash(dm_block_t b)
32{
33 return hash_64(b, PREFETCH_BITS);
34}
35
36static void prefetch_wipe(struct prefetch_set *p)
37{
38 unsigned i;
39 for (i = 0; i < PREFETCH_SIZE; i++)
40 p->blocks[i] = PREFETCH_SENTINEL;
41}
42
43static void prefetch_init(struct prefetch_set *p)
44{
45 mutex_init(&p->lock);
46 prefetch_wipe(p);
47}
48
49static void prefetch_add(struct prefetch_set *p, dm_block_t b)
50{
51 unsigned h = prefetch_hash(b);
52
53 mutex_lock(&p->lock);
54 if (p->blocks[h] == PREFETCH_SENTINEL)
55 p->blocks[h] = b;
56
57 mutex_unlock(&p->lock);
58}
59
60static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm)
61{
62 unsigned i;
63
64 mutex_lock(&p->lock);
65
66 for (i = 0; i < PREFETCH_SIZE; i++)
67 if (p->blocks[i] != PREFETCH_SENTINEL) {
68 dm_bm_prefetch(bm, p->blocks[i]);
69 p->blocks[i] = PREFETCH_SENTINEL;
70 }
71
72 mutex_unlock(&p->lock);
73}
74
75/*----------------------------------------------------------------*/
76
20struct shadow_info { 77struct shadow_info {
21 struct hlist_node hlist; 78 struct hlist_node hlist;
22 dm_block_t where; 79 dm_block_t where;
@@ -37,6 +94,8 @@ struct dm_transaction_manager {
37 94
38 spinlock_t lock; 95 spinlock_t lock;
39 struct hlist_head buckets[DM_HASH_SIZE]; 96 struct hlist_head buckets[DM_HASH_SIZE];
97
98 struct prefetch_set prefetches;
40}; 99};
41 100
42/*----------------------------------------------------------------*/ 101/*----------------------------------------------------------------*/
@@ -117,6 +176,8 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
117 for (i = 0; i < DM_HASH_SIZE; i++) 176 for (i = 0; i < DM_HASH_SIZE; i++)
118 INIT_HLIST_HEAD(tm->buckets + i); 177 INIT_HLIST_HEAD(tm->buckets + i);
119 178
179 prefetch_init(&tm->prefetches);
180
120 return tm; 181 return tm;
121} 182}
122 183
@@ -268,8 +329,14 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
268 struct dm_block_validator *v, 329 struct dm_block_validator *v,
269 struct dm_block **blk) 330 struct dm_block **blk)
270{ 331{
271 if (tm->is_clone) 332 if (tm->is_clone) {
272 return dm_bm_read_try_lock(tm->real->bm, b, v, blk); 333 int r = dm_bm_read_try_lock(tm->real->bm, b, v, blk);
334
335 if (r == -EWOULDBLOCK)
336 prefetch_add(&tm->real->prefetches, b);
337
338 return r;
339 }
273 340
274 return dm_bm_read_lock(tm->bm, b, v, blk); 341 return dm_bm_read_lock(tm->bm, b, v, blk);
275} 342}
@@ -317,6 +384,12 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
317 return tm->bm; 384 return tm->bm;
318} 385}
319 386
387void dm_tm_issue_prefetches(struct dm_transaction_manager *tm)
388{
389 prefetch_issue(&tm->prefetches, tm->bm);
390}
391EXPORT_SYMBOL_GPL(dm_tm_issue_prefetches);
392
320/*----------------------------------------------------------------*/ 393/*----------------------------------------------------------------*/
321 394
322static int dm_tm_create_internal(struct dm_block_manager *bm, 395static int dm_tm_create_internal(struct dm_block_manager *bm,
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index 2772ed2a781a..2e0d4d66fb1b 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -109,6 +109,13 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
109struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); 109struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
110 110
111/* 111/*
112 * If you're using a non-blocking clone the tm will build up a list of
113 * requested blocks that weren't in core. This call will request those
114 * blocks to be prefetched.
115 */
116void dm_tm_issue_prefetches(struct dm_transaction_manager *tm);
117
118/*
112 * A little utility that ties the knot by producing a transaction manager 119 * A little utility that ties the knot by producing a transaction manager
113 * that has a space map managed by the transaction manager... 120 * that has a space map managed by the transaction manager...
114 * 121 *