summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorJoe Thornber <ejt@redhat.com>2016-12-15 04:57:31 -0500
committerMike Snitzer <snitzer@redhat.com>2017-03-07 13:28:31 -0500
commitb29d4986d0da1a27cd35917cdb433672f5c95d7f (patch)
treea5d94b86cf1eb759bfef5761015135d747e80561 /drivers/md
parent742c8fdc31e820503f9267070311d894978d1349 (diff)
dm cache: significant rework to leverage dm-bio-prison-v2
The cache policy interfaces have been updated to work well with the new bio-prison v2 interface's ability to queue work immediately (promotion, demotion, etc) -- overriding benefit being reduced latency on processing IO through the cache. Previously such work would be left for the DM cache core to queue on various lists and then process in batches later -- this caused a serious delay in latency for IO driven by the cache. The background tracker code was factored out so that all cache policies can make use of it. Also, the "cleaner" policy has been removed and is now a variant of the smq policy that simply disallows migrations. Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig8
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/dm-cache-background-tracker.c238
-rw-r--r--drivers/md/dm-cache-background-tracker.h46
-rw-r--r--drivers/md/dm-cache-metadata.h2
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c469
-rw-r--r--drivers/md/dm-cache-policy-internal.h76
-rw-r--r--drivers/md/dm-cache-policy-smq.c821
-rw-r--r--drivers/md/dm-cache-policy.h187
-rw-r--r--drivers/md/dm-cache-target.c2469
10 files changed, 1922 insertions, 2399 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b7767da50c26..982cd0626bc7 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -325,14 +325,6 @@ config DM_CACHE_SMQ
325 of less memory utilization, improved performance and increased 325 of less memory utilization, improved performance and increased
326 adaptability in the face of changing workloads. 326 adaptability in the face of changing workloads.
327 327
328config DM_CACHE_CLEANER
329 tristate "Cleaner Cache Policy (EXPERIMENTAL)"
330 depends on DM_CACHE
331 default y
332 ---help---
333 A simple cache policy that writes back all data to the
334 origin. Used when decommissioning a dm-cache.
335
336config DM_ERA 328config DM_ERA
337 tristate "Era target (EXPERIMENTAL)" 329 tristate "Era target (EXPERIMENTAL)"
338 depends on BLK_DEV_DM 330 depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d378b1db7852..2801b2fb452d 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -13,9 +13,9 @@ dm-log-userspace-y \
13 += dm-log-userspace-base.o dm-log-userspace-transfer.o 13 += dm-log-userspace-base.o dm-log-userspace-transfer.o
14dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o 14dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o
15dm-thin-pool-y += dm-thin.o dm-thin-metadata.o 15dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
16dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o 16dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
17 dm-cache-background-tracker.o
17dm-cache-smq-y += dm-cache-policy-smq.o 18dm-cache-smq-y += dm-cache-policy-smq.o
18dm-cache-cleaner-y += dm-cache-policy-cleaner.o
19dm-era-y += dm-era-target.o 19dm-era-y += dm-era-target.o
20dm-verity-y += dm-verity-target.o 20dm-verity-y += dm-verity-target.o
21md-mod-y += md.o bitmap.o 21md-mod-y += md.o bitmap.o
@@ -57,7 +57,6 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
57obj-$(CONFIG_DM_VERITY) += dm-verity.o 57obj-$(CONFIG_DM_VERITY) += dm-verity.o
58obj-$(CONFIG_DM_CACHE) += dm-cache.o 58obj-$(CONFIG_DM_CACHE) += dm-cache.o
59obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o 59obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
60obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
61obj-$(CONFIG_DM_ERA) += dm-era.o 60obj-$(CONFIG_DM_ERA) += dm-era.o
62obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o 61obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
63 62
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c
new file mode 100644
index 000000000000..9b1afdfb13f0
--- /dev/null
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -0,0 +1,238 @@
1/*
2 * Copyright (C) 2017 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-cache-background-tracker.h"
8
9/*----------------------------------------------------------------*/
10
11#define DM_MSG_PREFIX "dm-background-tracker"
12
13struct bt_work {
14 struct list_head list;
15 struct rb_node node;
16 struct policy_work work;
17};
18
19struct background_tracker {
20 unsigned max_work;
21 atomic_t pending_promotes;
22 atomic_t pending_writebacks;
23 atomic_t pending_demotes;
24
25 struct list_head issued;
26 struct list_head queued;
27 struct rb_root pending;
28
29 struct kmem_cache *work_cache;
30};
31
32struct background_tracker *btracker_create(unsigned max_work)
33{
34 struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
35
36 b->max_work = max_work;
37 atomic_set(&b->pending_promotes, 0);
38 atomic_set(&b->pending_writebacks, 0);
39 atomic_set(&b->pending_demotes, 0);
40
41 INIT_LIST_HEAD(&b->issued);
42 INIT_LIST_HEAD(&b->queued);
43
44 b->pending = RB_ROOT;
45 b->work_cache = KMEM_CACHE(bt_work, 0);
46 if (!b->work_cache) {
47 DMERR("couldn't create mempool for background work items");
48 kfree(b);
49 b = NULL;
50 }
51
52 return b;
53}
54EXPORT_SYMBOL_GPL(btracker_create);
55
56void btracker_destroy(struct background_tracker *b)
57{
58 kmem_cache_destroy(b->work_cache);
59 kfree(b);
60}
61EXPORT_SYMBOL_GPL(btracker_destroy);
62
63static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs)
64{
65 if (from_oblock(lhs) < from_oblock(rhs))
66 return -1;
67
68 if (from_oblock(rhs) < from_oblock(lhs))
69 return 1;
70
71 return 0;
72}
73
74static bool __insert_pending(struct background_tracker *b,
75 struct bt_work *nw)
76{
77 int cmp;
78 struct bt_work *w;
79 struct rb_node **new = &b->pending.rb_node, *parent = NULL;
80
81 while (*new) {
82 w = container_of(*new, struct bt_work, node);
83
84 parent = *new;
85 cmp = cmp_oblock(w->work.oblock, nw->work.oblock);
86 if (cmp < 0)
87 new = &((*new)->rb_left);
88
89 else if (cmp > 0)
90 new = &((*new)->rb_right);
91
92 else
93 /* already present */
94 return false;
95 }
96
97 rb_link_node(&nw->node, parent, new);
98 rb_insert_color(&nw->node, &b->pending);
99
100 return true;
101}
102
103static struct bt_work *__find_pending(struct background_tracker *b,
104 dm_oblock_t oblock)
105{
106 int cmp;
107 struct bt_work *w;
108 struct rb_node **new = &b->pending.rb_node;
109
110 while (*new) {
111 w = container_of(*new, struct bt_work, node);
112
113 cmp = cmp_oblock(w->work.oblock, oblock);
114 if (cmp < 0)
115 new = &((*new)->rb_left);
116
117 else if (cmp > 0)
118 new = &((*new)->rb_right);
119
120 else
121 break;
122 }
123
124 return *new ? w : NULL;
125}
126
127
128static void update_stats(struct background_tracker *b, struct policy_work *w, int delta)
129{
130 switch (w->op) {
131 case POLICY_PROMOTE:
132 atomic_add(delta, &b->pending_promotes);
133 break;
134
135 case POLICY_DEMOTE:
136 atomic_add(delta, &b->pending_demotes);
137 break;
138
139 case POLICY_WRITEBACK:
140 atomic_add(delta, &b->pending_writebacks);
141 break;
142 }
143}
144
145unsigned btracker_nr_writebacks_queued(struct background_tracker *b)
146{
147 return atomic_read(&b->pending_writebacks);
148}
149EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued);
150
151unsigned btracker_nr_demotions_queued(struct background_tracker *b)
152{
153 return atomic_read(&b->pending_demotes);
154}
155EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued);
156
157static bool max_work_reached(struct background_tracker *b)
158{
159 // FIXME: finish
160 return false;
161}
162
163int btracker_queue(struct background_tracker *b,
164 struct policy_work *work,
165 struct policy_work **pwork)
166{
167 struct bt_work *w;
168
169 if (pwork)
170 *pwork = NULL;
171
172 if (max_work_reached(b))
173 return -ENOMEM;
174
175 w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
176 if (!w)
177 return -ENOMEM;
178
179 memcpy(&w->work, work, sizeof(*work));
180
181 if (!__insert_pending(b, w)) {
182 /*
183 * There was a race, we'll just ignore this second
184 * bit of work for the same oblock.
185 */
186 kmem_cache_free(b->work_cache, w);
187 return -EINVAL;
188 }
189
190 if (pwork) {
191 *pwork = &w->work;
192 list_add(&w->list, &b->issued);
193 } else
194 list_add(&w->list, &b->queued);
195 update_stats(b, &w->work, 1);
196
197 return 0;
198}
199EXPORT_SYMBOL_GPL(btracker_queue);
200
201/*
202 * Returns -ENODATA if there's no work.
203 */
204int btracker_issue(struct background_tracker *b, struct policy_work **work)
205{
206 struct bt_work *w;
207
208 if (list_empty(&b->queued))
209 return -ENODATA;
210
211 w = list_first_entry(&b->queued, struct bt_work, list);
212 list_move(&w->list, &b->issued);
213 *work = &w->work;
214
215 return 0;
216}
217EXPORT_SYMBOL_GPL(btracker_issue);
218
219void btracker_complete(struct background_tracker *b,
220 struct policy_work *op)
221{
222 struct bt_work *w = container_of(op, struct bt_work, work);
223
224 update_stats(b, &w->work, -1);
225 rb_erase(&w->node, &b->pending);
226 list_del(&w->list);
227 kmem_cache_free(b->work_cache, w);
228}
229EXPORT_SYMBOL_GPL(btracker_complete);
230
231bool btracker_promotion_already_present(struct background_tracker *b,
232 dm_oblock_t oblock)
233{
234 return __find_pending(b, oblock) != NULL;
235}
236EXPORT_SYMBOL_GPL(btracker_promotion_already_present);
237
238/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h
new file mode 100644
index 000000000000..27ab90dbc275
--- /dev/null
+++ b/drivers/md/dm-cache-background-tracker.h
@@ -0,0 +1,46 @@
1/*
2 * Copyright (C) 2017 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_CACHE_BACKGROUND_WORK_H
8#define DM_CACHE_BACKGROUND_WORK_H
9
10#include <linux/vmalloc.h>
11#include "dm-cache-policy.h"
12
13/*----------------------------------------------------------------*/
14
15struct background_work;
16struct background_tracker;
17
18/*
19 * FIXME: discuss lack of locking in all methods.
20 */
21struct background_tracker *btracker_create(unsigned max_work);
22void btracker_destroy(struct background_tracker *b);
23
24unsigned btracker_nr_writebacks_queued(struct background_tracker *b);
25unsigned btracker_nr_demotions_queued(struct background_tracker *b);
26
27/*
28 * returns -EINVAL iff the work is already queued. -ENOMEM if the work
29 * couldn't be queued for another reason.
30 */
31int btracker_queue(struct background_tracker *b,
32 struct policy_work *work,
33 struct policy_work **pwork);
34
35/*
36 * Returns -ENODATA if there's no work.
37 */
38int btracker_issue(struct background_tracker *b, struct policy_work **work);
39void btracker_complete(struct background_tracker *b,
40 struct policy_work *op);
41bool btracker_promotion_already_present(struct background_tracker *b,
42 dm_oblock_t oblock);
43
44/*----------------------------------------------------------------*/
45
46#endif
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 4f07c08cf107..179ed5bf81a3 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -50,6 +50,8 @@
50#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL 50#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL
51#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL 51#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL
52 52
53struct dm_cache_metadata;
54
53/* 55/*
54 * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on 56 * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on
55 * failure. If reopening then features must match. 57 * failure. If reopening then features must match.
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
deleted file mode 100644
index 2e8a8f1d8358..000000000000
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ /dev/null
@@ -1,469 +0,0 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * writeback cache policy supporting flushing out dirty cache blocks.
5 *
6 * This file is released under the GPL.
7 */
8
9#include "dm-cache-policy.h"
10#include "dm.h"
11
12#include <linux/hash.h>
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/vmalloc.h>
16
17/*----------------------------------------------------------------*/
18
19#define DM_MSG_PREFIX "cache cleaner"
20
21/* Cache entry struct. */
22struct wb_cache_entry {
23 struct list_head list;
24 struct hlist_node hlist;
25
26 dm_oblock_t oblock;
27 dm_cblock_t cblock;
28 bool dirty:1;
29 bool pending:1;
30};
31
32struct hash {
33 struct hlist_head *table;
34 dm_block_t hash_bits;
35 unsigned nr_buckets;
36};
37
38struct policy {
39 struct dm_cache_policy policy;
40 spinlock_t lock;
41
42 struct list_head free;
43 struct list_head clean;
44 struct list_head clean_pending;
45 struct list_head dirty;
46
47 /*
48 * We know exactly how many cblocks will be needed,
49 * so we can allocate them up front.
50 */
51 dm_cblock_t cache_size, nr_cblocks_allocated;
52 struct wb_cache_entry *cblocks;
53 struct hash chash;
54};
55
56/*----------------------------------------------------------------------------*/
57
58/*
59 * Low-level functions.
60 */
61static unsigned next_power(unsigned n, unsigned min)
62{
63 return roundup_pow_of_two(max(n, min));
64}
65
66static struct policy *to_policy(struct dm_cache_policy *p)
67{
68 return container_of(p, struct policy, policy);
69}
70
71static struct list_head *list_pop(struct list_head *q)
72{
73 struct list_head *r = q->next;
74
75 list_del(r);
76
77 return r;
78}
79
80/*----------------------------------------------------------------------------*/
81
82/* Allocate/free various resources. */
83static int alloc_hash(struct hash *hash, unsigned elts)
84{
85 hash->nr_buckets = next_power(elts >> 4, 16);
86 hash->hash_bits = __ffs(hash->nr_buckets);
87 hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
88
89 return hash->table ? 0 : -ENOMEM;
90}
91
92static void free_hash(struct hash *hash)
93{
94 vfree(hash->table);
95}
96
97static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
98{
99 int r = -ENOMEM;
100
101 p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
102 if (p->cblocks) {
103 unsigned u = from_cblock(cache_size);
104
105 while (u--)
106 list_add(&p->cblocks[u].list, &p->free);
107
108 p->nr_cblocks_allocated = 0;
109
110 /* Cache entries hash. */
111 r = alloc_hash(&p->chash, from_cblock(cache_size));
112 if (r)
113 vfree(p->cblocks);
114 }
115
116 return r;
117}
118
119static void free_cache_blocks_and_hash(struct policy *p)
120{
121 free_hash(&p->chash);
122 vfree(p->cblocks);
123}
124
125static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
126{
127 struct wb_cache_entry *e;
128
129 BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
130
131 e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
132 p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
133
134 return e;
135}
136
137/*----------------------------------------------------------------------------*/
138
139/* Hash functions (lookup, insert, remove). */
140static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
141{
142 struct hash *hash = &p->chash;
143 unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
144 struct wb_cache_entry *cur;
145 struct hlist_head *bucket = &hash->table[h];
146
147 hlist_for_each_entry(cur, bucket, hlist) {
148 if (cur->oblock == oblock) {
149 /* Move upfront bucket for faster access. */
150 hlist_del(&cur->hlist);
151 hlist_add_head(&cur->hlist, bucket);
152 return cur;
153 }
154 }
155
156 return NULL;
157}
158
159static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
160{
161 unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
162
163 hlist_add_head(&e->hlist, &p->chash.table[h]);
164}
165
166static void remove_cache_hash_entry(struct wb_cache_entry *e)
167{
168 hlist_del(&e->hlist);
169}
170
171/* Public interface (see dm-cache-policy.h */
172static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
173 bool can_block, bool can_migrate, bool discarded_oblock,
174 struct bio *bio, struct policy_locker *locker,
175 struct policy_result *result)
176{
177 struct policy *p = to_policy(pe);
178 struct wb_cache_entry *e;
179 unsigned long flags;
180
181 result->op = POLICY_MISS;
182
183 if (can_block)
184 spin_lock_irqsave(&p->lock, flags);
185
186 else if (!spin_trylock_irqsave(&p->lock, flags))
187 return -EWOULDBLOCK;
188
189 e = lookup_cache_entry(p, oblock);
190 if (e) {
191 result->op = POLICY_HIT;
192 result->cblock = e->cblock;
193
194 }
195
196 spin_unlock_irqrestore(&p->lock, flags);
197
198 return 0;
199}
200
201static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
202{
203 int r;
204 struct policy *p = to_policy(pe);
205 struct wb_cache_entry *e;
206 unsigned long flags;
207
208 if (!spin_trylock_irqsave(&p->lock, flags))
209 return -EWOULDBLOCK;
210
211 e = lookup_cache_entry(p, oblock);
212 if (e) {
213 *cblock = e->cblock;
214 r = 0;
215
216 } else
217 r = -ENOENT;
218
219 spin_unlock_irqrestore(&p->lock, flags);
220
221 return r;
222}
223
224static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
225{
226 struct policy *p = to_policy(pe);
227 struct wb_cache_entry *e;
228
229 e = lookup_cache_entry(p, oblock);
230 BUG_ON(!e);
231
232 if (set) {
233 if (!e->dirty) {
234 e->dirty = true;
235 list_move(&e->list, &p->dirty);
236 }
237
238 } else {
239 if (e->dirty) {
240 e->pending = false;
241 e->dirty = false;
242 list_move(&e->list, &p->clean);
243 }
244 }
245}
246
247static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
248{
249 struct policy *p = to_policy(pe);
250 unsigned long flags;
251
252 spin_lock_irqsave(&p->lock, flags);
253 __set_clear_dirty(pe, oblock, true);
254 spin_unlock_irqrestore(&p->lock, flags);
255}
256
257static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
258{
259 struct policy *p = to_policy(pe);
260 unsigned long flags;
261
262 spin_lock_irqsave(&p->lock, flags);
263 __set_clear_dirty(pe, oblock, false);
264 spin_unlock_irqrestore(&p->lock, flags);
265}
266
267static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
268{
269 insert_cache_hash_entry(p, e);
270 if (e->dirty)
271 list_add(&e->list, &p->dirty);
272 else
273 list_add(&e->list, &p->clean);
274}
275
276static int wb_load_mapping(struct dm_cache_policy *pe,
277 dm_oblock_t oblock, dm_cblock_t cblock,
278 uint32_t hint, bool hint_valid)
279{
280 int r;
281 struct policy *p = to_policy(pe);
282 struct wb_cache_entry *e = alloc_cache_entry(p);
283
284 if (e) {
285 e->cblock = cblock;
286 e->oblock = oblock;
287 e->dirty = false; /* blocks default to clean */
288 add_cache_entry(p, e);
289 r = 0;
290
291 } else
292 r = -ENOMEM;
293
294 return r;
295}
296
297static void wb_destroy(struct dm_cache_policy *pe)
298{
299 struct policy *p = to_policy(pe);
300
301 free_cache_blocks_and_hash(p);
302 kfree(p);
303}
304
305static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
306{
307 struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
308
309 BUG_ON(!r);
310
311 remove_cache_hash_entry(r);
312 list_del(&r->list);
313
314 return r;
315}
316
317static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
318{
319 struct policy *p = to_policy(pe);
320 struct wb_cache_entry *e;
321 unsigned long flags;
322
323 spin_lock_irqsave(&p->lock, flags);
324 e = __wb_force_remove_mapping(p, oblock);
325 list_add_tail(&e->list, &p->free);
326 BUG_ON(!from_cblock(p->nr_cblocks_allocated));
327 p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
328 spin_unlock_irqrestore(&p->lock, flags);
329}
330
331static void wb_force_mapping(struct dm_cache_policy *pe,
332 dm_oblock_t current_oblock, dm_oblock_t oblock)
333{
334 struct policy *p = to_policy(pe);
335 struct wb_cache_entry *e;
336 unsigned long flags;
337
338 spin_lock_irqsave(&p->lock, flags);
339 e = __wb_force_remove_mapping(p, current_oblock);
340 e->oblock = oblock;
341 add_cache_entry(p, e);
342 spin_unlock_irqrestore(&p->lock, flags);
343}
344
345static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
346{
347 struct list_head *l;
348 struct wb_cache_entry *r;
349
350 if (list_empty(&p->dirty))
351 return NULL;
352
353 l = list_pop(&p->dirty);
354 r = container_of(l, struct wb_cache_entry, list);
355 list_add(l, &p->clean_pending);
356
357 return r;
358}
359
360static int wb_writeback_work(struct dm_cache_policy *pe,
361 dm_oblock_t *oblock,
362 dm_cblock_t *cblock,
363 bool critical_only)
364{
365 int r = -ENOENT;
366 struct policy *p = to_policy(pe);
367 struct wb_cache_entry *e;
368 unsigned long flags;
369
370 spin_lock_irqsave(&p->lock, flags);
371
372 e = get_next_dirty_entry(p);
373 if (e) {
374 *oblock = e->oblock;
375 *cblock = e->cblock;
376 r = 0;
377 }
378
379 spin_unlock_irqrestore(&p->lock, flags);
380
381 return r;
382}
383
384static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
385{
386 return to_policy(pe)->nr_cblocks_allocated;
387}
388
389/* Init the policy plugin interface function pointers. */
390static void init_policy_functions(struct policy *p)
391{
392 p->policy.destroy = wb_destroy;
393 p->policy.map = wb_map;
394 p->policy.lookup = wb_lookup;
395 p->policy.set_dirty = wb_set_dirty;
396 p->policy.clear_dirty = wb_clear_dirty;
397 p->policy.load_mapping = wb_load_mapping;
398 p->policy.get_hint = NULL;
399 p->policy.remove_mapping = wb_remove_mapping;
400 p->policy.writeback_work = wb_writeback_work;
401 p->policy.force_mapping = wb_force_mapping;
402 p->policy.residency = wb_residency;
403 p->policy.tick = NULL;
404}
405
406static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
407 sector_t origin_size,
408 sector_t cache_block_size)
409{
410 int r;
411 struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
412
413 if (!p)
414 return NULL;
415
416 init_policy_functions(p);
417 INIT_LIST_HEAD(&p->free);
418 INIT_LIST_HEAD(&p->clean);
419 INIT_LIST_HEAD(&p->clean_pending);
420 INIT_LIST_HEAD(&p->dirty);
421
422 p->cache_size = cache_size;
423 spin_lock_init(&p->lock);
424
425 /* Allocate cache entry structs and add them to free list. */
426 r = alloc_cache_blocks_with_hash(p, cache_size);
427 if (!r)
428 return &p->policy;
429
430 kfree(p);
431
432 return NULL;
433}
434/*----------------------------------------------------------------------------*/
435
436static struct dm_cache_policy_type wb_policy_type = {
437 .name = "cleaner",
438 .version = {1, 0, 0},
439 .hint_size = 4,
440 .owner = THIS_MODULE,
441 .create = wb_create
442};
443
444static int __init wb_init(void)
445{
446 int r = dm_cache_policy_register(&wb_policy_type);
447
448 if (r < 0)
449 DMERR("register failed %d", r);
450 else
451 DMINFO("version %u.%u.%u loaded",
452 wb_policy_type.version[0],
453 wb_policy_type.version[1],
454 wb_policy_type.version[2]);
455
456 return r;
457}
458
459static void __exit wb_exit(void)
460{
461 dm_cache_policy_unregister(&wb_policy_type);
462}
463
464module_init(wb_init);
465module_exit(wb_exit);
466
467MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
468MODULE_LICENSE("GPL");
469MODULE_DESCRIPTION("cleaner cache policy");
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 808ee0e2b2c4..56f0a23f698c 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -12,70 +12,65 @@
12 12
13/*----------------------------------------------------------------*/ 13/*----------------------------------------------------------------*/
14 14
15/* 15static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
16 * Little inline functions that simplify calling the policy methods. 16 int data_dir, bool fast_copy, bool *background_queued)
17 */
18static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
19 bool can_block, bool can_migrate, bool discarded_oblock,
20 struct bio *bio, struct policy_locker *locker,
21 struct policy_result *result)
22{ 17{
23 return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result); 18 return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued);
24} 19}
25 20
26static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) 21static inline int policy_lookup_with_work(struct dm_cache_policy *p,
22 dm_oblock_t oblock, dm_cblock_t *cblock,
23 int data_dir, bool fast_copy,
24 struct policy_work **work)
27{ 25{
28 BUG_ON(!p->lookup); 26 if (!p->lookup_with_work) {
29 return p->lookup(p, oblock, cblock); 27 *work = NULL;
30} 28 return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL);
29 }
31 30
32static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 31 return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work);
33{
34 if (p->set_dirty)
35 p->set_dirty(p, oblock);
36} 32}
37 33
38static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 34static inline int policy_get_background_work(struct dm_cache_policy *p,
35 bool idle, struct policy_work **result)
39{ 36{
40 if (p->clear_dirty) 37 return p->get_background_work(p, idle, result);
41 p->clear_dirty(p, oblock);
42} 38}
43 39
44static inline int policy_load_mapping(struct dm_cache_policy *p, 40static inline void policy_complete_background_work(struct dm_cache_policy *p,
45 dm_oblock_t oblock, dm_cblock_t cblock, 41 struct policy_work *work,
46 uint32_t hint, bool hint_valid) 42 bool success)
47{ 43{
48 return p->load_mapping(p, oblock, cblock, hint, hint_valid); 44 return p->complete_background_work(p, work, success);
49} 45}
50 46
51static inline uint32_t policy_get_hint(struct dm_cache_policy *p, 47static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
52 dm_cblock_t cblock)
53{ 48{
54 return p->get_hint ? p->get_hint(p, cblock) : 0; 49 p->set_dirty(p, cblock);
55} 50}
56 51
57static inline int policy_writeback_work(struct dm_cache_policy *p, 52static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
58 dm_oblock_t *oblock,
59 dm_cblock_t *cblock,
60 bool critical_only)
61{ 53{
62 return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT; 54 p->clear_dirty(p, cblock);
63} 55}
64 56
65static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 57static inline int policy_load_mapping(struct dm_cache_policy *p,
58 dm_oblock_t oblock, dm_cblock_t cblock,
59 bool dirty, uint32_t hint, bool hint_valid)
66{ 60{
67 p->remove_mapping(p, oblock); 61 return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid);
68} 62}
69 63
70static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) 64static inline int policy_invalidate_mapping(struct dm_cache_policy *p,
65 dm_cblock_t cblock)
71{ 66{
72 return p->remove_cblock(p, cblock); 67 return p->invalidate_mapping(p, cblock);
73} 68}
74 69
75static inline void policy_force_mapping(struct dm_cache_policy *p, 70static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
76 dm_oblock_t current_oblock, dm_oblock_t new_oblock) 71 dm_cblock_t cblock)
77{ 72{
78 return p->force_mapping(p, current_oblock, new_oblock); 73 return p->get_hint ? p->get_hint(p, cblock) : 0;
79} 74}
80 75
81static inline dm_cblock_t policy_residency(struct dm_cache_policy *p) 76static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
@@ -107,6 +102,11 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
107 return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL; 102 return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
108} 103}
109 104
105static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow)
106{
107 return p->allow_migrations(p, allow);
108}
109
110/*----------------------------------------------------------------*/ 110/*----------------------------------------------------------------*/
111 111
112/* 112/*
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index f19c6930a67c..74436dc2122f 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -4,8 +4,9 @@
4 * This file is released under the GPL. 4 * This file is released under the GPL.
5 */ 5 */
6 6
7#include "dm-cache-policy.h" 7#include "dm-cache-background-tracker.h"
8#include "dm-cache-policy-internal.h" 8#include "dm-cache-policy-internal.h"
9#include "dm-cache-policy.h"
9#include "dm.h" 10#include "dm.h"
10 11
11#include <linux/hash.h> 12#include <linux/hash.h>
@@ -38,10 +39,11 @@ struct entry {
38 unsigned hash_next:28; 39 unsigned hash_next:28;
39 unsigned prev:28; 40 unsigned prev:28;
40 unsigned next:28; 41 unsigned next:28;
41 unsigned level:7; 42 unsigned level:6;
42 bool dirty:1; 43 bool dirty:1;
43 bool allocated:1; 44 bool allocated:1;
44 bool sentinel:1; 45 bool sentinel:1;
46 bool pending_work:1;
45 47
46 dm_oblock_t oblock; 48 dm_oblock_t oblock;
47}; 49};
@@ -279,14 +281,28 @@ static unsigned q_size(struct queue *q)
279 */ 281 */
280static void q_push(struct queue *q, struct entry *e) 282static void q_push(struct queue *q, struct entry *e)
281{ 283{
284 BUG_ON(e->pending_work);
285
282 if (!e->sentinel) 286 if (!e->sentinel)
283 q->nr_elts++; 287 q->nr_elts++;
284 288
285 l_add_tail(q->es, q->qs + e->level, e); 289 l_add_tail(q->es, q->qs + e->level, e);
286} 290}
287 291
292static void q_push_front(struct queue *q, struct entry *e)
293{
294 BUG_ON(e->pending_work);
295
296 if (!e->sentinel)
297 q->nr_elts++;
298
299 l_add_head(q->es, q->qs + e->level, e);
300}
301
288static void q_push_before(struct queue *q, struct entry *old, struct entry *e) 302static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
289{ 303{
304 BUG_ON(e->pending_work);
305
290 if (!e->sentinel) 306 if (!e->sentinel)
291 q->nr_elts++; 307 q->nr_elts++;
292 308
@@ -336,19 +352,6 @@ static struct entry *q_pop(struct queue *q)
336} 352}
337 353
338/* 354/*
339 * Pops an entry from a level that is not past a sentinel.
340 */
341static struct entry *q_pop_old(struct queue *q, unsigned max_level)
342{
343 struct entry *e = q_peek(q, max_level, false);
344
345 if (e)
346 q_del(q, e);
347
348 return e;
349}
350
351/*
352 * This function assumes there is a non-sentinel entry to pop. It's only 355 * This function assumes there is a non-sentinel entry to pop. It's only
353 * used by redistribute, so we know this is true. It also doesn't adjust 356 * used by redistribute, so we know this is true. It also doesn't adjust
354 * the q->nr_elts count. 357 * the q->nr_elts count.
@@ -446,45 +449,49 @@ static void q_redistribute(struct queue *q)
446 break; 449 break;
447 450
448 e->level = level + 1u; 451 e->level = level + 1u;
449 l_add_head(q->es, l_above, e); 452 l_add_tail(q->es, l_above, e);
450 } 453 }
451 } 454 }
452} 455}
453 456
454static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels) 457static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
458 struct entry *s1, struct entry *s2)
455{ 459{
456 struct entry *de; 460 struct entry *de;
457 unsigned new_level; 461 unsigned sentinels_passed = 0;
458 462 unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
459 q_del(q, e);
460 463
464 /* try and find an entry to swap with */
461 if (extra_levels && (e->level < q->nr_levels - 1u)) { 465 if (extra_levels && (e->level < q->nr_levels - 1u)) {
462 new_level = min(q->nr_levels - 1u, e->level + extra_levels); 466 for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
463 for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) { 467 sentinels_passed++;
464 if (de->sentinel)
465 continue;
466 468
469 if (de) {
467 q_del(q, de); 470 q_del(q, de);
468 de->level = e->level; 471 de->level = e->level;
472 if (s1) {
473 switch (sentinels_passed) {
474 case 0:
475 q_push_before(q, s1, de);
476 break;
477
478 case 1:
479 q_push_before(q, s2, de);
480 break;
469 481
470 if (dest) 482 default:
471 q_push_before(q, dest, de); 483 q_push(q, de);
472 else 484 }
485 } else
473 q_push(q, de); 486 q_push(q, de);
474 break;
475 } 487 }
476
477 e->level = new_level;
478 } 488 }
479 489
490 q_del(q, e);
491 e->level = new_level;
480 q_push(q, e); 492 q_push(q, e);
481} 493}
482 494
483static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
484{
485 q_requeue_before(q, NULL, e, extra_levels);
486}
487
488/*----------------------------------------------------------------*/ 495/*----------------------------------------------------------------*/
489 496
490#define FP_SHIFT 8 497#define FP_SHIFT 8
@@ -550,7 +557,7 @@ static enum performance stats_assess(struct stats *s)
550 557
551/*----------------------------------------------------------------*/ 558/*----------------------------------------------------------------*/
552 559
553struct hash_table { 560struct smq_hash_table {
554 struct entry_space *es; 561 struct entry_space *es;
555 unsigned long long hash_bits; 562 unsigned long long hash_bits;
556 unsigned *buckets; 563 unsigned *buckets;
@@ -560,7 +567,7 @@ struct hash_table {
560 * All cache entries are stored in a chained hash table. To save space we 567 * All cache entries are stored in a chained hash table. To save space we
561 * use indexing again, and only store indexes to the next entry. 568 * use indexing again, and only store indexes to the next entry.
562 */ 569 */
563static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries) 570static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
564{ 571{
565 unsigned i, nr_buckets; 572 unsigned i, nr_buckets;
566 573
@@ -578,34 +585,34 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent
578 return 0; 585 return 0;
579} 586}
580 587
581static void h_exit(struct hash_table *ht) 588static void h_exit(struct smq_hash_table *ht)
582{ 589{
583 vfree(ht->buckets); 590 vfree(ht->buckets);
584} 591}
585 592
586static struct entry *h_head(struct hash_table *ht, unsigned bucket) 593static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
587{ 594{
588 return to_entry(ht->es, ht->buckets[bucket]); 595 return to_entry(ht->es, ht->buckets[bucket]);
589} 596}
590 597
591static struct entry *h_next(struct hash_table *ht, struct entry *e) 598static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
592{ 599{
593 return to_entry(ht->es, e->hash_next); 600 return to_entry(ht->es, e->hash_next);
594} 601}
595 602
596static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e) 603static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
597{ 604{
598 e->hash_next = ht->buckets[bucket]; 605 e->hash_next = ht->buckets[bucket];
599 ht->buckets[bucket] = to_index(ht->es, e); 606 ht->buckets[bucket] = to_index(ht->es, e);
600} 607}
601 608
602static void h_insert(struct hash_table *ht, struct entry *e) 609static void h_insert(struct smq_hash_table *ht, struct entry *e)
603{ 610{
604 unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); 611 unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
605 __h_insert(ht, h, e); 612 __h_insert(ht, h, e);
606} 613}
607 614
608static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock, 615static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
609 struct entry **prev) 616 struct entry **prev)
610{ 617{
611 struct entry *e; 618 struct entry *e;
@@ -621,7 +628,7 @@ static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t o
621 return NULL; 628 return NULL;
622} 629}
623 630
624static void __h_unlink(struct hash_table *ht, unsigned h, 631static void __h_unlink(struct smq_hash_table *ht, unsigned h,
625 struct entry *e, struct entry *prev) 632 struct entry *e, struct entry *prev)
626{ 633{
627 if (prev) 634 if (prev)
@@ -633,7 +640,7 @@ static void __h_unlink(struct hash_table *ht, unsigned h,
633/* 640/*
634 * Also moves each entry to the front of the bucket. 641 * Also moves each entry to the front of the bucket.
635 */ 642 */
636static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock) 643static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
637{ 644{
638 struct entry *e, *prev; 645 struct entry *e, *prev;
639 unsigned h = hash_64(from_oblock(oblock), ht->hash_bits); 646 unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
@@ -651,7 +658,7 @@ static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
651 return e; 658 return e;
652} 659}
653 660
654static void h_remove(struct hash_table *ht, struct entry *e) 661static void h_remove(struct smq_hash_table *ht, struct entry *e)
655{ 662{
656 unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); 663 unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
657 struct entry *prev; 664 struct entry *prev;
@@ -699,7 +706,10 @@ static void init_entry(struct entry *e)
699 e->next = INDEXER_NULL; 706 e->next = INDEXER_NULL;
700 e->prev = INDEXER_NULL; 707 e->prev = INDEXER_NULL;
701 e->level = 0u; 708 e->level = 0u;
709 e->dirty = true; /* FIXME: audit */
702 e->allocated = true; 710 e->allocated = true;
711 e->sentinel = false;
712 e->pending_work = false;
703} 713}
704 714
705static struct entry *alloc_entry(struct entry_alloc *ea) 715static struct entry *alloc_entry(struct entry_alloc *ea)
@@ -762,11 +772,11 @@ static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
762#define NR_HOTSPOT_LEVELS 64u 772#define NR_HOTSPOT_LEVELS 64u
763#define NR_CACHE_LEVELS 64u 773#define NR_CACHE_LEVELS 64u
764 774
765#define WRITEBACK_PERIOD (10 * HZ) 775#define WRITEBACK_PERIOD (10ul * HZ)
766#define DEMOTE_PERIOD (60 * HZ) 776#define DEMOTE_PERIOD (60ul * HZ)
767 777
768#define HOTSPOT_UPDATE_PERIOD (HZ) 778#define HOTSPOT_UPDATE_PERIOD (HZ)
769#define CACHE_UPDATE_PERIOD (10u * HZ) 779#define CACHE_UPDATE_PERIOD (60ul * HZ)
770 780
771struct smq_policy { 781struct smq_policy {
772 struct dm_cache_policy policy; 782 struct dm_cache_policy policy;
@@ -814,8 +824,8 @@ struct smq_policy {
814 * The hash tables allows us to quickly find an entry by origin 824 * The hash tables allows us to quickly find an entry by origin
815 * block. 825 * block.
816 */ 826 */
817 struct hash_table table; 827 struct smq_hash_table table;
818 struct hash_table hotspot_table; 828 struct smq_hash_table hotspot_table;
819 829
820 bool current_writeback_sentinels; 830 bool current_writeback_sentinels;
821 unsigned long next_writeback_period; 831 unsigned long next_writeback_period;
@@ -828,6 +838,10 @@ struct smq_policy {
828 838
829 unsigned long next_hotspot_period; 839 unsigned long next_hotspot_period;
830 unsigned long next_cache_period; 840 unsigned long next_cache_period;
841
842 struct background_tracker *bg_work;
843
844 bool migrations_allowed;
831}; 845};
832 846
833/*----------------------------------------------------------------*/ 847/*----------------------------------------------------------------*/
@@ -876,15 +890,15 @@ static void __update_demote_sentinels(struct smq_policy *mq)
876static void update_sentinels(struct smq_policy *mq) 890static void update_sentinels(struct smq_policy *mq)
877{ 891{
878 if (time_after(jiffies, mq->next_writeback_period)) { 892 if (time_after(jiffies, mq->next_writeback_period)) {
879 __update_writeback_sentinels(mq);
880 mq->next_writeback_period = jiffies + WRITEBACK_PERIOD; 893 mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
881 mq->current_writeback_sentinels = !mq->current_writeback_sentinels; 894 mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
895 __update_writeback_sentinels(mq);
882 } 896 }
883 897
884 if (time_after(jiffies, mq->next_demote_period)) { 898 if (time_after(jiffies, mq->next_demote_period)) {
885 __update_demote_sentinels(mq);
886 mq->next_demote_period = jiffies + DEMOTE_PERIOD; 899 mq->next_demote_period = jiffies + DEMOTE_PERIOD;
887 mq->current_demote_sentinels = !mq->current_demote_sentinels; 900 mq->current_demote_sentinels = !mq->current_demote_sentinels;
901 __update_demote_sentinels(mq);
888 } 902 }
889} 903}
890 904
@@ -920,55 +934,40 @@ static void sentinels_init(struct smq_policy *mq)
920 934
921/*----------------------------------------------------------------*/ 935/*----------------------------------------------------------------*/
922 936
923/* 937static void del_queue(struct smq_policy *mq, struct entry *e)
924 * These methods tie together the dirty queue, clean queue and hash table.
925 */
926static void push_new(struct smq_policy *mq, struct entry *e)
927{ 938{
928 struct queue *q = e->dirty ? &mq->dirty : &mq->clean; 939 q_del(e->dirty ? &mq->dirty : &mq->clean, e);
929 h_insert(&mq->table, e);
930 q_push(q, e);
931} 940}
932 941
933static void push(struct smq_policy *mq, struct entry *e) 942static void push_queue(struct smq_policy *mq, struct entry *e)
934{ 943{
935 struct entry *sentinel; 944 if (e->dirty)
936 945 q_push(&mq->dirty, e);
937 h_insert(&mq->table, e); 946 else
938 947 q_push(&mq->clean, e);
939 /*
940 * Punch this into the queue just in front of the sentinel, to
941 * ensure it's cleaned straight away.
942 */
943 if (e->dirty) {
944 sentinel = writeback_sentinel(mq, e->level);
945 q_push_before(&mq->dirty, sentinel, e);
946 } else {
947 sentinel = demote_sentinel(mq, e->level);
948 q_push_before(&mq->clean, sentinel, e);
949 }
950} 948}
951 949
952/* 950// !h, !q, a -> h, q, a
953 * Removes an entry from cache. Removes from the hash table. 951static void push(struct smq_policy *mq, struct entry *e)
954 */
955static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
956{ 952{
957 q_del(q, e); 953 h_insert(&mq->table, e);
958 h_remove(&mq->table, e); 954 if (!e->pending_work)
955 push_queue(mq, e);
959} 956}
960 957
961static void del(struct smq_policy *mq, struct entry *e) 958static void push_queue_front(struct smq_policy *mq, struct entry *e)
962{ 959{
963 __del(mq, e->dirty ? &mq->dirty : &mq->clean, e); 960 if (e->dirty)
961 q_push_front(&mq->dirty, e);
962 else
963 q_push_front(&mq->clean, e);
964} 964}
965 965
966static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level) 966static void push_front(struct smq_policy *mq, struct entry *e)
967{ 967{
968 struct entry *e = q_pop_old(q, max_level); 968 h_insert(&mq->table, e);
969 if (e) 969 if (!e->pending_work)
970 h_remove(&mq->table, e); 970 push_queue_front(mq, e);
971 return e;
972} 971}
973 972
974static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) 973static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
@@ -978,16 +977,21 @@ static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
978 977
979static void requeue(struct smq_policy *mq, struct entry *e) 978static void requeue(struct smq_policy *mq, struct entry *e)
980{ 979{
981 struct entry *sentinel; 980 /*
981 * Pending work has temporarily been taken out of the queues.
982 */
983 if (e->pending_work)
984 return;
982 985
983 if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) { 986 if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
984 if (e->dirty) { 987 if (!e->dirty) {
985 sentinel = writeback_sentinel(mq, e->level); 988 q_requeue(&mq->clean, e, 1u, NULL, NULL);
986 q_requeue_before(&mq->dirty, sentinel, e, 1u); 989 return;
987 } else {
988 sentinel = demote_sentinel(mq, e->level);
989 q_requeue_before(&mq->clean, sentinel, e, 1u);
990 } 990 }
991
992 q_requeue(&mq->dirty, e, 1u,
993 get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
994 get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
991 } 995 }
992} 996}
993 997
@@ -1026,6 +1030,8 @@ static void update_promote_levels(struct smq_policy *mq)
1026 unsigned threshold_level = allocator_empty(&mq->cache_alloc) ? 1030 unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
1027 default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u); 1031 default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
1028 1032
1033 threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
1034
1029 /* 1035 /*
1030 * If the hotspot queue is performing badly then we have little 1036 * If the hotspot queue is performing badly then we have little
1031 * confidence that we know which blocks to promote. So we cut down 1037 * confidence that we know which blocks to promote. So we cut down
@@ -1045,7 +1051,7 @@ static void update_promote_levels(struct smq_policy *mq)
1045 } 1051 }
1046 1052
1047 mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level; 1053 mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
1048 mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u; 1054 mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
1049} 1055}
1050 1056
1051/* 1057/*
@@ -1095,34 +1101,142 @@ static void end_cache_period(struct smq_policy *mq)
1095 } 1101 }
1096} 1102}
1097 1103
1098static int demote_cblock(struct smq_policy *mq, 1104/*----------------------------------------------------------------*/
1099 struct policy_locker *locker, 1105
1100 dm_oblock_t *oblock) 1106/*
1107 * Targets are given as a percentage.
1108 */
1109#define CLEAN_TARGET 25u
1110#define FREE_TARGET 25u
1111
1112static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
1101{ 1113{
1102 struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false); 1114 return from_cblock(mq->cache_size) * p / 100u;
1103 if (!demoted) 1115}
1104 /* 1116
1105 * We could get a block from mq->dirty, but that 1117static bool clean_target_met(struct smq_policy *mq, bool idle)
1106 * would add extra latency to the triggering bio as it 1118{
1107 * waits for the writeback. Better to not promote this 1119 /*
1108 * time and hope there's a clean block next time this block 1120 * Cache entries may not be populated. So we cannot rely on the
1109 * is hit. 1121 * size of the clean queue.
1110 */ 1122 */
1111 return -ENOSPC; 1123 unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
1112 1124
1113 if (locker->fn(locker, demoted->oblock)) 1125 if (idle)
1114 /* 1126 /*
1115 * We couldn't lock this block. 1127 * We'd like to clean everything.
1116 */ 1128 */
1117 return -EBUSY; 1129 return q_size(&mq->dirty) == 0u;
1130 else
1131 return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >=
1132 percent_to_target(mq, CLEAN_TARGET);
1133}
1118 1134
1119 del(mq, demoted); 1135static bool free_target_met(struct smq_policy *mq, bool idle)
1120 *oblock = demoted->oblock; 1136{
1121 free_entry(&mq->cache_alloc, demoted); 1137 unsigned nr_free = from_cblock(mq->cache_size) -
1138 mq->cache_alloc.nr_allocated;
1122 1139
1123 return 0; 1140 if (idle)
1141 return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
1142 percent_to_target(mq, FREE_TARGET);
1143 else
1144 return true;
1124} 1145}
1125 1146
1147/*----------------------------------------------------------------*/
1148
1149static void mark_pending(struct smq_policy *mq, struct entry *e)
1150{
1151 BUG_ON(e->sentinel);
1152 BUG_ON(!e->allocated);
1153 BUG_ON(e->pending_work);
1154 e->pending_work = true;
1155}
1156
1157static void clear_pending(struct smq_policy *mq, struct entry *e)
1158{
1159 BUG_ON(!e->pending_work);
1160 e->pending_work = false;
1161}
1162
1163static void queue_writeback(struct smq_policy *mq)
1164{
1165 int r;
1166 struct policy_work work;
1167 struct entry *e;
1168
1169 e = q_peek(&mq->dirty, mq->dirty.nr_levels, false);
1170 if (e) {
1171 mark_pending(mq, e);
1172 q_del(&mq->dirty, e);
1173
1174 work.op = POLICY_WRITEBACK;
1175 work.oblock = e->oblock;
1176 work.cblock = infer_cblock(mq, e);
1177
1178 r = btracker_queue(mq->bg_work, &work, NULL);
1179 WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race.
1180 }
1181}
1182
1183static void queue_demotion(struct smq_policy *mq)
1184{
1185 struct policy_work work;
1186 struct entry *e;
1187
1188 if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
1189 return;
1190
1191 e = q_peek(&mq->clean, mq->clean.nr_levels, true);
1192 if (!e) {
1193 if (!clean_target_met(mq, false))
1194 queue_writeback(mq);
1195 return;
1196 }
1197
1198 mark_pending(mq, e);
1199 q_del(&mq->clean, e);
1200
1201 work.op = POLICY_DEMOTE;
1202 work.oblock = e->oblock;
1203 work.cblock = infer_cblock(mq, e);
1204 btracker_queue(mq->bg_work, &work, NULL);
1205}
1206
1207static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
1208 struct policy_work **workp)
1209{
1210 struct entry *e;
1211 struct policy_work work;
1212
1213 if (!mq->migrations_allowed)
1214 return;
1215
1216 if (allocator_empty(&mq->cache_alloc)) {
1217 if (!free_target_met(mq, false))
1218 queue_demotion(mq);
1219 return;
1220 }
1221
1222 if (btracker_promotion_already_present(mq->bg_work, oblock))
1223 return;
1224
1225 /*
1226 * We allocate the entry now to reserve the cblock. If the
1227 * background work is aborted we must remember to free it.
1228 */
1229 e = alloc_entry(&mq->cache_alloc);
1230 BUG_ON(!e);
1231 e->pending_work = true;
1232 work.op = POLICY_PROMOTE;
1233 work.oblock = oblock;
1234 work.cblock = infer_cblock(mq, e);
1235 btracker_queue(mq->bg_work, &work, workp);
1236}
1237
1238/*----------------------------------------------------------------*/
1239
1126enum promote_result { 1240enum promote_result {
1127 PROMOTE_NOT, 1241 PROMOTE_NOT,
1128 PROMOTE_TEMPORARY, 1242 PROMOTE_TEMPORARY,
@@ -1137,49 +1251,18 @@ static enum promote_result maybe_promote(bool promote)
1137 return promote ? PROMOTE_PERMANENT : PROMOTE_NOT; 1251 return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
1138} 1252}
1139 1253
1140static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio, 1254static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
1141 bool fast_promote) 1255 int data_dir, bool fast_promote)
1142{ 1256{
1143 if (bio_data_dir(bio) == WRITE) { 1257 if (data_dir == WRITE) {
1144 if (!allocator_empty(&mq->cache_alloc) && fast_promote) 1258 if (!allocator_empty(&mq->cache_alloc) && fast_promote)
1145 return PROMOTE_TEMPORARY; 1259 return PROMOTE_TEMPORARY;
1146 1260
1147 else 1261 return maybe_promote(hs_e->level >= mq->write_promote_level);
1148 return maybe_promote(hs_e->level >= mq->write_promote_level);
1149 } else 1262 } else
1150 return maybe_promote(hs_e->level >= mq->read_promote_level); 1263 return maybe_promote(hs_e->level >= mq->read_promote_level);
1151} 1264}
1152 1265
1153static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
1154 struct policy_locker *locker,
1155 struct policy_result *result, enum promote_result pr)
1156{
1157 int r;
1158 struct entry *e;
1159
1160 if (allocator_empty(&mq->cache_alloc)) {
1161 result->op = POLICY_REPLACE;
1162 r = demote_cblock(mq, locker, &result->old_oblock);
1163 if (r) {
1164 result->op = POLICY_MISS;
1165 return;
1166 }
1167
1168 } else
1169 result->op = POLICY_NEW;
1170
1171 e = alloc_entry(&mq->cache_alloc);
1172 BUG_ON(!e);
1173 e->oblock = oblock;
1174
1175 if (pr == PROMOTE_TEMPORARY)
1176 push(mq, e);
1177 else
1178 push_new(mq, e);
1179
1180 result->cblock = infer_cblock(mq, e);
1181}
1182
1183static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) 1266static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1184{ 1267{
1185 sector_t r = from_oblock(b); 1268 sector_t r = from_oblock(b);
@@ -1187,7 +1270,7 @@ static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1187 return to_oblock(r); 1270 return to_oblock(r);
1188} 1271}
1189 1272
1190static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio) 1273static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
1191{ 1274{
1192 unsigned hi; 1275 unsigned hi;
1193 dm_oblock_t hb = to_hblock(mq, b); 1276 dm_oblock_t hb = to_hblock(mq, b);
@@ -1199,7 +1282,8 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
1199 hi = get_index(&mq->hotspot_alloc, e); 1282 hi = get_index(&mq->hotspot_alloc, e);
1200 q_requeue(&mq->hotspot, e, 1283 q_requeue(&mq->hotspot, e,
1201 test_and_set_bit(hi, mq->hotspot_hit_bits) ? 1284 test_and_set_bit(hi, mq->hotspot_hit_bits) ?
1202 0u : mq->hotspot_level_jump); 1285 0u : mq->hotspot_level_jump,
1286 NULL, NULL);
1203 1287
1204 } else { 1288 } else {
1205 stats_miss(&mq->hotspot_stats); 1289 stats_miss(&mq->hotspot_stats);
@@ -1225,47 +1309,6 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
1225 return e; 1309 return e;
1226} 1310}
1227 1311
1228/*
1229 * Looks the oblock up in the hash table, then decides whether to put in
1230 * pre_cache, or cache etc.
1231 */
1232static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
1233 bool can_migrate, bool fast_promote,
1234 struct policy_locker *locker, struct policy_result *result)
1235{
1236 struct entry *e, *hs_e;
1237 enum promote_result pr;
1238
1239 hs_e = update_hotspot_queue(mq, oblock, bio);
1240
1241 e = h_lookup(&mq->table, oblock);
1242 if (e) {
1243 stats_level_accessed(&mq->cache_stats, e->level);
1244
1245 requeue(mq, e);
1246 result->op = POLICY_HIT;
1247 result->cblock = infer_cblock(mq, e);
1248
1249 } else {
1250 stats_miss(&mq->cache_stats);
1251
1252 pr = should_promote(mq, hs_e, bio, fast_promote);
1253 if (pr == PROMOTE_NOT)
1254 result->op = POLICY_MISS;
1255
1256 else {
1257 if (!can_migrate) {
1258 result->op = POLICY_MISS;
1259 return -EWOULDBLOCK;
1260 }
1261
1262 insert_in_cache(mq, oblock, locker, result, pr);
1263 }
1264 }
1265
1266 return 0;
1267}
1268
1269/*----------------------------------------------------------------*/ 1312/*----------------------------------------------------------------*/
1270 1313
1271/* 1314/*
@@ -1282,6 +1325,7 @@ static void smq_destroy(struct dm_cache_policy *p)
1282{ 1325{
1283 struct smq_policy *mq = to_smq_policy(p); 1326 struct smq_policy *mq = to_smq_policy(p);
1284 1327
1328 btracker_destroy(mq->bg_work);
1285 h_exit(&mq->hotspot_table); 1329 h_exit(&mq->hotspot_table);
1286 h_exit(&mq->table); 1330 h_exit(&mq->table);
1287 free_bitset(mq->hotspot_hit_bits); 1331 free_bitset(mq->hotspot_hit_bits);
@@ -1290,234 +1334,247 @@ static void smq_destroy(struct dm_cache_policy *p)
1290 kfree(mq); 1334 kfree(mq);
1291} 1335}
1292 1336
1293static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock, 1337/*----------------------------------------------------------------*/
1294 bool can_block, bool can_migrate, bool fast_promote,
1295 struct bio *bio, struct policy_locker *locker,
1296 struct policy_result *result)
1297{
1298 int r;
1299 unsigned long flags;
1300 struct smq_policy *mq = to_smq_policy(p);
1301
1302 result->op = POLICY_MISS;
1303
1304 spin_lock_irqsave(&mq->lock, flags);
1305 r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
1306 spin_unlock_irqrestore(&mq->lock, flags);
1307
1308 return r;
1309}
1310 1338
1311static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) 1339static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
1340 int data_dir, bool fast_copy,
1341 struct policy_work **work, bool *background_work)
1312{ 1342{
1313 int r; 1343 struct entry *e, *hs_e;
1314 unsigned long flags; 1344 enum promote_result pr;
1315 struct smq_policy *mq = to_smq_policy(p); 1345
1316 struct entry *e; 1346 *background_work = false;
1317 1347
1318 spin_lock_irqsave(&mq->lock, flags);
1319 e = h_lookup(&mq->table, oblock); 1348 e = h_lookup(&mq->table, oblock);
1320 if (e) { 1349 if (e) {
1350 stats_level_accessed(&mq->cache_stats, e->level);
1351
1352 requeue(mq, e);
1321 *cblock = infer_cblock(mq, e); 1353 *cblock = infer_cblock(mq, e);
1322 r = 0; 1354 return 0;
1323 } else
1324 r = -ENOENT;
1325 spin_unlock_irqrestore(&mq->lock, flags);
1326 1355
1327 return r; 1356 } else {
1328} 1357 stats_miss(&mq->cache_stats);
1329 1358
1330static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set) 1359 /*
1331{ 1360 * The hotspot queue only gets updated with misses.
1332 struct entry *e; 1361 */
1362 hs_e = update_hotspot_queue(mq, oblock);
1333 1363
1334 e = h_lookup(&mq->table, oblock); 1364 pr = should_promote(mq, hs_e, data_dir, fast_copy);
1335 BUG_ON(!e); 1365 if (pr != PROMOTE_NOT) {
1366 queue_promotion(mq, oblock, work);
1367 *background_work = true;
1368 }
1336 1369
1337 del(mq, e); 1370 return -ENOENT;
1338 e->dirty = set; 1371 }
1339 push(mq, e);
1340} 1372}
1341 1373
1342static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 1374static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
1375 int data_dir, bool fast_copy,
1376 bool *background_work)
1343{ 1377{
1378 int r;
1344 unsigned long flags; 1379 unsigned long flags;
1345 struct smq_policy *mq = to_smq_policy(p); 1380 struct smq_policy *mq = to_smq_policy(p);
1346 1381
1347 spin_lock_irqsave(&mq->lock, flags); 1382 spin_lock_irqsave(&mq->lock, flags);
1348 __smq_set_clear_dirty(mq, oblock, true); 1383 r = __lookup(mq, oblock, cblock,
1384 data_dir, fast_copy,
1385 NULL, background_work);
1349 spin_unlock_irqrestore(&mq->lock, flags); 1386 spin_unlock_irqrestore(&mq->lock, flags);
1387
1388 return r;
1350} 1389}
1351 1390
1352static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 1391static int smq_lookup_with_work(struct dm_cache_policy *p,
1392 dm_oblock_t oblock, dm_cblock_t *cblock,
1393 int data_dir, bool fast_copy,
1394 struct policy_work **work)
1353{ 1395{
1354 struct smq_policy *mq = to_smq_policy(p); 1396 int r;
1397 bool background_queued;
1355 unsigned long flags; 1398 unsigned long flags;
1399 struct smq_policy *mq = to_smq_policy(p);
1356 1400
1357 spin_lock_irqsave(&mq->lock, flags); 1401 spin_lock_irqsave(&mq->lock, flags);
1358 __smq_set_clear_dirty(mq, oblock, false); 1402 r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
1359 spin_unlock_irqrestore(&mq->lock, flags); 1403 spin_unlock_irqrestore(&mq->lock, flags);
1360}
1361 1404
1362static unsigned random_level(dm_cblock_t cblock) 1405 return r;
1363{
1364 return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
1365} 1406}
1366 1407
1367static int smq_load_mapping(struct dm_cache_policy *p, 1408static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
1368 dm_oblock_t oblock, dm_cblock_t cblock, 1409 struct policy_work **result)
1369 uint32_t hint, bool hint_valid)
1370{ 1410{
1411 int r;
1412 unsigned long flags;
1371 struct smq_policy *mq = to_smq_policy(p); 1413 struct smq_policy *mq = to_smq_policy(p);
1372 struct entry *e;
1373 1414
1374 e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); 1415 spin_lock_irqsave(&mq->lock, flags);
1375 e->oblock = oblock; 1416 r = btracker_issue(mq->bg_work, result);
1376 e->dirty = false; /* this gets corrected in a minute */ 1417 if (r == -ENODATA) {
1377 e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); 1418 /* find some writeback work to do */
1378 push(mq, e); 1419 if (mq->migrations_allowed && !free_target_met(mq, idle))
1379 1420 queue_demotion(mq);
1380 return 0;
1381}
1382 1421
1383static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock) 1422 else if (!clean_target_met(mq, idle))
1384{ 1423 queue_writeback(mq);
1385 struct smq_policy *mq = to_smq_policy(p);
1386 struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1387 1424
1388 if (!e->allocated) 1425 r = btracker_issue(mq->bg_work, result);
1389 return 0; 1426 }
1427 spin_unlock_irqrestore(&mq->lock, flags);
1390 1428
1391 return e->level; 1429 return r;
1392} 1430}
1393 1431
1394static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock) 1432/*
1395{ 1433 * We need to clear any pending work flags that have been set, and in the
1396 struct entry *e; 1434 * case of promotion free the entry for the destination cblock.
1435 */
1436static void __complete_background_work(struct smq_policy *mq,
1437 struct policy_work *work,
1438 bool success)
1439{
1440 struct entry *e = get_entry(&mq->cache_alloc,
1441 from_cblock(work->cblock));
1442
1443 switch (work->op) {
1444 case POLICY_PROMOTE:
1445 // !h, !q, a
1446 clear_pending(mq, e);
1447 if (success) {
1448 e->oblock = work->oblock;
1449 push(mq, e);
1450 // h, q, a
1451 } else {
1452 free_entry(&mq->cache_alloc, e);
1453 // !h, !q, !a
1454 }
1455 break;
1397 1456
1398 e = h_lookup(&mq->table, oblock); 1457 case POLICY_DEMOTE:
1399 BUG_ON(!e); 1458 // h, !q, a
1459 if (success) {
1460 h_remove(&mq->table, e);
1461 free_entry(&mq->cache_alloc, e);
1462 // !h, !q, !a
1463 } else {
1464 clear_pending(mq, e);
1465 push_queue(mq, e);
1466 // h, q, a
1467 }
1468 break;
1400 1469
1401 del(mq, e); 1470 case POLICY_WRITEBACK:
1402 free_entry(&mq->cache_alloc, e); 1471 // h, !q, a
1472 clear_pending(mq, e);
1473 push_queue(mq, e);
1474 // h, q, a
1475 break;
1476 }
1477
1478 btracker_complete(mq->bg_work, work);
1403} 1479}
1404 1480
1405static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 1481static void smq_complete_background_work(struct dm_cache_policy *p,
1482 struct policy_work *work,
1483 bool success)
1406{ 1484{
1407 struct smq_policy *mq = to_smq_policy(p);
1408 unsigned long flags; 1485 unsigned long flags;
1486 struct smq_policy *mq = to_smq_policy(p);
1409 1487
1410 spin_lock_irqsave(&mq->lock, flags); 1488 spin_lock_irqsave(&mq->lock, flags);
1411 __remove_mapping(mq, oblock); 1489 __complete_background_work(mq, work, success);
1412 spin_unlock_irqrestore(&mq->lock, flags); 1490 spin_unlock_irqrestore(&mq->lock, flags);
1413} 1491}
1414 1492
1415static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock) 1493// in_hash(oblock) -> in_hash(oblock)
1494static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
1416{ 1495{
1417 struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); 1496 struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1418 1497
1419 if (!e || !e->allocated) 1498 if (e->pending_work)
1420 return -ENODATA; 1499 e->dirty = set;
1421 1500 else {
1422 del(mq, e); 1501 del_queue(mq, e);
1423 free_entry(&mq->cache_alloc, e); 1502 e->dirty = set;
1424 1503 push_queue(mq, e);
1425 return 0; 1504 }
1426} 1505}
1427 1506
1428static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) 1507static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1429{ 1508{
1430 int r;
1431 unsigned long flags; 1509 unsigned long flags;
1432 struct smq_policy *mq = to_smq_policy(p); 1510 struct smq_policy *mq = to_smq_policy(p);
1433 1511
1434 spin_lock_irqsave(&mq->lock, flags); 1512 spin_lock_irqsave(&mq->lock, flags);
1435 r = __remove_cblock(mq, cblock); 1513 __smq_set_clear_dirty(mq, cblock, true);
1436 spin_unlock_irqrestore(&mq->lock, flags); 1514 spin_unlock_irqrestore(&mq->lock, flags);
1437
1438 return r;
1439} 1515}
1440 1516
1441 1517static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1442#define CLEAN_TARGET_CRITICAL 5u /* percent */
1443
1444static bool clean_target_met(struct smq_policy *mq, bool critical)
1445{ 1518{
1446 if (critical) { 1519 struct smq_policy *mq = to_smq_policy(p);
1447 /* 1520 unsigned long flags;
1448 * Cache entries may not be populated. So we're cannot rely on the
1449 * size of the clean queue.
1450 */
1451 unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
1452 unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
1453 1521
1454 return nr_clean >= target; 1522 spin_lock_irqsave(&mq->lock, flags);
1455 } else 1523 __smq_set_clear_dirty(mq, cblock, false);
1456 return !q_size(&mq->dirty); 1524 spin_unlock_irqrestore(&mq->lock, flags);
1457} 1525}
1458 1526
1459static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock, 1527static unsigned random_level(dm_cblock_t cblock)
1460 dm_cblock_t *cblock, bool critical_only)
1461{ 1528{
1462 struct entry *e = NULL; 1529 return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
1463 bool target_met = clean_target_met(mq, critical_only); 1530}
1464
1465 if (critical_only)
1466 /*
1467 * Always try and keep the bottom level clean.
1468 */
1469 e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
1470 1531
1471 else 1532static int smq_load_mapping(struct dm_cache_policy *p,
1472 e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels); 1533 dm_oblock_t oblock, dm_cblock_t cblock,
1534 bool dirty, uint32_t hint, bool hint_valid)
1535{
1536 struct smq_policy *mq = to_smq_policy(p);
1537 struct entry *e;
1473 1538
1474 if (!e) 1539 e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
1475 return -ENODATA; 1540 e->oblock = oblock;
1541 e->dirty = dirty;
1542 e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
1543 e->pending_work = false;
1476 1544
1477 *oblock = e->oblock; 1545 /*
1478 *cblock = infer_cblock(mq, e); 1546 * When we load mappings we push ahead of both sentinels in order to
1479 e->dirty = false; 1547 * allow demotions and cleaning to occur immediately.
1480 push_new(mq, e); 1548 */
1549 push_front(mq, e);
1481 1550
1482 return 0; 1551 return 0;
1483} 1552}
1484 1553
1485static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, 1554static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
1486 dm_cblock_t *cblock, bool critical_only)
1487{ 1555{
1488 int r;
1489 unsigned long flags;
1490 struct smq_policy *mq = to_smq_policy(p); 1556 struct smq_policy *mq = to_smq_policy(p);
1557 struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1491 1558
1492 spin_lock_irqsave(&mq->lock, flags); 1559 if (!e->allocated)
1493 r = __smq_writeback_work(mq, oblock, cblock, critical_only); 1560 return -ENODATA;
1494 spin_unlock_irqrestore(&mq->lock, flags);
1495
1496 return r;
1497}
1498
1499static void __force_mapping(struct smq_policy *mq,
1500 dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1501{
1502 struct entry *e = h_lookup(&mq->table, current_oblock);
1503 1561
1504 if (e) { 1562 // FIXME: what if this block has pending background work?
1505 del(mq, e); 1563 del_queue(mq, e);
1506 e->oblock = new_oblock; 1564 h_remove(&mq->table, e);
1507 e->dirty = true; 1565 free_entry(&mq->cache_alloc, e);
1508 push(mq, e); 1566 return 0;
1509 }
1510} 1567}
1511 1568
1512static void smq_force_mapping(struct dm_cache_policy *p, 1569static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
1513 dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1514{ 1570{
1515 unsigned long flags;
1516 struct smq_policy *mq = to_smq_policy(p); 1571 struct smq_policy *mq = to_smq_policy(p);
1572 struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1517 1573
1518 spin_lock_irqsave(&mq->lock, flags); 1574 if (!e->allocated)
1519 __force_mapping(mq, current_oblock, new_oblock); 1575 return 0;
1520 spin_unlock_irqrestore(&mq->lock, flags); 1576
1577 return e->level;
1521} 1578}
1522 1579
1523static dm_cblock_t smq_residency(struct dm_cache_policy *p) 1580static dm_cblock_t smq_residency(struct dm_cache_policy *p)
@@ -1546,6 +1603,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block)
1546 spin_unlock_irqrestore(&mq->lock, flags); 1603 spin_unlock_irqrestore(&mq->lock, flags);
1547} 1604}
1548 1605
1606static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
1607{
1608 struct smq_policy *mq = to_smq_policy(p);
1609 mq->migrations_allowed = allow;
1610}
1611
1549/* 1612/*
1550 * smq has no config values, but the old mq policy did. To avoid breaking 1613 * smq has no config values, but the old mq policy did. To avoid breaking
1551 * software we continue to accept these configurables for the mq policy, 1614 * software we continue to accept these configurables for the mq policy,
@@ -1590,18 +1653,18 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
1590static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) 1653static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
1591{ 1654{
1592 mq->policy.destroy = smq_destroy; 1655 mq->policy.destroy = smq_destroy;
1593 mq->policy.map = smq_map;
1594 mq->policy.lookup = smq_lookup; 1656 mq->policy.lookup = smq_lookup;
1657 mq->policy.lookup_with_work = smq_lookup_with_work;
1658 mq->policy.get_background_work = smq_get_background_work;
1659 mq->policy.complete_background_work = smq_complete_background_work;
1595 mq->policy.set_dirty = smq_set_dirty; 1660 mq->policy.set_dirty = smq_set_dirty;
1596 mq->policy.clear_dirty = smq_clear_dirty; 1661 mq->policy.clear_dirty = smq_clear_dirty;
1597 mq->policy.load_mapping = smq_load_mapping; 1662 mq->policy.load_mapping = smq_load_mapping;
1663 mq->policy.invalidate_mapping = smq_invalidate_mapping;
1598 mq->policy.get_hint = smq_get_hint; 1664 mq->policy.get_hint = smq_get_hint;
1599 mq->policy.remove_mapping = smq_remove_mapping;
1600 mq->policy.remove_cblock = smq_remove_cblock;
1601 mq->policy.writeback_work = smq_writeback_work;
1602 mq->policy.force_mapping = smq_force_mapping;
1603 mq->policy.residency = smq_residency; 1665 mq->policy.residency = smq_residency;
1604 mq->policy.tick = smq_tick; 1666 mq->policy.tick = smq_tick;
1667 mq->policy.allow_migrations = smq_allow_migrations;
1605 1668
1606 if (mimic_mq) { 1669 if (mimic_mq) {
1607 mq->policy.set_config_value = mq_set_config_value; 1670 mq->policy.set_config_value = mq_set_config_value;
@@ -1633,7 +1696,8 @@ static void calc_hotspot_params(sector_t origin_size,
1633static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, 1696static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
1634 sector_t origin_size, 1697 sector_t origin_size,
1635 sector_t cache_block_size, 1698 sector_t cache_block_size,
1636 bool mimic_mq) 1699 bool mimic_mq,
1700 bool migrations_allowed)
1637{ 1701{
1638 unsigned i; 1702 unsigned i;
1639 unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; 1703 unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
@@ -1658,11 +1722,11 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
1658 } 1722 }
1659 1723
1660 init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue); 1724 init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
1661 for (i = 0; i < nr_sentinels_per_queue; i++) 1725 for (i = 0; i < nr_sentinels_per_queue; i++)
1662 get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true; 1726 get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
1663 1727
1664 init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels); 1728 init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
1665 for (i = 0; i < nr_sentinels_per_queue; i++) 1729 for (i = 0; i < nr_sentinels_per_queue; i++)
1666 get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true; 1730 get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
1667 1731
1668 init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels, 1732 init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
@@ -1715,8 +1779,16 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
1715 mq->next_hotspot_period = jiffies; 1779 mq->next_hotspot_period = jiffies;
1716 mq->next_cache_period = jiffies; 1780 mq->next_cache_period = jiffies;
1717 1781
1782 mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */
1783 if (!mq->bg_work)
1784 goto bad_btracker;
1785
1786 mq->migrations_allowed = migrations_allowed;
1787
1718 return &mq->policy; 1788 return &mq->policy;
1719 1789
1790bad_btracker:
1791 h_exit(&mq->hotspot_table);
1720bad_alloc_hotspot_table: 1792bad_alloc_hotspot_table:
1721 h_exit(&mq->table); 1793 h_exit(&mq->table);
1722bad_alloc_table: 1794bad_alloc_table:
@@ -1735,21 +1807,28 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
1735 sector_t origin_size, 1807 sector_t origin_size,
1736 sector_t cache_block_size) 1808 sector_t cache_block_size)
1737{ 1809{
1738 return __smq_create(cache_size, origin_size, cache_block_size, false); 1810 return __smq_create(cache_size, origin_size, cache_block_size, false, true);
1739} 1811}
1740 1812
1741static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, 1813static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1742 sector_t origin_size, 1814 sector_t origin_size,
1743 sector_t cache_block_size) 1815 sector_t cache_block_size)
1744{ 1816{
1745 return __smq_create(cache_size, origin_size, cache_block_size, true); 1817 return __smq_create(cache_size, origin_size, cache_block_size, true, true);
1818}
1819
1820static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
1821 sector_t origin_size,
1822 sector_t cache_block_size)
1823{
1824 return __smq_create(cache_size, origin_size, cache_block_size, false, false);
1746} 1825}
1747 1826
1748/*----------------------------------------------------------------*/ 1827/*----------------------------------------------------------------*/
1749 1828
1750static struct dm_cache_policy_type smq_policy_type = { 1829static struct dm_cache_policy_type smq_policy_type = {
1751 .name = "smq", 1830 .name = "smq",
1752 .version = {1, 5, 0}, 1831 .version = {2, 0, 0},
1753 .hint_size = 4, 1832 .hint_size = 4,
1754 .owner = THIS_MODULE, 1833 .owner = THIS_MODULE,
1755 .create = smq_create 1834 .create = smq_create
@@ -1757,15 +1836,23 @@ static struct dm_cache_policy_type smq_policy_type = {
1757 1836
1758static struct dm_cache_policy_type mq_policy_type = { 1837static struct dm_cache_policy_type mq_policy_type = {
1759 .name = "mq", 1838 .name = "mq",
1760 .version = {1, 5, 0}, 1839 .version = {2, 0, 0},
1761 .hint_size = 4, 1840 .hint_size = 4,
1762 .owner = THIS_MODULE, 1841 .owner = THIS_MODULE,
1763 .create = mq_create, 1842 .create = mq_create,
1764}; 1843};
1765 1844
1845static struct dm_cache_policy_type cleaner_policy_type = {
1846 .name = "cleaner",
1847 .version = {2, 0, 0},
1848 .hint_size = 4,
1849 .owner = THIS_MODULE,
1850 .create = cleaner_create,
1851};
1852
1766static struct dm_cache_policy_type default_policy_type = { 1853static struct dm_cache_policy_type default_policy_type = {
1767 .name = "default", 1854 .name = "default",
1768 .version = {1, 5, 0}, 1855 .version = {2, 0, 0},
1769 .hint_size = 4, 1856 .hint_size = 4,
1770 .owner = THIS_MODULE, 1857 .owner = THIS_MODULE,
1771 .create = smq_create, 1858 .create = smq_create,
@@ -1785,23 +1872,36 @@ static int __init smq_init(void)
1785 r = dm_cache_policy_register(&mq_policy_type); 1872 r = dm_cache_policy_register(&mq_policy_type);
1786 if (r) { 1873 if (r) {
1787 DMERR("register failed (as mq) %d", r); 1874 DMERR("register failed (as mq) %d", r);
1788 dm_cache_policy_unregister(&smq_policy_type); 1875 goto out_mq;
1789 return -ENOMEM; 1876 }
1877
1878 r = dm_cache_policy_register(&cleaner_policy_type);
1879 if (r) {
1880 DMERR("register failed (as cleaner) %d", r);
1881 goto out_cleaner;
1790 } 1882 }
1791 1883
1792 r = dm_cache_policy_register(&default_policy_type); 1884 r = dm_cache_policy_register(&default_policy_type);
1793 if (r) { 1885 if (r) {
1794 DMERR("register failed (as default) %d", r); 1886 DMERR("register failed (as default) %d", r);
1795 dm_cache_policy_unregister(&mq_policy_type); 1887 goto out_default;
1796 dm_cache_policy_unregister(&smq_policy_type);
1797 return -ENOMEM;
1798 } 1888 }
1799 1889
1800 return 0; 1890 return 0;
1891
1892out_default:
1893 dm_cache_policy_unregister(&cleaner_policy_type);
1894out_cleaner:
1895 dm_cache_policy_unregister(&mq_policy_type);
1896out_mq:
1897 dm_cache_policy_unregister(&smq_policy_type);
1898
1899 return -ENOMEM;
1801} 1900}
1802 1901
1803static void __exit smq_exit(void) 1902static void __exit smq_exit(void)
1804{ 1903{
1904 dm_cache_policy_unregister(&cleaner_policy_type);
1805 dm_cache_policy_unregister(&smq_policy_type); 1905 dm_cache_policy_unregister(&smq_policy_type);
1806 dm_cache_policy_unregister(&mq_policy_type); 1906 dm_cache_policy_unregister(&mq_policy_type);
1807 dm_cache_policy_unregister(&default_policy_type); 1907 dm_cache_policy_unregister(&default_policy_type);
@@ -1816,3 +1916,4 @@ MODULE_DESCRIPTION("smq cache policy");
1816 1916
1817MODULE_ALIAS("dm-cache-default"); 1917MODULE_ALIAS("dm-cache-default");
1818MODULE_ALIAS("dm-cache-mq"); 1918MODULE_ALIAS("dm-cache-mq");
1919MODULE_ALIAS("dm-cache-cleaner");
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index aa10b1493f34..c05fc3436cef 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -13,183 +13,100 @@
13 13
14/*----------------------------------------------------------------*/ 14/*----------------------------------------------------------------*/
15 15
16/* FIXME: make it clear which methods are optional. Get debug policy to
17 * double check this at start.
18 */
19
20/* 16/*
21 * The cache policy makes the important decisions about which blocks get to 17 * The cache policy makes the important decisions about which blocks get to
22 * live on the faster cache device. 18 * live on the faster cache device.
23 *
24 * When the core target has to remap a bio it calls the 'map' method of the
25 * policy. This returns an instruction telling the core target what to do.
26 *
27 * POLICY_HIT:
28 * That block is in the cache. Remap to the cache and carry on.
29 *
30 * POLICY_MISS:
31 * This block is on the origin device. Remap and carry on.
32 *
33 * POLICY_NEW:
34 * This block is currently on the origin device, but the policy wants to
35 * move it. The core should:
36 *
37 * - hold any further io to this origin block
38 * - copy the origin to the given cache block
39 * - release all the held blocks
40 * - remap the original block to the cache
41 *
42 * POLICY_REPLACE:
43 * This block is currently on the origin device. The policy wants to
44 * move it to the cache, with the added complication that the destination
45 * cache block needs a writeback first. The core should:
46 *
47 * - hold any further io to this origin block
48 * - hold any further io to the origin block that's being written back
49 * - writeback
50 * - copy new block to cache
51 * - release held blocks
52 * - remap bio to cache and reissue.
53 *
54 * Should the core run into trouble while processing a POLICY_NEW or
55 * POLICY_REPLACE instruction it will roll back the policies mapping using
56 * remove_mapping() or force_mapping(). These methods must not fail. This
57 * approach avoids having transactional semantics in the policy (ie, the
58 * core informing the policy when a migration is complete), and hence makes
59 * it easier to write new policies.
60 *
61 * In general policy methods should never block, except in the case of the
62 * map function when can_migrate is set. So be careful to implement using
63 * bounded, preallocated memory.
64 */ 19 */
65enum policy_operation { 20enum policy_operation {
66 POLICY_HIT, 21 POLICY_PROMOTE,
67 POLICY_MISS, 22 POLICY_DEMOTE,
68 POLICY_NEW, 23 POLICY_WRITEBACK
69 POLICY_REPLACE
70};
71
72/*
73 * When issuing a POLICY_REPLACE the policy needs to make a callback to
74 * lock the block being demoted. This doesn't need to occur during a
75 * writeback operation since the block remains in the cache.
76 */
77struct policy_locker;
78typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
79
80struct policy_locker {
81 policy_lock_fn fn;
82}; 24};
83 25
84/* 26/*
85 * This is the instruction passed back to the core target. 27 * This is the instruction passed back to the core target.
86 */ 28 */
87struct policy_result { 29struct policy_work {
88 enum policy_operation op; 30 enum policy_operation op;
89 dm_oblock_t old_oblock; /* POLICY_REPLACE */ 31 dm_oblock_t oblock;
90 dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */ 32 dm_cblock_t cblock;
91}; 33};
92 34
93/* 35/*
94 * The cache policy object. Just a bunch of methods. It is envisaged that 36 * The cache policy object. It is envisaged that this structure will be
95 * this structure will be embedded in a bigger, policy specific structure 37 * embedded in a bigger, policy specific structure (ie. use container_of()).
96 * (ie. use container_of()).
97 */ 38 */
98struct dm_cache_policy { 39struct dm_cache_policy {
99
100 /*
101 * FIXME: make it clear which methods are optional, and which may
102 * block.
103 */
104
105 /* 40 /*
106 * Destroys this object. 41 * Destroys this object.
107 */ 42 */
108 void (*destroy)(struct dm_cache_policy *p); 43 void (*destroy)(struct dm_cache_policy *p);
109 44
110 /* 45 /*
111 * See large comment above. 46 * Find the location of a block.
112 *
113 * oblock - the origin block we're interested in.
114 *
115 * can_block - indicates whether the current thread is allowed to
116 * block. -EWOULDBLOCK returned if it can't and would.
117 *
118 * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
119 * instructions. If denied and the policy would have
120 * returned one of these instructions it should
121 * return -EWOULDBLOCK.
122 * 47 *
123 * discarded_oblock - indicates whether the whole origin block is 48 * Must not block.
124 * in a discarded state (FIXME: better to tell the
125 * policy about this sooner, so it can recycle that
126 * cache block if it wants.)
127 * bio - the bio that triggered this call.
128 * result - gets filled in with the instruction.
129 * 49 *
130 * May only return 0, or -EWOULDBLOCK (if !can_migrate) 50 * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for
51 * other errors (-EWOULDBLOCK would be typical). data_dir should be
52 * READ or WRITE. fast_copy should be set if migrating this block would
53 * be 'cheap' somehow (eg, discarded data). background_queued will be set
54 * if a migration has just been queued.
131 */ 55 */
132 int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock, 56 int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
133 bool can_block, bool can_migrate, bool discarded_oblock, 57 int data_dir, bool fast_copy, bool *background_queued);
134 struct bio *bio, struct policy_locker *locker,
135 struct policy_result *result);
136 58
137 /* 59 /*
138 * Sometimes we want to see if a block is in the cache, without 60 * Sometimes the core target can optimise a migration, eg, the
139 * triggering any update of stats. (ie. it's not a real hit). 61 * block may be discarded, or the bio may cover an entire block.
140 * 62 * In order to optimise it needs the migration immediately though
141 * Must not block. 63 * so it knows to do something different with the bio.
142 * 64 *
143 * Returns 0 if in cache, -ENOENT if not, < 0 for other errors 65 * This method is optional (policy-internal will fallback to using
144 * (-EWOULDBLOCK would be typical). 66 * lookup).
145 */ 67 */
146 int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); 68 int (*lookup_with_work)(struct dm_cache_policy *p,
147 69 dm_oblock_t oblock, dm_cblock_t *cblock,
148 void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); 70 int data_dir, bool fast_copy,
149 void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); 71 struct policy_work **work);
150 72
151 /* 73 /*
152 * Called when a cache target is first created. Used to load a 74 * Retrieves background work. Returns -ENODATA when there's no
153 * mapping from the metadata device into the policy. 75 * background work.
154 */ 76 */
155 int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, 77 int (*get_background_work)(struct dm_cache_policy *p, bool idle,
156 dm_cblock_t cblock, uint32_t hint, bool hint_valid); 78 struct policy_work **result);
157 79
158 /* 80 /*
159 * Gets the hint for a given cblock. Called in a single threaded 81 * You must pass in the same work pointer that you were given, not
160 * context. So no locking required. 82 * a copy.
161 */ 83 */
162 uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); 84 void (*complete_background_work)(struct dm_cache_policy *p,
85 struct policy_work *work,
86 bool success);
87
88 void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
89 void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
163 90
164 /* 91 /*
165 * Override functions used on the error paths of the core target. 92 * Called when a cache target is first created. Used to load a
166 * They must succeed. 93 * mapping from the metadata device into the policy.
167 */ 94 */
168 void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock); 95 int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
169 void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, 96 dm_cblock_t cblock, bool dirty,
170 dm_oblock_t new_oblock); 97 uint32_t hint, bool hint_valid);
171 98
172 /* 99 /*
173 * This is called via the invalidate_cblocks message. It is 100 * Drops the mapping, irrespective of whether it's clean or dirty.
174 * possible the particular cblock has already been removed due to a 101 * Returns -ENODATA if cblock is not mapped.
175 * write io in passthrough mode. In which case this should return
176 * -ENODATA.
177 */ 102 */
178 int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock); 103 int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock);
179 104
180 /* 105 /*
181 * Provide a dirty block to be written back by the core target. If 106 * Gets the hint for a given cblock. Called in a single threaded
182 * critical_only is set then the policy should only provide work if 107 * context. So no locking required.
183 * it urgently needs it.
184 *
185 * Returns:
186 *
187 * 0 and @cblock,@oblock: block to write back provided
188 *
189 * -ENODATA: no dirty blocks available
190 */ 108 */
191 int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock, 109 uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);
192 bool critical_only);
193 110
194 /* 111 /*
195 * How full is the cache? 112 * How full is the cache?
@@ -202,6 +119,8 @@ struct dm_cache_policy {
202 * queue merging has occurred). To stop the policy being fooled by 119 * queue merging has occurred). To stop the policy being fooled by
203 * these, the core target sends regular tick() calls to the policy. 120 * these, the core target sends regular tick() calls to the policy.
204 * The policy should only count an entry as hit once per tick. 121 * The policy should only count an entry as hit once per tick.
122 *
123 * This method is optional.
205 */ 124 */
206 void (*tick)(struct dm_cache_policy *p, bool can_block); 125 void (*tick)(struct dm_cache_policy *p, bool can_block);
207 126
@@ -213,6 +132,8 @@ struct dm_cache_policy {
213 int (*set_config_value)(struct dm_cache_policy *p, 132 int (*set_config_value)(struct dm_cache_policy *p,
214 const char *key, const char *value); 133 const char *key, const char *value);
215 134
135 void (*allow_migrations)(struct dm_cache_policy *p, bool allow);
136
216 /* 137 /*
217 * Book keeping ptr for the policy register, not for general use. 138 * Book keeping ptr for the policy register, not for general use.
218 */ 139 */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 2eaa414e1509..b7de289a10bb 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -5,7 +5,7 @@
5 */ 5 */
6 6
7#include "dm.h" 7#include "dm.h"
8#include "dm-bio-prison-v1.h" 8#include "dm-bio-prison-v2.h"
9#include "dm-bio-record.h" 9#include "dm-bio-record.h"
10#include "dm-cache-metadata.h" 10#include "dm-cache-metadata.h"
11 11
@@ -15,6 +15,7 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/mempool.h> 16#include <linux/mempool.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rwsem.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
19#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
20 21
@@ -25,7 +26,18 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
25 26
26/*----------------------------------------------------------------*/ 27/*----------------------------------------------------------------*/
27 28
28#define IOT_RESOLUTION 4 29/*
30 * Glossary:
31 *
32 * oblock: index of an origin block
33 * cblock: index of a cache block
34 * promotion: movement of a block from origin to cache
35 * demotion: movement of a block from cache to origin
36 * migration: movement of a block between the origin and cache device,
37 * either direction
38 */
39
40/*----------------------------------------------------------------*/
29 41
30struct io_tracker { 42struct io_tracker {
31 spinlock_t lock; 43 spinlock_t lock;
@@ -99,19 +111,178 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
99/*----------------------------------------------------------------*/ 111/*----------------------------------------------------------------*/
100 112
101/* 113/*
102 * Glossary: 114 * Represents a chunk of future work. 'input' allows continuations to pass
103 * 115 * values between themselves, typically error values.
104 * oblock: index of an origin block
105 * cblock: index of a cache block
106 * promotion: movement of a block from origin to cache
107 * demotion: movement of a block from cache to origin
108 * migration: movement of a block between the origin and cache device,
109 * either direction
110 */ 116 */
117struct continuation {
118 struct work_struct ws;
119 int input;
120};
121
122static inline void init_continuation(struct continuation *k,
123 void (*fn)(struct work_struct *))
124{
125 INIT_WORK(&k->ws, fn);
126 k->input = 0;
127}
128
129static inline void queue_continuation(struct workqueue_struct *wq,
130 struct continuation *k)
131{
132 queue_work(wq, &k->ws);
133}
111 134
112/*----------------------------------------------------------------*/ 135/*----------------------------------------------------------------*/
113 136
114/* 137/*
138 * The batcher collects together pieces of work that need a particular
139 * operation to occur before they can proceed (typically a commit).
140 */
141struct batcher {
142 /*
143 * The operation that everyone is waiting for.
144 */
145 int (*commit_op)(void *context);
146 void *commit_context;
147
148 /*
149 * This is how bios should be issued once the commit op is complete
150 * (accounted_request).
151 */
152 void (*issue_op)(struct bio *bio, void *context);
153 void *issue_context;
154
155 /*
156 * Queued work gets put on here after commit.
157 */
158 struct workqueue_struct *wq;
159
160 spinlock_t lock;
161 struct list_head work_items;
162 struct bio_list bios;
163 struct work_struct commit_work;
164
165 bool commit_scheduled;
166};
167
168static void __commit(struct work_struct *_ws)
169{
170 struct batcher *b = container_of(_ws, struct batcher, commit_work);
171
172 int r;
173 unsigned long flags;
174 struct list_head work_items;
175 struct work_struct *ws, *tmp;
176 struct continuation *k;
177 struct bio *bio;
178 struct bio_list bios;
179
180 INIT_LIST_HEAD(&work_items);
181 bio_list_init(&bios);
182
183 /*
184 * We have to grab these before the commit_op to avoid a race
185 * condition.
186 */
187 spin_lock_irqsave(&b->lock, flags);
188 list_splice_init(&b->work_items, &work_items);
189 bio_list_merge(&bios, &b->bios);
190 bio_list_init(&b->bios);
191 b->commit_scheduled = false;
192 spin_unlock_irqrestore(&b->lock, flags);
193
194 r = b->commit_op(b->commit_context);
195
196 list_for_each_entry_safe(ws, tmp, &work_items, entry) {
197 k = container_of(ws, struct continuation, ws);
198 k->input = r;
199 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
200 queue_work(b->wq, ws);
201 }
202
203 while ((bio = bio_list_pop(&bios))) {
204 if (r) {
205 bio->bi_error = r;
206 bio_endio(bio);
207 } else
208 b->issue_op(bio, b->issue_context);
209 }
210}
211
212static void batcher_init(struct batcher *b,
213 int (*commit_op)(void *),
214 void *commit_context,
215 void (*issue_op)(struct bio *bio, void *),
216 void *issue_context,
217 struct workqueue_struct *wq)
218{
219 b->commit_op = commit_op;
220 b->commit_context = commit_context;
221 b->issue_op = issue_op;
222 b->issue_context = issue_context;
223 b->wq = wq;
224
225 spin_lock_init(&b->lock);
226 INIT_LIST_HEAD(&b->work_items);
227 bio_list_init(&b->bios);
228 INIT_WORK(&b->commit_work, __commit);
229 b->commit_scheduled = false;
230}
231
232static void async_commit(struct batcher *b)
233{
234 queue_work(b->wq, &b->commit_work);
235}
236
237static void continue_after_commit(struct batcher *b, struct continuation *k)
238{
239 unsigned long flags;
240 bool commit_scheduled;
241
242 spin_lock_irqsave(&b->lock, flags);
243 commit_scheduled = b->commit_scheduled;
244 list_add_tail(&k->ws.entry, &b->work_items);
245 spin_unlock_irqrestore(&b->lock, flags);
246
247 if (commit_scheduled)
248 async_commit(b);
249}
250
251/*
252 * Bios are errored if commit failed.
253 */
254static void issue_after_commit(struct batcher *b, struct bio *bio)
255{
256 unsigned long flags;
257 bool commit_scheduled;
258
259 spin_lock_irqsave(&b->lock, flags);
260 commit_scheduled = b->commit_scheduled;
261 bio_list_add(&b->bios, bio);
262 spin_unlock_irqrestore(&b->lock, flags);
263
264 if (commit_scheduled)
265 async_commit(b);
266}
267
268/*
269 * Call this if some urgent work is waiting for the commit to complete.
270 */
271static void schedule_commit(struct batcher *b)
272{
273 bool immediate;
274 unsigned long flags;
275
276 spin_lock_irqsave(&b->lock, flags);
277 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
278 b->commit_scheduled = true;
279 spin_unlock_irqrestore(&b->lock, flags);
280
281 if (immediate)
282 async_commit(b);
283}
284
285/*
115 * There are a couple of places where we let a bio run, but want to do some 286 * There are a couple of places where we let a bio run, but want to do some
116 * work before calling its endio function. We do this by temporarily 287 * work before calling its endio function. We do this by temporarily
117 * changing the endio fn. 288 * changing the endio fn.
@@ -189,31 +360,13 @@ struct cache_stats {
189 atomic_t write_miss; 360 atomic_t write_miss;
190 atomic_t demotion; 361 atomic_t demotion;
191 atomic_t promotion; 362 atomic_t promotion;
363 atomic_t writeback;
192 atomic_t copies_avoided; 364 atomic_t copies_avoided;
193 atomic_t cache_cell_clash; 365 atomic_t cache_cell_clash;
194 atomic_t commit_count; 366 atomic_t commit_count;
195 atomic_t discard_count; 367 atomic_t discard_count;
196}; 368};
197 369
198/*
199 * Defines a range of cblocks, begin to (end - 1) are in the range. end is
200 * the one-past-the-end value.
201 */
202struct cblock_range {
203 dm_cblock_t begin;
204 dm_cblock_t end;
205};
206
207struct invalidation_request {
208 struct list_head list;
209 struct cblock_range *cblocks;
210
211 atomic_t complete;
212 int err;
213
214 wait_queue_head_t result_wait;
215};
216
217struct cache { 370struct cache {
218 struct dm_target *ti; 371 struct dm_target *ti;
219 struct dm_target_callbacks callbacks; 372 struct dm_target_callbacks callbacks;
@@ -255,11 +408,7 @@ struct cache {
255 spinlock_t lock; 408 spinlock_t lock;
256 struct list_head deferred_cells; 409 struct list_head deferred_cells;
257 struct bio_list deferred_bios; 410 struct bio_list deferred_bios;
258 struct bio_list deferred_flush_bios;
259 struct bio_list deferred_writethrough_bios; 411 struct bio_list deferred_writethrough_bios;
260 struct list_head quiesced_migrations;
261 struct list_head completed_migrations;
262 struct list_head need_commit_migrations;
263 sector_t migration_threshold; 412 sector_t migration_threshold;
264 wait_queue_head_t migration_wait; 413 wait_queue_head_t migration_wait;
265 atomic_t nr_allocated_migrations; 414 atomic_t nr_allocated_migrations;
@@ -270,9 +419,7 @@ struct cache {
270 */ 419 */
271 atomic_t nr_io_migrations; 420 atomic_t nr_io_migrations;
272 421
273 wait_queue_head_t quiescing_wait; 422 struct rw_semaphore quiesce_lock;
274 atomic_t quiescing;
275 atomic_t quiescing_ack;
276 423
277 /* 424 /*
278 * cache_size entries, dirty if set 425 * cache_size entries, dirty if set
@@ -296,13 +443,11 @@ struct cache {
296 443
297 struct dm_kcopyd_client *copier; 444 struct dm_kcopyd_client *copier;
298 struct workqueue_struct *wq; 445 struct workqueue_struct *wq;
299 struct work_struct worker; 446 struct work_struct deferred_bio_worker;
300 447 struct work_struct deferred_writethrough_worker;
448 struct work_struct migration_worker;
301 struct delayed_work waker; 449 struct delayed_work waker;
302 unsigned long last_commit_jiffies; 450 struct dm_bio_prison_v2 *prison;
303
304 struct dm_bio_prison *prison;
305 struct dm_deferred_set *all_io_ds;
306 451
307 mempool_t *migration_pool; 452 mempool_t *migration_pool;
308 453
@@ -330,12 +475,17 @@ struct cache {
330 struct list_head invalidation_requests; 475 struct list_head invalidation_requests;
331 476
332 struct io_tracker origin_tracker; 477 struct io_tracker origin_tracker;
478
479 struct work_struct commit_ws;
480 struct batcher committer;
481
482 struct rw_semaphore background_work_lock;
333}; 483};
334 484
335struct per_bio_data { 485struct per_bio_data {
336 bool tick:1; 486 bool tick:1;
337 unsigned req_nr:2; 487 unsigned req_nr:2;
338 struct dm_deferred_entry *all_io_entry; 488 struct dm_bio_prison_cell_v2 *cell;
339 struct dm_hook_info hook_info; 489 struct dm_hook_info hook_info;
340 sector_t len; 490 sector_t len;
341 491
@@ -350,55 +500,64 @@ struct per_bio_data {
350}; 500};
351 501
352struct dm_cache_migration { 502struct dm_cache_migration {
353 struct list_head list; 503 struct continuation k;
354 struct cache *cache; 504 struct cache *cache;
355 505
356 unsigned long start_jiffies; 506 struct policy_work *op;
357 dm_oblock_t old_oblock; 507 struct bio *overwrite_bio;
358 dm_oblock_t new_oblock; 508 struct dm_bio_prison_cell_v2 *cell;
359 dm_cblock_t cblock;
360
361 bool err:1;
362 bool discard:1;
363 bool writeback:1;
364 bool demote:1;
365 bool promote:1;
366 bool requeue_holder:1;
367 bool invalidate:1;
368 509
369 struct dm_bio_prison_cell *old_ocell; 510 dm_cblock_t invalidate_cblock;
370 struct dm_bio_prison_cell *new_ocell; 511 dm_oblock_t invalidate_oblock;
371}; 512};
372 513
373/* 514/*----------------------------------------------------------------*/
374 * Processing a bio in the worker thread may require these memory 515
375 * allocations. We prealloc to avoid deadlocks (the same worker thread 516static bool writethrough_mode(struct cache_features *f)
376 * frees them back to the mempool). 517{
377 */ 518 return f->io_mode == CM_IO_WRITETHROUGH;
378struct prealloc { 519}
379 struct dm_cache_migration *mg;
380 struct dm_bio_prison_cell *cell1;
381 struct dm_bio_prison_cell *cell2;
382};
383 520
384static enum cache_metadata_mode get_cache_mode(struct cache *cache); 521static bool writeback_mode(struct cache_features *f)
522{
523 return f->io_mode == CM_IO_WRITEBACK;
524}
385 525
386static void wake_worker(struct cache *cache) 526static inline bool passthrough_mode(struct cache_features *f)
387{ 527{
388 queue_work(cache->wq, &cache->worker); 528 return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
389} 529}
390 530
391/*----------------------------------------------------------------*/ 531/*----------------------------------------------------------------*/
392 532
393static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 533static void wake_deferred_bio_worker(struct cache *cache)
394{ 534{
395 /* FIXME: change to use a local slab. */ 535 queue_work(cache->wq, &cache->deferred_bio_worker);
396 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
397} 536}
398 537
399static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 538static void wake_deferred_writethrough_worker(struct cache *cache)
400{ 539{
401 dm_bio_prison_free_cell(cache->prison, cell); 540 queue_work(cache->wq, &cache->deferred_writethrough_worker);
541}
542
543static void wake_migration_worker(struct cache *cache)
544{
545 if (passthrough_mode(&cache->features))
546 return;
547
548 queue_work(cache->wq, &cache->migration_worker);
549}
550
551/*----------------------------------------------------------------*/
552
553static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
554{
555 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
556}
557
558static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
559{
560 dm_bio_prison_free_cell_v2(cache->prison, cell);
402} 561}
403 562
404static struct dm_cache_migration *alloc_migration(struct cache *cache) 563static struct dm_cache_migration *alloc_migration(struct cache *cache)
@@ -424,146 +583,127 @@ static void free_migration(struct dm_cache_migration *mg)
424 mempool_free(mg, cache->migration_pool); 583 mempool_free(mg, cache->migration_pool);
425} 584}
426 585
427static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 586/*----------------------------------------------------------------*/
428{
429 if (!p->mg) {
430 p->mg = alloc_migration(cache);
431 if (!p->mg)
432 return -ENOMEM;
433 }
434
435 if (!p->cell1) {
436 p->cell1 = alloc_prison_cell(cache);
437 if (!p->cell1)
438 return -ENOMEM;
439 }
440
441 if (!p->cell2) {
442 p->cell2 = alloc_prison_cell(cache);
443 if (!p->cell2)
444 return -ENOMEM;
445 }
446
447 return 0;
448}
449 587
450static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 588static inline dm_oblock_t oblock_succ(dm_oblock_t b)
451{ 589{
452 if (p->cell2) 590 return to_oblock(from_oblock(b) + 1ull);
453 free_prison_cell(cache, p->cell2);
454
455 if (p->cell1)
456 free_prison_cell(cache, p->cell1);
457
458 if (p->mg)
459 free_migration(p->mg);
460} 591}
461 592
462static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 593static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
463{ 594{
464 struct dm_cache_migration *mg = p->mg; 595 key->virtual = 0;
465 596 key->dev = 0;
466 BUG_ON(!mg); 597 key->block_begin = from_oblock(begin);
467 p->mg = NULL; 598 key->block_end = from_oblock(end);
468
469 return mg;
470} 599}
471 600
472/* 601/*
473 * You must have a cell within the prealloc struct to return. If not this 602 * We have two lock levels. Level 0, which is used to prevent WRITEs, and
474 * function will BUG() rather than returning NULL. 603 * level 1 which prevents *both* READs and WRITEs.
475 */ 604 */
476static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 605#define WRITE_LOCK_LEVEL 0
606#define READ_WRITE_LOCK_LEVEL 1
607
608static unsigned lock_level(struct bio *bio)
477{ 609{
478 struct dm_bio_prison_cell *r = NULL; 610 return bio_data_dir(bio) == WRITE ?
611 WRITE_LOCK_LEVEL :
612 READ_WRITE_LOCK_LEVEL;
613}
479 614
480 if (p->cell1) { 615/*----------------------------------------------------------------
481 r = p->cell1; 616 * Per bio data
482 p->cell1 = NULL; 617 *--------------------------------------------------------------*/
483 618
484 } else if (p->cell2) { 619/*
485 r = p->cell2; 620 * If using writeback, leave out struct per_bio_data's writethrough fields.
486 p->cell2 = NULL; 621 */
487 } else 622#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
488 BUG(); 623#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
489 624
490 return r; 625static size_t get_per_bio_data_size(struct cache *cache)
626{
627 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
491} 628}
492 629
493/* 630static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
494 * You can't have more than two cells in a prealloc struct. BUG() will be
495 * called if you try and overfill.
496 */
497static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
498{ 631{
499 if (!p->cell2) 632 struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
500 p->cell2 = cell; 633 BUG_ON(!pb);
634 return pb;
635}
501 636
502 else if (!p->cell1) 637static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
503 p->cell1 = cell; 638{
639 struct per_bio_data *pb = get_per_bio_data(bio, data_size);
504 640
505 else 641 pb->tick = false;
506 BUG(); 642 pb->req_nr = dm_bio_get_target_bio_nr(bio);
643 pb->cell = NULL;
644 pb->len = 0;
645
646 return pb;
507} 647}
508 648
509/*----------------------------------------------------------------*/ 649/*----------------------------------------------------------------*/
510 650
511static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 651static void defer_bio(struct cache *cache, struct bio *bio)
512{ 652{
513 key->virtual = 0; 653 unsigned long flags;
514 key->dev = 0;
515 key->block_begin = from_oblock(begin);
516 key->block_end = from_oblock(end);
517}
518 654
519/* 655 spin_lock_irqsave(&cache->lock, flags);
520 * The caller hands in a preallocated cell, and a free function for it. 656 bio_list_add(&cache->deferred_bios, bio);
521 * The cell will be freed if there's an error, or if it wasn't used because 657 spin_unlock_irqrestore(&cache->lock, flags);
522 * a cell with that key already exists.
523 */
524typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
525 658
526static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 659 wake_deferred_bio_worker(cache);
527 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 660}
528 cell_free_fn free_fn, void *free_context, 661
529 struct dm_bio_prison_cell **cell_result) 662static void defer_bios(struct cache *cache, struct bio_list *bios)
530{ 663{
531 int r; 664 unsigned long flags;
532 struct dm_cell_key key;
533 665
534 build_key(oblock_begin, oblock_end, &key); 666 spin_lock_irqsave(&cache->lock, flags);
535 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 667 bio_list_merge(&cache->deferred_bios, bios);
536 if (r) 668 bio_list_init(bios);
537 free_fn(free_context, cell_prealloc); 669 spin_unlock_irqrestore(&cache->lock, flags);
538 670
539 return r; 671 wake_deferred_bio_worker(cache);
540} 672}
541 673
542static int bio_detain(struct cache *cache, dm_oblock_t oblock, 674/*----------------------------------------------------------------*/
543 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 675
544 cell_free_fn free_fn, void *free_context, 676static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
545 struct dm_bio_prison_cell **cell_result)
546{ 677{
678 bool r;
679 size_t pb_size;
680 struct per_bio_data *pb;
681 struct dm_cell_key_v2 key;
547 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 682 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
548 return bio_detain_range(cache, oblock, end, bio, 683 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
549 cell_prealloc, free_fn, free_context, cell_result);
550}
551 684
552static int get_cell(struct cache *cache, 685 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
553 dm_oblock_t oblock, 686 if (!cell_prealloc) {
554 struct prealloc *structs, 687 defer_bio(cache, bio);
555 struct dm_bio_prison_cell **cell_result) 688 return false;
556{ 689 }
557 int r;
558 struct dm_cell_key key;
559 struct dm_bio_prison_cell *cell_prealloc;
560 690
561 cell_prealloc = prealloc_get_cell(structs); 691 build_key(oblock, end, &key);
692 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
693 if (!r) {
694 /*
695 * Failed to get the lock.
696 */
697 free_prison_cell(cache, cell_prealloc);
698 return r;
699 }
562 700
563 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 701 if (cell != cell_prealloc)
564 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 702 free_prison_cell(cache, cell_prealloc);
565 if (r) 703
566 prealloc_put_cell(structs, cell_prealloc); 704 pb_size = get_per_bio_data_size(cache);
705 pb = get_per_bio_data(bio, pb_size);
706 pb->cell = cell;
567 707
568 return r; 708 return r;
569} 709}
@@ -575,21 +715,33 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b)
575 return test_bit(from_cblock(b), cache->dirty_bitset); 715 return test_bit(from_cblock(b), cache->dirty_bitset);
576} 716}
577 717
578static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 718static void set_dirty(struct cache *cache, dm_cblock_t cblock)
579{ 719{
580 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 720 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
581 atomic_inc(&cache->nr_dirty); 721 atomic_inc(&cache->nr_dirty);
582 policy_set_dirty(cache->policy, oblock); 722 policy_set_dirty(cache->policy, cblock);
583 } 723 }
584} 724}
585 725
586static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 726/*
727 * These two are called when setting after migrations to force the policy
728 * and dirty bitset to be in sync.
729 */
730static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
731{
732 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
733 atomic_inc(&cache->nr_dirty);
734 policy_set_dirty(cache->policy, cblock);
735}
736
737static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
587{ 738{
588 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 739 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
589 policy_clear_dirty(cache->policy, oblock);
590 if (atomic_dec_return(&cache->nr_dirty) == 0) 740 if (atomic_dec_return(&cache->nr_dirty) == 0)
591 dm_table_event(cache->ti->table); 741 dm_table_event(cache->ti->table);
592 } 742 }
743
744 policy_clear_dirty(cache->policy, cblock);
593} 745}
594 746
595/*----------------------------------------------------------------*/ 747/*----------------------------------------------------------------*/
@@ -628,11 +780,6 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
628 oblocks_per_dblock(cache))); 780 oblocks_per_dblock(cache)));
629} 781}
630 782
631static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
632{
633 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
634}
635
636static void set_discard(struct cache *cache, dm_dblock_t b) 783static void set_discard(struct cache *cache, dm_dblock_t b)
637{ 784{
638 unsigned long flags; 785 unsigned long flags;
@@ -679,83 +826,6 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
679 return r; 826 return r;
680} 827}
681 828
682/*----------------------------------------------------------------*/
683
684static void load_stats(struct cache *cache)
685{
686 struct dm_cache_statistics stats;
687
688 dm_cache_metadata_get_stats(cache->cmd, &stats);
689 atomic_set(&cache->stats.read_hit, stats.read_hits);
690 atomic_set(&cache->stats.read_miss, stats.read_misses);
691 atomic_set(&cache->stats.write_hit, stats.write_hits);
692 atomic_set(&cache->stats.write_miss, stats.write_misses);
693}
694
695static void save_stats(struct cache *cache)
696{
697 struct dm_cache_statistics stats;
698
699 if (get_cache_mode(cache) >= CM_READ_ONLY)
700 return;
701
702 stats.read_hits = atomic_read(&cache->stats.read_hit);
703 stats.read_misses = atomic_read(&cache->stats.read_miss);
704 stats.write_hits = atomic_read(&cache->stats.write_hit);
705 stats.write_misses = atomic_read(&cache->stats.write_miss);
706
707 dm_cache_metadata_set_stats(cache->cmd, &stats);
708}
709
710/*----------------------------------------------------------------
711 * Per bio data
712 *--------------------------------------------------------------*/
713
714/*
715 * If using writeback, leave out struct per_bio_data's writethrough fields.
716 */
717#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
718#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
719
720static bool writethrough_mode(struct cache_features *f)
721{
722 return f->io_mode == CM_IO_WRITETHROUGH;
723}
724
725static bool writeback_mode(struct cache_features *f)
726{
727 return f->io_mode == CM_IO_WRITEBACK;
728}
729
730static bool passthrough_mode(struct cache_features *f)
731{
732 return f->io_mode == CM_IO_PASSTHROUGH;
733}
734
735static size_t get_per_bio_data_size(struct cache *cache)
736{
737 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
738}
739
740static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
741{
742 struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
743 BUG_ON(!pb);
744 return pb;
745}
746
747static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
748{
749 struct per_bio_data *pb = get_per_bio_data(bio, data_size);
750
751 pb->tick = false;
752 pb->req_nr = dm_bio_get_target_bio_nr(bio);
753 pb->all_io_entry = NULL;
754 pb->len = 0;
755
756 return pb;
757}
758
759/*---------------------------------------------------------------- 829/*----------------------------------------------------------------
760 * Remapping 830 * Remapping
761 *--------------------------------------------------------------*/ 831 *--------------------------------------------------------------*/
@@ -797,8 +867,9 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
797} 867}
798 868
799static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 869static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
800 dm_oblock_t oblock) 870 dm_oblock_t oblock)
801{ 871{
872 // FIXME: this is called way too much.
802 check_if_tick_bio_needed(cache, bio); 873 check_if_tick_bio_needed(cache, bio);
803 remap_to_origin(cache, bio); 874 remap_to_origin(cache, bio);
804 if (bio_data_dir(bio) == WRITE) 875 if (bio_data_dir(bio) == WRITE)
@@ -811,7 +882,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
811 check_if_tick_bio_needed(cache, bio); 882 check_if_tick_bio_needed(cache, bio);
812 remap_to_cache(cache, bio, cblock); 883 remap_to_cache(cache, bio, cblock);
813 if (bio_data_dir(bio) == WRITE) { 884 if (bio_data_dir(bio) == WRITE) {
814 set_dirty(cache, oblock, cblock); 885 set_dirty(cache, cblock);
815 clear_discard(cache, oblock_to_dblock(cache, oblock)); 886 clear_discard(cache, oblock_to_dblock(cache, oblock));
816 } 887 }
817} 888}
@@ -828,22 +899,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
828 return to_oblock(block_nr); 899 return to_oblock(block_nr);
829} 900}
830 901
831/*
832 * You must increment the deferred set whilst the prison cell is held. To
833 * encourage this, we ask for 'cell' to be passed in.
834 */
835static void inc_ds(struct cache *cache, struct bio *bio,
836 struct dm_bio_prison_cell *cell)
837{
838 size_t pb_data_size = get_per_bio_data_size(cache);
839 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
840
841 BUG_ON(!cell);
842 BUG_ON(pb->all_io_entry);
843
844 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
845}
846
847static bool accountable_bio(struct cache *cache, struct bio *bio) 902static bool accountable_bio(struct cache *cache, struct bio *bio)
848{ 903{
849 return ((bio->bi_bdev == cache->origin_dev->bdev) && 904 return ((bio->bi_bdev == cache->origin_dev->bdev) &&
@@ -875,29 +930,10 @@ static void accounted_request(struct cache *cache, struct bio *bio)
875 generic_make_request(bio); 930 generic_make_request(bio);
876} 931}
877 932
878static void issue(struct cache *cache, struct bio *bio) 933static void issue_op(struct bio *bio, void *context)
879{ 934{
880 unsigned long flags; 935 struct cache *cache = context;
881 936 accounted_request(cache, bio);
882 if (!op_is_flush(bio->bi_opf)) {
883 accounted_request(cache, bio);
884 return;
885 }
886
887 /*
888 * Batch together any bios that trigger commits and then issue a
889 * single commit for them in do_worker().
890 */
891 spin_lock_irqsave(&cache->lock, flags);
892 cache->commit_requested = true;
893 bio_list_add(&cache->deferred_flush_bios, bio);
894 spin_unlock_irqrestore(&cache->lock, flags);
895}
896
897static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
898{
899 inc_ds(cache, bio, cell);
900 issue(cache, bio);
901} 937}
902 938
903static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 939static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
@@ -908,7 +944,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
908 bio_list_add(&cache->deferred_writethrough_bios, bio); 944 bio_list_add(&cache->deferred_writethrough_bios, bio);
909 spin_unlock_irqrestore(&cache->lock, flags); 945 spin_unlock_irqrestore(&cache->lock, flags);
910 946
911 wake_worker(cache); 947 wake_deferred_writethrough_worker(cache);
912} 948}
913 949
914static void writethrough_endio(struct bio *bio) 950static void writethrough_endio(struct bio *bio)
@@ -934,6 +970,7 @@ static void writethrough_endio(struct bio *bio)
934} 970}
935 971
936/* 972/*
973 * FIXME: send in parallel, huge latency as is.
937 * When running in writethrough mode we need to send writes to clean blocks 974 * When running in writethrough mode we need to send writes to clean blocks
938 * to both the cache and origin devices. In future we'd like to clone the 975 * to both the cache and origin devices. In future we'd like to clone the
939 * bio and send them in parallel, but for now we're doing them in 976 * bio and send them in parallel, but for now we're doing them in
@@ -1046,12 +1083,58 @@ static void metadata_operation_failed(struct cache *cache, const char *op, int r
1046 set_cache_mode(cache, CM_READ_ONLY); 1083 set_cache_mode(cache, CM_READ_ONLY);
1047} 1084}
1048 1085
1086/*----------------------------------------------------------------*/
1087
1088static void load_stats(struct cache *cache)
1089{
1090 struct dm_cache_statistics stats;
1091
1092 dm_cache_metadata_get_stats(cache->cmd, &stats);
1093 atomic_set(&cache->stats.read_hit, stats.read_hits);
1094 atomic_set(&cache->stats.read_miss, stats.read_misses);
1095 atomic_set(&cache->stats.write_hit, stats.write_hits);
1096 atomic_set(&cache->stats.write_miss, stats.write_misses);
1097}
1098
1099static void save_stats(struct cache *cache)
1100{
1101 struct dm_cache_statistics stats;
1102
1103 if (get_cache_mode(cache) >= CM_READ_ONLY)
1104 return;
1105
1106 stats.read_hits = atomic_read(&cache->stats.read_hit);
1107 stats.read_misses = atomic_read(&cache->stats.read_miss);
1108 stats.write_hits = atomic_read(&cache->stats.write_hit);
1109 stats.write_misses = atomic_read(&cache->stats.write_miss);
1110
1111 dm_cache_metadata_set_stats(cache->cmd, &stats);
1112}
1113
1114static void update_stats(struct cache_stats *stats, enum policy_operation op)
1115{
1116 switch (op) {
1117 case POLICY_PROMOTE:
1118 atomic_inc(&stats->promotion);
1119 break;
1120
1121 case POLICY_DEMOTE:
1122 atomic_inc(&stats->demotion);
1123 break;
1124
1125 case POLICY_WRITEBACK:
1126 atomic_inc(&stats->writeback);
1127 break;
1128 }
1129}
1130
1049/*---------------------------------------------------------------- 1131/*----------------------------------------------------------------
1050 * Migration processing 1132 * Migration processing
1051 * 1133 *
1052 * Migration covers moving data from the origin device to the cache, or 1134 * Migration covers moving data from the origin device to the cache, or
1053 * vice versa. 1135 * vice versa.
1054 *--------------------------------------------------------------*/ 1136 *--------------------------------------------------------------*/
1137
1055static void inc_io_migrations(struct cache *cache) 1138static void inc_io_migrations(struct cache *cache)
1056{ 1139{
1057 atomic_inc(&cache->nr_io_migrations); 1140 atomic_inc(&cache->nr_io_migrations);
@@ -1067,213 +1150,109 @@ static bool discard_or_flush(struct bio *bio)
1067 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1150 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
1068} 1151}
1069 1152
1070static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) 1153static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1071{ 1154 dm_dblock_t *b, dm_dblock_t *e)
1072 if (discard_or_flush(cell->holder)) {
1073 /*
1074 * We have to handle these bios individually.
1075 */
1076 dm_cell_release(cache->prison, cell, &cache->deferred_bios);
1077 free_prison_cell(cache, cell);
1078 } else
1079 list_add_tail(&cell->user_list, &cache->deferred_cells);
1080}
1081
1082static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
1083{ 1155{
1084 unsigned long flags; 1156 sector_t sb = bio->bi_iter.bi_sector;
1085 1157 sector_t se = bio_end_sector(bio);
1086 if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
1087 /*
1088 * There was no prisoner to promote to holder, the
1089 * cell has been released.
1090 */
1091 free_prison_cell(cache, cell);
1092 return;
1093 }
1094 1158
1095 spin_lock_irqsave(&cache->lock, flags); 1159 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1096 __cell_defer(cache, cell);
1097 spin_unlock_irqrestore(&cache->lock, flags);
1098 1160
1099 wake_worker(cache); 1161 if (se - sb < cache->discard_block_size)
1162 *e = *b;
1163 else
1164 *e = to_dblock(block_div(se, cache->discard_block_size));
1100} 1165}
1101 1166
1102static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) 1167/*----------------------------------------------------------------*/
1103{
1104 dm_cell_error(cache->prison, cell, err);
1105 free_prison_cell(cache, cell);
1106}
1107 1168
1108static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) 1169static void prevent_background_work(struct cache *cache)
1109{ 1170{
1110 cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); 1171 lockdep_off();
1172 down_write(&cache->background_work_lock);
1173 lockdep_on();
1111} 1174}
1112 1175
1113static void free_io_migration(struct dm_cache_migration *mg) 1176static void allow_background_work(struct cache *cache)
1114{ 1177{
1115 struct cache *cache = mg->cache; 1178 lockdep_off();
1116 1179 up_write(&cache->background_work_lock);
1117 dec_io_migrations(cache); 1180 lockdep_on();
1118 free_migration(mg);
1119 wake_worker(cache);
1120} 1181}
1121 1182
1122static void migration_failure(struct dm_cache_migration *mg) 1183static bool background_work_begin(struct cache *cache)
1123{ 1184{
1124 struct cache *cache = mg->cache; 1185 bool r;
1125 const char *dev_name = cache_device_name(cache);
1126
1127 if (mg->writeback) {
1128 DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
1129 set_dirty(cache, mg->old_oblock, mg->cblock);
1130 cell_defer(cache, mg->old_ocell, false);
1131
1132 } else if (mg->demote) {
1133 DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
1134 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
1135 1186
1136 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1187 lockdep_off();
1137 if (mg->promote) 1188 r = down_read_trylock(&cache->background_work_lock);
1138 cell_defer(cache, mg->new_ocell, true); 1189 lockdep_on();
1139 } else {
1140 DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
1141 policy_remove_mapping(cache->policy, mg->new_oblock);
1142 cell_defer(cache, mg->new_ocell, true);
1143 }
1144 1190
1145 free_io_migration(mg); 1191 return r;
1146} 1192}
1147 1193
1148static void migration_success_pre_commit(struct dm_cache_migration *mg) 1194static void background_work_end(struct cache *cache)
1149{ 1195{
1150 int r; 1196 lockdep_off();
1151 unsigned long flags; 1197 up_read(&cache->background_work_lock);
1152 struct cache *cache = mg->cache; 1198 lockdep_on();
1153 1199}
1154 if (mg->writeback) {
1155 clear_dirty(cache, mg->old_oblock, mg->cblock);
1156 cell_defer(cache, mg->old_ocell, false);
1157 free_io_migration(mg);
1158 return;
1159 1200
1160 } else if (mg->demote) { 1201/*----------------------------------------------------------------*/
1161 r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
1162 if (r) {
1163 DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
1164 cache_device_name(cache));
1165 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1166 policy_force_mapping(cache->policy, mg->new_oblock,
1167 mg->old_oblock);
1168 if (mg->promote)
1169 cell_defer(cache, mg->new_ocell, true);
1170 free_io_migration(mg);
1171 return;
1172 }
1173 } else {
1174 r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
1175 if (r) {
1176 DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
1177 cache_device_name(cache));
1178 metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1179 policy_remove_mapping(cache->policy, mg->new_oblock);
1180 free_io_migration(mg);
1181 return;
1182 }
1183 }
1184 1202
1185 spin_lock_irqsave(&cache->lock, flags); 1203static void quiesce(struct dm_cache_migration *mg,
1186 list_add_tail(&mg->list, &cache->need_commit_migrations); 1204 void (*continuation)(struct work_struct *))
1187 cache->commit_requested = true; 1205{
1188 spin_unlock_irqrestore(&cache->lock, flags); 1206 init_continuation(&mg->k, continuation);
1207 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
1189} 1208}
1190 1209
1191static void migration_success_post_commit(struct dm_cache_migration *mg) 1210static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
1192{ 1211{
1193 unsigned long flags; 1212 struct continuation *k = container_of(ws, struct continuation, ws);
1194 struct cache *cache = mg->cache; 1213 return container_of(k, struct dm_cache_migration, k);
1195
1196 if (mg->writeback) {
1197 DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
1198 cache_device_name(cache));
1199 return;
1200
1201 } else if (mg->demote) {
1202 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
1203
1204 if (mg->promote) {
1205 mg->demote = false;
1206
1207 spin_lock_irqsave(&cache->lock, flags);
1208 list_add_tail(&mg->list, &cache->quiesced_migrations);
1209 spin_unlock_irqrestore(&cache->lock, flags);
1210
1211 } else {
1212 if (mg->invalidate)
1213 policy_remove_mapping(cache->policy, mg->old_oblock);
1214 free_io_migration(mg);
1215 }
1216
1217 } else {
1218 if (mg->requeue_holder) {
1219 clear_dirty(cache, mg->new_oblock, mg->cblock);
1220 cell_defer(cache, mg->new_ocell, true);
1221 } else {
1222 /*
1223 * The block was promoted via an overwrite, so it's dirty.
1224 */
1225 set_dirty(cache, mg->new_oblock, mg->cblock);
1226 bio_endio(mg->new_ocell->holder);
1227 cell_defer(cache, mg->new_ocell, false);
1228 }
1229 free_io_migration(mg);
1230 }
1231} 1214}
1232 1215
1233static void copy_complete(int read_err, unsigned long write_err, void *context) 1216static void copy_complete(int read_err, unsigned long write_err, void *context)
1234{ 1217{
1235 unsigned long flags; 1218 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
1236 struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
1237 struct cache *cache = mg->cache;
1238 1219
1239 if (read_err || write_err) 1220 if (read_err || write_err)
1240 mg->err = true; 1221 mg->k.input = -EIO;
1241
1242 spin_lock_irqsave(&cache->lock, flags);
1243 list_add_tail(&mg->list, &cache->completed_migrations);
1244 spin_unlock_irqrestore(&cache->lock, flags);
1245 1222
1246 wake_worker(cache); 1223 queue_continuation(mg->cache->wq, &mg->k);
1247} 1224}
1248 1225
1249static void issue_copy(struct dm_cache_migration *mg) 1226static int copy(struct dm_cache_migration *mg, bool promote)
1250{ 1227{
1251 int r; 1228 int r;
1252 struct dm_io_region o_region, c_region; 1229 struct dm_io_region o_region, c_region;
1253 struct cache *cache = mg->cache; 1230 struct cache *cache = mg->cache;
1254 sector_t cblock = from_cblock(mg->cblock);
1255 1231
1256 o_region.bdev = cache->origin_dev->bdev; 1232 o_region.bdev = cache->origin_dev->bdev;
1233 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
1257 o_region.count = cache->sectors_per_block; 1234 o_region.count = cache->sectors_per_block;
1258 1235
1259 c_region.bdev = cache->cache_dev->bdev; 1236 c_region.bdev = cache->cache_dev->bdev;
1260 c_region.sector = cblock * cache->sectors_per_block; 1237 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
1261 c_region.count = cache->sectors_per_block; 1238 c_region.count = cache->sectors_per_block;
1262 1239
1263 if (mg->writeback || mg->demote) { 1240 if (promote)
1264 /* demote */ 1241 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
1265 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1242 else
1266 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1243 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
1267 } else {
1268 /* promote */
1269 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
1270 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
1271 }
1272 1244
1273 if (r < 0) { 1245 return r;
1274 DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); 1246}
1275 migration_failure(mg); 1247
1276 } 1248static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
1249{
1250 size_t pb_data_size = get_per_bio_data_size(cache);
1251 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1252
1253 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
1254 free_prison_cell(cache, pb->cell);
1255 pb->cell = NULL;
1277} 1256}
1278 1257
1279static void overwrite_endio(struct bio *bio) 1258static void overwrite_endio(struct bio *bio)
@@ -1282,368 +1261,475 @@ static void overwrite_endio(struct bio *bio)
1282 struct cache *cache = mg->cache; 1261 struct cache *cache = mg->cache;
1283 size_t pb_data_size = get_per_bio_data_size(cache); 1262 size_t pb_data_size = get_per_bio_data_size(cache);
1284 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1263 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1285 unsigned long flags;
1286 1264
1287 dm_unhook_bio(&pb->hook_info, bio); 1265 dm_unhook_bio(&pb->hook_info, bio);
1288 1266
1289 if (bio->bi_error) 1267 if (bio->bi_error)
1290 mg->err = true; 1268 mg->k.input = bio->bi_error;
1291 1269
1292 mg->requeue_holder = false; 1270 queue_continuation(mg->cache->wq, &mg->k);
1293
1294 spin_lock_irqsave(&cache->lock, flags);
1295 list_add_tail(&mg->list, &cache->completed_migrations);
1296 spin_unlock_irqrestore(&cache->lock, flags);
1297
1298 wake_worker(cache);
1299} 1271}
1300 1272
1301static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1273static void overwrite(struct dm_cache_migration *mg,
1274 void (*continuation)(struct work_struct *))
1302{ 1275{
1276 struct bio *bio = mg->overwrite_bio;
1303 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1277 size_t pb_data_size = get_per_bio_data_size(mg->cache);
1304 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1278 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1305 1279
1306 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1280 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1307 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1308 1281
1309 /* 1282 /*
1310 * No need to inc_ds() here, since the cell will be held for the 1283 * The overwrite bio is part of the copy operation, as such it does
1311 * duration of the io. 1284 * not set/clear discard or dirty flags.
1312 */ 1285 */
1286 if (mg->op->op == POLICY_PROMOTE)
1287 remap_to_cache(mg->cache, bio, mg->op->cblock);
1288 else
1289 remap_to_origin(mg->cache, bio);
1290
1291 init_continuation(&mg->k, continuation);
1313 accounted_request(mg->cache, bio); 1292 accounted_request(mg->cache, bio);
1314} 1293}
1315 1294
1316static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1295/*
1296 * Migration steps:
1297 *
1298 * 1) exclusive lock preventing WRITEs
1299 * 2) quiesce
1300 * 3) copy or issue overwrite bio
1301 * 4) upgrade to exclusive lock preventing READs and WRITEs
1302 * 5) quiesce
1303 * 6) update metadata and commit
1304 * 7) unlock
1305 */
1306static void mg_complete(struct dm_cache_migration *mg, bool success)
1317{ 1307{
1318 return (bio_data_dir(bio) == WRITE) && 1308 struct bio_list bios;
1319 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1309 struct cache *cache = mg->cache;
1320} 1310 struct policy_work *op = mg->op;
1311 dm_cblock_t cblock = op->cblock;
1312
1313 if (success)
1314 update_stats(&cache->stats, op->op);
1315
1316 switch (op->op) {
1317 case POLICY_PROMOTE:
1318 clear_discard(cache, oblock_to_dblock(cache, op->oblock));
1319 policy_complete_background_work(cache->policy, op, success);
1320
1321 if (mg->overwrite_bio) {
1322 if (success)
1323 force_set_dirty(cache, cblock);
1324 else
1325 mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO);
1326 bio_endio(mg->overwrite_bio);
1327 } else {
1328 if (success)
1329 force_clear_dirty(cache, cblock);
1330 dec_io_migrations(cache);
1331 }
1332 break;
1321 1333
1322static void avoid_copy(struct dm_cache_migration *mg) 1334 case POLICY_DEMOTE:
1323{ 1335 /*
1324 atomic_inc(&mg->cache->stats.copies_avoided); 1336 * We clear dirty here to update the nr_dirty counter.
1325 migration_success_pre_commit(mg); 1337 */
1326} 1338 if (success)
1339 force_clear_dirty(cache, cblock);
1340 policy_complete_background_work(cache->policy, op, success);
1341 dec_io_migrations(cache);
1342 break;
1327 1343
1328static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1344 case POLICY_WRITEBACK:
1329 dm_dblock_t *b, dm_dblock_t *e) 1345 if (success)
1330{ 1346 force_clear_dirty(cache, cblock);
1331 sector_t sb = bio->bi_iter.bi_sector; 1347 policy_complete_background_work(cache->policy, op, success);
1332 sector_t se = bio_end_sector(bio); 1348 dec_io_migrations(cache);
1349 break;
1350 }
1333 1351
1334 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1352 bio_list_init(&bios);
1353 if (mg->cell) {
1354 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1355 free_prison_cell(cache, mg->cell);
1356 }
1335 1357
1336 if (se - sb < cache->discard_block_size) 1358 free_migration(mg);
1337 *e = *b; 1359 defer_bios(cache, &bios);
1338 else 1360 wake_migration_worker(cache);
1339 *e = to_dblock(block_div(se, cache->discard_block_size)); 1361
1362 background_work_end(cache);
1340} 1363}
1341 1364
1342static void issue_discard(struct dm_cache_migration *mg) 1365static void mg_success(struct work_struct *ws)
1343{ 1366{
1344 dm_dblock_t b, e; 1367 struct dm_cache_migration *mg = ws_to_mg(ws);
1345 struct bio *bio = mg->new_ocell->holder; 1368 mg_complete(mg, mg->k.input == 0);
1346 struct cache *cache = mg->cache;
1347
1348 calc_discard_block_range(cache, bio, &b, &e);
1349 while (b != e) {
1350 set_discard(cache, b);
1351 b = to_dblock(from_dblock(b) + 1);
1352 }
1353
1354 bio_endio(bio);
1355 cell_defer(cache, mg->new_ocell, false);
1356 free_migration(mg);
1357 wake_worker(cache);
1358} 1369}
1359 1370
1360static void issue_copy_or_discard(struct dm_cache_migration *mg) 1371static void mg_update_metadata(struct work_struct *ws)
1361{ 1372{
1362 bool avoid; 1373 int r;
1374 struct dm_cache_migration *mg = ws_to_mg(ws);
1363 struct cache *cache = mg->cache; 1375 struct cache *cache = mg->cache;
1376 struct policy_work *op = mg->op;
1364 1377
1365 if (mg->discard) { 1378 switch (op->op) {
1366 issue_discard(mg); 1379 case POLICY_PROMOTE:
1367 return; 1380 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
1368 } 1381 if (r) {
1382 DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
1383 cache_device_name(cache));
1384 metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1369 1385
1370 if (mg->writeback || mg->demote) 1386 mg_complete(mg, false);
1371 avoid = !is_dirty(cache, mg->cblock) || 1387 return;
1372 is_discarded_oblock(cache, mg->old_oblock); 1388 }
1373 else { 1389 mg_complete(mg, true);
1374 struct bio *bio = mg->new_ocell->holder; 1390 break;
1375 1391
1376 avoid = is_discarded_oblock(cache, mg->new_oblock); 1392 case POLICY_DEMOTE:
1393 r = dm_cache_remove_mapping(cache->cmd, op->cblock);
1394 if (r) {
1395 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
1396 cache_device_name(cache));
1397 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1377 1398
1378 if (writeback_mode(&cache->features) && 1399 mg_complete(mg, false);
1379 !avoid && bio_writes_complete_block(cache, bio)) {
1380 issue_overwrite(mg, bio);
1381 return; 1400 return;
1382 } 1401 }
1383 }
1384 1402
1385 avoid ? avoid_copy(mg) : issue_copy(mg); 1403 /*
1404 * It would be nice if we only had to commit when a REQ_FLUSH
1405 * comes through. But there's one scenario that we have to
1406 * look out for:
1407 *
1408 * - vblock x in a cache block
1409 * - domotion occurs
1410 * - cache block gets reallocated and over written
1411 * - crash
1412 *
1413 * When we recover, because there was no commit the cache will
1414 * rollback to having the data for vblock x in the cache block.
1415 * But the cache block has since been overwritten, so it'll end
1416 * up pointing to data that was never in 'x' during the history
1417 * of the device.
1418 *
1419 * To avoid this issue we require a commit as part of the
1420 * demotion operation.
1421 */
1422 init_continuation(&mg->k, mg_success);
1423 continue_after_commit(&cache->committer, &mg->k);
1424 schedule_commit(&cache->committer);
1425 break;
1426
1427 case POLICY_WRITEBACK:
1428 mg_complete(mg, true);
1429 break;
1430 }
1386} 1431}
1387 1432
1388static void complete_migration(struct dm_cache_migration *mg) 1433static void mg_update_metadata_after_copy(struct work_struct *ws)
1389{ 1434{
1390 if (mg->err) 1435 struct dm_cache_migration *mg = ws_to_mg(ws);
1391 migration_failure(mg); 1436
1437 /*
1438 * Did the copy succeed?
1439 */
1440 if (mg->k.input)
1441 mg_complete(mg, false);
1392 else 1442 else
1393 migration_success_pre_commit(mg); 1443 mg_update_metadata(ws);
1394} 1444}
1395 1445
1396static void process_migrations(struct cache *cache, struct list_head *head, 1446static void mg_upgrade_lock(struct work_struct *ws)
1397 void (*fn)(struct dm_cache_migration *))
1398{ 1447{
1399 unsigned long flags; 1448 int r;
1400 struct list_head list; 1449 struct dm_cache_migration *mg = ws_to_mg(ws);
1401 struct dm_cache_migration *mg, *tmp;
1402 1450
1403 INIT_LIST_HEAD(&list); 1451 /*
1404 spin_lock_irqsave(&cache->lock, flags); 1452 * Did the copy succeed?
1405 list_splice_init(head, &list); 1453 */
1406 spin_unlock_irqrestore(&cache->lock, flags); 1454 if (mg->k.input)
1455 mg_complete(mg, false);
1407 1456
1408 list_for_each_entry_safe(mg, tmp, &list, list) 1457 else {
1409 fn(mg); 1458 /*
1410} 1459 * Now we want the lock to prevent both reads and writes.
1460 */
1461 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
1462 READ_WRITE_LOCK_LEVEL);
1463 if (r < 0)
1464 mg_complete(mg, false);
1411 1465
1412static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1466 else if (r)
1413{ 1467 quiesce(mg, mg_update_metadata);
1414 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1468
1469 else
1470 mg_update_metadata(ws);
1471 }
1415} 1472}
1416 1473
1417static void queue_quiesced_migration(struct dm_cache_migration *mg) 1474static void mg_copy(struct work_struct *ws)
1418{ 1475{
1419 unsigned long flags; 1476 int r;
1420 struct cache *cache = mg->cache; 1477 struct dm_cache_migration *mg = ws_to_mg(ws);
1421 1478
1422 spin_lock_irqsave(&cache->lock, flags); 1479 if (mg->overwrite_bio) {
1423 __queue_quiesced_migration(mg); 1480 /*
1424 spin_unlock_irqrestore(&cache->lock, flags); 1481 * It's safe to do this here, even though it's new data
1482 * because all IO has been locked out of the block.
1483 *
1484 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
1485 * so _not_ using mg_upgrade_lock() as continutation.
1486 */
1487 overwrite(mg, mg_update_metadata_after_copy);
1425 1488
1426 wake_worker(cache); 1489 } else {
1427} 1490 struct cache *cache = mg->cache;
1491 struct policy_work *op = mg->op;
1492 bool is_policy_promote = (op->op == POLICY_PROMOTE);
1428 1493
1429static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1494 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
1430{ 1495 is_discarded_oblock(cache, op->oblock)) {
1431 unsigned long flags; 1496 mg_upgrade_lock(ws);
1432 struct dm_cache_migration *mg, *tmp; 1497 return;
1498 }
1433 1499
1434 spin_lock_irqsave(&cache->lock, flags); 1500 init_continuation(&mg->k, mg_upgrade_lock);
1435 list_for_each_entry_safe(mg, tmp, work, list)
1436 __queue_quiesced_migration(mg);
1437 spin_unlock_irqrestore(&cache->lock, flags);
1438 1501
1439 wake_worker(cache); 1502 r = copy(mg, is_policy_promote);
1503 if (r) {
1504 DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
1505 mg->k.input = -EIO;
1506 mg_complete(mg, false);
1507 }
1508 }
1440} 1509}
1441 1510
1442static void check_for_quiesced_migrations(struct cache *cache, 1511static int mg_lock_writes(struct dm_cache_migration *mg)
1443 struct per_bio_data *pb)
1444{ 1512{
1445 struct list_head work; 1513 int r;
1514 struct dm_cell_key_v2 key;
1515 struct cache *cache = mg->cache;
1516 struct dm_bio_prison_cell_v2 *prealloc;
1446 1517
1447 if (!pb->all_io_entry) 1518 prealloc = alloc_prison_cell(cache);
1448 return; 1519 if (!prealloc) {
1520 DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
1521 mg_complete(mg, false);
1522 return -ENOMEM;
1523 }
1524
1525 /*
1526 * Prevent writes to the block, but allow reads to continue.
1527 * Unless we're using an overwrite bio, in which case we lock
1528 * everything.
1529 */
1530 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
1531 r = dm_cell_lock_v2(cache->prison, &key,
1532 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
1533 prealloc, &mg->cell);
1534 if (r < 0) {
1535 free_prison_cell(cache, prealloc);
1536 mg_complete(mg, false);
1537 return r;
1538 }
1449 1539
1450 INIT_LIST_HEAD(&work); 1540 if (mg->cell != prealloc)
1451 dm_deferred_entry_dec(pb->all_io_entry, &work); 1541 free_prison_cell(cache, prealloc);
1452 1542
1453 if (!list_empty(&work)) 1543 if (r == 0)
1454 queue_quiesced_migrations(cache, &work); 1544 mg_copy(&mg->k.ws);
1455} 1545 else
1546 quiesce(mg, mg_copy);
1456 1547
1457static void quiesce_migration(struct dm_cache_migration *mg) 1548 return 0;
1458{
1459 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
1460 queue_quiesced_migration(mg);
1461} 1549}
1462 1550
1463static void promote(struct cache *cache, struct prealloc *structs, 1551static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
1464 dm_oblock_t oblock, dm_cblock_t cblock,
1465 struct dm_bio_prison_cell *cell)
1466{ 1552{
1467 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1553 struct dm_cache_migration *mg;
1554
1555 if (!background_work_begin(cache)) {
1556 policy_complete_background_work(cache->policy, op, false);
1557 return -EPERM;
1558 }
1559
1560 mg = alloc_migration(cache);
1561 if (!mg) {
1562 policy_complete_background_work(cache->policy, op, false);
1563 background_work_end(cache);
1564 return -ENOMEM;
1565 }
1566
1567 memset(mg, 0, sizeof(*mg));
1468 1568
1469 mg->err = false;
1470 mg->discard = false;
1471 mg->writeback = false;
1472 mg->demote = false;
1473 mg->promote = true;
1474 mg->requeue_holder = true;
1475 mg->invalidate = false;
1476 mg->cache = cache; 1569 mg->cache = cache;
1477 mg->new_oblock = oblock; 1570 mg->op = op;
1478 mg->cblock = cblock; 1571 mg->overwrite_bio = bio;
1479 mg->old_ocell = NULL;
1480 mg->new_ocell = cell;
1481 mg->start_jiffies = jiffies;
1482 1572
1483 inc_io_migrations(cache); 1573 if (!bio)
1484 quiesce_migration(mg); 1574 inc_io_migrations(cache);
1575
1576 return mg_lock_writes(mg);
1485} 1577}
1486 1578
1487static void writeback(struct cache *cache, struct prealloc *structs, 1579/*----------------------------------------------------------------
1488 dm_oblock_t oblock, dm_cblock_t cblock, 1580 * invalidation processing
1489 struct dm_bio_prison_cell *cell) 1581 *--------------------------------------------------------------*/
1582
1583static void invalidate_complete(struct dm_cache_migration *mg, bool success)
1490{ 1584{
1491 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1585 struct bio_list bios;
1586 struct cache *cache = mg->cache;
1492 1587
1493 mg->err = false; 1588 bio_list_init(&bios);
1494 mg->discard = false; 1589 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1495 mg->writeback = true; 1590 free_prison_cell(cache, mg->cell);
1496 mg->demote = false;
1497 mg->promote = false;
1498 mg->requeue_holder = true;
1499 mg->invalidate = false;
1500 mg->cache = cache;
1501 mg->old_oblock = oblock;
1502 mg->cblock = cblock;
1503 mg->old_ocell = cell;
1504 mg->new_ocell = NULL;
1505 mg->start_jiffies = jiffies;
1506
1507 inc_io_migrations(cache);
1508 quiesce_migration(mg);
1509}
1510
1511static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1512 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1513 dm_cblock_t cblock,
1514 struct dm_bio_prison_cell *old_ocell,
1515 struct dm_bio_prison_cell *new_ocell)
1516{
1517 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1518
1519 mg->err = false;
1520 mg->discard = false;
1521 mg->writeback = false;
1522 mg->demote = true;
1523 mg->promote = true;
1524 mg->requeue_holder = true;
1525 mg->invalidate = false;
1526 mg->cache = cache;
1527 mg->old_oblock = old_oblock;
1528 mg->new_oblock = new_oblock;
1529 mg->cblock = cblock;
1530 mg->old_ocell = old_ocell;
1531 mg->new_ocell = new_ocell;
1532 mg->start_jiffies = jiffies;
1533 1591
1534 inc_io_migrations(cache); 1592 if (!success && mg->overwrite_bio)
1535 quiesce_migration(mg); 1593 bio_io_error(mg->overwrite_bio);
1536}
1537 1594
1538/* 1595 free_migration(mg);
1539 * Invalidate a cache entry. No writeback occurs; any changes in the cache 1596 defer_bios(cache, &bios);
1540 * block are thrown away.
1541 */
1542static void invalidate(struct cache *cache, struct prealloc *structs,
1543 dm_oblock_t oblock, dm_cblock_t cblock,
1544 struct dm_bio_prison_cell *cell)
1545{
1546 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1547
1548 mg->err = false;
1549 mg->discard = false;
1550 mg->writeback = false;
1551 mg->demote = true;
1552 mg->promote = false;
1553 mg->requeue_holder = true;
1554 mg->invalidate = true;
1555 mg->cache = cache;
1556 mg->old_oblock = oblock;
1557 mg->cblock = cblock;
1558 mg->old_ocell = cell;
1559 mg->new_ocell = NULL;
1560 mg->start_jiffies = jiffies;
1561 1597
1562 inc_io_migrations(cache); 1598 background_work_end(cache);
1563 quiesce_migration(mg);
1564} 1599}
1565 1600
1566static void discard(struct cache *cache, struct prealloc *structs, 1601static void invalidate_completed(struct work_struct *ws)
1567 struct dm_bio_prison_cell *cell)
1568{ 1602{
1569 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1603 struct dm_cache_migration *mg = ws_to_mg(ws);
1604 invalidate_complete(mg, !mg->k.input);
1605}
1570 1606
1571 mg->err = false; 1607static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
1572 mg->discard = true; 1608{
1573 mg->writeback = false; 1609 int r = policy_invalidate_mapping(cache->policy, cblock);
1574 mg->demote = false; 1610 if (!r) {
1575 mg->promote = false; 1611 r = dm_cache_remove_mapping(cache->cmd, cblock);
1576 mg->requeue_holder = false; 1612 if (r) {
1577 mg->invalidate = false; 1613 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
1578 mg->cache = cache; 1614 cache_device_name(cache));
1579 mg->old_ocell = NULL; 1615 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1580 mg->new_ocell = cell; 1616 }
1581 mg->start_jiffies = jiffies; 1617
1618 } else if (r == -ENODATA) {
1619 /*
1620 * Harmless, already unmapped.
1621 */
1622 r = 0;
1623
1624 } else
1625 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
1582 1626
1583 quiesce_migration(mg); 1627 return r;
1584} 1628}
1585 1629
1586/*---------------------------------------------------------------- 1630static void invalidate_remove(struct work_struct *ws)
1587 * bio processing
1588 *--------------------------------------------------------------*/
1589static void defer_bio(struct cache *cache, struct bio *bio)
1590{ 1631{
1591 unsigned long flags; 1632 int r;
1633 struct dm_cache_migration *mg = ws_to_mg(ws);
1634 struct cache *cache = mg->cache;
1592 1635
1593 spin_lock_irqsave(&cache->lock, flags); 1636 r = invalidate_cblock(cache, mg->invalidate_cblock);
1594 bio_list_add(&cache->deferred_bios, bio); 1637 if (r) {
1595 spin_unlock_irqrestore(&cache->lock, flags); 1638 invalidate_complete(mg, false);
1639 return;
1640 }
1596 1641
1597 wake_worker(cache); 1642 init_continuation(&mg->k, invalidate_completed);
1643 continue_after_commit(&cache->committer, &mg->k);
1644 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
1645 mg->overwrite_bio = NULL;
1646 schedule_commit(&cache->committer);
1598} 1647}
1599 1648
1600static void process_flush_bio(struct cache *cache, struct bio *bio) 1649static int invalidate_lock(struct dm_cache_migration *mg)
1601{ 1650{
1602 size_t pb_data_size = get_per_bio_data_size(cache); 1651 int r;
1603 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1652 struct dm_cell_key_v2 key;
1653 struct cache *cache = mg->cache;
1654 struct dm_bio_prison_cell_v2 *prealloc;
1604 1655
1605 BUG_ON(bio->bi_iter.bi_size); 1656 prealloc = alloc_prison_cell(cache);
1606 if (!pb->req_nr) 1657 if (!prealloc) {
1607 remap_to_origin(cache, bio); 1658 invalidate_complete(mg, false);
1608 else 1659 return -ENOMEM;
1609 remap_to_cache(cache, bio, 0); 1660 }
1610 1661
1611 /* 1662 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
1612 * REQ_PREFLUSH is not directed at any particular block so we don't 1663 r = dm_cell_lock_v2(cache->prison, &key,
1613 * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH 1664 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
1614 * by dm-core. 1665 if (r < 0) {
1615 */ 1666 free_prison_cell(cache, prealloc);
1616 issue(cache, bio); 1667 invalidate_complete(mg, false);
1668 return r;
1669 }
1670
1671 if (mg->cell != prealloc)
1672 free_prison_cell(cache, prealloc);
1673
1674 if (r)
1675 quiesce(mg, invalidate_remove);
1676
1677 else {
1678 /*
1679 * We can't call invalidate_remove() directly here because we
1680 * might still be in request context.
1681 */
1682 init_continuation(&mg->k, invalidate_remove);
1683 queue_work(cache->wq, &mg->k.ws);
1684 }
1685
1686 return 0;
1617} 1687}
1618 1688
1619static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1689static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
1620 struct bio *bio) 1690 dm_oblock_t oblock, struct bio *bio)
1621{ 1691{
1622 int r; 1692 struct dm_cache_migration *mg;
1623 dm_dblock_t b, e;
1624 struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1625 1693
1626 calc_discard_block_range(cache, bio, &b, &e); 1694 if (!background_work_begin(cache))
1627 if (b == e) { 1695 return -EPERM;
1628 bio_endio(bio); 1696
1629 return; 1697 mg = alloc_migration(cache);
1698 if (!mg) {
1699 background_work_end(cache);
1700 return -ENOMEM;
1630 } 1701 }
1631 1702
1632 cell_prealloc = prealloc_get_cell(structs); 1703 memset(mg, 0, sizeof(*mg));
1633 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
1634 (cell_free_fn) prealloc_put_cell,
1635 structs, &new_ocell);
1636 if (r > 0)
1637 return;
1638 1704
1639 discard(cache, structs, new_ocell); 1705 mg->cache = cache;
1706 mg->overwrite_bio = bio;
1707 mg->invalidate_cblock = cblock;
1708 mg->invalidate_oblock = oblock;
1709
1710 return invalidate_lock(mg);
1640} 1711}
1641 1712
1642static bool spare_migration_bandwidth(struct cache *cache) 1713/*----------------------------------------------------------------
1714 * bio processing
1715 *--------------------------------------------------------------*/
1716
1717enum busy {
1718 IDLE,
1719 MODERATE,
1720 BUSY
1721};
1722
1723static enum busy spare_migration_bandwidth(struct cache *cache)
1643{ 1724{
1725 bool idle = iot_idle_for(&cache->origin_tracker, HZ);
1644 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1726 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1645 cache->sectors_per_block; 1727 cache->sectors_per_block;
1646 return current_volume < cache->migration_threshold; 1728
1729 if (current_volume <= cache->migration_threshold)
1730 return idle ? IDLE : MODERATE;
1731 else
1732 return idle ? MODERATE : BUSY;
1647} 1733}
1648 1734
1649static void inc_hit_counter(struct cache *cache, struct bio *bio) 1735static void inc_hit_counter(struct cache *cache, struct bio *bio)
@@ -1660,255 +1746,143 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
1660 1746
1661/*----------------------------------------------------------------*/ 1747/*----------------------------------------------------------------*/
1662 1748
1663struct inc_detail { 1749static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1664 struct cache *cache;
1665 struct bio_list bios_for_issue;
1666 struct bio_list unhandled_bios;
1667 bool any_writes;
1668};
1669
1670static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
1671{ 1750{
1672 struct bio *bio; 1751 return (bio_data_dir(bio) == WRITE) &&
1673 struct inc_detail *detail = context; 1752 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1674 struct cache *cache = detail->cache;
1675
1676 inc_ds(cache, cell->holder, cell);
1677 if (bio_data_dir(cell->holder) == WRITE)
1678 detail->any_writes = true;
1679
1680 while ((bio = bio_list_pop(&cell->bios))) {
1681 if (discard_or_flush(bio)) {
1682 bio_list_add(&detail->unhandled_bios, bio);
1683 continue;
1684 }
1685
1686 if (bio_data_dir(bio) == WRITE)
1687 detail->any_writes = true;
1688
1689 bio_list_add(&detail->bios_for_issue, bio);
1690 inc_ds(cache, bio, cell);
1691 }
1692} 1753}
1693 1754
1694// FIXME: refactor these two 1755static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
1695static void remap_cell_to_origin_clear_discard(struct cache *cache,
1696 struct dm_bio_prison_cell *cell,
1697 dm_oblock_t oblock, bool issue_holder)
1698{ 1756{
1699 struct bio *bio; 1757 return writeback_mode(&cache->features) &&
1700 unsigned long flags; 1758 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
1701 struct inc_detail detail;
1702
1703 detail.cache = cache;
1704 bio_list_init(&detail.bios_for_issue);
1705 bio_list_init(&detail.unhandled_bios);
1706 detail.any_writes = false;
1707
1708 spin_lock_irqsave(&cache->lock, flags);
1709 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
1710 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
1711 spin_unlock_irqrestore(&cache->lock, flags);
1712
1713 remap_to_origin(cache, cell->holder);
1714 if (issue_holder)
1715 issue(cache, cell->holder);
1716 else
1717 accounted_begin(cache, cell->holder);
1718
1719 if (detail.any_writes)
1720 clear_discard(cache, oblock_to_dblock(cache, oblock));
1721
1722 while ((bio = bio_list_pop(&detail.bios_for_issue))) {
1723 remap_to_origin(cache, bio);
1724 issue(cache, bio);
1725 }
1726
1727 free_prison_cell(cache, cell);
1728} 1759}
1729 1760
1730static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, 1761static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
1731 dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) 1762 bool *commit_needed)
1732{ 1763{
1733 struct bio *bio; 1764 int r, data_dir;
1734 unsigned long flags; 1765 bool rb, background_queued;
1735 struct inc_detail detail; 1766 dm_cblock_t cblock;
1736 1767 size_t pb_data_size = get_per_bio_data_size(cache);
1737 detail.cache = cache; 1768 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1738 bio_list_init(&detail.bios_for_issue);
1739 bio_list_init(&detail.unhandled_bios);
1740 detail.any_writes = false;
1741
1742 spin_lock_irqsave(&cache->lock, flags);
1743 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
1744 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
1745 spin_unlock_irqrestore(&cache->lock, flags);
1746 1769
1747 remap_to_cache(cache, cell->holder, cblock); 1770 *commit_needed = false;
1748 if (issue_holder)
1749 issue(cache, cell->holder);
1750 else
1751 accounted_begin(cache, cell->holder);
1752 1771
1753 if (detail.any_writes) { 1772 rb = bio_detain_shared(cache, block, bio);
1754 set_dirty(cache, oblock, cblock); 1773 if (!rb) {
1755 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1774 /*
1756 } 1775 * An exclusive lock is held for this block, so we have to
1757 1776 * wait. We set the commit_needed flag so the current
1758 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1777 * transaction will be committed asap, allowing this lock
1759 remap_to_cache(cache, bio, cblock); 1778 * to be dropped.
1760 issue(cache, bio); 1779 */
1780 *commit_needed = true;
1781 return DM_MAPIO_SUBMITTED;
1761 } 1782 }
1762 1783
1763 free_prison_cell(cache, cell); 1784 data_dir = bio_data_dir(bio);
1764}
1765 1785
1766/*----------------------------------------------------------------*/ 1786 if (optimisable_bio(cache, bio, block)) {
1767 1787 struct policy_work *op = NULL;
1768struct old_oblock_lock {
1769 struct policy_locker locker;
1770 struct cache *cache;
1771 struct prealloc *structs;
1772 struct dm_bio_prison_cell *cell;
1773};
1774 1788
1775static int null_locker(struct policy_locker *locker, dm_oblock_t b) 1789 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
1776{ 1790 if (unlikely(r && r != -ENOENT)) {
1777 /* This should never be called */ 1791 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
1778 BUG(); 1792 cache_device_name(cache), r);
1779 return 0; 1793 bio_io_error(bio);
1780} 1794 return DM_MAPIO_SUBMITTED;
1795 }
1781 1796
1782static int cell_locker(struct policy_locker *locker, dm_oblock_t b) 1797 if (r == -ENOENT && op) {
1783{ 1798 bio_drop_shared_lock(cache, bio);
1784 struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); 1799 BUG_ON(op->op != POLICY_PROMOTE);
1785 struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); 1800 mg_start(cache, op, bio);
1801 return DM_MAPIO_SUBMITTED;
1802 }
1803 } else {
1804 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
1805 if (unlikely(r && r != -ENOENT)) {
1806 DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
1807 cache_device_name(cache), r);
1808 bio_io_error(bio);
1809 return DM_MAPIO_SUBMITTED;
1810 }
1786 1811
1787 return bio_detain(l->cache, b, NULL, cell_prealloc, 1812 if (background_queued)
1788 (cell_free_fn) prealloc_put_cell, 1813 wake_migration_worker(cache);
1789 l->structs, &l->cell); 1814 }
1790}
1791 1815
1792static void process_cell(struct cache *cache, struct prealloc *structs, 1816 if (r == -ENOENT) {
1793 struct dm_bio_prison_cell *new_ocell) 1817 /*
1794{ 1818 * Miss.
1795 int r; 1819 */
1796 bool release_cell = true; 1820 inc_miss_counter(cache, bio);
1797 struct bio *bio = new_ocell->holder; 1821 if (pb->req_nr == 0) {
1798 dm_oblock_t block = get_bio_block(cache, bio); 1822 accounted_begin(cache, bio);
1799 struct policy_result lookup_result; 1823 remap_to_origin_clear_discard(cache, bio, block);
1800 bool passthrough = passthrough_mode(&cache->features);
1801 bool fast_promotion, can_migrate;
1802 struct old_oblock_lock ool;
1803
1804 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
1805 can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
1806
1807 ool.locker.fn = cell_locker;
1808 ool.cache = cache;
1809 ool.structs = structs;
1810 ool.cell = NULL;
1811 r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
1812 bio, &ool.locker, &lookup_result);
1813
1814 if (r == -EWOULDBLOCK)
1815 /* migration has been denied */
1816 lookup_result.op = POLICY_MISS;
1817
1818 switch (lookup_result.op) {
1819 case POLICY_HIT:
1820 if (passthrough) {
1821 inc_miss_counter(cache, bio);
1822 1824
1825 } else {
1823 /* 1826 /*
1824 * Passthrough always maps to the origin, 1827 * This is a duplicate writethrough io that is no
1825 * invalidating any cache blocks that are written 1828 * longer needed because the block has been demoted.
1826 * to.
1827 */ 1829 */
1830 bio_endio(bio);
1831 return DM_MAPIO_SUBMITTED;
1832 }
1833 } else {
1834 /*
1835 * Hit.
1836 */
1837 inc_hit_counter(cache, bio);
1828 1838
1839 /*
1840 * Passthrough always maps to the origin, invalidating any
1841 * cache blocks that are written to.
1842 */
1843 if (passthrough_mode(&cache->features)) {
1829 if (bio_data_dir(bio) == WRITE) { 1844 if (bio_data_dir(bio) == WRITE) {
1845 bio_drop_shared_lock(cache, bio);
1830 atomic_inc(&cache->stats.demotion); 1846 atomic_inc(&cache->stats.demotion);
1831 invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1847 invalidate_start(cache, cblock, block, bio);
1832 release_cell = false; 1848 } else
1833
1834 } else {
1835 /* FIXME: factor out issue_origin() */
1836 remap_to_origin_clear_discard(cache, bio, block); 1849 remap_to_origin_clear_discard(cache, bio, block);
1837 inc_and_issue(cache, bio, new_ocell); 1850
1838 }
1839 } else { 1851 } else {
1840 inc_hit_counter(cache, bio); 1852 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
1841 1853 !is_dirty(cache, cblock)) {
1842 if (bio_data_dir(bio) == WRITE && 1854 remap_to_origin_then_cache(cache, bio, block, cblock);
1843 writethrough_mode(&cache->features) && 1855 accounted_begin(cache, bio);
1844 !is_dirty(cache, lookup_result.cblock)) { 1856 } else
1845 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1857 remap_to_cache_dirty(cache, bio, block, cblock);
1846 inc_and_issue(cache, bio, new_ocell);
1847
1848 } else {
1849 remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
1850 release_cell = false;
1851 }
1852 } 1858 }
1853
1854 break;
1855
1856 case POLICY_MISS:
1857 inc_miss_counter(cache, bio);
1858 remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
1859 release_cell = false;
1860 break;
1861
1862 case POLICY_NEW:
1863 atomic_inc(&cache->stats.promotion);
1864 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1865 release_cell = false;
1866 break;
1867
1868 case POLICY_REPLACE:
1869 atomic_inc(&cache->stats.demotion);
1870 atomic_inc(&cache->stats.promotion);
1871 demote_then_promote(cache, structs, lookup_result.old_oblock,
1872 block, lookup_result.cblock,
1873 ool.cell, new_ocell);
1874 release_cell = false;
1875 break;
1876
1877 default:
1878 DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
1879 cache_device_name(cache), __func__,
1880 (unsigned) lookup_result.op);
1881 bio_io_error(bio);
1882 } 1859 }
1883 1860
1884 if (release_cell)
1885 cell_defer(cache, new_ocell, false);
1886}
1887
1888static void process_bio(struct cache *cache, struct prealloc *structs,
1889 struct bio *bio)
1890{
1891 int r;
1892 dm_oblock_t block = get_bio_block(cache, bio);
1893 struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1894
1895 /* 1861 /*
1896 * Check to see if that block is currently migrating. 1862 * dm core turns FUA requests into a separate payload and FLUSH req.
1897 */ 1863 */
1898 cell_prealloc = prealloc_get_cell(structs); 1864 if (bio->bi_opf & REQ_FUA) {
1899 r = bio_detain(cache, block, bio, cell_prealloc, 1865 /*
1900 (cell_free_fn) prealloc_put_cell, 1866 * issue_after_commit will call accounted_begin a second time. So
1901 structs, &new_ocell); 1867 * we call accounted_complete() to avoid double accounting.
1902 if (r > 0) 1868 */
1903 return; 1869 accounted_complete(cache, bio);
1870 issue_after_commit(&cache->committer, bio);
1871 *commit_needed = true;
1872 return DM_MAPIO_SUBMITTED;
1873 }
1904 1874
1905 process_cell(cache, structs, new_ocell); 1875 return DM_MAPIO_REMAPPED;
1906} 1876}
1907 1877
1908static int need_commit_due_to_time(struct cache *cache) 1878static bool process_bio(struct cache *cache, struct bio *bio)
1909{ 1879{
1910 return jiffies < cache->last_commit_jiffies || 1880 bool commit_needed;
1911 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1881
1882 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
1883 generic_make_request(bio);
1884
1885 return commit_needed;
1912} 1886}
1913 1887
1914/* 1888/*
@@ -1929,123 +1903,88 @@ static int commit(struct cache *cache, bool clean_shutdown)
1929 return r; 1903 return r;
1930} 1904}
1931 1905
1932static int commit_if_needed(struct cache *cache) 1906/*
1907 * Used by the batcher.
1908 */
1909static int commit_op(void *context)
1933{ 1910{
1934 int r = 0; 1911 struct cache *cache = context;
1935 1912
1936 if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1913 if (dm_cache_changed_this_transaction(cache->cmd))
1937 dm_cache_changed_this_transaction(cache->cmd)) { 1914 return commit(cache, false);
1938 r = commit(cache, false);
1939 cache->commit_requested = false;
1940 cache->last_commit_jiffies = jiffies;
1941 }
1942 1915
1943 return r; 1916 return 0;
1944} 1917}
1945 1918
1946static void process_deferred_bios(struct cache *cache) 1919/*----------------------------------------------------------------*/
1947{
1948 bool prealloc_used = false;
1949 unsigned long flags;
1950 struct bio_list bios;
1951 struct bio *bio;
1952 struct prealloc structs;
1953
1954 memset(&structs, 0, sizeof(structs));
1955 bio_list_init(&bios);
1956
1957 spin_lock_irqsave(&cache->lock, flags);
1958 bio_list_merge(&bios, &cache->deferred_bios);
1959 bio_list_init(&cache->deferred_bios);
1960 spin_unlock_irqrestore(&cache->lock, flags);
1961
1962 while (!bio_list_empty(&bios)) {
1963 /*
1964 * If we've got no free migration structs, and processing
1965 * this bio might require one, we pause until there are some
1966 * prepared mappings to process.
1967 */
1968 prealloc_used = true;
1969 if (prealloc_data_structs(cache, &structs)) {
1970 spin_lock_irqsave(&cache->lock, flags);
1971 bio_list_merge(&cache->deferred_bios, &bios);
1972 spin_unlock_irqrestore(&cache->lock, flags);
1973 break;
1974 }
1975 1920
1976 bio = bio_list_pop(&bios); 1921static bool process_flush_bio(struct cache *cache, struct bio *bio)
1922{
1923 size_t pb_data_size = get_per_bio_data_size(cache);
1924 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1977 1925
1978 if (bio->bi_opf & REQ_PREFLUSH) 1926 if (!pb->req_nr)
1979 process_flush_bio(cache, bio); 1927 remap_to_origin(cache, bio);
1980 else if (bio_op(bio) == REQ_OP_DISCARD) 1928 else
1981 process_discard_bio(cache, &structs, bio); 1929 remap_to_cache(cache, bio, 0);
1982 else
1983 process_bio(cache, &structs, bio);
1984 }
1985 1930
1986 if (prealloc_used) 1931 issue_after_commit(&cache->committer, bio);
1987 prealloc_free_structs(cache, &structs); 1932 return true;
1988} 1933}
1989 1934
1990static void process_deferred_cells(struct cache *cache) 1935static bool process_discard_bio(struct cache *cache, struct bio *bio)
1991{ 1936{
1992 bool prealloc_used = false; 1937 dm_dblock_t b, e;
1993 unsigned long flags;
1994 struct dm_bio_prison_cell *cell, *tmp;
1995 struct list_head cells;
1996 struct prealloc structs;
1997
1998 memset(&structs, 0, sizeof(structs));
1999
2000 INIT_LIST_HEAD(&cells);
2001
2002 spin_lock_irqsave(&cache->lock, flags);
2003 list_splice_init(&cache->deferred_cells, &cells);
2004 spin_unlock_irqrestore(&cache->lock, flags);
2005
2006 list_for_each_entry_safe(cell, tmp, &cells, user_list) {
2007 /*
2008 * If we've got no free migration structs, and processing
2009 * this bio might require one, we pause until there are some
2010 * prepared mappings to process.
2011 */
2012 prealloc_used = true;
2013 if (prealloc_data_structs(cache, &structs)) {
2014 spin_lock_irqsave(&cache->lock, flags);
2015 list_splice(&cells, &cache->deferred_cells);
2016 spin_unlock_irqrestore(&cache->lock, flags);
2017 break;
2018 }
2019 1938
2020 process_cell(cache, &structs, cell); 1939 // FIXME: do we need to lock the region? Or can we just assume the
1940 // user wont be so foolish as to issue discard concurrently with
1941 // other IO?
1942 calc_discard_block_range(cache, bio, &b, &e);
1943 while (b != e) {
1944 set_discard(cache, b);
1945 b = to_dblock(from_dblock(b) + 1);
2021 } 1946 }
2022 1947
2023 if (prealloc_used) 1948 bio_endio(bio);
2024 prealloc_free_structs(cache, &structs); 1949
1950 return false;
2025} 1951}
2026 1952
2027static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 1953static void process_deferred_bios(struct work_struct *ws)
2028{ 1954{
1955 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
1956
2029 unsigned long flags; 1957 unsigned long flags;
1958 bool commit_needed = false;
2030 struct bio_list bios; 1959 struct bio_list bios;
2031 struct bio *bio; 1960 struct bio *bio;
2032 1961
2033 bio_list_init(&bios); 1962 bio_list_init(&bios);
2034 1963
2035 spin_lock_irqsave(&cache->lock, flags); 1964 spin_lock_irqsave(&cache->lock, flags);
2036 bio_list_merge(&bios, &cache->deferred_flush_bios); 1965 bio_list_merge(&bios, &cache->deferred_bios);
2037 bio_list_init(&cache->deferred_flush_bios); 1966 bio_list_init(&cache->deferred_bios);
2038 spin_unlock_irqrestore(&cache->lock, flags); 1967 spin_unlock_irqrestore(&cache->lock, flags);
2039 1968
2040 /* 1969 while ((bio = bio_list_pop(&bios))) {
2041 * These bios have already been through inc_ds() 1970 if (bio->bi_opf & REQ_PREFLUSH)
2042 */ 1971 commit_needed = process_flush_bio(cache, bio) || commit_needed;
2043 while ((bio = bio_list_pop(&bios))) 1972
2044 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); 1973 else if (bio_op(bio) == REQ_OP_DISCARD)
1974 commit_needed = process_discard_bio(cache, bio) || commit_needed;
1975
1976 else
1977 commit_needed = process_bio(cache, bio) || commit_needed;
1978 }
1979
1980 if (commit_needed)
1981 schedule_commit(&cache->committer);
2045} 1982}
2046 1983
2047static void process_deferred_writethrough_bios(struct cache *cache) 1984static void process_deferred_writethrough_bios(struct work_struct *ws)
2048{ 1985{
1986 struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
1987
2049 unsigned long flags; 1988 unsigned long flags;
2050 struct bio_list bios; 1989 struct bio_list bios;
2051 struct bio *bio; 1990 struct bio *bio;
@@ -2058,153 +1997,15 @@ static void process_deferred_writethrough_bios(struct cache *cache)
2058 spin_unlock_irqrestore(&cache->lock, flags); 1997 spin_unlock_irqrestore(&cache->lock, flags);
2059 1998
2060 /* 1999 /*
2061 * These bios have already been through inc_ds() 2000 * These bios have already been through accounted_begin()
2062 */ 2001 */
2063 while ((bio = bio_list_pop(&bios))) 2002 while ((bio = bio_list_pop(&bios)))
2064 accounted_request(cache, bio); 2003 generic_make_request(bio);
2065}
2066
2067static void writeback_some_dirty_blocks(struct cache *cache)
2068{
2069 bool prealloc_used = false;
2070 dm_oblock_t oblock;
2071 dm_cblock_t cblock;
2072 struct prealloc structs;
2073 struct dm_bio_prison_cell *old_ocell;
2074 bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
2075
2076 memset(&structs, 0, sizeof(structs));
2077
2078 while (spare_migration_bandwidth(cache)) {
2079 if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
2080 break; /* no work to do */
2081
2082 prealloc_used = true;
2083 if (prealloc_data_structs(cache, &structs) ||
2084 get_cell(cache, oblock, &structs, &old_ocell)) {
2085 policy_set_dirty(cache->policy, oblock);
2086 break;
2087 }
2088
2089 writeback(cache, &structs, oblock, cblock, old_ocell);
2090 }
2091
2092 if (prealloc_used)
2093 prealloc_free_structs(cache, &structs);
2094}
2095
2096/*----------------------------------------------------------------
2097 * Invalidations.
2098 * Dropping something from the cache *without* writing back.
2099 *--------------------------------------------------------------*/
2100
2101static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
2102{
2103 int r = 0;
2104 uint64_t begin = from_cblock(req->cblocks->begin);
2105 uint64_t end = from_cblock(req->cblocks->end);
2106
2107 while (begin != end) {
2108 r = policy_remove_cblock(cache->policy, to_cblock(begin));
2109 if (!r) {
2110 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
2111 if (r) {
2112 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
2113 break;
2114 }
2115
2116 } else if (r == -ENODATA) {
2117 /* harmless, already unmapped */
2118 r = 0;
2119
2120 } else {
2121 DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
2122 break;
2123 }
2124
2125 begin++;
2126 }
2127
2128 cache->commit_requested = true;
2129
2130 req->err = r;
2131 atomic_set(&req->complete, 1);
2132
2133 wake_up(&req->result_wait);
2134}
2135
2136static void process_invalidation_requests(struct cache *cache)
2137{
2138 struct list_head list;
2139 struct invalidation_request *req, *tmp;
2140
2141 INIT_LIST_HEAD(&list);
2142 spin_lock(&cache->invalidation_lock);
2143 list_splice_init(&cache->invalidation_requests, &list);
2144 spin_unlock(&cache->invalidation_lock);
2145
2146 list_for_each_entry_safe (req, tmp, &list, list)
2147 process_invalidation_request(cache, req);
2148} 2004}
2149 2005
2150/*---------------------------------------------------------------- 2006/*----------------------------------------------------------------
2151 * Main worker loop 2007 * Main worker loop
2152 *--------------------------------------------------------------*/ 2008 *--------------------------------------------------------------*/
2153static bool is_quiescing(struct cache *cache)
2154{
2155 return atomic_read(&cache->quiescing);
2156}
2157
2158static void ack_quiescing(struct cache *cache)
2159{
2160 if (is_quiescing(cache)) {
2161 atomic_inc(&cache->quiescing_ack);
2162 wake_up(&cache->quiescing_wait);
2163 }
2164}
2165
2166static void wait_for_quiescing_ack(struct cache *cache)
2167{
2168 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
2169}
2170
2171static void start_quiescing(struct cache *cache)
2172{
2173 atomic_inc(&cache->quiescing);
2174 wait_for_quiescing_ack(cache);
2175}
2176
2177static void stop_quiescing(struct cache *cache)
2178{
2179 atomic_set(&cache->quiescing, 0);
2180 atomic_set(&cache->quiescing_ack, 0);
2181}
2182
2183static void wait_for_migrations(struct cache *cache)
2184{
2185 wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
2186}
2187
2188static void stop_worker(struct cache *cache)
2189{
2190 cancel_delayed_work(&cache->waker);
2191 flush_workqueue(cache->wq);
2192}
2193
2194static void requeue_deferred_cells(struct cache *cache)
2195{
2196 unsigned long flags;
2197 struct list_head cells;
2198 struct dm_bio_prison_cell *cell, *tmp;
2199
2200 INIT_LIST_HEAD(&cells);
2201 spin_lock_irqsave(&cache->lock, flags);
2202 list_splice_init(&cache->deferred_cells, &cells);
2203 spin_unlock_irqrestore(&cache->lock, flags);
2204
2205 list_for_each_entry_safe(cell, tmp, &cells, user_list)
2206 cell_requeue(cache, cell);
2207}
2208 2009
2209static void requeue_deferred_bios(struct cache *cache) 2010static void requeue_deferred_bios(struct cache *cache)
2210{ 2011{
@@ -2221,53 +2022,6 @@ static void requeue_deferred_bios(struct cache *cache)
2221 } 2022 }
2222} 2023}
2223 2024
2224static int more_work(struct cache *cache)
2225{
2226 if (is_quiescing(cache))
2227 return !list_empty(&cache->quiesced_migrations) ||
2228 !list_empty(&cache->completed_migrations) ||
2229 !list_empty(&cache->need_commit_migrations);
2230 else
2231 return !bio_list_empty(&cache->deferred_bios) ||
2232 !list_empty(&cache->deferred_cells) ||
2233 !bio_list_empty(&cache->deferred_flush_bios) ||
2234 !bio_list_empty(&cache->deferred_writethrough_bios) ||
2235 !list_empty(&cache->quiesced_migrations) ||
2236 !list_empty(&cache->completed_migrations) ||
2237 !list_empty(&cache->need_commit_migrations) ||
2238 cache->invalidate;
2239}
2240
2241static void do_worker(struct work_struct *ws)
2242{
2243 struct cache *cache = container_of(ws, struct cache, worker);
2244
2245 do {
2246 if (!is_quiescing(cache)) {
2247 writeback_some_dirty_blocks(cache);
2248 process_deferred_writethrough_bios(cache);
2249 process_deferred_bios(cache);
2250 process_deferred_cells(cache);
2251 process_invalidation_requests(cache);
2252 }
2253
2254 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
2255 process_migrations(cache, &cache->completed_migrations, complete_migration);
2256
2257 if (commit_if_needed(cache)) {
2258 process_deferred_flush_bios(cache, false);
2259 process_migrations(cache, &cache->need_commit_migrations, migration_failure);
2260 } else {
2261 process_deferred_flush_bios(cache, true);
2262 process_migrations(cache, &cache->need_commit_migrations,
2263 migration_success_post_commit);
2264 }
2265
2266 ack_quiescing(cache);
2267
2268 } while (more_work(cache));
2269}
2270
2271/* 2025/*
2272 * We want to commit periodically so that not too much 2026 * We want to commit periodically so that not too much
2273 * unwritten metadata builds up. 2027 * unwritten metadata builds up.
@@ -2275,25 +2029,39 @@ static void do_worker(struct work_struct *ws)
2275static void do_waker(struct work_struct *ws) 2029static void do_waker(struct work_struct *ws)
2276{ 2030{
2277 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2031 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
2032
2278 policy_tick(cache->policy, true); 2033 policy_tick(cache->policy, true);
2279 wake_worker(cache); 2034 wake_migration_worker(cache);
2035 schedule_commit(&cache->committer);
2280 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2036 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
2281} 2037}
2282 2038
2283/*----------------------------------------------------------------*/ 2039static void check_migrations(struct work_struct *ws)
2284
2285static int is_congested(struct dm_dev *dev, int bdi_bits)
2286{ 2040{
2287 struct request_queue *q = bdev_get_queue(dev->bdev); 2041 int r;
2288 return bdi_congested(q->backing_dev_info, bdi_bits); 2042 struct policy_work *op;
2289} 2043 struct cache *cache = container_of(ws, struct cache, migration_worker);
2044 enum busy b;
2290 2045
2291static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2046 for (;;) {
2292{ 2047 b = spare_migration_bandwidth(cache);
2293 struct cache *cache = container_of(cb, struct cache, callbacks); 2048 if (b == BUSY)
2049 break;
2294 2050
2295 return is_congested(cache->origin_dev, bdi_bits) || 2051 r = policy_get_background_work(cache->policy, b == IDLE, &op);
2296 is_congested(cache->cache_dev, bdi_bits); 2052 if (r == -ENODATA)
2053 break;
2054
2055 if (r) {
2056 DMERR_LIMIT("%s: policy_background_work failed",
2057 cache_device_name(cache));
2058 break;
2059 }
2060
2061 r = mg_start(cache, op, NULL);
2062 if (r)
2063 break;
2064 }
2297} 2065}
2298 2066
2299/*---------------------------------------------------------------- 2067/*----------------------------------------------------------------
@@ -2310,11 +2078,8 @@ static void destroy(struct cache *cache)
2310 2078
2311 mempool_destroy(cache->migration_pool); 2079 mempool_destroy(cache->migration_pool);
2312 2080
2313 if (cache->all_io_ds)
2314 dm_deferred_set_destroy(cache->all_io_ds);
2315
2316 if (cache->prison) 2081 if (cache->prison)
2317 dm_bio_prison_destroy(cache->prison); 2082 dm_bio_prison_destroy_v2(cache->prison);
2318 2083
2319 if (cache->wq) 2084 if (cache->wq)
2320 destroy_workqueue(cache->wq); 2085 destroy_workqueue(cache->wq);
@@ -2707,6 +2472,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2707 return PTR_ERR(p); 2472 return PTR_ERR(p);
2708 } 2473 }
2709 cache->policy = p; 2474 cache->policy = p;
2475 BUG_ON(!cache->policy);
2710 2476
2711 return 0; 2477 return 0;
2712} 2478}
@@ -2750,6 +2516,20 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size)
2750 cache->cache_size = size; 2516 cache->cache_size = size;
2751} 2517}
2752 2518
2519static int is_congested(struct dm_dev *dev, int bdi_bits)
2520{
2521 struct request_queue *q = bdev_get_queue(dev->bdev);
2522 return bdi_congested(q->backing_dev_info, bdi_bits);
2523}
2524
2525static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2526{
2527 struct cache *cache = container_of(cb, struct cache, callbacks);
2528
2529 return is_congested(cache->origin_dev, bdi_bits) ||
2530 is_congested(cache->cache_dev, bdi_bits);
2531}
2532
2753#define DEFAULT_MIGRATION_THRESHOLD 2048 2533#define DEFAULT_MIGRATION_THRESHOLD 2048
2754 2534
2755static int cache_create(struct cache_args *ca, struct cache **result) 2535static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2788,7 +2568,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2788 2568
2789 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2569 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2790 2570
2791 /* FIXME: factor out this whole section */
2792 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2571 origin_blocks = cache->origin_sectors = ca->origin_sectors;
2793 origin_blocks = block_div(origin_blocks, ca->block_size); 2572 origin_blocks = block_div(origin_blocks, ca->block_size);
2794 cache->origin_blocks = to_oblock(origin_blocks); 2573 cache->origin_blocks = to_oblock(origin_blocks);
@@ -2854,24 +2633,18 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2854 r = -EINVAL; 2633 r = -EINVAL;
2855 goto bad; 2634 goto bad;
2856 } 2635 }
2636
2637 policy_allow_migrations(cache->policy, false);
2857 } 2638 }
2858 2639
2859 spin_lock_init(&cache->lock); 2640 spin_lock_init(&cache->lock);
2860 INIT_LIST_HEAD(&cache->deferred_cells); 2641 INIT_LIST_HEAD(&cache->deferred_cells);
2861 bio_list_init(&cache->deferred_bios); 2642 bio_list_init(&cache->deferred_bios);
2862 bio_list_init(&cache->deferred_flush_bios);
2863 bio_list_init(&cache->deferred_writethrough_bios); 2643 bio_list_init(&cache->deferred_writethrough_bios);
2864 INIT_LIST_HEAD(&cache->quiesced_migrations);
2865 INIT_LIST_HEAD(&cache->completed_migrations);
2866 INIT_LIST_HEAD(&cache->need_commit_migrations);
2867 atomic_set(&cache->nr_allocated_migrations, 0); 2644 atomic_set(&cache->nr_allocated_migrations, 0);
2868 atomic_set(&cache->nr_io_migrations, 0); 2645 atomic_set(&cache->nr_io_migrations, 0);
2869 init_waitqueue_head(&cache->migration_wait); 2646 init_waitqueue_head(&cache->migration_wait);
2870 2647
2871 init_waitqueue_head(&cache->quiescing_wait);
2872 atomic_set(&cache->quiescing, 0);
2873 atomic_set(&cache->quiescing_ack, 0);
2874
2875 r = -ENOMEM; 2648 r = -ENOMEM;
2876 atomic_set(&cache->nr_dirty, 0); 2649 atomic_set(&cache->nr_dirty, 0);
2877 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2650 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
@@ -2900,27 +2673,23 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2900 goto bad; 2673 goto bad;
2901 } 2674 }
2902 2675
2903 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2676 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
2904 if (!cache->wq) { 2677 if (!cache->wq) {
2905 *error = "could not create workqueue for metadata object"; 2678 *error = "could not create workqueue for metadata object";
2906 goto bad; 2679 goto bad;
2907 } 2680 }
2908 INIT_WORK(&cache->worker, do_worker); 2681 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
2682 INIT_WORK(&cache->deferred_writethrough_worker,
2683 process_deferred_writethrough_bios);
2684 INIT_WORK(&cache->migration_worker, check_migrations);
2909 INIT_DELAYED_WORK(&cache->waker, do_waker); 2685 INIT_DELAYED_WORK(&cache->waker, do_waker);
2910 cache->last_commit_jiffies = jiffies;
2911 2686
2912 cache->prison = dm_bio_prison_create(); 2687 cache->prison = dm_bio_prison_create_v2(cache->wq);
2913 if (!cache->prison) { 2688 if (!cache->prison) {
2914 *error = "could not create bio prison"; 2689 *error = "could not create bio prison";
2915 goto bad; 2690 goto bad;
2916 } 2691 }
2917 2692
2918 cache->all_io_ds = dm_deferred_set_create();
2919 if (!cache->all_io_ds) {
2920 *error = "could not create all_io deferred set";
2921 goto bad;
2922 }
2923
2924 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2693 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2925 migration_cache); 2694 migration_cache);
2926 if (!cache->migration_pool) { 2695 if (!cache->migration_pool) {
@@ -2947,11 +2716,15 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2947 spin_lock_init(&cache->invalidation_lock); 2716 spin_lock_init(&cache->invalidation_lock);
2948 INIT_LIST_HEAD(&cache->invalidation_requests); 2717 INIT_LIST_HEAD(&cache->invalidation_requests);
2949 2718
2719 batcher_init(&cache->committer, commit_op, cache,
2720 issue_op, cache, cache->wq);
2950 iot_init(&cache->origin_tracker); 2721 iot_init(&cache->origin_tracker);
2951 2722
2723 init_rwsem(&cache->background_work_lock);
2724 prevent_background_work(cache);
2725
2952 *result = cache; 2726 *result = cache;
2953 return 0; 2727 return 0;
2954
2955bad: 2728bad:
2956 destroy(cache); 2729 destroy(cache);
2957 return r; 2730 return r;
@@ -3009,7 +2782,6 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
3009 } 2782 }
3010 2783
3011 ti->private = cache; 2784 ti->private = cache;
3012
3013out: 2785out:
3014 destroy_cache_args(ca); 2786 destroy_cache_args(ca);
3015 return r; 2787 return r;
@@ -3022,17 +2794,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
3022 struct cache *cache = ti->private; 2794 struct cache *cache = ti->private;
3023 2795
3024 int r; 2796 int r;
3025 struct dm_bio_prison_cell *cell = NULL; 2797 bool commit_needed;
3026 dm_oblock_t block = get_bio_block(cache, bio); 2798 dm_oblock_t block = get_bio_block(cache, bio);
3027 size_t pb_data_size = get_per_bio_data_size(cache); 2799 size_t pb_data_size = get_per_bio_data_size(cache);
3028 bool can_migrate = false;
3029 bool fast_promotion;
3030 struct policy_result lookup_result;
3031 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
3032 struct old_oblock_lock ool;
3033
3034 ool.locker.fn = null_locker;
3035 2800
2801 init_per_bio_data(bio, pb_data_size);
3036 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2802 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
3037 /* 2803 /*
3038 * This can only occur if the io goes to a partial block at 2804 * This can only occur if the io goes to a partial block at
@@ -3049,101 +2815,9 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
3049 return DM_MAPIO_SUBMITTED; 2815 return DM_MAPIO_SUBMITTED;
3050 } 2816 }
3051 2817
3052 /* 2818 r = map_bio(cache, bio, block, &commit_needed);
3053 * Check to see if that block is currently migrating. 2819 if (commit_needed)
3054 */ 2820 schedule_commit(&cache->committer);
3055 cell = alloc_prison_cell(cache);
3056 if (!cell) {
3057 defer_bio(cache, bio);
3058 return DM_MAPIO_SUBMITTED;
3059 }
3060
3061 r = bio_detain(cache, block, bio, cell,
3062 (cell_free_fn) free_prison_cell,
3063 cache, &cell);
3064 if (r) {
3065 if (r < 0)
3066 defer_bio(cache, bio);
3067
3068 return DM_MAPIO_SUBMITTED;
3069 }
3070
3071 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
3072
3073 r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
3074 bio, &ool.locker, &lookup_result);
3075 if (r == -EWOULDBLOCK) {
3076 cell_defer(cache, cell, true);
3077 return DM_MAPIO_SUBMITTED;
3078
3079 } else if (r) {
3080 DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
3081 cache_device_name(cache), r);
3082 cell_defer(cache, cell, false);
3083 bio_io_error(bio);
3084 return DM_MAPIO_SUBMITTED;
3085 }
3086
3087 r = DM_MAPIO_REMAPPED;
3088 switch (lookup_result.op) {
3089 case POLICY_HIT:
3090 if (passthrough_mode(&cache->features)) {
3091 if (bio_data_dir(bio) == WRITE) {
3092 /*
3093 * We need to invalidate this block, so
3094 * defer for the worker thread.
3095 */
3096 cell_defer(cache, cell, true);
3097 r = DM_MAPIO_SUBMITTED;
3098
3099 } else {
3100 inc_miss_counter(cache, bio);
3101 remap_to_origin_clear_discard(cache, bio, block);
3102 accounted_begin(cache, bio);
3103 inc_ds(cache, bio, cell);
3104 // FIXME: we want to remap hits or misses straight
3105 // away rather than passing over to the worker.
3106 cell_defer(cache, cell, false);
3107 }
3108
3109 } else {
3110 inc_hit_counter(cache, bio);
3111 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
3112 !is_dirty(cache, lookup_result.cblock)) {
3113 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
3114 accounted_begin(cache, bio);
3115 inc_ds(cache, bio, cell);
3116 cell_defer(cache, cell, false);
3117
3118 } else
3119 remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
3120 }
3121 break;
3122
3123 case POLICY_MISS:
3124 inc_miss_counter(cache, bio);
3125 if (pb->req_nr != 0) {
3126 /*
3127 * This is a duplicate writethrough io that is no
3128 * longer needed because the block has been demoted.
3129 */
3130 bio_endio(bio);
3131 // FIXME: remap everything as a miss
3132 cell_defer(cache, cell, false);
3133 r = DM_MAPIO_SUBMITTED;
3134
3135 } else
3136 remap_cell_to_origin_clear_discard(cache, cell, block, false);
3137 break;
3138
3139 default:
3140 DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
3141 cache_device_name(cache), __func__,
3142 (unsigned) lookup_result.op);
3143 cell_defer(cache, cell, false);
3144 bio_io_error(bio);
3145 r = DM_MAPIO_SUBMITTED;
3146 }
3147 2821
3148 return r; 2822 return r;
3149} 2823}
@@ -3163,7 +2837,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
3163 spin_unlock_irqrestore(&cache->lock, flags); 2837 spin_unlock_irqrestore(&cache->lock, flags);
3164 } 2838 }
3165 2839
3166 check_for_quiesced_migrations(cache, pb); 2840 bio_drop_shared_lock(cache, bio);
3167 accounted_complete(cache, bio); 2841 accounted_complete(cache, bio);
3168 2842
3169 return 0; 2843 return 0;
@@ -3263,12 +2937,18 @@ static void cache_postsuspend(struct dm_target *ti)
3263{ 2937{
3264 struct cache *cache = ti->private; 2938 struct cache *cache = ti->private;
3265 2939
3266 start_quiescing(cache); 2940 prevent_background_work(cache);
3267 wait_for_migrations(cache); 2941 BUG_ON(atomic_read(&cache->nr_io_migrations));
3268 stop_worker(cache); 2942
2943 cancel_delayed_work(&cache->waker);
2944 flush_workqueue(cache->wq);
2945 WARN_ON(cache->origin_tracker.in_flight);
2946
2947 /*
2948 * If it's a flush suspend there won't be any deferred bios, so this
2949 * call is harmless.
2950 */
3269 requeue_deferred_bios(cache); 2951 requeue_deferred_bios(cache);
3270 requeue_deferred_cells(cache);
3271 stop_quiescing(cache);
3272 2952
3273 if (get_cache_mode(cache) == CM_WRITE) 2953 if (get_cache_mode(cache) == CM_WRITE)
3274 (void) sync_metadata(cache); 2954 (void) sync_metadata(cache);
@@ -3280,15 +2960,10 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
3280 int r; 2960 int r;
3281 struct cache *cache = context; 2961 struct cache *cache = context;
3282 2962
3283 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 2963 r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
3284 if (r) 2964 if (r)
3285 return r; 2965 return r;
3286 2966
3287 if (dirty)
3288 set_dirty(cache, oblock, cblock);
3289 else
3290 clear_dirty(cache, oblock, cblock);
3291
3292 return 0; 2967 return 0;
3293} 2968}
3294 2969
@@ -3487,6 +3162,7 @@ static void cache_resume(struct dm_target *ti)
3487 struct cache *cache = ti->private; 3162 struct cache *cache = ti->private;
3488 3163
3489 cache->need_tick_bio = true; 3164 cache->need_tick_bio = true;
3165 allow_background_work(cache);
3490 do_waker(&cache->waker.work); 3166 do_waker(&cache->waker.work);
3491} 3167}
3492 3168
@@ -3621,10 +3297,19 @@ err:
3621} 3297}
3622 3298
3623/* 3299/*
3300 * Defines a range of cblocks, begin to (end - 1) are in the range. end is
3301 * the one-past-the-end value.
3302 */
3303struct cblock_range {
3304 dm_cblock_t begin;
3305 dm_cblock_t end;
3306};
3307
3308/*
3624 * A cache block range can take two forms: 3309 * A cache block range can take two forms:
3625 * 3310 *
3626 * i) A single cblock, eg. '3456' 3311 * i) A single cblock, eg. '3456'
3627 * ii) A begin and end cblock with dots between, eg. 123-234 3312 * ii) A begin and end cblock with a dash between, eg. 123-234
3628 */ 3313 */
3629static int parse_cblock_range(struct cache *cache, const char *str, 3314static int parse_cblock_range(struct cache *cache, const char *str,
3630 struct cblock_range *result) 3315 struct cblock_range *result)
@@ -3690,23 +3375,31 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
3690 return 0; 3375 return 0;
3691} 3376}
3692 3377
3378static inline dm_cblock_t cblock_succ(dm_cblock_t b)
3379{
3380 return to_cblock(from_cblock(b) + 1);
3381}
3382
3693static int request_invalidation(struct cache *cache, struct cblock_range *range) 3383static int request_invalidation(struct cache *cache, struct cblock_range *range)
3694{ 3384{
3695 struct invalidation_request req; 3385 int r = 0;
3696 3386
3697 INIT_LIST_HEAD(&req.list); 3387 /*
3698 req.cblocks = range; 3388 * We don't need to do any locking here because we know we're in
3699 atomic_set(&req.complete, 0); 3389 * passthrough mode. There's is potential for a race between an
3700 req.err = 0; 3390 * invalidation triggered by an io and an invalidation message. This
3701 init_waitqueue_head(&req.result_wait); 3391 * is harmless, we must not worry if the policy call fails.
3392 */
3393 while (range->begin != range->end) {
3394 r = invalidate_cblock(cache, range->begin);
3395 if (r)
3396 return r;
3702 3397
3703 spin_lock(&cache->invalidation_lock); 3398 range->begin = cblock_succ(range->begin);
3704 list_add(&req.list, &cache->invalidation_requests); 3399 }
3705 spin_unlock(&cache->invalidation_lock);
3706 wake_worker(cache);
3707 3400
3708 wait_event(req.result_wait, atomic_read(&req.complete)); 3401 cache->commit_requested = true;
3709 return req.err; 3402 return r;
3710} 3403}
3711 3404
3712static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3405static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
@@ -3816,7 +3509,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3816 3509
3817static struct target_type cache_target = { 3510static struct target_type cache_target = {
3818 .name = "cache", 3511 .name = "cache",
3819 .version = {1, 10, 0}, 3512 .version = {2, 0, 0},
3820 .module = THIS_MODULE, 3513 .module = THIS_MODULE,
3821 .ctr = cache_ctr, 3514 .ctr = cache_ctr,
3822 .dtr = cache_dtr, 3515 .dtr = cache_dtr,