aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm-cache-target.c
diff options
context:
space:
mode:
authorJoe Thornber <ejt@redhat.com>2016-12-15 04:57:31 -0500
committerMike Snitzer <snitzer@redhat.com>2017-03-07 13:28:31 -0500
commitb29d4986d0da1a27cd35917cdb433672f5c95d7f (patch)
treea5d94b86cf1eb759bfef5761015135d747e80561 /drivers/md/dm-cache-target.c
parent742c8fdc31e820503f9267070311d894978d1349 (diff)
dm cache: significant rework to leverage dm-bio-prison-v2
The cache policy interfaces have been updated to work well with the new bio-prison v2 interface's ability to queue work immediately (promotion, demotion, etc) -- overriding benefit being reduced latency on processing IO through the cache. Previously such work would be left for the DM cache core to queue on various lists and then process in batches later -- this caused a serious delay in latency for IO driven by the cache. The background tracker code was factored out so that all cache policies can make use of it. Also, the "cleaner" policy has been removed and is now a variant of the smq policy that simply disallows migrations. Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Diffstat (limited to 'drivers/md/dm-cache-target.c')
-rw-r--r--drivers/md/dm-cache-target.c2469
1 files changed, 1081 insertions, 1388 deletions
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 2eaa414e1509..b7de289a10bb 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -5,7 +5,7 @@
5 */ 5 */
6 6
7#include "dm.h" 7#include "dm.h"
8#include "dm-bio-prison-v1.h" 8#include "dm-bio-prison-v2.h"
9#include "dm-bio-record.h" 9#include "dm-bio-record.h"
10#include "dm-cache-metadata.h" 10#include "dm-cache-metadata.h"
11 11
@@ -15,6 +15,7 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/mempool.h> 16#include <linux/mempool.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rwsem.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
19#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
20 21
@@ -25,7 +26,18 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
25 26
26/*----------------------------------------------------------------*/ 27/*----------------------------------------------------------------*/
27 28
28#define IOT_RESOLUTION 4 29/*
30 * Glossary:
31 *
32 * oblock: index of an origin block
33 * cblock: index of a cache block
34 * promotion: movement of a block from origin to cache
35 * demotion: movement of a block from cache to origin
36 * migration: movement of a block between the origin and cache device,
37 * either direction
38 */
39
40/*----------------------------------------------------------------*/
29 41
30struct io_tracker { 42struct io_tracker {
31 spinlock_t lock; 43 spinlock_t lock;
@@ -99,19 +111,178 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
99/*----------------------------------------------------------------*/ 111/*----------------------------------------------------------------*/
100 112
101/* 113/*
102 * Glossary: 114 * Represents a chunk of future work. 'input' allows continuations to pass
103 * 115 * values between themselves, typically error values.
104 * oblock: index of an origin block
105 * cblock: index of a cache block
106 * promotion: movement of a block from origin to cache
107 * demotion: movement of a block from cache to origin
108 * migration: movement of a block between the origin and cache device,
109 * either direction
110 */ 116 */
117struct continuation {
118 struct work_struct ws;
119 int input;
120};
121
122static inline void init_continuation(struct continuation *k,
123 void (*fn)(struct work_struct *))
124{
125 INIT_WORK(&k->ws, fn);
126 k->input = 0;
127}
128
129static inline void queue_continuation(struct workqueue_struct *wq,
130 struct continuation *k)
131{
132 queue_work(wq, &k->ws);
133}
111 134
112/*----------------------------------------------------------------*/ 135/*----------------------------------------------------------------*/
113 136
114/* 137/*
138 * The batcher collects together pieces of work that need a particular
139 * operation to occur before they can proceed (typically a commit).
140 */
141struct batcher {
142 /*
143 * The operation that everyone is waiting for.
144 */
145 int (*commit_op)(void *context);
146 void *commit_context;
147
148 /*
149 * This is how bios should be issued once the commit op is complete
150 * (accounted_request).
151 */
152 void (*issue_op)(struct bio *bio, void *context);
153 void *issue_context;
154
155 /*
156 * Queued work gets put on here after commit.
157 */
158 struct workqueue_struct *wq;
159
160 spinlock_t lock;
161 struct list_head work_items;
162 struct bio_list bios;
163 struct work_struct commit_work;
164
165 bool commit_scheduled;
166};
167
168static void __commit(struct work_struct *_ws)
169{
170 struct batcher *b = container_of(_ws, struct batcher, commit_work);
171
172 int r;
173 unsigned long flags;
174 struct list_head work_items;
175 struct work_struct *ws, *tmp;
176 struct continuation *k;
177 struct bio *bio;
178 struct bio_list bios;
179
180 INIT_LIST_HEAD(&work_items);
181 bio_list_init(&bios);
182
183 /*
184 * We have to grab these before the commit_op to avoid a race
185 * condition.
186 */
187 spin_lock_irqsave(&b->lock, flags);
188 list_splice_init(&b->work_items, &work_items);
189 bio_list_merge(&bios, &b->bios);
190 bio_list_init(&b->bios);
191 b->commit_scheduled = false;
192 spin_unlock_irqrestore(&b->lock, flags);
193
194 r = b->commit_op(b->commit_context);
195
196 list_for_each_entry_safe(ws, tmp, &work_items, entry) {
197 k = container_of(ws, struct continuation, ws);
198 k->input = r;
199 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
200 queue_work(b->wq, ws);
201 }
202
203 while ((bio = bio_list_pop(&bios))) {
204 if (r) {
205 bio->bi_error = r;
206 bio_endio(bio);
207 } else
208 b->issue_op(bio, b->issue_context);
209 }
210}
211
212static void batcher_init(struct batcher *b,
213 int (*commit_op)(void *),
214 void *commit_context,
215 void (*issue_op)(struct bio *bio, void *),
216 void *issue_context,
217 struct workqueue_struct *wq)
218{
219 b->commit_op = commit_op;
220 b->commit_context = commit_context;
221 b->issue_op = issue_op;
222 b->issue_context = issue_context;
223 b->wq = wq;
224
225 spin_lock_init(&b->lock);
226 INIT_LIST_HEAD(&b->work_items);
227 bio_list_init(&b->bios);
228 INIT_WORK(&b->commit_work, __commit);
229 b->commit_scheduled = false;
230}
231
232static void async_commit(struct batcher *b)
233{
234 queue_work(b->wq, &b->commit_work);
235}
236
237static void continue_after_commit(struct batcher *b, struct continuation *k)
238{
239 unsigned long flags;
240 bool commit_scheduled;
241
242 spin_lock_irqsave(&b->lock, flags);
243 commit_scheduled = b->commit_scheduled;
244 list_add_tail(&k->ws.entry, &b->work_items);
245 spin_unlock_irqrestore(&b->lock, flags);
246
247 if (commit_scheduled)
248 async_commit(b);
249}
250
251/*
252 * Bios are errored if commit failed.
253 */
254static void issue_after_commit(struct batcher *b, struct bio *bio)
255{
256 unsigned long flags;
257 bool commit_scheduled;
258
259 spin_lock_irqsave(&b->lock, flags);
260 commit_scheduled = b->commit_scheduled;
261 bio_list_add(&b->bios, bio);
262 spin_unlock_irqrestore(&b->lock, flags);
263
264 if (commit_scheduled)
265 async_commit(b);
266}
267
268/*
269 * Call this if some urgent work is waiting for the commit to complete.
270 */
271static void schedule_commit(struct batcher *b)
272{
273 bool immediate;
274 unsigned long flags;
275
276 spin_lock_irqsave(&b->lock, flags);
277 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
278 b->commit_scheduled = true;
279 spin_unlock_irqrestore(&b->lock, flags);
280
281 if (immediate)
282 async_commit(b);
283}
284
285/*
115 * There are a couple of places where we let a bio run, but want to do some 286 * There are a couple of places where we let a bio run, but want to do some
116 * work before calling its endio function. We do this by temporarily 287 * work before calling its endio function. We do this by temporarily
117 * changing the endio fn. 288 * changing the endio fn.
@@ -189,31 +360,13 @@ struct cache_stats {
189 atomic_t write_miss; 360 atomic_t write_miss;
190 atomic_t demotion; 361 atomic_t demotion;
191 atomic_t promotion; 362 atomic_t promotion;
363 atomic_t writeback;
192 atomic_t copies_avoided; 364 atomic_t copies_avoided;
193 atomic_t cache_cell_clash; 365 atomic_t cache_cell_clash;
194 atomic_t commit_count; 366 atomic_t commit_count;
195 atomic_t discard_count; 367 atomic_t discard_count;
196}; 368};
197 369
198/*
199 * Defines a range of cblocks, begin to (end - 1) are in the range. end is
200 * the one-past-the-end value.
201 */
202struct cblock_range {
203 dm_cblock_t begin;
204 dm_cblock_t end;
205};
206
207struct invalidation_request {
208 struct list_head list;
209 struct cblock_range *cblocks;
210
211 atomic_t complete;
212 int err;
213
214 wait_queue_head_t result_wait;
215};
216
217struct cache { 370struct cache {
218 struct dm_target *ti; 371 struct dm_target *ti;
219 struct dm_target_callbacks callbacks; 372 struct dm_target_callbacks callbacks;
@@ -255,11 +408,7 @@ struct cache {
255 spinlock_t lock; 408 spinlock_t lock;
256 struct list_head deferred_cells; 409 struct list_head deferred_cells;
257 struct bio_list deferred_bios; 410 struct bio_list deferred_bios;
258 struct bio_list deferred_flush_bios;
259 struct bio_list deferred_writethrough_bios; 411 struct bio_list deferred_writethrough_bios;
260 struct list_head quiesced_migrations;
261 struct list_head completed_migrations;
262 struct list_head need_commit_migrations;
263 sector_t migration_threshold; 412 sector_t migration_threshold;
264 wait_queue_head_t migration_wait; 413 wait_queue_head_t migration_wait;
265 atomic_t nr_allocated_migrations; 414 atomic_t nr_allocated_migrations;
@@ -270,9 +419,7 @@ struct cache {
270 */ 419 */
271 atomic_t nr_io_migrations; 420 atomic_t nr_io_migrations;
272 421
273 wait_queue_head_t quiescing_wait; 422 struct rw_semaphore quiesce_lock;
274 atomic_t quiescing;
275 atomic_t quiescing_ack;
276 423
277 /* 424 /*
278 * cache_size entries, dirty if set 425 * cache_size entries, dirty if set
@@ -296,13 +443,11 @@ struct cache {
296 443
297 struct dm_kcopyd_client *copier; 444 struct dm_kcopyd_client *copier;
298 struct workqueue_struct *wq; 445 struct workqueue_struct *wq;
299 struct work_struct worker; 446 struct work_struct deferred_bio_worker;
300 447 struct work_struct deferred_writethrough_worker;
448 struct work_struct migration_worker;
301 struct delayed_work waker; 449 struct delayed_work waker;
302 unsigned long last_commit_jiffies; 450 struct dm_bio_prison_v2 *prison;
303
304 struct dm_bio_prison *prison;
305 struct dm_deferred_set *all_io_ds;
306 451
307 mempool_t *migration_pool; 452 mempool_t *migration_pool;
308 453
@@ -330,12 +475,17 @@ struct cache {
330 struct list_head invalidation_requests; 475 struct list_head invalidation_requests;
331 476
332 struct io_tracker origin_tracker; 477 struct io_tracker origin_tracker;
478
479 struct work_struct commit_ws;
480 struct batcher committer;
481
482 struct rw_semaphore background_work_lock;
333}; 483};
334 484
335struct per_bio_data { 485struct per_bio_data {
336 bool tick:1; 486 bool tick:1;
337 unsigned req_nr:2; 487 unsigned req_nr:2;
338 struct dm_deferred_entry *all_io_entry; 488 struct dm_bio_prison_cell_v2 *cell;
339 struct dm_hook_info hook_info; 489 struct dm_hook_info hook_info;
340 sector_t len; 490 sector_t len;
341 491
@@ -350,55 +500,64 @@ struct per_bio_data {
350}; 500};
351 501
352struct dm_cache_migration { 502struct dm_cache_migration {
353 struct list_head list; 503 struct continuation k;
354 struct cache *cache; 504 struct cache *cache;
355 505
356 unsigned long start_jiffies; 506 struct policy_work *op;
357 dm_oblock_t old_oblock; 507 struct bio *overwrite_bio;
358 dm_oblock_t new_oblock; 508 struct dm_bio_prison_cell_v2 *cell;
359 dm_cblock_t cblock;
360
361 bool err:1;
362 bool discard:1;
363 bool writeback:1;
364 bool demote:1;
365 bool promote:1;
366 bool requeue_holder:1;
367 bool invalidate:1;
368 509
369 struct dm_bio_prison_cell *old_ocell; 510 dm_cblock_t invalidate_cblock;
370 struct dm_bio_prison_cell *new_ocell; 511 dm_oblock_t invalidate_oblock;
371}; 512};
372 513
373/* 514/*----------------------------------------------------------------*/
374 * Processing a bio in the worker thread may require these memory 515
375 * allocations. We prealloc to avoid deadlocks (the same worker thread 516static bool writethrough_mode(struct cache_features *f)
376 * frees them back to the mempool). 517{
377 */ 518 return f->io_mode == CM_IO_WRITETHROUGH;
378struct prealloc { 519}
379 struct dm_cache_migration *mg;
380 struct dm_bio_prison_cell *cell1;
381 struct dm_bio_prison_cell *cell2;
382};
383 520
384static enum cache_metadata_mode get_cache_mode(struct cache *cache); 521static bool writeback_mode(struct cache_features *f)
522{
523 return f->io_mode == CM_IO_WRITEBACK;
524}
385 525
386static void wake_worker(struct cache *cache) 526static inline bool passthrough_mode(struct cache_features *f)
387{ 527{
388 queue_work(cache->wq, &cache->worker); 528 return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
389} 529}
390 530
391/*----------------------------------------------------------------*/ 531/*----------------------------------------------------------------*/
392 532
393static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 533static void wake_deferred_bio_worker(struct cache *cache)
394{ 534{
395 /* FIXME: change to use a local slab. */ 535 queue_work(cache->wq, &cache->deferred_bio_worker);
396 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
397} 536}
398 537
399static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 538static void wake_deferred_writethrough_worker(struct cache *cache)
400{ 539{
401 dm_bio_prison_free_cell(cache->prison, cell); 540 queue_work(cache->wq, &cache->deferred_writethrough_worker);
541}
542
543static void wake_migration_worker(struct cache *cache)
544{
545 if (passthrough_mode(&cache->features))
546 return;
547
548 queue_work(cache->wq, &cache->migration_worker);
549}
550
551/*----------------------------------------------------------------*/
552
553static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
554{
555 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
556}
557
558static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
559{
560 dm_bio_prison_free_cell_v2(cache->prison, cell);
402} 561}
403 562
404static struct dm_cache_migration *alloc_migration(struct cache *cache) 563static struct dm_cache_migration *alloc_migration(struct cache *cache)
@@ -424,146 +583,127 @@ static void free_migration(struct dm_cache_migration *mg)
424 mempool_free(mg, cache->migration_pool); 583 mempool_free(mg, cache->migration_pool);
425} 584}
426 585
427static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 586/*----------------------------------------------------------------*/
428{
429 if (!p->mg) {
430 p->mg = alloc_migration(cache);
431 if (!p->mg)
432 return -ENOMEM;
433 }
434
435 if (!p->cell1) {
436 p->cell1 = alloc_prison_cell(cache);
437 if (!p->cell1)
438 return -ENOMEM;
439 }
440
441 if (!p->cell2) {
442 p->cell2 = alloc_prison_cell(cache);
443 if (!p->cell2)
444 return -ENOMEM;
445 }
446
447 return 0;
448}
449 587
450static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 588static inline dm_oblock_t oblock_succ(dm_oblock_t b)
451{ 589{
452 if (p->cell2) 590 return to_oblock(from_oblock(b) + 1ull);
453 free_prison_cell(cache, p->cell2);
454
455 if (p->cell1)
456 free_prison_cell(cache, p->cell1);
457
458 if (p->mg)
459 free_migration(p->mg);
460} 591}
461 592
462static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 593static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
463{ 594{
464 struct dm_cache_migration *mg = p->mg; 595 key->virtual = 0;
465 596 key->dev = 0;
466 BUG_ON(!mg); 597 key->block_begin = from_oblock(begin);
467 p->mg = NULL; 598 key->block_end = from_oblock(end);
468
469 return mg;
470} 599}
471 600
472/* 601/*
473 * You must have a cell within the prealloc struct to return. If not this 602 * We have two lock levels. Level 0, which is used to prevent WRITEs, and
474 * function will BUG() rather than returning NULL. 603 * level 1 which prevents *both* READs and WRITEs.
475 */ 604 */
476static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 605#define WRITE_LOCK_LEVEL 0
606#define READ_WRITE_LOCK_LEVEL 1
607
608static unsigned lock_level(struct bio *bio)
477{ 609{
478 struct dm_bio_prison_cell *r = NULL; 610 return bio_data_dir(bio) == WRITE ?
611 WRITE_LOCK_LEVEL :
612 READ_WRITE_LOCK_LEVEL;
613}
479 614
480 if (p->cell1) { 615/*----------------------------------------------------------------
481 r = p->cell1; 616 * Per bio data
482 p->cell1 = NULL; 617 *--------------------------------------------------------------*/
483 618
484 } else if (p->cell2) { 619/*
485 r = p->cell2; 620 * If using writeback, leave out struct per_bio_data's writethrough fields.
486 p->cell2 = NULL; 621 */
487 } else 622#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
488 BUG(); 623#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
489 624
490 return r; 625static size_t get_per_bio_data_size(struct cache *cache)
626{
627 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
491} 628}
492 629
493/* 630static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
494 * You can't have more than two cells in a prealloc struct. BUG() will be
495 * called if you try and overfill.
496 */
497static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
498{ 631{
499 if (!p->cell2) 632 struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
500 p->cell2 = cell; 633 BUG_ON(!pb);
634 return pb;
635}
501 636
502 else if (!p->cell1) 637static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
503 p->cell1 = cell; 638{
639 struct per_bio_data *pb = get_per_bio_data(bio, data_size);
504 640
505 else 641 pb->tick = false;
506 BUG(); 642 pb->req_nr = dm_bio_get_target_bio_nr(bio);
643 pb->cell = NULL;
644 pb->len = 0;
645
646 return pb;
507} 647}
508 648
509/*----------------------------------------------------------------*/ 649/*----------------------------------------------------------------*/
510 650
511static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 651static void defer_bio(struct cache *cache, struct bio *bio)
512{ 652{
513 key->virtual = 0; 653 unsigned long flags;
514 key->dev = 0;
515 key->block_begin = from_oblock(begin);
516 key->block_end = from_oblock(end);
517}
518 654
519/* 655 spin_lock_irqsave(&cache->lock, flags);
520 * The caller hands in a preallocated cell, and a free function for it. 656 bio_list_add(&cache->deferred_bios, bio);
521 * The cell will be freed if there's an error, or if it wasn't used because 657 spin_unlock_irqrestore(&cache->lock, flags);
522 * a cell with that key already exists.
523 */
524typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
525 658
526static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 659 wake_deferred_bio_worker(cache);
527 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 660}
528 cell_free_fn free_fn, void *free_context, 661
529 struct dm_bio_prison_cell **cell_result) 662static void defer_bios(struct cache *cache, struct bio_list *bios)
530{ 663{
531 int r; 664 unsigned long flags;
532 struct dm_cell_key key;
533 665
534 build_key(oblock_begin, oblock_end, &key); 666 spin_lock_irqsave(&cache->lock, flags);
535 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 667 bio_list_merge(&cache->deferred_bios, bios);
536 if (r) 668 bio_list_init(bios);
537 free_fn(free_context, cell_prealloc); 669 spin_unlock_irqrestore(&cache->lock, flags);
538 670
539 return r; 671 wake_deferred_bio_worker(cache);
540} 672}
541 673
542static int bio_detain(struct cache *cache, dm_oblock_t oblock, 674/*----------------------------------------------------------------*/
543 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 675
544 cell_free_fn free_fn, void *free_context, 676static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
545 struct dm_bio_prison_cell **cell_result)
546{ 677{
678 bool r;
679 size_t pb_size;
680 struct per_bio_data *pb;
681 struct dm_cell_key_v2 key;
547 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 682 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
548 return bio_detain_range(cache, oblock, end, bio, 683 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
549 cell_prealloc, free_fn, free_context, cell_result);
550}
551 684
552static int get_cell(struct cache *cache, 685 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
553 dm_oblock_t oblock, 686 if (!cell_prealloc) {
554 struct prealloc *structs, 687 defer_bio(cache, bio);
555 struct dm_bio_prison_cell **cell_result) 688 return false;
556{ 689 }
557 int r;
558 struct dm_cell_key key;
559 struct dm_bio_prison_cell *cell_prealloc;
560 690
561 cell_prealloc = prealloc_get_cell(structs); 691 build_key(oblock, end, &key);
692 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
693 if (!r) {
694 /*
695 * Failed to get the lock.
696 */
697 free_prison_cell(cache, cell_prealloc);
698 return r;
699 }
562 700
563 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 701 if (cell != cell_prealloc)
564 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 702 free_prison_cell(cache, cell_prealloc);
565 if (r) 703
566 prealloc_put_cell(structs, cell_prealloc); 704 pb_size = get_per_bio_data_size(cache);
705 pb = get_per_bio_data(bio, pb_size);
706 pb->cell = cell;
567 707
568 return r; 708 return r;
569} 709}
@@ -575,21 +715,33 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b)
575 return test_bit(from_cblock(b), cache->dirty_bitset); 715 return test_bit(from_cblock(b), cache->dirty_bitset);
576} 716}
577 717
578static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 718static void set_dirty(struct cache *cache, dm_cblock_t cblock)
579{ 719{
580 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 720 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
581 atomic_inc(&cache->nr_dirty); 721 atomic_inc(&cache->nr_dirty);
582 policy_set_dirty(cache->policy, oblock); 722 policy_set_dirty(cache->policy, cblock);
583 } 723 }
584} 724}
585 725
586static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 726/*
727 * These two are called when setting after migrations to force the policy
728 * and dirty bitset to be in sync.
729 */
730static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
731{
732 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
733 atomic_inc(&cache->nr_dirty);
734 policy_set_dirty(cache->policy, cblock);
735}
736
737static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
587{ 738{
588 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 739 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
589 policy_clear_dirty(cache->policy, oblock);
590 if (atomic_dec_return(&cache->nr_dirty) == 0) 740 if (atomic_dec_return(&cache->nr_dirty) == 0)
591 dm_table_event(cache->ti->table); 741 dm_table_event(cache->ti->table);
592 } 742 }
743
744 policy_clear_dirty(cache->policy, cblock);
593} 745}
594 746
595/*----------------------------------------------------------------*/ 747/*----------------------------------------------------------------*/
@@ -628,11 +780,6 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
628 oblocks_per_dblock(cache))); 780 oblocks_per_dblock(cache)));
629} 781}
630 782
631static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
632{
633 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
634}
635
636static void set_discard(struct cache *cache, dm_dblock_t b) 783static void set_discard(struct cache *cache, dm_dblock_t b)
637{ 784{
638 unsigned long flags; 785 unsigned long flags;
@@ -679,83 +826,6 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
679 return r; 826 return r;
680} 827}
681 828
682/*----------------------------------------------------------------*/
683
684static void load_stats(struct cache *cache)
685{
686 struct dm_cache_statistics stats;
687
688 dm_cache_metadata_get_stats(cache->cmd, &stats);
689 atomic_set(&cache->stats.read_hit, stats.read_hits);
690 atomic_set(&cache->stats.read_miss, stats.read_misses);
691 atomic_set(&cache->stats.write_hit, stats.write_hits);
692 atomic_set(&cache->stats.write_miss, stats.write_misses);
693}
694
695static void save_stats(struct cache *cache)
696{
697 struct dm_cache_statistics stats;
698
699 if (get_cache_mode(cache) >= CM_READ_ONLY)
700 return;
701
702 stats.read_hits = atomic_read(&cache->stats.read_hit);
703 stats.read_misses = atomic_read(&cache->stats.read_miss);
704 stats.write_hits = atomic_read(&cache->stats.write_hit);
705 stats.write_misses = atomic_read(&cache->stats.write_miss);
706
707 dm_cache_metadata_set_stats(cache->cmd, &stats);
708}
709
710/*----------------------------------------------------------------
711 * Per bio data
712 *--------------------------------------------------------------*/
713
714/*
715 * If using writeback, leave out struct per_bio_data's writethrough fields.
716 */
717#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
718#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
719
720static bool writethrough_mode(struct cache_features *f)
721{
722 return f->io_mode == CM_IO_WRITETHROUGH;
723}
724
725static bool writeback_mode(struct cache_features *f)
726{
727 return f->io_mode == CM_IO_WRITEBACK;
728}
729
730static bool passthrough_mode(struct cache_features *f)
731{
732 return f->io_mode == CM_IO_PASSTHROUGH;
733}
734
735static size_t get_per_bio_data_size(struct cache *cache)
736{
737 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
738}
739
740static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
741{
742 struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
743 BUG_ON(!pb);
744 return pb;
745}
746
747static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
748{
749 struct per_bio_data *pb = get_per_bio_data(bio, data_size);
750
751 pb->tick = false;
752 pb->req_nr = dm_bio_get_target_bio_nr(bio);
753 pb->all_io_entry = NULL;
754 pb->len = 0;
755
756 return pb;
757}
758
759/*---------------------------------------------------------------- 829/*----------------------------------------------------------------
760 * Remapping 830 * Remapping
761 *--------------------------------------------------------------*/ 831 *--------------------------------------------------------------*/
@@ -797,8 +867,9 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
797} 867}
798 868
799static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 869static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
800 dm_oblock_t oblock) 870 dm_oblock_t oblock)
801{ 871{
872 // FIXME: this is called way too much.
802 check_if_tick_bio_needed(cache, bio); 873 check_if_tick_bio_needed(cache, bio);
803 remap_to_origin(cache, bio); 874 remap_to_origin(cache, bio);
804 if (bio_data_dir(bio) == WRITE) 875 if (bio_data_dir(bio) == WRITE)
@@ -811,7 +882,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
811 check_if_tick_bio_needed(cache, bio); 882 check_if_tick_bio_needed(cache, bio);
812 remap_to_cache(cache, bio, cblock); 883 remap_to_cache(cache, bio, cblock);
813 if (bio_data_dir(bio) == WRITE) { 884 if (bio_data_dir(bio) == WRITE) {
814 set_dirty(cache, oblock, cblock); 885 set_dirty(cache, cblock);
815 clear_discard(cache, oblock_to_dblock(cache, oblock)); 886 clear_discard(cache, oblock_to_dblock(cache, oblock));
816 } 887 }
817} 888}
@@ -828,22 +899,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
828 return to_oblock(block_nr); 899 return to_oblock(block_nr);
829} 900}
830 901
831/*
832 * You must increment the deferred set whilst the prison cell is held. To
833 * encourage this, we ask for 'cell' to be passed in.
834 */
835static void inc_ds(struct cache *cache, struct bio *bio,
836 struct dm_bio_prison_cell *cell)
837{
838 size_t pb_data_size = get_per_bio_data_size(cache);
839 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
840
841 BUG_ON(!cell);
842 BUG_ON(pb->all_io_entry);
843
844 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
845}
846
847static bool accountable_bio(struct cache *cache, struct bio *bio) 902static bool accountable_bio(struct cache *cache, struct bio *bio)
848{ 903{
849 return ((bio->bi_bdev == cache->origin_dev->bdev) && 904 return ((bio->bi_bdev == cache->origin_dev->bdev) &&
@@ -875,29 +930,10 @@ static void accounted_request(struct cache *cache, struct bio *bio)
875 generic_make_request(bio); 930 generic_make_request(bio);
876} 931}
877 932
878static void issue(struct cache *cache, struct bio *bio) 933static void issue_op(struct bio *bio, void *context)
879{ 934{
880 unsigned long flags; 935 struct cache *cache = context;
881 936 accounted_request(cache, bio);
882 if (!op_is_flush(bio->bi_opf)) {
883 accounted_request(cache, bio);
884 return;
885 }
886
887 /*
888 * Batch together any bios that trigger commits and then issue a
889 * single commit for them in do_worker().
890 */
891 spin_lock_irqsave(&cache->lock, flags);
892 cache->commit_requested = true;
893 bio_list_add(&cache->deferred_flush_bios, bio);
894 spin_unlock_irqrestore(&cache->lock, flags);
895}
896
897static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
898{
899 inc_ds(cache, bio, cell);
900 issue(cache, bio);
901} 937}
902 938
903static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 939static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
@@ -908,7 +944,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
908 bio_list_add(&cache->deferred_writethrough_bios, bio); 944 bio_list_add(&cache->deferred_writethrough_bios, bio);
909 spin_unlock_irqrestore(&cache->lock, flags); 945 spin_unlock_irqrestore(&cache->lock, flags);
910 946
911 wake_worker(cache); 947 wake_deferred_writethrough_worker(cache);
912} 948}
913 949
914static void writethrough_endio(struct bio *bio) 950static void writethrough_endio(struct bio *bio)
@@ -934,6 +970,7 @@ static void writethrough_endio(struct bio *bio)
934} 970}
935 971
936/* 972/*
973 * FIXME: send in parallel, huge latency as is.
937 * When running in writethrough mode we need to send writes to clean blocks 974 * When running in writethrough mode we need to send writes to clean blocks
938 * to both the cache and origin devices. In future we'd like to clone the 975 * to both the cache and origin devices. In future we'd like to clone the
939 * bio and send them in parallel, but for now we're doing them in 976 * bio and send them in parallel, but for now we're doing them in
@@ -1046,12 +1083,58 @@ static void metadata_operation_failed(struct cache *cache, const char *op, int r
1046 set_cache_mode(cache, CM_READ_ONLY); 1083 set_cache_mode(cache, CM_READ_ONLY);
1047} 1084}
1048 1085
1086/*----------------------------------------------------------------*/
1087
1088static void load_stats(struct cache *cache)
1089{
1090 struct dm_cache_statistics stats;
1091
1092 dm_cache_metadata_get_stats(cache->cmd, &stats);
1093 atomic_set(&cache->stats.read_hit, stats.read_hits);
1094 atomic_set(&cache->stats.read_miss, stats.read_misses);
1095 atomic_set(&cache->stats.write_hit, stats.write_hits);
1096 atomic_set(&cache->stats.write_miss, stats.write_misses);
1097}
1098
1099static void save_stats(struct cache *cache)
1100{
1101 struct dm_cache_statistics stats;
1102
1103 if (get_cache_mode(cache) >= CM_READ_ONLY)
1104 return;
1105
1106 stats.read_hits = atomic_read(&cache->stats.read_hit);
1107 stats.read_misses = atomic_read(&cache->stats.read_miss);
1108 stats.write_hits = atomic_read(&cache->stats.write_hit);
1109 stats.write_misses = atomic_read(&cache->stats.write_miss);
1110
1111 dm_cache_metadata_set_stats(cache->cmd, &stats);
1112}
1113
1114static void update_stats(struct cache_stats *stats, enum policy_operation op)
1115{
1116 switch (op) {
1117 case POLICY_PROMOTE:
1118 atomic_inc(&stats->promotion);
1119 break;
1120
1121 case POLICY_DEMOTE:
1122 atomic_inc(&stats->demotion);
1123 break;
1124
1125 case POLICY_WRITEBACK:
1126 atomic_inc(&stats->writeback);
1127 break;
1128 }
1129}
1130
1049/*---------------------------------------------------------------- 1131/*----------------------------------------------------------------
1050 * Migration processing 1132 * Migration processing
1051 * 1133 *
1052 * Migration covers moving data from the origin device to the cache, or 1134 * Migration covers moving data from the origin device to the cache, or
1053 * vice versa. 1135 * vice versa.
1054 *--------------------------------------------------------------*/ 1136 *--------------------------------------------------------------*/
1137
1055static void inc_io_migrations(struct cache *cache) 1138static void inc_io_migrations(struct cache *cache)
1056{ 1139{
1057 atomic_inc(&cache->nr_io_migrations); 1140 atomic_inc(&cache->nr_io_migrations);
@@ -1067,213 +1150,109 @@ static bool discard_or_flush(struct bio *bio)
1067 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1150 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
1068} 1151}
1069 1152
1070static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) 1153static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1071{ 1154 dm_dblock_t *b, dm_dblock_t *e)
1072 if (discard_or_flush(cell->holder)) {
1073 /*
1074 * We have to handle these bios individually.
1075 */
1076 dm_cell_release(cache->prison, cell, &cache->deferred_bios);
1077 free_prison_cell(cache, cell);
1078 } else
1079 list_add_tail(&cell->user_list, &cache->deferred_cells);
1080}
1081
1082static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
1083{ 1155{
1084 unsigned long flags; 1156 sector_t sb = bio->bi_iter.bi_sector;
1085 1157 sector_t se = bio_end_sector(bio);
1086 if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
1087 /*
1088 * There was no prisoner to promote to holder, the
1089 * cell has been released.
1090 */
1091 free_prison_cell(cache, cell);
1092 return;
1093 }
1094 1158
1095 spin_lock_irqsave(&cache->lock, flags); 1159 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1096 __cell_defer(cache, cell);
1097 spin_unlock_irqrestore(&cache->lock, flags);
1098 1160
1099 wake_worker(cache); 1161 if (se - sb < cache->discard_block_size)
1162 *e = *b;
1163 else
1164 *e = to_dblock(block_div(se, cache->discard_block_size));
1100} 1165}
1101 1166
1102static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) 1167/*----------------------------------------------------------------*/
1103{
1104 dm_cell_error(cache->prison, cell, err);
1105 free_prison_cell(cache, cell);
1106}
1107 1168
1108static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) 1169static void prevent_background_work(struct cache *cache)
1109{ 1170{
1110 cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); 1171 lockdep_off();
1172 down_write(&cache->background_work_lock);
1173 lockdep_on();
1111} 1174}
1112 1175
1113static void free_io_migration(struct dm_cache_migration *mg) 1176static void allow_background_work(struct cache *cache)
1114{ 1177{
1115 struct cache *cache = mg->cache; 1178 lockdep_off();
1116 1179 up_write(&cache->background_work_lock);
1117 dec_io_migrations(cache); 1180 lockdep_on();
1118 free_migration(mg);
1119 wake_worker(cache);
1120} 1181}
1121 1182
1122static void migration_failure(struct dm_cache_migration *mg) 1183static bool background_work_begin(struct cache *cache)
1123{ 1184{
1124 struct cache *cache = mg->cache; 1185 bool r;
1125 const char *dev_name = cache_device_name(cache);
1126
1127 if (mg->writeback) {
1128 DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
1129 set_dirty(cache, mg->old_oblock, mg->cblock);
1130 cell_defer(cache, mg->old_ocell, false);
1131
1132 } else if (mg->demote) {
1133 DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
1134 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
1135 1186
1136 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1187 lockdep_off();
1137 if (mg->promote) 1188 r = down_read_trylock(&cache->background_work_lock);
1138 cell_defer(cache, mg->new_ocell, true); 1189 lockdep_on();
1139 } else {
1140 DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
1141 policy_remove_mapping(cache->policy, mg->new_oblock);
1142 cell_defer(cache, mg->new_ocell, true);
1143 }
1144 1190
1145 free_io_migration(mg); 1191 return r;
1146} 1192}
1147 1193
1148static void migration_success_pre_commit(struct dm_cache_migration *mg) 1194static void background_work_end(struct cache *cache)
1149{ 1195{
1150 int r; 1196 lockdep_off();
1151 unsigned long flags; 1197 up_read(&cache->background_work_lock);
1152 struct cache *cache = mg->cache; 1198 lockdep_on();
1153 1199}
1154 if (mg->writeback) {
1155 clear_dirty(cache, mg->old_oblock, mg->cblock);
1156 cell_defer(cache, mg->old_ocell, false);
1157 free_io_migration(mg);
1158 return;
1159 1200
1160 } else if (mg->demote) { 1201/*----------------------------------------------------------------*/
1161 r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
1162 if (r) {
1163 DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
1164 cache_device_name(cache));
1165 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1166 policy_force_mapping(cache->policy, mg->new_oblock,
1167 mg->old_oblock);
1168 if (mg->promote)
1169 cell_defer(cache, mg->new_ocell, true);
1170 free_io_migration(mg);
1171 return;
1172 }
1173 } else {
1174 r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
1175 if (r) {
1176 DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
1177 cache_device_name(cache));
1178 metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1179 policy_remove_mapping(cache->policy, mg->new_oblock);
1180 free_io_migration(mg);
1181 return;
1182 }
1183 }
1184 1202
1185 spin_lock_irqsave(&cache->lock, flags); 1203static void quiesce(struct dm_cache_migration *mg,
1186 list_add_tail(&mg->list, &cache->need_commit_migrations); 1204 void (*continuation)(struct work_struct *))
1187 cache->commit_requested = true; 1205{
1188 spin_unlock_irqrestore(&cache->lock, flags); 1206 init_continuation(&mg->k, continuation);
1207 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
1189} 1208}
1190 1209
1191static void migration_success_post_commit(struct dm_cache_migration *mg) 1210static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
1192{ 1211{
1193 unsigned long flags; 1212 struct continuation *k = container_of(ws, struct continuation, ws);
1194 struct cache *cache = mg->cache; 1213 return container_of(k, struct dm_cache_migration, k);
1195
1196 if (mg->writeback) {
1197 DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
1198 cache_device_name(cache));
1199 return;
1200
1201 } else if (mg->demote) {
1202 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
1203
1204 if (mg->promote) {
1205 mg->demote = false;
1206
1207 spin_lock_irqsave(&cache->lock, flags);
1208 list_add_tail(&mg->list, &cache->quiesced_migrations);
1209 spin_unlock_irqrestore(&cache->lock, flags);
1210
1211 } else {
1212 if (mg->invalidate)
1213 policy_remove_mapping(cache->policy, mg->old_oblock);
1214 free_io_migration(mg);
1215 }
1216
1217 } else {
1218 if (mg->requeue_holder) {
1219 clear_dirty(cache, mg->new_oblock, mg->cblock);
1220 cell_defer(cache, mg->new_ocell, true);
1221 } else {
1222 /*
1223 * The block was promoted via an overwrite, so it's dirty.
1224 */
1225 set_dirty(cache, mg->new_oblock, mg->cblock);
1226 bio_endio(mg->new_ocell->holder);
1227 cell_defer(cache, mg->new_ocell, false);
1228 }
1229 free_io_migration(mg);
1230 }
1231} 1214}
1232 1215
1233static void copy_complete(int read_err, unsigned long write_err, void *context) 1216static void copy_complete(int read_err, unsigned long write_err, void *context)
1234{ 1217{
1235 unsigned long flags; 1218 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
1236 struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
1237 struct cache *cache = mg->cache;
1238 1219
1239 if (read_err || write_err) 1220 if (read_err || write_err)
1240 mg->err = true; 1221 mg->k.input = -EIO;
1241
1242 spin_lock_irqsave(&cache->lock, flags);
1243 list_add_tail(&mg->list, &cache->completed_migrations);
1244 spin_unlock_irqrestore(&cache->lock, flags);
1245 1222
1246 wake_worker(cache); 1223 queue_continuation(mg->cache->wq, &mg->k);
1247} 1224}
1248 1225
1249static void issue_copy(struct dm_cache_migration *mg) 1226static int copy(struct dm_cache_migration *mg, bool promote)
1250{ 1227{
1251 int r; 1228 int r;
1252 struct dm_io_region o_region, c_region; 1229 struct dm_io_region o_region, c_region;
1253 struct cache *cache = mg->cache; 1230 struct cache *cache = mg->cache;
1254 sector_t cblock = from_cblock(mg->cblock);
1255 1231
1256 o_region.bdev = cache->origin_dev->bdev; 1232 o_region.bdev = cache->origin_dev->bdev;
1233 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
1257 o_region.count = cache->sectors_per_block; 1234 o_region.count = cache->sectors_per_block;
1258 1235
1259 c_region.bdev = cache->cache_dev->bdev; 1236 c_region.bdev = cache->cache_dev->bdev;
1260 c_region.sector = cblock * cache->sectors_per_block; 1237 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
1261 c_region.count = cache->sectors_per_block; 1238 c_region.count = cache->sectors_per_block;
1262 1239
1263 if (mg->writeback || mg->demote) { 1240 if (promote)
1264 /* demote */ 1241 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
1265 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1242 else
1266 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1243 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
1267 } else {
1268 /* promote */
1269 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
1270 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
1271 }
1272 1244
1273 if (r < 0) { 1245 return r;
1274 DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); 1246}
1275 migration_failure(mg); 1247
1276 } 1248static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
1249{
1250 size_t pb_data_size = get_per_bio_data_size(cache);
1251 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1252
1253 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
1254 free_prison_cell(cache, pb->cell);
1255 pb->cell = NULL;
1277} 1256}
1278 1257
1279static void overwrite_endio(struct bio *bio) 1258static void overwrite_endio(struct bio *bio)
@@ -1282,368 +1261,475 @@ static void overwrite_endio(struct bio *bio)
1282 struct cache *cache = mg->cache; 1261 struct cache *cache = mg->cache;
1283 size_t pb_data_size = get_per_bio_data_size(cache); 1262 size_t pb_data_size = get_per_bio_data_size(cache);
1284 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1263 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1285 unsigned long flags;
1286 1264
1287 dm_unhook_bio(&pb->hook_info, bio); 1265 dm_unhook_bio(&pb->hook_info, bio);
1288 1266
1289 if (bio->bi_error) 1267 if (bio->bi_error)
1290 mg->err = true; 1268 mg->k.input = bio->bi_error;
1291 1269
1292 mg->requeue_holder = false; 1270 queue_continuation(mg->cache->wq, &mg->k);
1293
1294 spin_lock_irqsave(&cache->lock, flags);
1295 list_add_tail(&mg->list, &cache->completed_migrations);
1296 spin_unlock_irqrestore(&cache->lock, flags);
1297
1298 wake_worker(cache);
1299} 1271}
1300 1272
1301static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1273static void overwrite(struct dm_cache_migration *mg,
1274 void (*continuation)(struct work_struct *))
1302{ 1275{
1276 struct bio *bio = mg->overwrite_bio;
1303 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1277 size_t pb_data_size = get_per_bio_data_size(mg->cache);
1304 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1278 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1305 1279
1306 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1280 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1307 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1308 1281
1309 /* 1282 /*
1310 * No need to inc_ds() here, since the cell will be held for the 1283 * The overwrite bio is part of the copy operation, as such it does
1311 * duration of the io. 1284 * not set/clear discard or dirty flags.
1312 */ 1285 */
1286 if (mg->op->op == POLICY_PROMOTE)
1287 remap_to_cache(mg->cache, bio, mg->op->cblock);
1288 else
1289 remap_to_origin(mg->cache, bio);
1290
1291 init_continuation(&mg->k, continuation);
1313 accounted_request(mg->cache, bio); 1292 accounted_request(mg->cache, bio);
1314} 1293}
1315 1294
1316static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1295/*
1296 * Migration steps:
1297 *
1298 * 1) exclusive lock preventing WRITEs
1299 * 2) quiesce
1300 * 3) copy or issue overwrite bio
1301 * 4) upgrade to exclusive lock preventing READs and WRITEs
1302 * 5) quiesce
1303 * 6) update metadata and commit
1304 * 7) unlock
1305 */
1306static void mg_complete(struct dm_cache_migration *mg, bool success)
1317{ 1307{
1318 return (bio_data_dir(bio) == WRITE) && 1308 struct bio_list bios;
1319 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1309 struct cache *cache = mg->cache;
1320} 1310 struct policy_work *op = mg->op;
1311 dm_cblock_t cblock = op->cblock;
1312
1313 if (success)
1314 update_stats(&cache->stats, op->op);
1315
1316 switch (op->op) {
1317 case POLICY_PROMOTE:
1318 clear_discard(cache, oblock_to_dblock(cache, op->oblock));
1319 policy_complete_background_work(cache->policy, op, success);
1320
1321 if (mg->overwrite_bio) {
1322 if (success)
1323 force_set_dirty(cache, cblock);
1324 else
1325 mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO);
1326 bio_endio(mg->overwrite_bio);
1327 } else {
1328 if (success)
1329 force_clear_dirty(cache, cblock);
1330 dec_io_migrations(cache);
1331 }
1332 break;
1321 1333
1322static void avoid_copy(struct dm_cache_migration *mg) 1334 case POLICY_DEMOTE:
1323{ 1335 /*
1324 atomic_inc(&mg->cache->stats.copies_avoided); 1336 * We clear dirty here to update the nr_dirty counter.
1325 migration_success_pre_commit(mg); 1337 */
1326} 1338 if (success)
1339 force_clear_dirty(cache, cblock);
1340 policy_complete_background_work(cache->policy, op, success);
1341 dec_io_migrations(cache);
1342 break;
1327 1343
1328static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1344 case POLICY_WRITEBACK:
1329 dm_dblock_t *b, dm_dblock_t *e) 1345 if (success)
1330{ 1346 force_clear_dirty(cache, cblock);
1331 sector_t sb = bio->bi_iter.bi_sector; 1347 policy_complete_background_work(cache->policy, op, success);
1332 sector_t se = bio_end_sector(bio); 1348 dec_io_migrations(cache);
1349 break;
1350 }
1333 1351
1334 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1352 bio_list_init(&bios);
1353 if (mg->cell) {
1354 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1355 free_prison_cell(cache, mg->cell);
1356 }
1335 1357
1336 if (se - sb < cache->discard_block_size) 1358 free_migration(mg);
1337 *e = *b; 1359 defer_bios(cache, &bios);
1338 else 1360 wake_migration_worker(cache);
1339 *e = to_dblock(block_div(se, cache->discard_block_size)); 1361
1362 background_work_end(cache);
1340} 1363}
1341 1364
1342static void issue_discard(struct dm_cache_migration *mg) 1365static void mg_success(struct work_struct *ws)
1343{ 1366{
1344 dm_dblock_t b, e; 1367 struct dm_cache_migration *mg = ws_to_mg(ws);
1345 struct bio *bio = mg->new_ocell->holder; 1368 mg_complete(mg, mg->k.input == 0);
1346 struct cache *cache = mg->cache;
1347
1348 calc_discard_block_range(cache, bio, &b, &e);
1349 while (b != e) {
1350 set_discard(cache, b);
1351 b = to_dblock(from_dblock(b) + 1);
1352 }
1353
1354 bio_endio(bio);
1355 cell_defer(cache, mg->new_ocell, false);
1356 free_migration(mg);
1357 wake_worker(cache);
1358} 1369}
1359 1370
1360static void issue_copy_or_discard(struct dm_cache_migration *mg) 1371static void mg_update_metadata(struct work_struct *ws)
1361{ 1372{
1362 bool avoid; 1373 int r;
1374 struct dm_cache_migration *mg = ws_to_mg(ws);
1363 struct cache *cache = mg->cache; 1375 struct cache *cache = mg->cache;
1376 struct policy_work *op = mg->op;
1364 1377
1365 if (mg->discard) { 1378 switch (op->op) {
1366 issue_discard(mg); 1379 case POLICY_PROMOTE:
1367 return; 1380 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
1368 } 1381 if (r) {
1382 DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
1383 cache_device_name(cache));
1384 metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1369 1385
1370 if (mg->writeback || mg->demote) 1386 mg_complete(mg, false);
1371 avoid = !is_dirty(cache, mg->cblock) || 1387 return;
1372 is_discarded_oblock(cache, mg->old_oblock); 1388 }
1373 else { 1389 mg_complete(mg, true);
1374 struct bio *bio = mg->new_ocell->holder; 1390 break;
1375 1391
1376 avoid = is_discarded_oblock(cache, mg->new_oblock); 1392 case POLICY_DEMOTE:
1393 r = dm_cache_remove_mapping(cache->cmd, op->cblock);
1394 if (r) {
1395 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
1396 cache_device_name(cache));
1397 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1377 1398
1378 if (writeback_mode(&cache->features) && 1399 mg_complete(mg, false);
1379 !avoid && bio_writes_complete_block(cache, bio)) {
1380 issue_overwrite(mg, bio);
1381 return; 1400 return;
1382 } 1401 }
1383 }
1384 1402
1385 avoid ? avoid_copy(mg) : issue_copy(mg); 1403 /*
1404 * It would be nice if we only had to commit when a REQ_FLUSH
1405 * comes through. But there's one scenario that we have to
1406 * look out for:
1407 *
1408 * - vblock x in a cache block
1409 * - domotion occurs
1410 * - cache block gets reallocated and over written
1411 * - crash
1412 *
1413 * When we recover, because there was no commit the cache will
1414 * rollback to having the data for vblock x in the cache block.
1415 * But the cache block has since been overwritten, so it'll end
1416 * up pointing to data that was never in 'x' during the history
1417 * of the device.
1418 *
1419 * To avoid this issue we require a commit as part of the
1420 * demotion operation.
1421 */
1422 init_continuation(&mg->k, mg_success);
1423 continue_after_commit(&cache->committer, &mg->k);
1424 schedule_commit(&cache->committer);
1425 break;
1426
1427 case POLICY_WRITEBACK:
1428 mg_complete(mg, true);
1429 break;
1430 }
1386} 1431}
1387 1432
1388static void complete_migration(struct dm_cache_migration *mg) 1433static void mg_update_metadata_after_copy(struct work_struct *ws)
1389{ 1434{
1390 if (mg->err) 1435 struct dm_cache_migration *mg = ws_to_mg(ws);
1391 migration_failure(mg); 1436
1437 /*
1438 * Did the copy succeed?
1439 */
1440 if (mg->k.input)
1441 mg_complete(mg, false);
1392 else 1442 else
1393 migration_success_pre_commit(mg); 1443 mg_update_metadata(ws);
1394} 1444}
1395 1445
1396static void process_migrations(struct cache *cache, struct list_head *head, 1446static void mg_upgrade_lock(struct work_struct *ws)
1397 void (*fn)(struct dm_cache_migration *))
1398{ 1447{
1399 unsigned long flags; 1448 int r;
1400 struct list_head list; 1449 struct dm_cache_migration *mg = ws_to_mg(ws);
1401 struct dm_cache_migration *mg, *tmp;
1402 1450
1403 INIT_LIST_HEAD(&list); 1451 /*
1404 spin_lock_irqsave(&cache->lock, flags); 1452 * Did the copy succeed?
1405 list_splice_init(head, &list); 1453 */
1406 spin_unlock_irqrestore(&cache->lock, flags); 1454 if (mg->k.input)
1455 mg_complete(mg, false);
1407 1456
1408 list_for_each_entry_safe(mg, tmp, &list, list) 1457 else {
1409 fn(mg); 1458 /*
1410} 1459 * Now we want the lock to prevent both reads and writes.
1460 */
1461 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
1462 READ_WRITE_LOCK_LEVEL);
1463 if (r < 0)
1464 mg_complete(mg, false);
1411 1465
1412static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1466 else if (r)
1413{ 1467 quiesce(mg, mg_update_metadata);
1414 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1468
1469 else
1470 mg_update_metadata(ws);
1471 }
1415} 1472}
1416 1473
1417static void queue_quiesced_migration(struct dm_cache_migration *mg) 1474static void mg_copy(struct work_struct *ws)
1418{ 1475{
1419 unsigned long flags; 1476 int r;
1420 struct cache *cache = mg->cache; 1477 struct dm_cache_migration *mg = ws_to_mg(ws);
1421 1478
1422 spin_lock_irqsave(&cache->lock, flags); 1479 if (mg->overwrite_bio) {
1423 __queue_quiesced_migration(mg); 1480 /*
1424 spin_unlock_irqrestore(&cache->lock, flags); 1481 * It's safe to do this here, even though it's new data
1482 * because all IO has been locked out of the block.
1483 *
1484 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
1485 * so _not_ using mg_upgrade_lock() as continutation.
1486 */
1487 overwrite(mg, mg_update_metadata_after_copy);
1425 1488
1426 wake_worker(cache); 1489 } else {
1427} 1490 struct cache *cache = mg->cache;
1491 struct policy_work *op = mg->op;
1492 bool is_policy_promote = (op->op == POLICY_PROMOTE);
1428 1493
1429static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1494 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
1430{ 1495 is_discarded_oblock(cache, op->oblock)) {
1431 unsigned long flags; 1496 mg_upgrade_lock(ws);
1432 struct dm_cache_migration *mg, *tmp; 1497 return;
1498 }
1433 1499
1434 spin_lock_irqsave(&cache->lock, flags); 1500 init_continuation(&mg->k, mg_upgrade_lock);
1435 list_for_each_entry_safe(mg, tmp, work, list)
1436 __queue_quiesced_migration(mg);
1437 spin_unlock_irqrestore(&cache->lock, flags);
1438 1501
1439 wake_worker(cache); 1502 r = copy(mg, is_policy_promote);
1503 if (r) {
1504 DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
1505 mg->k.input = -EIO;
1506 mg_complete(mg, false);
1507 }
1508 }
1440} 1509}
1441 1510
1442static void check_for_quiesced_migrations(struct cache *cache, 1511static int mg_lock_writes(struct dm_cache_migration *mg)
1443 struct per_bio_data *pb)
1444{ 1512{
1445 struct list_head work; 1513 int r;
1514 struct dm_cell_key_v2 key;
1515 struct cache *cache = mg->cache;
1516 struct dm_bio_prison_cell_v2 *prealloc;
1446 1517
1447 if (!pb->all_io_entry) 1518 prealloc = alloc_prison_cell(cache);
1448 return; 1519 if (!prealloc) {
1520 DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
1521 mg_complete(mg, false);
1522 return -ENOMEM;
1523 }
1524
1525 /*
1526 * Prevent writes to the block, but allow reads to continue.
1527 * Unless we're using an overwrite bio, in which case we lock
1528 * everything.
1529 */
1530 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
1531 r = dm_cell_lock_v2(cache->prison, &key,
1532 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
1533 prealloc, &mg->cell);
1534 if (r < 0) {
1535 free_prison_cell(cache, prealloc);
1536 mg_complete(mg, false);
1537 return r;
1538 }
1449 1539
1450 INIT_LIST_HEAD(&work); 1540 if (mg->cell != prealloc)
1451 dm_deferred_entry_dec(pb->all_io_entry, &work); 1541 free_prison_cell(cache, prealloc);
1452 1542
1453 if (!list_empty(&work)) 1543 if (r == 0)
1454 queue_quiesced_migrations(cache, &work); 1544 mg_copy(&mg->k.ws);
1455} 1545 else
1546 quiesce(mg, mg_copy);
1456 1547
1457static void quiesce_migration(struct dm_cache_migration *mg) 1548 return 0;
1458{
1459 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
1460 queue_quiesced_migration(mg);
1461} 1549}
1462 1550
1463static void promote(struct cache *cache, struct prealloc *structs, 1551static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
1464 dm_oblock_t oblock, dm_cblock_t cblock,
1465 struct dm_bio_prison_cell *cell)
1466{ 1552{
1467 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1553 struct dm_cache_migration *mg;
1554
1555 if (!background_work_begin(cache)) {
1556 policy_complete_background_work(cache->policy, op, false);
1557 return -EPERM;
1558 }
1559
1560 mg = alloc_migration(cache);
1561 if (!mg) {
1562 policy_complete_background_work(cache->policy, op, false);
1563 background_work_end(cache);
1564 return -ENOMEM;
1565 }
1566
1567 memset(mg, 0, sizeof(*mg));
1468 1568
1469 mg->err = false;
1470 mg->discard = false;
1471 mg->writeback = false;
1472 mg->demote = false;
1473 mg->promote = true;
1474 mg->requeue_holder = true;
1475 mg->invalidate = false;
1476 mg->cache = cache; 1569 mg->cache = cache;
1477 mg->new_oblock = oblock; 1570 mg->op = op;
1478 mg->cblock = cblock; 1571 mg->overwrite_bio = bio;
1479 mg->old_ocell = NULL;
1480 mg->new_ocell = cell;
1481 mg->start_jiffies = jiffies;
1482 1572
1483 inc_io_migrations(cache); 1573 if (!bio)
1484 quiesce_migration(mg); 1574 inc_io_migrations(cache);
1575
1576 return mg_lock_writes(mg);
1485} 1577}
1486 1578
1487static void writeback(struct cache *cache, struct prealloc *structs, 1579/*----------------------------------------------------------------
1488 dm_oblock_t oblock, dm_cblock_t cblock, 1580 * invalidation processing
1489 struct dm_bio_prison_cell *cell) 1581 *--------------------------------------------------------------*/
1582
1583static void invalidate_complete(struct dm_cache_migration *mg, bool success)
1490{ 1584{
1491 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1585 struct bio_list bios;
1586 struct cache *cache = mg->cache;
1492 1587
1493 mg->err = false; 1588 bio_list_init(&bios);
1494 mg->discard = false; 1589 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1495 mg->writeback = true; 1590 free_prison_cell(cache, mg->cell);
1496 mg->demote = false;
1497 mg->promote = false;
1498 mg->requeue_holder = true;
1499 mg->invalidate = false;
1500 mg->cache = cache;
1501 mg->old_oblock = oblock;
1502 mg->cblock = cblock;
1503 mg->old_ocell = cell;
1504 mg->new_ocell = NULL;
1505 mg->start_jiffies = jiffies;
1506
1507 inc_io_migrations(cache);
1508 quiesce_migration(mg);
1509}
1510
1511static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1512 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1513 dm_cblock_t cblock,
1514 struct dm_bio_prison_cell *old_ocell,
1515 struct dm_bio_prison_cell *new_ocell)
1516{
1517 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1518
1519 mg->err = false;
1520 mg->discard = false;
1521 mg->writeback = false;
1522 mg->demote = true;
1523 mg->promote = true;
1524 mg->requeue_holder = true;
1525 mg->invalidate = false;
1526 mg->cache = cache;
1527 mg->old_oblock = old_oblock;
1528 mg->new_oblock = new_oblock;
1529 mg->cblock = cblock;
1530 mg->old_ocell = old_ocell;
1531 mg->new_ocell = new_ocell;
1532 mg->start_jiffies = jiffies;
1533 1591
1534 inc_io_migrations(cache); 1592 if (!success && mg->overwrite_bio)
1535 quiesce_migration(mg); 1593 bio_io_error(mg->overwrite_bio);
1536}
1537 1594
1538/* 1595 free_migration(mg);
1539 * Invalidate a cache entry. No writeback occurs; any changes in the cache 1596 defer_bios(cache, &bios);
1540 * block are thrown away.
1541 */
1542static void invalidate(struct cache *cache, struct prealloc *structs,
1543 dm_oblock_t oblock, dm_cblock_t cblock,
1544 struct dm_bio_prison_cell *cell)
1545{
1546 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1547
1548 mg->err = false;
1549 mg->discard = false;
1550 mg->writeback = false;
1551 mg->demote = true;
1552 mg->promote = false;
1553 mg->requeue_holder = true;
1554 mg->invalidate = true;
1555 mg->cache = cache;
1556 mg->old_oblock = oblock;
1557 mg->cblock = cblock;
1558 mg->old_ocell = cell;
1559 mg->new_ocell = NULL;
1560 mg->start_jiffies = jiffies;
1561 1597
1562 inc_io_migrations(cache); 1598 background_work_end(cache);
1563 quiesce_migration(mg);
1564} 1599}
1565 1600
1566static void discard(struct cache *cache, struct prealloc *structs, 1601static void invalidate_completed(struct work_struct *ws)
1567 struct dm_bio_prison_cell *cell)
1568{ 1602{
1569 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1603 struct dm_cache_migration *mg = ws_to_mg(ws);
1604 invalidate_complete(mg, !mg->k.input);
1605}
1570 1606
1571 mg->err = false; 1607static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
1572 mg->discard = true; 1608{
1573 mg->writeback = false; 1609 int r = policy_invalidate_mapping(cache->policy, cblock);
1574 mg->demote = false; 1610 if (!r) {
1575 mg->promote = false; 1611 r = dm_cache_remove_mapping(cache->cmd, cblock);
1576 mg->requeue_holder = false; 1612 if (r) {
1577 mg->invalidate = false; 1613 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
1578 mg->cache = cache; 1614 cache_device_name(cache));
1579 mg->old_ocell = NULL; 1615 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1580 mg->new_ocell = cell; 1616 }
1581 mg->start_jiffies = jiffies; 1617
1618 } else if (r == -ENODATA) {
1619 /*
1620 * Harmless, already unmapped.
1621 */
1622 r = 0;
1623
1624 } else
1625 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
1582 1626
1583 quiesce_migration(mg); 1627 return r;
1584} 1628}
1585 1629
1586/*---------------------------------------------------------------- 1630static void invalidate_remove(struct work_struct *ws)
1587 * bio processing
1588 *--------------------------------------------------------------*/
1589static void defer_bio(struct cache *cache, struct bio *bio)
1590{ 1631{
1591 unsigned long flags; 1632 int r;
1633 struct dm_cache_migration *mg = ws_to_mg(ws);
1634 struct cache *cache = mg->cache;
1592 1635
1593 spin_lock_irqsave(&cache->lock, flags); 1636 r = invalidate_cblock(cache, mg->invalidate_cblock);
1594 bio_list_add(&cache->deferred_bios, bio); 1637 if (r) {
1595 spin_unlock_irqrestore(&cache->lock, flags); 1638 invalidate_complete(mg, false);
1639 return;
1640 }
1596 1641
1597 wake_worker(cache); 1642 init_continuation(&mg->k, invalidate_completed);
1643 continue_after_commit(&cache->committer, &mg->k);
1644 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
1645 mg->overwrite_bio = NULL;
1646 schedule_commit(&cache->committer);
1598} 1647}
1599 1648
1600static void process_flush_bio(struct cache *cache, struct bio *bio) 1649static int invalidate_lock(struct dm_cache_migration *mg)
1601{ 1650{
1602 size_t pb_data_size = get_per_bio_data_size(cache); 1651 int r;
1603 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1652 struct dm_cell_key_v2 key;
1653 struct cache *cache = mg->cache;
1654 struct dm_bio_prison_cell_v2 *prealloc;
1604 1655
1605 BUG_ON(bio->bi_iter.bi_size); 1656 prealloc = alloc_prison_cell(cache);
1606 if (!pb->req_nr) 1657 if (!prealloc) {
1607 remap_to_origin(cache, bio); 1658 invalidate_complete(mg, false);
1608 else 1659 return -ENOMEM;
1609 remap_to_cache(cache, bio, 0); 1660 }
1610 1661
1611 /* 1662 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
1612 * REQ_PREFLUSH is not directed at any particular block so we don't 1663 r = dm_cell_lock_v2(cache->prison, &key,
1613 * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH 1664 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
1614 * by dm-core. 1665 if (r < 0) {
1615 */ 1666 free_prison_cell(cache, prealloc);
1616 issue(cache, bio); 1667 invalidate_complete(mg, false);
1668 return r;
1669 }
1670
1671 if (mg->cell != prealloc)
1672 free_prison_cell(cache, prealloc);
1673
1674 if (r)
1675 quiesce(mg, invalidate_remove);
1676
1677 else {
1678 /*
1679 * We can't call invalidate_remove() directly here because we
1680 * might still be in request context.
1681 */
1682 init_continuation(&mg->k, invalidate_remove);
1683 queue_work(cache->wq, &mg->k.ws);
1684 }
1685
1686 return 0;
1617} 1687}
1618 1688
1619static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1689static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
1620 struct bio *bio) 1690 dm_oblock_t oblock, struct bio *bio)
1621{ 1691{
1622 int r; 1692 struct dm_cache_migration *mg;
1623 dm_dblock_t b, e;
1624 struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1625 1693
1626 calc_discard_block_range(cache, bio, &b, &e); 1694 if (!background_work_begin(cache))
1627 if (b == e) { 1695 return -EPERM;
1628 bio_endio(bio); 1696
1629 return; 1697 mg = alloc_migration(cache);
1698 if (!mg) {
1699 background_work_end(cache);
1700 return -ENOMEM;
1630 } 1701 }
1631 1702
1632 cell_prealloc = prealloc_get_cell(structs); 1703 memset(mg, 0, sizeof(*mg));
1633 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
1634 (cell_free_fn) prealloc_put_cell,
1635 structs, &new_ocell);
1636 if (r > 0)
1637 return;
1638 1704
1639 discard(cache, structs, new_ocell); 1705 mg->cache = cache;
1706 mg->overwrite_bio = bio;
1707 mg->invalidate_cblock = cblock;
1708 mg->invalidate_oblock = oblock;
1709
1710 return invalidate_lock(mg);
1640} 1711}
1641 1712
1642static bool spare_migration_bandwidth(struct cache *cache) 1713/*----------------------------------------------------------------
1714 * bio processing
1715 *--------------------------------------------------------------*/
1716
1717enum busy {
1718 IDLE,
1719 MODERATE,
1720 BUSY
1721};
1722
1723static enum busy spare_migration_bandwidth(struct cache *cache)
1643{ 1724{
1725 bool idle = iot_idle_for(&cache->origin_tracker, HZ);
1644 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1726 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1645 cache->sectors_per_block; 1727 cache->sectors_per_block;
1646 return current_volume < cache->migration_threshold; 1728
1729 if (current_volume <= cache->migration_threshold)
1730 return idle ? IDLE : MODERATE;
1731 else
1732 return idle ? MODERATE : BUSY;
1647} 1733}
1648 1734
1649static void inc_hit_counter(struct cache *cache, struct bio *bio) 1735static void inc_hit_counter(struct cache *cache, struct bio *bio)
@@ -1660,255 +1746,143 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
1660 1746
1661/*----------------------------------------------------------------*/ 1747/*----------------------------------------------------------------*/
1662 1748
1663struct inc_detail { 1749static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1664 struct cache *cache;
1665 struct bio_list bios_for_issue;
1666 struct bio_list unhandled_bios;
1667 bool any_writes;
1668};
1669
1670static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
1671{ 1750{
1672 struct bio *bio; 1751 return (bio_data_dir(bio) == WRITE) &&
1673 struct inc_detail *detail = context; 1752 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1674 struct cache *cache = detail->cache;
1675
1676 inc_ds(cache, cell->holder, cell);
1677 if (bio_data_dir(cell->holder) == WRITE)
1678 detail->any_writes = true;
1679
1680 while ((bio = bio_list_pop(&cell->bios))) {
1681 if (discard_or_flush(bio)) {
1682 bio_list_add(&detail->unhandled_bios, bio);
1683 continue;
1684 }
1685
1686 if (bio_data_dir(bio) == WRITE)
1687 detail->any_writes = true;
1688
1689 bio_list_add(&detail->bios_for_issue, bio);
1690 inc_ds(cache, bio, cell);
1691 }
1692} 1753}
1693 1754
1694// FIXME: refactor these two 1755static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
1695static void remap_cell_to_origin_clear_discard(struct cache *cache,
1696 struct dm_bio_prison_cell *cell,
1697 dm_oblock_t oblock, bool issue_holder)
1698{ 1756{
1699 struct bio *bio; 1757 return writeback_mode(&cache->features) &&
1700 unsigned long flags; 1758 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
1701 struct inc_detail detail;
1702
1703 detail.cache = cache;
1704 bio_list_init(&detail.bios_for_issue);
1705 bio_list_init(&detail.unhandled_bios);
1706 detail.any_writes = false;
1707
1708 spin_lock_irqsave(&cache->lock, flags);
1709 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
1710 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
1711 spin_unlock_irqrestore(&cache->lock, flags);
1712
1713 remap_to_origin(cache, cell->holder);
1714 if (issue_holder)
1715 issue(cache, cell->holder);
1716 else
1717 accounted_begin(cache, cell->holder);
1718
1719 if (detail.any_writes)
1720 clear_discard(cache, oblock_to_dblock(cache, oblock));
1721
1722 while ((bio = bio_list_pop(&detail.bios_for_issue))) {
1723 remap_to_origin(cache, bio);
1724 issue(cache, bio);
1725 }
1726
1727 free_prison_cell(cache, cell);
1728} 1759}
1729 1760
1730static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, 1761static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
1731 dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) 1762 bool *commit_needed)
1732{ 1763{
1733 struct bio *bio; 1764 int r, data_dir;
1734 unsigned long flags; 1765 bool rb, background_queued;
1735 struct inc_detail detail; 1766 dm_cblock_t cblock;
1736 1767 size_t pb_data_size = get_per_bio_data_size(cache);
1737 detail.cache = cache; 1768 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1738 bio_list_init(&detail.bios_for_issue);
1739 bio_list_init(&detail.unhandled_bios);
1740 detail.any_writes = false;
1741
1742 spin_lock_irqsave(&cache->lock, flags);
1743 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
1744 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
1745 spin_unlock_irqrestore(&cache->lock, flags);
1746 1769
1747 remap_to_cache(cache, cell->holder, cblock); 1770 *commit_needed = false;
1748 if (issue_holder)
1749 issue(cache, cell->holder);
1750 else
1751 accounted_begin(cache, cell->holder);
1752 1771
1753 if (detail.any_writes) { 1772 rb = bio_detain_shared(cache, block, bio);
1754 set_dirty(cache, oblock, cblock); 1773 if (!rb) {
1755 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1774 /*
1756 } 1775 * An exclusive lock is held for this block, so we have to
1757 1776 * wait. We set the commit_needed flag so the current
1758 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1777 * transaction will be committed asap, allowing this lock
1759 remap_to_cache(cache, bio, cblock); 1778 * to be dropped.
1760 issue(cache, bio); 1779 */
1780 *commit_needed = true;
1781 return DM_MAPIO_SUBMITTED;
1761 } 1782 }
1762 1783
1763 free_prison_cell(cache, cell); 1784 data_dir = bio_data_dir(bio);
1764}
1765 1785
1766/*----------------------------------------------------------------*/ 1786 if (optimisable_bio(cache, bio, block)) {
1767 1787 struct policy_work *op = NULL;
1768struct old_oblock_lock {
1769 struct policy_locker locker;
1770 struct cache *cache;
1771 struct prealloc *structs;
1772 struct dm_bio_prison_cell *cell;
1773};
1774 1788
1775static int null_locker(struct policy_locker *locker, dm_oblock_t b) 1789 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
1776{ 1790 if (unlikely(r && r != -ENOENT)) {
1777 /* This should never be called */ 1791 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
1778 BUG(); 1792 cache_device_name(cache), r);
1779 return 0; 1793 bio_io_error(bio);
1780} 1794 return DM_MAPIO_SUBMITTED;
1795 }
1781 1796
1782static int cell_locker(struct policy_locker *locker, dm_oblock_t b) 1797 if (r == -ENOENT && op) {
1783{ 1798 bio_drop_shared_lock(cache, bio);
1784 struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); 1799 BUG_ON(op->op != POLICY_PROMOTE);
1785 struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); 1800 mg_start(cache, op, bio);
1801 return DM_MAPIO_SUBMITTED;
1802 }
1803 } else {
1804 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
1805 if (unlikely(r && r != -ENOENT)) {
1806 DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
1807 cache_device_name(cache), r);
1808 bio_io_error(bio);
1809 return DM_MAPIO_SUBMITTED;
1810 }
1786 1811
1787 return bio_detain(l->cache, b, NULL, cell_prealloc, 1812 if (background_queued)
1788 (cell_free_fn) prealloc_put_cell, 1813 wake_migration_worker(cache);
1789 l->structs, &l->cell); 1814 }
1790}
1791 1815
1792static void process_cell(struct cache *cache, struct prealloc *structs, 1816 if (r == -ENOENT) {
1793 struct dm_bio_prison_cell *new_ocell) 1817 /*
1794{ 1818 * Miss.
1795 int r; 1819 */
1796 bool release_cell = true; 1820 inc_miss_counter(cache, bio);
1797 struct bio *bio = new_ocell->holder; 1821 if (pb->req_nr == 0) {
1798 dm_oblock_t block = get_bio_block(cache, bio); 1822 accounted_begin(cache, bio);
1799 struct policy_result lookup_result; 1823 remap_to_origin_clear_discard(cache, bio, block);
1800 bool passthrough = passthrough_mode(&cache->features);
1801 bool fast_promotion, can_migrate;
1802 struct old_oblock_lock ool;
1803
1804 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
1805 can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
1806
1807 ool.locker.fn = cell_locker;
1808 ool.cache = cache;
1809 ool.structs = structs;
1810 ool.cell = NULL;
1811 r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
1812 bio, &ool.locker, &lookup_result);
1813
1814 if (r == -EWOULDBLOCK)
1815 /* migration has been denied */
1816 lookup_result.op = POLICY_MISS;
1817
1818 switch (lookup_result.op) {
1819 case POLICY_HIT:
1820 if (passthrough) {
1821 inc_miss_counter(cache, bio);
1822 1824
1825 } else {
1823 /* 1826 /*
1824 * Passthrough always maps to the origin, 1827 * This is a duplicate writethrough io that is no
1825 * invalidating any cache blocks that are written 1828 * longer needed because the block has been demoted.
1826 * to.
1827 */ 1829 */
1830 bio_endio(bio);
1831 return DM_MAPIO_SUBMITTED;
1832 }
1833 } else {
1834 /*
1835 * Hit.
1836 */
1837 inc_hit_counter(cache, bio);
1828 1838
1839 /*
1840 * Passthrough always maps to the origin, invalidating any
1841 * cache blocks that are written to.
1842 */
1843 if (passthrough_mode(&cache->features)) {
1829 if (bio_data_dir(bio) == WRITE) { 1844 if (bio_data_dir(bio) == WRITE) {
1845 bio_drop_shared_lock(cache, bio);
1830 atomic_inc(&cache->stats.demotion); 1846 atomic_inc(&cache->stats.demotion);
1831 invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1847 invalidate_start(cache, cblock, block, bio);
1832 release_cell = false; 1848 } else
1833
1834 } else {
1835 /* FIXME: factor out issue_origin() */
1836 remap_to_origin_clear_discard(cache, bio, block); 1849 remap_to_origin_clear_discard(cache, bio, block);
1837 inc_and_issue(cache, bio, new_ocell); 1850
1838 }
1839 } else { 1851 } else {
1840 inc_hit_counter(cache, bio); 1852 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
1841 1853 !is_dirty(cache, cblock)) {
1842 if (bio_data_dir(bio) == WRITE && 1854 remap_to_origin_then_cache(cache, bio, block, cblock);
1843 writethrough_mode(&cache->features) && 1855 accounted_begin(cache, bio);
1844 !is_dirty(cache, lookup_result.cblock)) { 1856 } else
1845 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1857 remap_to_cache_dirty(cache, bio, block, cblock);
1846 inc_and_issue(cache, bio, new_ocell);
1847
1848 } else {
1849 remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
1850 release_cell = false;
1851 }
1852 } 1858 }
1853
1854 break;
1855
1856 case POLICY_MISS:
1857 inc_miss_counter(cache, bio);
1858 remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
1859 release_cell = false;
1860 break;
1861
1862 case POLICY_NEW:
1863 atomic_inc(&cache->stats.promotion);
1864 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1865 release_cell = false;
1866 break;
1867
1868 case POLICY_REPLACE:
1869 atomic_inc(&cache->stats.demotion);
1870 atomic_inc(&cache->stats.promotion);
1871 demote_then_promote(cache, structs, lookup_result.old_oblock,
1872 block, lookup_result.cblock,
1873 ool.cell, new_ocell);
1874 release_cell = false;
1875 break;
1876
1877 default:
1878 DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
1879 cache_device_name(cache), __func__,
1880 (unsigned) lookup_result.op);
1881 bio_io_error(bio);
1882 } 1859 }
1883 1860
1884 if (release_cell)
1885 cell_defer(cache, new_ocell, false);
1886}
1887
1888static void process_bio(struct cache *cache, struct prealloc *structs,
1889 struct bio *bio)
1890{
1891 int r;
1892 dm_oblock_t block = get_bio_block(cache, bio);
1893 struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1894
1895 /* 1861 /*
1896 * Check to see if that block is currently migrating. 1862 * dm core turns FUA requests into a separate payload and FLUSH req.
1897 */ 1863 */
1898 cell_prealloc = prealloc_get_cell(structs); 1864 if (bio->bi_opf & REQ_FUA) {
1899 r = bio_detain(cache, block, bio, cell_prealloc, 1865 /*
1900 (cell_free_fn) prealloc_put_cell, 1866 * issue_after_commit will call accounted_begin a second time. So
1901 structs, &new_ocell); 1867 * we call accounted_complete() to avoid double accounting.
1902 if (r > 0) 1868 */
1903 return; 1869 accounted_complete(cache, bio);
1870 issue_after_commit(&cache->committer, bio);
1871 *commit_needed = true;
1872 return DM_MAPIO_SUBMITTED;
1873 }
1904 1874
1905 process_cell(cache, structs, new_ocell); 1875 return DM_MAPIO_REMAPPED;
1906} 1876}
1907 1877
1908static int need_commit_due_to_time(struct cache *cache) 1878static bool process_bio(struct cache *cache, struct bio *bio)
1909{ 1879{
1910 return jiffies < cache->last_commit_jiffies || 1880 bool commit_needed;
1911 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1881
1882 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
1883 generic_make_request(bio);
1884
1885 return commit_needed;
1912} 1886}
1913 1887
1914/* 1888/*
@@ -1929,123 +1903,88 @@ static int commit(struct cache *cache, bool clean_shutdown)
1929 return r; 1903 return r;
1930} 1904}
1931 1905
1932static int commit_if_needed(struct cache *cache) 1906/*
1907 * Used by the batcher.
1908 */
1909static int commit_op(void *context)
1933{ 1910{
1934 int r = 0; 1911 struct cache *cache = context;
1935 1912
1936 if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1913 if (dm_cache_changed_this_transaction(cache->cmd))
1937 dm_cache_changed_this_transaction(cache->cmd)) { 1914 return commit(cache, false);
1938 r = commit(cache, false);
1939 cache->commit_requested = false;
1940 cache->last_commit_jiffies = jiffies;
1941 }
1942 1915
1943 return r; 1916 return 0;
1944} 1917}
1945 1918
1946static void process_deferred_bios(struct cache *cache) 1919/*----------------------------------------------------------------*/
1947{
1948 bool prealloc_used = false;
1949 unsigned long flags;
1950 struct bio_list bios;
1951 struct bio *bio;
1952 struct prealloc structs;
1953
1954 memset(&structs, 0, sizeof(structs));
1955 bio_list_init(&bios);
1956
1957 spin_lock_irqsave(&cache->lock, flags);
1958 bio_list_merge(&bios, &cache->deferred_bios);
1959 bio_list_init(&cache->deferred_bios);
1960 spin_unlock_irqrestore(&cache->lock, flags);
1961
1962 while (!bio_list_empty(&bios)) {
1963 /*
1964 * If we've got no free migration structs, and processing
1965 * this bio might require one, we pause until there are some
1966 * prepared mappings to process.
1967 */
1968 prealloc_used = true;
1969 if (prealloc_data_structs(cache, &structs)) {
1970 spin_lock_irqsave(&cache->lock, flags);
1971 bio_list_merge(&cache->deferred_bios, &bios);
1972 spin_unlock_irqrestore(&cache->lock, flags);
1973 break;
1974 }
1975 1920
1976 bio = bio_list_pop(&bios); 1921static bool process_flush_bio(struct cache *cache, struct bio *bio)
1922{
1923 size_t pb_data_size = get_per_bio_data_size(cache);
1924 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1977 1925
1978 if (bio->bi_opf & REQ_PREFLUSH) 1926 if (!pb->req_nr)
1979 process_flush_bio(cache, bio); 1927 remap_to_origin(cache, bio);
1980 else if (bio_op(bio) == REQ_OP_DISCARD) 1928 else
1981 process_discard_bio(cache, &structs, bio); 1929 remap_to_cache(cache, bio, 0);
1982 else
1983 process_bio(cache, &structs, bio);
1984 }
1985 1930
1986 if (prealloc_used) 1931 issue_after_commit(&cache->committer, bio);
1987 prealloc_free_structs(cache, &structs); 1932 return true;
1988} 1933}
1989 1934
1990static void process_deferred_cells(struct cache *cache) 1935static bool process_discard_bio(struct cache *cache, struct bio *bio)
1991{ 1936{
1992 bool prealloc_used = false; 1937 dm_dblock_t b, e;
1993 unsigned long flags;
1994 struct dm_bio_prison_cell *cell, *tmp;
1995 struct list_head cells;
1996 struct prealloc structs;
1997
1998 memset(&structs, 0, sizeof(structs));
1999
2000 INIT_LIST_HEAD(&cells);
2001
2002 spin_lock_irqsave(&cache->lock, flags);
2003 list_splice_init(&cache->deferred_cells, &cells);
2004 spin_unlock_irqrestore(&cache->lock, flags);
2005
2006 list_for_each_entry_safe(cell, tmp, &cells, user_list) {
2007 /*
2008 * If we've got no free migration structs, and processing
2009 * this bio might require one, we pause until there are some
2010 * prepared mappings to process.
2011 */
2012 prealloc_used = true;
2013 if (prealloc_data_structs(cache, &structs)) {
2014 spin_lock_irqsave(&cache->lock, flags);
2015 list_splice(&cells, &cache->deferred_cells);
2016 spin_unlock_irqrestore(&cache->lock, flags);
2017 break;
2018 }
2019 1938
2020 process_cell(cache, &structs, cell); 1939 // FIXME: do we need to lock the region? Or can we just assume the
1940 // user wont be so foolish as to issue discard concurrently with
1941 // other IO?
1942 calc_discard_block_range(cache, bio, &b, &e);
1943 while (b != e) {
1944 set_discard(cache, b);
1945 b = to_dblock(from_dblock(b) + 1);
2021 } 1946 }
2022 1947
2023 if (prealloc_used) 1948 bio_endio(bio);
2024 prealloc_free_structs(cache, &structs); 1949
1950 return false;
2025} 1951}
2026 1952
2027static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 1953static void process_deferred_bios(struct work_struct *ws)
2028{ 1954{
1955 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
1956
2029 unsigned long flags; 1957 unsigned long flags;
1958 bool commit_needed = false;
2030 struct bio_list bios; 1959 struct bio_list bios;
2031 struct bio *bio; 1960 struct bio *bio;
2032 1961
2033 bio_list_init(&bios); 1962 bio_list_init(&bios);
2034 1963
2035 spin_lock_irqsave(&cache->lock, flags); 1964 spin_lock_irqsave(&cache->lock, flags);
2036 bio_list_merge(&bios, &cache->deferred_flush_bios); 1965 bio_list_merge(&bios, &cache->deferred_bios);
2037 bio_list_init(&cache->deferred_flush_bios); 1966 bio_list_init(&cache->deferred_bios);
2038 spin_unlock_irqrestore(&cache->lock, flags); 1967 spin_unlock_irqrestore(&cache->lock, flags);
2039 1968
2040 /* 1969 while ((bio = bio_list_pop(&bios))) {
2041 * These bios have already been through inc_ds() 1970 if (bio->bi_opf & REQ_PREFLUSH)
2042 */ 1971 commit_needed = process_flush_bio(cache, bio) || commit_needed;
2043 while ((bio = bio_list_pop(&bios))) 1972
2044 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); 1973 else if (bio_op(bio) == REQ_OP_DISCARD)
1974 commit_needed = process_discard_bio(cache, bio) || commit_needed;
1975
1976 else
1977 commit_needed = process_bio(cache, bio) || commit_needed;
1978 }
1979
1980 if (commit_needed)
1981 schedule_commit(&cache->committer);
2045} 1982}
2046 1983
2047static void process_deferred_writethrough_bios(struct cache *cache) 1984static void process_deferred_writethrough_bios(struct work_struct *ws)
2048{ 1985{
1986 struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
1987
2049 unsigned long flags; 1988 unsigned long flags;
2050 struct bio_list bios; 1989 struct bio_list bios;
2051 struct bio *bio; 1990 struct bio *bio;
@@ -2058,153 +1997,15 @@ static void process_deferred_writethrough_bios(struct cache *cache)
2058 spin_unlock_irqrestore(&cache->lock, flags); 1997 spin_unlock_irqrestore(&cache->lock, flags);
2059 1998
2060 /* 1999 /*
2061 * These bios have already been through inc_ds() 2000 * These bios have already been through accounted_begin()
2062 */ 2001 */
2063 while ((bio = bio_list_pop(&bios))) 2002 while ((bio = bio_list_pop(&bios)))
2064 accounted_request(cache, bio); 2003 generic_make_request(bio);
2065}
2066
2067static void writeback_some_dirty_blocks(struct cache *cache)
2068{
2069 bool prealloc_used = false;
2070 dm_oblock_t oblock;
2071 dm_cblock_t cblock;
2072 struct prealloc structs;
2073 struct dm_bio_prison_cell *old_ocell;
2074 bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
2075
2076 memset(&structs, 0, sizeof(structs));
2077
2078 while (spare_migration_bandwidth(cache)) {
2079 if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
2080 break; /* no work to do */
2081
2082 prealloc_used = true;
2083 if (prealloc_data_structs(cache, &structs) ||
2084 get_cell(cache, oblock, &structs, &old_ocell)) {
2085 policy_set_dirty(cache->policy, oblock);
2086 break;
2087 }
2088
2089 writeback(cache, &structs, oblock, cblock, old_ocell);
2090 }
2091
2092 if (prealloc_used)
2093 prealloc_free_structs(cache, &structs);
2094}
2095
2096/*----------------------------------------------------------------
2097 * Invalidations.
2098 * Dropping something from the cache *without* writing back.
2099 *--------------------------------------------------------------*/
2100
2101static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
2102{
2103 int r = 0;
2104 uint64_t begin = from_cblock(req->cblocks->begin);
2105 uint64_t end = from_cblock(req->cblocks->end);
2106
2107 while (begin != end) {
2108 r = policy_remove_cblock(cache->policy, to_cblock(begin));
2109 if (!r) {
2110 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
2111 if (r) {
2112 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
2113 break;
2114 }
2115
2116 } else if (r == -ENODATA) {
2117 /* harmless, already unmapped */
2118 r = 0;
2119
2120 } else {
2121 DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
2122 break;
2123 }
2124
2125 begin++;
2126 }
2127
2128 cache->commit_requested = true;
2129
2130 req->err = r;
2131 atomic_set(&req->complete, 1);
2132
2133 wake_up(&req->result_wait);
2134}
2135
2136static void process_invalidation_requests(struct cache *cache)
2137{
2138 struct list_head list;
2139 struct invalidation_request *req, *tmp;
2140
2141 INIT_LIST_HEAD(&list);
2142 spin_lock(&cache->invalidation_lock);
2143 list_splice_init(&cache->invalidation_requests, &list);
2144 spin_unlock(&cache->invalidation_lock);
2145
2146 list_for_each_entry_safe (req, tmp, &list, list)
2147 process_invalidation_request(cache, req);
2148} 2004}
2149 2005
2150/*---------------------------------------------------------------- 2006/*----------------------------------------------------------------
2151 * Main worker loop 2007 * Main worker loop
2152 *--------------------------------------------------------------*/ 2008 *--------------------------------------------------------------*/
2153static bool is_quiescing(struct cache *cache)
2154{
2155 return atomic_read(&cache->quiescing);
2156}
2157
2158static void ack_quiescing(struct cache *cache)
2159{
2160 if (is_quiescing(cache)) {
2161 atomic_inc(&cache->quiescing_ack);
2162 wake_up(&cache->quiescing_wait);
2163 }
2164}
2165
2166static void wait_for_quiescing_ack(struct cache *cache)
2167{
2168 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
2169}
2170
2171static void start_quiescing(struct cache *cache)
2172{
2173 atomic_inc(&cache->quiescing);
2174 wait_for_quiescing_ack(cache);
2175}
2176
2177static void stop_quiescing(struct cache *cache)
2178{
2179 atomic_set(&cache->quiescing, 0);
2180 atomic_set(&cache->quiescing_ack, 0);
2181}
2182
2183static void wait_for_migrations(struct cache *cache)
2184{
2185 wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
2186}
2187
2188static void stop_worker(struct cache *cache)
2189{
2190 cancel_delayed_work(&cache->waker);
2191 flush_workqueue(cache->wq);
2192}
2193
2194static void requeue_deferred_cells(struct cache *cache)
2195{
2196 unsigned long flags;
2197 struct list_head cells;
2198 struct dm_bio_prison_cell *cell, *tmp;
2199
2200 INIT_LIST_HEAD(&cells);
2201 spin_lock_irqsave(&cache->lock, flags);
2202 list_splice_init(&cache->deferred_cells, &cells);
2203 spin_unlock_irqrestore(&cache->lock, flags);
2204
2205 list_for_each_entry_safe(cell, tmp, &cells, user_list)
2206 cell_requeue(cache, cell);
2207}
2208 2009
2209static void requeue_deferred_bios(struct cache *cache) 2010static void requeue_deferred_bios(struct cache *cache)
2210{ 2011{
@@ -2221,53 +2022,6 @@ static void requeue_deferred_bios(struct cache *cache)
2221 } 2022 }
2222} 2023}
2223 2024
2224static int more_work(struct cache *cache)
2225{
2226 if (is_quiescing(cache))
2227 return !list_empty(&cache->quiesced_migrations) ||
2228 !list_empty(&cache->completed_migrations) ||
2229 !list_empty(&cache->need_commit_migrations);
2230 else
2231 return !bio_list_empty(&cache->deferred_bios) ||
2232 !list_empty(&cache->deferred_cells) ||
2233 !bio_list_empty(&cache->deferred_flush_bios) ||
2234 !bio_list_empty(&cache->deferred_writethrough_bios) ||
2235 !list_empty(&cache->quiesced_migrations) ||
2236 !list_empty(&cache->completed_migrations) ||
2237 !list_empty(&cache->need_commit_migrations) ||
2238 cache->invalidate;
2239}
2240
2241static void do_worker(struct work_struct *ws)
2242{
2243 struct cache *cache = container_of(ws, struct cache, worker);
2244
2245 do {
2246 if (!is_quiescing(cache)) {
2247 writeback_some_dirty_blocks(cache);
2248 process_deferred_writethrough_bios(cache);
2249 process_deferred_bios(cache);
2250 process_deferred_cells(cache);
2251 process_invalidation_requests(cache);
2252 }
2253
2254 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
2255 process_migrations(cache, &cache->completed_migrations, complete_migration);
2256
2257 if (commit_if_needed(cache)) {
2258 process_deferred_flush_bios(cache, false);
2259 process_migrations(cache, &cache->need_commit_migrations, migration_failure);
2260 } else {
2261 process_deferred_flush_bios(cache, true);
2262 process_migrations(cache, &cache->need_commit_migrations,
2263 migration_success_post_commit);
2264 }
2265
2266 ack_quiescing(cache);
2267
2268 } while (more_work(cache));
2269}
2270
2271/* 2025/*
2272 * We want to commit periodically so that not too much 2026 * We want to commit periodically so that not too much
2273 * unwritten metadata builds up. 2027 * unwritten metadata builds up.
@@ -2275,25 +2029,39 @@ static void do_worker(struct work_struct *ws)
2275static void do_waker(struct work_struct *ws) 2029static void do_waker(struct work_struct *ws)
2276{ 2030{
2277 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2031 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
2032
2278 policy_tick(cache->policy, true); 2033 policy_tick(cache->policy, true);
2279 wake_worker(cache); 2034 wake_migration_worker(cache);
2035 schedule_commit(&cache->committer);
2280 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2036 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
2281} 2037}
2282 2038
2283/*----------------------------------------------------------------*/ 2039static void check_migrations(struct work_struct *ws)
2284
2285static int is_congested(struct dm_dev *dev, int bdi_bits)
2286{ 2040{
2287 struct request_queue *q = bdev_get_queue(dev->bdev); 2041 int r;
2288 return bdi_congested(q->backing_dev_info, bdi_bits); 2042 struct policy_work *op;
2289} 2043 struct cache *cache = container_of(ws, struct cache, migration_worker);
2044 enum busy b;
2290 2045
2291static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2046 for (;;) {
2292{ 2047 b = spare_migration_bandwidth(cache);
2293 struct cache *cache = container_of(cb, struct cache, callbacks); 2048 if (b == BUSY)
2049 break;
2294 2050
2295 return is_congested(cache->origin_dev, bdi_bits) || 2051 r = policy_get_background_work(cache->policy, b == IDLE, &op);
2296 is_congested(cache->cache_dev, bdi_bits); 2052 if (r == -ENODATA)
2053 break;
2054
2055 if (r) {
2056 DMERR_LIMIT("%s: policy_background_work failed",
2057 cache_device_name(cache));
2058 break;
2059 }
2060
2061 r = mg_start(cache, op, NULL);
2062 if (r)
2063 break;
2064 }
2297} 2065}
2298 2066
2299/*---------------------------------------------------------------- 2067/*----------------------------------------------------------------
@@ -2310,11 +2078,8 @@ static void destroy(struct cache *cache)
2310 2078
2311 mempool_destroy(cache->migration_pool); 2079 mempool_destroy(cache->migration_pool);
2312 2080
2313 if (cache->all_io_ds)
2314 dm_deferred_set_destroy(cache->all_io_ds);
2315
2316 if (cache->prison) 2081 if (cache->prison)
2317 dm_bio_prison_destroy(cache->prison); 2082 dm_bio_prison_destroy_v2(cache->prison);
2318 2083
2319 if (cache->wq) 2084 if (cache->wq)
2320 destroy_workqueue(cache->wq); 2085 destroy_workqueue(cache->wq);
@@ -2707,6 +2472,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2707 return PTR_ERR(p); 2472 return PTR_ERR(p);
2708 } 2473 }
2709 cache->policy = p; 2474 cache->policy = p;
2475 BUG_ON(!cache->policy);
2710 2476
2711 return 0; 2477 return 0;
2712} 2478}
@@ -2750,6 +2516,20 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size)
2750 cache->cache_size = size; 2516 cache->cache_size = size;
2751} 2517}
2752 2518
2519static int is_congested(struct dm_dev *dev, int bdi_bits)
2520{
2521 struct request_queue *q = bdev_get_queue(dev->bdev);
2522 return bdi_congested(q->backing_dev_info, bdi_bits);
2523}
2524
2525static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2526{
2527 struct cache *cache = container_of(cb, struct cache, callbacks);
2528
2529 return is_congested(cache->origin_dev, bdi_bits) ||
2530 is_congested(cache->cache_dev, bdi_bits);
2531}
2532
2753#define DEFAULT_MIGRATION_THRESHOLD 2048 2533#define DEFAULT_MIGRATION_THRESHOLD 2048
2754 2534
2755static int cache_create(struct cache_args *ca, struct cache **result) 2535static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2788,7 +2568,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2788 2568
2789 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2569 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2790 2570
2791 /* FIXME: factor out this whole section */
2792 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2571 origin_blocks = cache->origin_sectors = ca->origin_sectors;
2793 origin_blocks = block_div(origin_blocks, ca->block_size); 2572 origin_blocks = block_div(origin_blocks, ca->block_size);
2794 cache->origin_blocks = to_oblock(origin_blocks); 2573 cache->origin_blocks = to_oblock(origin_blocks);
@@ -2854,24 +2633,18 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2854 r = -EINVAL; 2633 r = -EINVAL;
2855 goto bad; 2634 goto bad;
2856 } 2635 }
2636
2637 policy_allow_migrations(cache->policy, false);
2857 } 2638 }
2858 2639
2859 spin_lock_init(&cache->lock); 2640 spin_lock_init(&cache->lock);
2860 INIT_LIST_HEAD(&cache->deferred_cells); 2641 INIT_LIST_HEAD(&cache->deferred_cells);
2861 bio_list_init(&cache->deferred_bios); 2642 bio_list_init(&cache->deferred_bios);
2862 bio_list_init(&cache->deferred_flush_bios);
2863 bio_list_init(&cache->deferred_writethrough_bios); 2643 bio_list_init(&cache->deferred_writethrough_bios);
2864 INIT_LIST_HEAD(&cache->quiesced_migrations);
2865 INIT_LIST_HEAD(&cache->completed_migrations);
2866 INIT_LIST_HEAD(&cache->need_commit_migrations);
2867 atomic_set(&cache->nr_allocated_migrations, 0); 2644 atomic_set(&cache->nr_allocated_migrations, 0);
2868 atomic_set(&cache->nr_io_migrations, 0); 2645 atomic_set(&cache->nr_io_migrations, 0);
2869 init_waitqueue_head(&cache->migration_wait); 2646 init_waitqueue_head(&cache->migration_wait);
2870 2647
2871 init_waitqueue_head(&cache->quiescing_wait);
2872 atomic_set(&cache->quiescing, 0);
2873 atomic_set(&cache->quiescing_ack, 0);
2874
2875 r = -ENOMEM; 2648 r = -ENOMEM;
2876 atomic_set(&cache->nr_dirty, 0); 2649 atomic_set(&cache->nr_dirty, 0);
2877 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2650 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
@@ -2900,27 +2673,23 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2900 goto bad; 2673 goto bad;
2901 } 2674 }
2902 2675
2903 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2676 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
2904 if (!cache->wq) { 2677 if (!cache->wq) {
2905 *error = "could not create workqueue for metadata object"; 2678 *error = "could not create workqueue for metadata object";
2906 goto bad; 2679 goto bad;
2907 } 2680 }
2908 INIT_WORK(&cache->worker, do_worker); 2681 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
2682 INIT_WORK(&cache->deferred_writethrough_worker,
2683 process_deferred_writethrough_bios);
2684 INIT_WORK(&cache->migration_worker, check_migrations);
2909 INIT_DELAYED_WORK(&cache->waker, do_waker); 2685 INIT_DELAYED_WORK(&cache->waker, do_waker);
2910 cache->last_commit_jiffies = jiffies;
2911 2686
2912 cache->prison = dm_bio_prison_create(); 2687 cache->prison = dm_bio_prison_create_v2(cache->wq);
2913 if (!cache->prison) { 2688 if (!cache->prison) {
2914 *error = "could not create bio prison"; 2689 *error = "could not create bio prison";
2915 goto bad; 2690 goto bad;
2916 } 2691 }
2917 2692
2918 cache->all_io_ds = dm_deferred_set_create();
2919 if (!cache->all_io_ds) {
2920 *error = "could not create all_io deferred set";
2921 goto bad;
2922 }
2923
2924 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2693 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2925 migration_cache); 2694 migration_cache);
2926 if (!cache->migration_pool) { 2695 if (!cache->migration_pool) {
@@ -2947,11 +2716,15 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2947 spin_lock_init(&cache->invalidation_lock); 2716 spin_lock_init(&cache->invalidation_lock);
2948 INIT_LIST_HEAD(&cache->invalidation_requests); 2717 INIT_LIST_HEAD(&cache->invalidation_requests);
2949 2718
2719 batcher_init(&cache->committer, commit_op, cache,
2720 issue_op, cache, cache->wq);
2950 iot_init(&cache->origin_tracker); 2721 iot_init(&cache->origin_tracker);
2951 2722
2723 init_rwsem(&cache->background_work_lock);
2724 prevent_background_work(cache);
2725
2952 *result = cache; 2726 *result = cache;
2953 return 0; 2727 return 0;
2954
2955bad: 2728bad:
2956 destroy(cache); 2729 destroy(cache);
2957 return r; 2730 return r;
@@ -3009,7 +2782,6 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
3009 } 2782 }
3010 2783
3011 ti->private = cache; 2784 ti->private = cache;
3012
3013out: 2785out:
3014 destroy_cache_args(ca); 2786 destroy_cache_args(ca);
3015 return r; 2787 return r;
@@ -3022,17 +2794,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
3022 struct cache *cache = ti->private; 2794 struct cache *cache = ti->private;
3023 2795
3024 int r; 2796 int r;
3025 struct dm_bio_prison_cell *cell = NULL; 2797 bool commit_needed;
3026 dm_oblock_t block = get_bio_block(cache, bio); 2798 dm_oblock_t block = get_bio_block(cache, bio);
3027 size_t pb_data_size = get_per_bio_data_size(cache); 2799 size_t pb_data_size = get_per_bio_data_size(cache);
3028 bool can_migrate = false;
3029 bool fast_promotion;
3030 struct policy_result lookup_result;
3031 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
3032 struct old_oblock_lock ool;
3033
3034 ool.locker.fn = null_locker;
3035 2800
2801 init_per_bio_data(bio, pb_data_size);
3036 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2802 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
3037 /* 2803 /*
3038 * This can only occur if the io goes to a partial block at 2804 * This can only occur if the io goes to a partial block at
@@ -3049,101 +2815,9 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
3049 return DM_MAPIO_SUBMITTED; 2815 return DM_MAPIO_SUBMITTED;
3050 } 2816 }
3051 2817
3052 /* 2818 r = map_bio(cache, bio, block, &commit_needed);
3053 * Check to see if that block is currently migrating. 2819 if (commit_needed)
3054 */ 2820 schedule_commit(&cache->committer);
3055 cell = alloc_prison_cell(cache);
3056 if (!cell) {
3057 defer_bio(cache, bio);
3058 return DM_MAPIO_SUBMITTED;
3059 }
3060
3061 r = bio_detain(cache, block, bio, cell,
3062 (cell_free_fn) free_prison_cell,
3063 cache, &cell);
3064 if (r) {
3065 if (r < 0)
3066 defer_bio(cache, bio);
3067
3068 return DM_MAPIO_SUBMITTED;
3069 }
3070
3071 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
3072
3073 r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
3074 bio, &ool.locker, &lookup_result);
3075 if (r == -EWOULDBLOCK) {
3076 cell_defer(cache, cell, true);
3077 return DM_MAPIO_SUBMITTED;
3078
3079 } else if (r) {
3080 DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
3081 cache_device_name(cache), r);
3082 cell_defer(cache, cell, false);
3083 bio_io_error(bio);
3084 return DM_MAPIO_SUBMITTED;
3085 }
3086
3087 r = DM_MAPIO_REMAPPED;
3088 switch (lookup_result.op) {
3089 case POLICY_HIT:
3090 if (passthrough_mode(&cache->features)) {
3091 if (bio_data_dir(bio) == WRITE) {
3092 /*
3093 * We need to invalidate this block, so
3094 * defer for the worker thread.
3095 */
3096 cell_defer(cache, cell, true);
3097 r = DM_MAPIO_SUBMITTED;
3098
3099 } else {
3100 inc_miss_counter(cache, bio);
3101 remap_to_origin_clear_discard(cache, bio, block);
3102 accounted_begin(cache, bio);
3103 inc_ds(cache, bio, cell);
3104 // FIXME: we want to remap hits or misses straight
3105 // away rather than passing over to the worker.
3106 cell_defer(cache, cell, false);
3107 }
3108
3109 } else {
3110 inc_hit_counter(cache, bio);
3111 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
3112 !is_dirty(cache, lookup_result.cblock)) {
3113 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
3114 accounted_begin(cache, bio);
3115 inc_ds(cache, bio, cell);
3116 cell_defer(cache, cell, false);
3117
3118 } else
3119 remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
3120 }
3121 break;
3122
3123 case POLICY_MISS:
3124 inc_miss_counter(cache, bio);
3125 if (pb->req_nr != 0) {
3126 /*
3127 * This is a duplicate writethrough io that is no
3128 * longer needed because the block has been demoted.
3129 */
3130 bio_endio(bio);
3131 // FIXME: remap everything as a miss
3132 cell_defer(cache, cell, false);
3133 r = DM_MAPIO_SUBMITTED;
3134
3135 } else
3136 remap_cell_to_origin_clear_discard(cache, cell, block, false);
3137 break;
3138
3139 default:
3140 DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
3141 cache_device_name(cache), __func__,
3142 (unsigned) lookup_result.op);
3143 cell_defer(cache, cell, false);
3144 bio_io_error(bio);
3145 r = DM_MAPIO_SUBMITTED;
3146 }
3147 2821
3148 return r; 2822 return r;
3149} 2823}
@@ -3163,7 +2837,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
3163 spin_unlock_irqrestore(&cache->lock, flags); 2837 spin_unlock_irqrestore(&cache->lock, flags);
3164 } 2838 }
3165 2839
3166 check_for_quiesced_migrations(cache, pb); 2840 bio_drop_shared_lock(cache, bio);
3167 accounted_complete(cache, bio); 2841 accounted_complete(cache, bio);
3168 2842
3169 return 0; 2843 return 0;
@@ -3263,12 +2937,18 @@ static void cache_postsuspend(struct dm_target *ti)
3263{ 2937{
3264 struct cache *cache = ti->private; 2938 struct cache *cache = ti->private;
3265 2939
3266 start_quiescing(cache); 2940 prevent_background_work(cache);
3267 wait_for_migrations(cache); 2941 BUG_ON(atomic_read(&cache->nr_io_migrations));
3268 stop_worker(cache); 2942
2943 cancel_delayed_work(&cache->waker);
2944 flush_workqueue(cache->wq);
2945 WARN_ON(cache->origin_tracker.in_flight);
2946
2947 /*
2948 * If it's a flush suspend there won't be any deferred bios, so this
2949 * call is harmless.
2950 */
3269 requeue_deferred_bios(cache); 2951 requeue_deferred_bios(cache);
3270 requeue_deferred_cells(cache);
3271 stop_quiescing(cache);
3272 2952
3273 if (get_cache_mode(cache) == CM_WRITE) 2953 if (get_cache_mode(cache) == CM_WRITE)
3274 (void) sync_metadata(cache); 2954 (void) sync_metadata(cache);
@@ -3280,15 +2960,10 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
3280 int r; 2960 int r;
3281 struct cache *cache = context; 2961 struct cache *cache = context;
3282 2962
3283 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 2963 r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
3284 if (r) 2964 if (r)
3285 return r; 2965 return r;
3286 2966
3287 if (dirty)
3288 set_dirty(cache, oblock, cblock);
3289 else
3290 clear_dirty(cache, oblock, cblock);
3291
3292 return 0; 2967 return 0;
3293} 2968}
3294 2969
@@ -3487,6 +3162,7 @@ static void cache_resume(struct dm_target *ti)
3487 struct cache *cache = ti->private; 3162 struct cache *cache = ti->private;
3488 3163
3489 cache->need_tick_bio = true; 3164 cache->need_tick_bio = true;
3165 allow_background_work(cache);
3490 do_waker(&cache->waker.work); 3166 do_waker(&cache->waker.work);
3491} 3167}
3492 3168
@@ -3621,10 +3297,19 @@ err:
3621} 3297}
3622 3298
3623/* 3299/*
3300 * Defines a range of cblocks, begin to (end - 1) are in the range. end is
3301 * the one-past-the-end value.
3302 */
3303struct cblock_range {
3304 dm_cblock_t begin;
3305 dm_cblock_t end;
3306};
3307
3308/*
3624 * A cache block range can take two forms: 3309 * A cache block range can take two forms:
3625 * 3310 *
3626 * i) A single cblock, eg. '3456' 3311 * i) A single cblock, eg. '3456'
3627 * ii) A begin and end cblock with dots between, eg. 123-234 3312 * ii) A begin and end cblock with a dash between, eg. 123-234
3628 */ 3313 */
3629static int parse_cblock_range(struct cache *cache, const char *str, 3314static int parse_cblock_range(struct cache *cache, const char *str,
3630 struct cblock_range *result) 3315 struct cblock_range *result)
@@ -3690,23 +3375,31 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
3690 return 0; 3375 return 0;
3691} 3376}
3692 3377
3378static inline dm_cblock_t cblock_succ(dm_cblock_t b)
3379{
3380 return to_cblock(from_cblock(b) + 1);
3381}
3382
3693static int request_invalidation(struct cache *cache, struct cblock_range *range) 3383static int request_invalidation(struct cache *cache, struct cblock_range *range)
3694{ 3384{
3695 struct invalidation_request req; 3385 int r = 0;
3696 3386
3697 INIT_LIST_HEAD(&req.list); 3387 /*
3698 req.cblocks = range; 3388 * We don't need to do any locking here because we know we're in
3699 atomic_set(&req.complete, 0); 3389 * passthrough mode. There's is potential for a race between an
3700 req.err = 0; 3390 * invalidation triggered by an io and an invalidation message. This
3701 init_waitqueue_head(&req.result_wait); 3391 * is harmless, we must not worry if the policy call fails.
3392 */
3393 while (range->begin != range->end) {
3394 r = invalidate_cblock(cache, range->begin);
3395 if (r)
3396 return r;
3702 3397
3703 spin_lock(&cache->invalidation_lock); 3398 range->begin = cblock_succ(range->begin);
3704 list_add(&req.list, &cache->invalidation_requests); 3399 }
3705 spin_unlock(&cache->invalidation_lock);
3706 wake_worker(cache);
3707 3400
3708 wait_event(req.result_wait, atomic_read(&req.complete)); 3401 cache->commit_requested = true;
3709 return req.err; 3402 return r;
3710} 3403}
3711 3404
3712static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3405static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
@@ -3816,7 +3509,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3816 3509
3817static struct target_type cache_target = { 3510static struct target_type cache_target = {
3818 .name = "cache", 3511 .name = "cache",
3819 .version = {1, 10, 0}, 3512 .version = {2, 0, 0},
3820 .module = THIS_MODULE, 3513 .module = THIS_MODULE,
3821 .ctr = cache_ctr, 3514 .ctr = cache_ctr,
3822 .dtr = cache_dtr, 3515 .dtr = cache_dtr,