aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm-thin.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-03-28 15:55:04 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-28 15:55:04 -0400
commit89e5d6f0d979f6e7dc2bbb1ebd9e239217e2e952 (patch)
tree1126044004b73df905a6183430376f1d97c3b6c9 /drivers/md/dm-thin.c
parent516e77977085c9c50703fabb5dc61bd57a8cc1d0 (diff)
parenta4ffc152198efba2ed9e6eac0eb97f17bfebce85 (diff)
Merge tag 'dm-3.4-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm
Pull device-mapper changes for 3.4 from Alasdair Kergon: - Update thin provisioning to support read-only external snapshot origins and discards. - A new target, dm verity, for device content validation. - Mark dm uevent and dm raid as no-longer-experimental. - Miscellaneous other fixes and clean-ups. * tag 'dm-3.4-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm: (27 commits) dm: add verity target dm bufio: prefetch dm thin: add pool target flags to control discard dm thin: support discards dm thin: prepare to support discard dm thin: use dm_target_offset dm thin: support read only external snapshot origins dm thin: relax hard limit on the maximum size of a metadata device dm persistent data: remove space map ref_count entries if redundant dm thin: commit outstanding data every second dm: reject trailing characters in sccanf input dm raid: handle failed devices during start up dm thin metadata: pass correct space map to dm_sm_root_size dm persistent data: remove redundant value_size arg from value_ptr dm mpath: detect invalid map_context dm: clear bi_end_io on remapping failure dm table: simplify call to free_devices dm thin: correct comments dm raid: no longer experimental dm uevent: no longer experimental ...
Diffstat (limited to 'drivers/md/dm-thin.c')
-rw-r--r--drivers/md/dm-thin.c680
1 files changed, 508 insertions, 172 deletions
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c3087575fef0..213ae32a0fc4 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -23,6 +23,7 @@
23#define DEFERRED_SET_SIZE 64 23#define DEFERRED_SET_SIZE 64
24#define MAPPING_POOL_SIZE 1024 24#define MAPPING_POOL_SIZE 1024
25#define PRISON_CELLS 1024 25#define PRISON_CELLS 1024
26#define COMMIT_PERIOD HZ
26 27
27/* 28/*
28 * The block size of the device holding pool data must be 29 * The block size of the device holding pool data must be
@@ -32,16 +33,6 @@
32#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 33#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
33 34
34/* 35/*
35 * The metadata device is currently limited in size. The limitation is
36 * checked lower down in dm-space-map-metadata, but we also check it here
37 * so we can fail early.
38 *
39 * We have one block of index, which can hold 255 index entries. Each
40 * index entry contains allocation info about 16k metadata blocks.
41 */
42#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
43
44/*
45 * Device id is restricted to 24 bits. 36 * Device id is restricted to 24 bits.
46 */ 37 */
47#define MAX_DEV_ID ((1 << 24) - 1) 38#define MAX_DEV_ID ((1 << 24) - 1)
@@ -72,7 +63,7 @@
72 * missed out if the io covers the block. (schedule_copy). 63 * missed out if the io covers the block. (schedule_copy).
73 * 64 *
74 * iv) insert the new mapping into the origin's btree 65 * iv) insert the new mapping into the origin's btree
75 * (process_prepared_mappings). This act of inserting breaks some 66 * (process_prepared_mapping). This act of inserting breaks some
76 * sharing of btree nodes between the two devices. Breaking sharing only 67 * sharing of btree nodes between the two devices. Breaking sharing only
77 * effects the btree of that specific device. Btrees for the other 68 * effects the btree of that specific device. Btrees for the other
78 * devices that share the block never change. The btree for the origin 69 * devices that share the block never change. The btree for the origin
@@ -124,7 +115,7 @@ struct cell {
124 struct hlist_node list; 115 struct hlist_node list;
125 struct bio_prison *prison; 116 struct bio_prison *prison;
126 struct cell_key key; 117 struct cell_key key;
127 unsigned count; 118 struct bio *holder;
128 struct bio_list bios; 119 struct bio_list bios;
129}; 120};
130 121
@@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
220 * This may block if a new cell needs allocating. You must ensure that 211 * This may block if a new cell needs allocating. You must ensure that
221 * cells will be unlocked even if the calling thread is blocked. 212 * cells will be unlocked even if the calling thread is blocked.
222 * 213 *
223 * Returns the number of entries in the cell prior to the new addition 214 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
224 * or < 0 on failure.
225 */ 215 */
226static int bio_detain(struct bio_prison *prison, struct cell_key *key, 216static int bio_detain(struct bio_prison *prison, struct cell_key *key,
227 struct bio *inmate, struct cell **ref) 217 struct bio *inmate, struct cell **ref)
228{ 218{
229 int r; 219 int r = 1;
230 unsigned long flags; 220 unsigned long flags;
231 uint32_t hash = hash_key(prison, key); 221 uint32_t hash = hash_key(prison, key);
232 struct cell *uninitialized_var(cell), *cell2 = NULL; 222 struct cell *cell, *cell2;
233 223
234 BUG_ON(hash > prison->nr_buckets); 224 BUG_ON(hash > prison->nr_buckets);
235 225
236 spin_lock_irqsave(&prison->lock, flags); 226 spin_lock_irqsave(&prison->lock, flags);
227
237 cell = __search_bucket(prison->cells + hash, key); 228 cell = __search_bucket(prison->cells + hash, key);
229 if (cell) {
230 bio_list_add(&cell->bios, inmate);
231 goto out;
232 }
238 233
239 if (!cell) { 234 /*
240 /* 235 * Allocate a new cell
241 * Allocate a new cell 236 */
242 */ 237 spin_unlock_irqrestore(&prison->lock, flags);
243 spin_unlock_irqrestore(&prison->lock, flags); 238 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
244 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 239 spin_lock_irqsave(&prison->lock, flags);
245 spin_lock_irqsave(&prison->lock, flags);
246 240
247 /* 241 /*
248 * We've been unlocked, so we have to double check that 242 * We've been unlocked, so we have to double check that
249 * nobody else has inserted this cell in the meantime. 243 * nobody else has inserted this cell in the meantime.
250 */ 244 */
251 cell = __search_bucket(prison->cells + hash, key); 245 cell = __search_bucket(prison->cells + hash, key);
246 if (cell) {
247 mempool_free(cell2, prison->cell_pool);
248 bio_list_add(&cell->bios, inmate);
249 goto out;
250 }
252 251
253 if (!cell) { 252 /*
254 cell = cell2; 253 * Use new cell.
255 cell2 = NULL; 254 */
255 cell = cell2;
256 256
257 cell->prison = prison; 257 cell->prison = prison;
258 memcpy(&cell->key, key, sizeof(cell->key)); 258 memcpy(&cell->key, key, sizeof(cell->key));
259 cell->count = 0; 259 cell->holder = inmate;
260 bio_list_init(&cell->bios); 260 bio_list_init(&cell->bios);
261 hlist_add_head(&cell->list, prison->cells + hash); 261 hlist_add_head(&cell->list, prison->cells + hash);
262 }
263 }
264 262
265 r = cell->count++; 263 r = 0;
266 bio_list_add(&cell->bios, inmate);
267 spin_unlock_irqrestore(&prison->lock, flags);
268 264
269 if (cell2) 265out:
270 mempool_free(cell2, prison->cell_pool); 266 spin_unlock_irqrestore(&prison->lock, flags);
271 267
272 *ref = cell; 268 *ref = cell;
273 269
@@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
283 279
284 hlist_del(&cell->list); 280 hlist_del(&cell->list);
285 281
286 if (inmates) 282 bio_list_add(inmates, cell->holder);
287 bio_list_merge(inmates, &cell->bios); 283 bio_list_merge(inmates, &cell->bios);
288 284
289 mempool_free(cell, prison->cell_pool); 285 mempool_free(cell, prison->cell_pool);
290} 286}
@@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios)
305 * bio may be in the cell. This function releases the cell, and also does 301 * bio may be in the cell. This function releases the cell, and also does
306 * a sanity check. 302 * a sanity check.
307 */ 303 */
304static void __cell_release_singleton(struct cell *cell, struct bio *bio)
305{
306 hlist_del(&cell->list);
307 BUG_ON(cell->holder != bio);
308 BUG_ON(!bio_list_empty(&cell->bios));
309}
310
308static void cell_release_singleton(struct cell *cell, struct bio *bio) 311static void cell_release_singleton(struct cell *cell, struct bio *bio)
309{ 312{
310 struct bio_prison *prison = cell->prison;
311 struct bio_list bios;
312 struct bio *b;
313 unsigned long flags; 313 unsigned long flags;
314 314 struct bio_prison *prison = cell->prison;
315 bio_list_init(&bios);
316 315
317 spin_lock_irqsave(&prison->lock, flags); 316 spin_lock_irqsave(&prison->lock, flags);
318 __cell_release(cell, &bios); 317 __cell_release_singleton(cell, bio);
319 spin_unlock_irqrestore(&prison->lock, flags); 318 spin_unlock_irqrestore(&prison->lock, flags);
319}
320
321/*
322 * Sometimes we don't want the holder, just the additional bios.
323 */
324static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
325{
326 struct bio_prison *prison = cell->prison;
327
328 hlist_del(&cell->list);
329 bio_list_merge(inmates, &cell->bios);
320 330
321 b = bio_list_pop(&bios); 331 mempool_free(cell, prison->cell_pool);
322 BUG_ON(b != bio); 332}
323 BUG_ON(!bio_list_empty(&bios)); 333
334static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
335{
336 unsigned long flags;
337 struct bio_prison *prison = cell->prison;
338
339 spin_lock_irqsave(&prison->lock, flags);
340 __cell_release_no_holder(cell, inmates);
341 spin_unlock_irqrestore(&prison->lock, flags);
324} 342}
325 343
326static void cell_error(struct cell *cell) 344static void cell_error(struct cell *cell)
@@ -471,6 +489,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
471 * devices. 489 * devices.
472 */ 490 */
473struct new_mapping; 491struct new_mapping;
492
493struct pool_features {
494 unsigned zero_new_blocks:1;
495 unsigned discard_enabled:1;
496 unsigned discard_passdown:1;
497};
498
474struct pool { 499struct pool {
475 struct list_head list; 500 struct list_head list;
476 struct dm_target *ti; /* Only set if a pool target is bound */ 501 struct dm_target *ti; /* Only set if a pool target is bound */
@@ -484,7 +509,7 @@ struct pool {
484 dm_block_t offset_mask; 509 dm_block_t offset_mask;
485 dm_block_t low_water_blocks; 510 dm_block_t low_water_blocks;
486 511
487 unsigned zero_new_blocks:1; 512 struct pool_features pf;
488 unsigned low_water_triggered:1; /* A dm event has been sent */ 513 unsigned low_water_triggered:1; /* A dm event has been sent */
489 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 514 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
490 515
@@ -493,17 +518,21 @@ struct pool {
493 518
494 struct workqueue_struct *wq; 519 struct workqueue_struct *wq;
495 struct work_struct worker; 520 struct work_struct worker;
521 struct delayed_work waker;
496 522
497 unsigned ref_count; 523 unsigned ref_count;
524 unsigned long last_commit_jiffies;
498 525
499 spinlock_t lock; 526 spinlock_t lock;
500 struct bio_list deferred_bios; 527 struct bio_list deferred_bios;
501 struct bio_list deferred_flush_bios; 528 struct bio_list deferred_flush_bios;
502 struct list_head prepared_mappings; 529 struct list_head prepared_mappings;
530 struct list_head prepared_discards;
503 531
504 struct bio_list retry_on_resume_list; 532 struct bio_list retry_on_resume_list;
505 533
506 struct deferred_set ds; /* FIXME: move to thin_c */ 534 struct deferred_set shared_read_ds;
535 struct deferred_set all_io_ds;
507 536
508 struct new_mapping *next_mapping; 537 struct new_mapping *next_mapping;
509 mempool_t *mapping_pool; 538 mempool_t *mapping_pool;
@@ -521,7 +550,7 @@ struct pool_c {
521 struct dm_target_callbacks callbacks; 550 struct dm_target_callbacks callbacks;
522 551
523 dm_block_t low_water_blocks; 552 dm_block_t low_water_blocks;
524 unsigned zero_new_blocks:1; 553 struct pool_features pf;
525}; 554};
526 555
527/* 556/*
@@ -529,6 +558,7 @@ struct pool_c {
529 */ 558 */
530struct thin_c { 559struct thin_c {
531 struct dm_dev *pool_dev; 560 struct dm_dev *pool_dev;
561 struct dm_dev *origin_dev;
532 dm_thin_id dev_id; 562 dm_thin_id dev_id;
533 563
534 struct pool *pool; 564 struct pool *pool;
@@ -597,6 +627,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
597 627
598/*----------------------------------------------------------------*/ 628/*----------------------------------------------------------------*/
599 629
630struct endio_hook {
631 struct thin_c *tc;
632 struct deferred_entry *shared_read_entry;
633 struct deferred_entry *all_io_entry;
634 struct new_mapping *overwrite_mapping;
635};
636
600static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 637static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
601{ 638{
602 struct bio *bio; 639 struct bio *bio;
@@ -607,7 +644,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
607 bio_list_init(master); 644 bio_list_init(master);
608 645
609 while ((bio = bio_list_pop(&bios))) { 646 while ((bio = bio_list_pop(&bios))) {
610 if (dm_get_mapinfo(bio)->ptr == tc) 647 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
648 if (h->tc == tc)
611 bio_endio(bio, DM_ENDIO_REQUEUE); 649 bio_endio(bio, DM_ENDIO_REQUEUE);
612 else 650 else
613 bio_list_add(master, bio); 651 bio_list_add(master, bio);
@@ -646,14 +684,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
646 (bio->bi_sector & pool->offset_mask); 684 (bio->bi_sector & pool->offset_mask);
647} 685}
648 686
649static void remap_and_issue(struct thin_c *tc, struct bio *bio, 687static void remap_to_origin(struct thin_c *tc, struct bio *bio)
650 dm_block_t block) 688{
689 bio->bi_bdev = tc->origin_dev->bdev;
690}
691
692static void issue(struct thin_c *tc, struct bio *bio)
651{ 693{
652 struct pool *pool = tc->pool; 694 struct pool *pool = tc->pool;
653 unsigned long flags; 695 unsigned long flags;
654 696
655 remap(tc, bio, block);
656
657 /* 697 /*
658 * Batch together any FUA/FLUSH bios we find and then issue 698 * Batch together any FUA/FLUSH bios we find and then issue
659 * a single commit for them in process_deferred_bios(). 699 * a single commit for them in process_deferred_bios().
@@ -666,6 +706,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
666 generic_make_request(bio); 706 generic_make_request(bio);
667} 707}
668 708
709static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
710{
711 remap_to_origin(tc, bio);
712 issue(tc, bio);
713}
714
715static void remap_and_issue(struct thin_c *tc, struct bio *bio,
716 dm_block_t block)
717{
718 remap(tc, bio, block);
719 issue(tc, bio);
720}
721
669/* 722/*
670 * wake_worker() is used when new work is queued and when pool_resume is 723 * wake_worker() is used when new work is queued and when pool_resume is
671 * ready to continue deferred IO processing. 724 * ready to continue deferred IO processing.
@@ -680,21 +733,17 @@ static void wake_worker(struct pool *pool)
680/* 733/*
681 * Bio endio functions. 734 * Bio endio functions.
682 */ 735 */
683struct endio_hook {
684 struct thin_c *tc;
685 bio_end_io_t *saved_bi_end_io;
686 struct deferred_entry *entry;
687};
688
689struct new_mapping { 736struct new_mapping {
690 struct list_head list; 737 struct list_head list;
691 738
692 int prepared; 739 unsigned quiesced:1;
740 unsigned prepared:1;
741 unsigned pass_discard:1;
693 742
694 struct thin_c *tc; 743 struct thin_c *tc;
695 dm_block_t virt_block; 744 dm_block_t virt_block;
696 dm_block_t data_block; 745 dm_block_t data_block;
697 struct cell *cell; 746 struct cell *cell, *cell2;
698 int err; 747 int err;
699 748
700 /* 749 /*
@@ -711,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
711{ 760{
712 struct pool *pool = m->tc->pool; 761 struct pool *pool = m->tc->pool;
713 762
714 if (list_empty(&m->list) && m->prepared) { 763 if (m->quiesced && m->prepared) {
715 list_add(&m->list, &pool->prepared_mappings); 764 list_add(&m->list, &pool->prepared_mappings);
716 wake_worker(pool); 765 wake_worker(pool);
717 } 766 }
@@ -734,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
734static void overwrite_endio(struct bio *bio, int err) 783static void overwrite_endio(struct bio *bio, int err)
735{ 784{
736 unsigned long flags; 785 unsigned long flags;
737 struct new_mapping *m = dm_get_mapinfo(bio)->ptr; 786 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
787 struct new_mapping *m = h->overwrite_mapping;
738 struct pool *pool = m->tc->pool; 788 struct pool *pool = m->tc->pool;
739 789
740 m->err = err; 790 m->err = err;
@@ -745,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err)
745 spin_unlock_irqrestore(&pool->lock, flags); 795 spin_unlock_irqrestore(&pool->lock, flags);
746} 796}
747 797
748static void shared_read_endio(struct bio *bio, int err)
749{
750 struct list_head mappings;
751 struct new_mapping *m, *tmp;
752 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
753 unsigned long flags;
754 struct pool *pool = h->tc->pool;
755
756 bio->bi_end_io = h->saved_bi_end_io;
757 bio_endio(bio, err);
758
759 INIT_LIST_HEAD(&mappings);
760 ds_dec(h->entry, &mappings);
761
762 spin_lock_irqsave(&pool->lock, flags);
763 list_for_each_entry_safe(m, tmp, &mappings, list) {
764 list_del(&m->list);
765 INIT_LIST_HEAD(&m->list);
766 __maybe_add_mapping(m);
767 }
768 spin_unlock_irqrestore(&pool->lock, flags);
769
770 mempool_free(h, pool->endio_hook_pool);
771}
772
773/*----------------------------------------------------------------*/ 798/*----------------------------------------------------------------*/
774 799
775/* 800/*
@@ -800,21 +825,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell,
800 * Same as cell_defer above, except it omits one particular detainee, 825 * Same as cell_defer above, except it omits one particular detainee,
801 * a write bio that covers the block and has already been processed. 826 * a write bio that covers the block and has already been processed.
802 */ 827 */
803static void cell_defer_except(struct thin_c *tc, struct cell *cell, 828static void cell_defer_except(struct thin_c *tc, struct cell *cell)
804 struct bio *exception)
805{ 829{
806 struct bio_list bios; 830 struct bio_list bios;
807 struct bio *bio;
808 struct pool *pool = tc->pool; 831 struct pool *pool = tc->pool;
809 unsigned long flags; 832 unsigned long flags;
810 833
811 bio_list_init(&bios); 834 bio_list_init(&bios);
812 cell_release(cell, &bios);
813 835
814 spin_lock_irqsave(&pool->lock, flags); 836 spin_lock_irqsave(&pool->lock, flags);
815 while ((bio = bio_list_pop(&bios))) 837 cell_release_no_holder(cell, &pool->deferred_bios);
816 if (bio != exception)
817 bio_list_add(&pool->deferred_bios, bio);
818 spin_unlock_irqrestore(&pool->lock, flags); 838 spin_unlock_irqrestore(&pool->lock, flags);
819 839
820 wake_worker(pool); 840 wake_worker(pool);
@@ -854,7 +874,7 @@ static void process_prepared_mapping(struct new_mapping *m)
854 * the bios in the cell. 874 * the bios in the cell.
855 */ 875 */
856 if (bio) { 876 if (bio) {
857 cell_defer_except(tc, m->cell, bio); 877 cell_defer_except(tc, m->cell);
858 bio_endio(bio, 0); 878 bio_endio(bio, 0);
859 } else 879 } else
860 cell_defer(tc, m->cell, m->data_block); 880 cell_defer(tc, m->cell, m->data_block);
@@ -863,7 +883,30 @@ static void process_prepared_mapping(struct new_mapping *m)
863 mempool_free(m, tc->pool->mapping_pool); 883 mempool_free(m, tc->pool->mapping_pool);
864} 884}
865 885
866static void process_prepared_mappings(struct pool *pool) 886static void process_prepared_discard(struct new_mapping *m)
887{
888 int r;
889 struct thin_c *tc = m->tc;
890
891 r = dm_thin_remove_block(tc->td, m->virt_block);
892 if (r)
893 DMERR("dm_thin_remove_block() failed");
894
895 /*
896 * Pass the discard down to the underlying device?
897 */
898 if (m->pass_discard)
899 remap_and_issue(tc, m->bio, m->data_block);
900 else
901 bio_endio(m->bio, 0);
902
903 cell_defer_except(tc, m->cell);
904 cell_defer_except(tc, m->cell2);
905 mempool_free(m, tc->pool->mapping_pool);
906}
907
908static void process_prepared(struct pool *pool, struct list_head *head,
909 void (*fn)(struct new_mapping *))
867{ 910{
868 unsigned long flags; 911 unsigned long flags;
869 struct list_head maps; 912 struct list_head maps;
@@ -871,21 +914,27 @@ static void process_prepared_mappings(struct pool *pool)
871 914
872 INIT_LIST_HEAD(&maps); 915 INIT_LIST_HEAD(&maps);
873 spin_lock_irqsave(&pool->lock, flags); 916 spin_lock_irqsave(&pool->lock, flags);
874 list_splice_init(&pool->prepared_mappings, &maps); 917 list_splice_init(head, &maps);
875 spin_unlock_irqrestore(&pool->lock, flags); 918 spin_unlock_irqrestore(&pool->lock, flags);
876 919
877 list_for_each_entry_safe(m, tmp, &maps, list) 920 list_for_each_entry_safe(m, tmp, &maps, list)
878 process_prepared_mapping(m); 921 fn(m);
879} 922}
880 923
881/* 924/*
882 * Deferred bio jobs. 925 * Deferred bio jobs.
883 */ 926 */
884static int io_overwrites_block(struct pool *pool, struct bio *bio) 927static int io_overlaps_block(struct pool *pool, struct bio *bio)
885{ 928{
886 return ((bio_data_dir(bio) == WRITE) && 929 return !(bio->bi_sector & pool->offset_mask) &&
887 !(bio->bi_sector & pool->offset_mask)) &&
888 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); 930 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
931
932}
933
934static int io_overwrites_block(struct pool *pool, struct bio *bio)
935{
936 return (bio_data_dir(bio) == WRITE) &&
937 io_overlaps_block(pool, bio);
889} 938}
890 939
891static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 940static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -917,7 +966,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool)
917} 966}
918 967
919static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 968static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
920 dm_block_t data_origin, dm_block_t data_dest, 969 struct dm_dev *origin, dm_block_t data_origin,
970 dm_block_t data_dest,
921 struct cell *cell, struct bio *bio) 971 struct cell *cell, struct bio *bio)
922{ 972{
923 int r; 973 int r;
@@ -925,6 +975,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
925 struct new_mapping *m = get_next_mapping(pool); 975 struct new_mapping *m = get_next_mapping(pool);
926 976
927 INIT_LIST_HEAD(&m->list); 977 INIT_LIST_HEAD(&m->list);
978 m->quiesced = 0;
928 m->prepared = 0; 979 m->prepared = 0;
929 m->tc = tc; 980 m->tc = tc;
930 m->virt_block = virt_block; 981 m->virt_block = virt_block;
@@ -933,7 +984,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
933 m->err = 0; 984 m->err = 0;
934 m->bio = NULL; 985 m->bio = NULL;
935 986
936 ds_add_work(&pool->ds, &m->list); 987 if (!ds_add_work(&pool->shared_read_ds, &m->list))
988 m->quiesced = 1;
937 989
938 /* 990 /*
939 * IO to pool_dev remaps to the pool target's data_dev. 991 * IO to pool_dev remaps to the pool target's data_dev.
@@ -942,14 +994,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
942 * bio immediately. Otherwise we use kcopyd to clone the data first. 994 * bio immediately. Otherwise we use kcopyd to clone the data first.
943 */ 995 */
944 if (io_overwrites_block(pool, bio)) { 996 if (io_overwrites_block(pool, bio)) {
997 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
998 h->overwrite_mapping = m;
945 m->bio = bio; 999 m->bio = bio;
946 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1000 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
947 dm_get_mapinfo(bio)->ptr = m;
948 remap_and_issue(tc, bio, data_dest); 1001 remap_and_issue(tc, bio, data_dest);
949 } else { 1002 } else {
950 struct dm_io_region from, to; 1003 struct dm_io_region from, to;
951 1004
952 from.bdev = tc->pool_dev->bdev; 1005 from.bdev = origin->bdev;
953 from.sector = data_origin * pool->sectors_per_block; 1006 from.sector = data_origin * pool->sectors_per_block;
954 from.count = pool->sectors_per_block; 1007 from.count = pool->sectors_per_block;
955 1008
@@ -967,6 +1020,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
967 } 1020 }
968} 1021}
969 1022
1023static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1024 dm_block_t data_origin, dm_block_t data_dest,
1025 struct cell *cell, struct bio *bio)
1026{
1027 schedule_copy(tc, virt_block, tc->pool_dev,
1028 data_origin, data_dest, cell, bio);
1029}
1030
1031static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1032 dm_block_t data_dest,
1033 struct cell *cell, struct bio *bio)
1034{
1035 schedule_copy(tc, virt_block, tc->origin_dev,
1036 virt_block, data_dest, cell, bio);
1037}
1038
970static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 1039static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
971 dm_block_t data_block, struct cell *cell, 1040 dm_block_t data_block, struct cell *cell,
972 struct bio *bio) 1041 struct bio *bio)
@@ -975,6 +1044,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
975 struct new_mapping *m = get_next_mapping(pool); 1044 struct new_mapping *m = get_next_mapping(pool);
976 1045
977 INIT_LIST_HEAD(&m->list); 1046 INIT_LIST_HEAD(&m->list);
1047 m->quiesced = 1;
978 m->prepared = 0; 1048 m->prepared = 0;
979 m->tc = tc; 1049 m->tc = tc;
980 m->virt_block = virt_block; 1050 m->virt_block = virt_block;
@@ -988,13 +1058,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
988 * zeroing pre-existing data, we can issue the bio immediately. 1058 * zeroing pre-existing data, we can issue the bio immediately.
989 * Otherwise we use kcopyd to zero the data first. 1059 * Otherwise we use kcopyd to zero the data first.
990 */ 1060 */
991 if (!pool->zero_new_blocks) 1061 if (!pool->pf.zero_new_blocks)
992 process_prepared_mapping(m); 1062 process_prepared_mapping(m);
993 1063
994 else if (io_overwrites_block(pool, bio)) { 1064 else if (io_overwrites_block(pool, bio)) {
1065 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1066 h->overwrite_mapping = m;
995 m->bio = bio; 1067 m->bio = bio;
996 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1068 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
997 dm_get_mapinfo(bio)->ptr = m;
998 remap_and_issue(tc, bio, data_block); 1069 remap_and_issue(tc, bio, data_block);
999 1070
1000 } else { 1071 } else {
@@ -1081,7 +1152,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1081 */ 1152 */
1082static void retry_on_resume(struct bio *bio) 1153static void retry_on_resume(struct bio *bio)
1083{ 1154{
1084 struct thin_c *tc = dm_get_mapinfo(bio)->ptr; 1155 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1156 struct thin_c *tc = h->tc;
1085 struct pool *pool = tc->pool; 1157 struct pool *pool = tc->pool;
1086 unsigned long flags; 1158 unsigned long flags;
1087 1159
@@ -1102,6 +1174,86 @@ static void no_space(struct cell *cell)
1102 retry_on_resume(bio); 1174 retry_on_resume(bio);
1103} 1175}
1104 1176
1177static void process_discard(struct thin_c *tc, struct bio *bio)
1178{
1179 int r;
1180 struct pool *pool = tc->pool;
1181 struct cell *cell, *cell2;
1182 struct cell_key key, key2;
1183 dm_block_t block = get_bio_block(tc, bio);
1184 struct dm_thin_lookup_result lookup_result;
1185 struct new_mapping *m;
1186
1187 build_virtual_key(tc->td, block, &key);
1188 if (bio_detain(tc->pool->prison, &key, bio, &cell))
1189 return;
1190
1191 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1192 switch (r) {
1193 case 0:
1194 /*
1195 * Check nobody is fiddling with this pool block. This can
1196 * happen if someone's in the process of breaking sharing
1197 * on this block.
1198 */
1199 build_data_key(tc->td, lookup_result.block, &key2);
1200 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
1201 cell_release_singleton(cell, bio);
1202 break;
1203 }
1204
1205 if (io_overlaps_block(pool, bio)) {
1206 /*
1207 * IO may still be going to the destination block. We must
1208 * quiesce before we can do the removal.
1209 */
1210 m = get_next_mapping(pool);
1211 m->tc = tc;
1212 m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown;
1213 m->virt_block = block;
1214 m->data_block = lookup_result.block;
1215 m->cell = cell;
1216 m->cell2 = cell2;
1217 m->err = 0;
1218 m->bio = bio;
1219
1220 if (!ds_add_work(&pool->all_io_ds, &m->list)) {
1221 list_add(&m->list, &pool->prepared_discards);
1222 wake_worker(pool);
1223 }
1224 } else {
1225 /*
1226 * This path is hit if people are ignoring
1227 * limits->discard_granularity. It ignores any
1228 * part of the discard that is in a subsequent
1229 * block.
1230 */
1231 sector_t offset = bio->bi_sector - (block << pool->block_shift);
1232 unsigned remaining = (pool->sectors_per_block - offset) << 9;
1233 bio->bi_size = min(bio->bi_size, remaining);
1234
1235 cell_release_singleton(cell, bio);
1236 cell_release_singleton(cell2, bio);
1237 remap_and_issue(tc, bio, lookup_result.block);
1238 }
1239 break;
1240
1241 case -ENODATA:
1242 /*
1243 * It isn't provisioned, just forget it.
1244 */
1245 cell_release_singleton(cell, bio);
1246 bio_endio(bio, 0);
1247 break;
1248
1249 default:
1250 DMERR("discard: find block unexpectedly returned %d", r);
1251 cell_release_singleton(cell, bio);
1252 bio_io_error(bio);
1253 break;
1254 }
1255}
1256
1105static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1257static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1106 struct cell_key *key, 1258 struct cell_key *key,
1107 struct dm_thin_lookup_result *lookup_result, 1259 struct dm_thin_lookup_result *lookup_result,
@@ -1113,8 +1265,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1113 r = alloc_data_block(tc, &data_block); 1265 r = alloc_data_block(tc, &data_block);
1114 switch (r) { 1266 switch (r) {
1115 case 0: 1267 case 0:
1116 schedule_copy(tc, block, lookup_result->block, 1268 schedule_internal_copy(tc, block, lookup_result->block,
1117 data_block, cell, bio); 1269 data_block, cell, bio);
1118 break; 1270 break;
1119 1271
1120 case -ENOSPC: 1272 case -ENOSPC:
@@ -1147,13 +1299,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1147 if (bio_data_dir(bio) == WRITE) 1299 if (bio_data_dir(bio) == WRITE)
1148 break_sharing(tc, bio, block, &key, lookup_result, cell); 1300 break_sharing(tc, bio, block, &key, lookup_result, cell);
1149 else { 1301 else {
1150 struct endio_hook *h; 1302 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1151 h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1152 1303
1153 h->tc = tc; 1304 h->shared_read_entry = ds_inc(&pool->shared_read_ds);
1154 h->entry = ds_inc(&pool->ds);
1155 save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
1156 dm_get_mapinfo(bio)->ptr = h;
1157 1305
1158 cell_release_singleton(cell, bio); 1306 cell_release_singleton(cell, bio);
1159 remap_and_issue(tc, bio, lookup_result->block); 1307 remap_and_issue(tc, bio, lookup_result->block);
@@ -1188,7 +1336,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1188 r = alloc_data_block(tc, &data_block); 1336 r = alloc_data_block(tc, &data_block);
1189 switch (r) { 1337 switch (r) {
1190 case 0: 1338 case 0:
1191 schedule_zero(tc, block, data_block, cell, bio); 1339 if (tc->origin_dev)
1340 schedule_external_copy(tc, block, data_block, cell, bio);
1341 else
1342 schedule_zero(tc, block, data_block, cell, bio);
1192 break; 1343 break;
1193 1344
1194 case -ENOSPC: 1345 case -ENOSPC:
@@ -1239,16 +1390,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1239 break; 1390 break;
1240 1391
1241 case -ENODATA: 1392 case -ENODATA:
1242 provision_block(tc, bio, block, cell); 1393 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1394 cell_release_singleton(cell, bio);
1395 remap_to_origin_and_issue(tc, bio);
1396 } else
1397 provision_block(tc, bio, block, cell);
1243 break; 1398 break;
1244 1399
1245 default: 1400 default:
1246 DMERR("dm_thin_find_block() failed, error = %d", r); 1401 DMERR("dm_thin_find_block() failed, error = %d", r);
1402 cell_release_singleton(cell, bio);
1247 bio_io_error(bio); 1403 bio_io_error(bio);
1248 break; 1404 break;
1249 } 1405 }
1250} 1406}
1251 1407
1408static int need_commit_due_to_time(struct pool *pool)
1409{
1410 return jiffies < pool->last_commit_jiffies ||
1411 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1412}
1413
1252static void process_deferred_bios(struct pool *pool) 1414static void process_deferred_bios(struct pool *pool)
1253{ 1415{
1254 unsigned long flags; 1416 unsigned long flags;
@@ -1264,7 +1426,9 @@ static void process_deferred_bios(struct pool *pool)
1264 spin_unlock_irqrestore(&pool->lock, flags); 1426 spin_unlock_irqrestore(&pool->lock, flags);
1265 1427
1266 while ((bio = bio_list_pop(&bios))) { 1428 while ((bio = bio_list_pop(&bios))) {
1267 struct thin_c *tc = dm_get_mapinfo(bio)->ptr; 1429 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1430 struct thin_c *tc = h->tc;
1431
1268 /* 1432 /*
1269 * If we've got no free new_mapping structs, and processing 1433 * If we've got no free new_mapping structs, and processing
1270 * this bio might require one, we pause until there are some 1434 * this bio might require one, we pause until there are some
@@ -1277,7 +1441,11 @@ static void process_deferred_bios(struct pool *pool)
1277 1441
1278 break; 1442 break;
1279 } 1443 }
1280 process_bio(tc, bio); 1444
1445 if (bio->bi_rw & REQ_DISCARD)
1446 process_discard(tc, bio);
1447 else
1448 process_bio(tc, bio);
1281 } 1449 }
1282 1450
1283 /* 1451 /*
@@ -1290,7 +1458,7 @@ static void process_deferred_bios(struct pool *pool)
1290 bio_list_init(&pool->deferred_flush_bios); 1458 bio_list_init(&pool->deferred_flush_bios);
1291 spin_unlock_irqrestore(&pool->lock, flags); 1459 spin_unlock_irqrestore(&pool->lock, flags);
1292 1460
1293 if (bio_list_empty(&bios)) 1461 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1294 return; 1462 return;
1295 1463
1296 r = dm_pool_commit_metadata(pool->pmd); 1464 r = dm_pool_commit_metadata(pool->pmd);
@@ -1301,6 +1469,7 @@ static void process_deferred_bios(struct pool *pool)
1301 bio_io_error(bio); 1469 bio_io_error(bio);
1302 return; 1470 return;
1303 } 1471 }
1472 pool->last_commit_jiffies = jiffies;
1304 1473
1305 while ((bio = bio_list_pop(&bios))) 1474 while ((bio = bio_list_pop(&bios)))
1306 generic_make_request(bio); 1475 generic_make_request(bio);
@@ -1310,10 +1479,22 @@ static void do_worker(struct work_struct *ws)
1310{ 1479{
1311 struct pool *pool = container_of(ws, struct pool, worker); 1480 struct pool *pool = container_of(ws, struct pool, worker);
1312 1481
1313 process_prepared_mappings(pool); 1482 process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
1483 process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
1314 process_deferred_bios(pool); 1484 process_deferred_bios(pool);
1315} 1485}
1316 1486
1487/*
1488 * We want to commit periodically so that not too much
1489 * unwritten data builds up.
1490 */
1491static void do_waker(struct work_struct *ws)
1492{
1493 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1494 wake_worker(pool);
1495 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1496}
1497
1317/*----------------------------------------------------------------*/ 1498/*----------------------------------------------------------------*/
1318 1499
1319/* 1500/*
@@ -1335,6 +1516,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1335 wake_worker(pool); 1516 wake_worker(pool);
1336} 1517}
1337 1518
1519static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
1520{
1521 struct pool *pool = tc->pool;
1522 struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1523
1524 h->tc = tc;
1525 h->shared_read_entry = NULL;
1526 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
1527 h->overwrite_mapping = NULL;
1528
1529 return h;
1530}
1531
1338/* 1532/*
1339 * Non-blocking function called from the thin target's map function. 1533 * Non-blocking function called from the thin target's map function.
1340 */ 1534 */
@@ -1347,12 +1541,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1347 struct dm_thin_device *td = tc->td; 1541 struct dm_thin_device *td = tc->td;
1348 struct dm_thin_lookup_result result; 1542 struct dm_thin_lookup_result result;
1349 1543
1350 /* 1544 map_context->ptr = thin_hook_bio(tc, bio);
1351 * Save the thin context for easy access from the deferred bio later. 1545 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1352 */
1353 map_context->ptr = tc;
1354
1355 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1356 thin_defer_bio(tc, bio); 1546 thin_defer_bio(tc, bio);
1357 return DM_MAPIO_SUBMITTED; 1547 return DM_MAPIO_SUBMITTED;
1358 } 1548 }
@@ -1434,7 +1624,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1434 1624
1435 pool->ti = ti; 1625 pool->ti = ti;
1436 pool->low_water_blocks = pt->low_water_blocks; 1626 pool->low_water_blocks = pt->low_water_blocks;
1437 pool->zero_new_blocks = pt->zero_new_blocks; 1627 pool->pf = pt->pf;
1438 1628
1439 return 0; 1629 return 0;
1440} 1630}
@@ -1448,6 +1638,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1448/*---------------------------------------------------------------- 1638/*----------------------------------------------------------------
1449 * Pool creation 1639 * Pool creation
1450 *--------------------------------------------------------------*/ 1640 *--------------------------------------------------------------*/
1641/* Initialize pool features. */
1642static void pool_features_init(struct pool_features *pf)
1643{
1644 pf->zero_new_blocks = 1;
1645 pf->discard_enabled = 1;
1646 pf->discard_passdown = 1;
1647}
1648
1451static void __pool_destroy(struct pool *pool) 1649static void __pool_destroy(struct pool *pool)
1452{ 1650{
1453 __pool_table_remove(pool); 1651 __pool_table_remove(pool);
@@ -1495,7 +1693,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1495 pool->block_shift = ffs(block_size) - 1; 1693 pool->block_shift = ffs(block_size) - 1;
1496 pool->offset_mask = block_size - 1; 1694 pool->offset_mask = block_size - 1;
1497 pool->low_water_blocks = 0; 1695 pool->low_water_blocks = 0;
1498 pool->zero_new_blocks = 1; 1696 pool_features_init(&pool->pf);
1499 pool->prison = prison_create(PRISON_CELLS); 1697 pool->prison = prison_create(PRISON_CELLS);
1500 if (!pool->prison) { 1698 if (!pool->prison) {
1501 *error = "Error creating pool's bio prison"; 1699 *error = "Error creating pool's bio prison";
@@ -1523,14 +1721,17 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1523 } 1721 }
1524 1722
1525 INIT_WORK(&pool->worker, do_worker); 1723 INIT_WORK(&pool->worker, do_worker);
1724 INIT_DELAYED_WORK(&pool->waker, do_waker);
1526 spin_lock_init(&pool->lock); 1725 spin_lock_init(&pool->lock);
1527 bio_list_init(&pool->deferred_bios); 1726 bio_list_init(&pool->deferred_bios);
1528 bio_list_init(&pool->deferred_flush_bios); 1727 bio_list_init(&pool->deferred_flush_bios);
1529 INIT_LIST_HEAD(&pool->prepared_mappings); 1728 INIT_LIST_HEAD(&pool->prepared_mappings);
1729 INIT_LIST_HEAD(&pool->prepared_discards);
1530 pool->low_water_triggered = 0; 1730 pool->low_water_triggered = 0;
1531 pool->no_free_space = 0; 1731 pool->no_free_space = 0;
1532 bio_list_init(&pool->retry_on_resume_list); 1732 bio_list_init(&pool->retry_on_resume_list);
1533 ds_init(&pool->ds); 1733 ds_init(&pool->shared_read_ds);
1734 ds_init(&pool->all_io_ds);
1534 1735
1535 pool->next_mapping = NULL; 1736 pool->next_mapping = NULL;
1536 pool->mapping_pool = 1737 pool->mapping_pool =
@@ -1549,6 +1750,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1549 goto bad_endio_hook_pool; 1750 goto bad_endio_hook_pool;
1550 } 1751 }
1551 pool->ref_count = 1; 1752 pool->ref_count = 1;
1753 pool->last_commit_jiffies = jiffies;
1552 pool->pool_md = pool_md; 1754 pool->pool_md = pool_md;
1553 pool->md_dev = metadata_dev; 1755 pool->md_dev = metadata_dev;
1554 __pool_table_insert(pool); 1756 __pool_table_insert(pool);
@@ -1588,7 +1790,8 @@ static void __pool_dec(struct pool *pool)
1588 1790
1589static struct pool *__pool_find(struct mapped_device *pool_md, 1791static struct pool *__pool_find(struct mapped_device *pool_md,
1590 struct block_device *metadata_dev, 1792 struct block_device *metadata_dev,
1591 unsigned long block_size, char **error) 1793 unsigned long block_size, char **error,
1794 int *created)
1592{ 1795{
1593 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 1796 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1594 1797
@@ -1604,8 +1807,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
1604 return ERR_PTR(-EINVAL); 1807 return ERR_PTR(-EINVAL);
1605 __pool_inc(pool); 1808 __pool_inc(pool);
1606 1809
1607 } else 1810 } else {
1608 pool = pool_create(pool_md, metadata_dev, block_size, error); 1811 pool = pool_create(pool_md, metadata_dev, block_size, error);
1812 *created = 1;
1813 }
1609 } 1814 }
1610 1815
1611 return pool; 1816 return pool;
@@ -1629,10 +1834,6 @@ static void pool_dtr(struct dm_target *ti)
1629 mutex_unlock(&dm_thin_pool_table.mutex); 1834 mutex_unlock(&dm_thin_pool_table.mutex);
1630} 1835}
1631 1836
1632struct pool_features {
1633 unsigned zero_new_blocks:1;
1634};
1635
1636static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 1837static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1637 struct dm_target *ti) 1838 struct dm_target *ti)
1638{ 1839{
@@ -1641,7 +1842,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1641 const char *arg_name; 1842 const char *arg_name;
1642 1843
1643 static struct dm_arg _args[] = { 1844 static struct dm_arg _args[] = {
1644 {0, 1, "Invalid number of pool feature arguments"}, 1845 {0, 3, "Invalid number of pool feature arguments"},
1645 }; 1846 };
1646 1847
1647 /* 1848 /*
@@ -1661,6 +1862,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1661 if (!strcasecmp(arg_name, "skip_block_zeroing")) { 1862 if (!strcasecmp(arg_name, "skip_block_zeroing")) {
1662 pf->zero_new_blocks = 0; 1863 pf->zero_new_blocks = 0;
1663 continue; 1864 continue;
1865 } else if (!strcasecmp(arg_name, "ignore_discard")) {
1866 pf->discard_enabled = 0;
1867 continue;
1868 } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
1869 pf->discard_passdown = 0;
1870 continue;
1664 } 1871 }
1665 1872
1666 ti->error = "Unrecognised pool feature requested"; 1873 ti->error = "Unrecognised pool feature requested";
@@ -1678,10 +1885,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1678 * 1885 *
1679 * Optional feature arguments are: 1886 * Optional feature arguments are:
1680 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 1887 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1888 * ignore_discard: disable discard
1889 * no_discard_passdown: don't pass discards down to the data device
1681 */ 1890 */
1682static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 1891static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1683{ 1892{
1684 int r; 1893 int r, pool_created = 0;
1685 struct pool_c *pt; 1894 struct pool_c *pt;
1686 struct pool *pool; 1895 struct pool *pool;
1687 struct pool_features pf; 1896 struct pool_features pf;
@@ -1691,6 +1900,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1691 dm_block_t low_water_blocks; 1900 dm_block_t low_water_blocks;
1692 struct dm_dev *metadata_dev; 1901 struct dm_dev *metadata_dev;
1693 sector_t metadata_dev_size; 1902 sector_t metadata_dev_size;
1903 char b[BDEVNAME_SIZE];
1694 1904
1695 /* 1905 /*
1696 * FIXME Remove validation from scope of lock. 1906 * FIXME Remove validation from scope of lock.
@@ -1712,11 +1922,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1712 } 1922 }
1713 1923
1714 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 1924 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1715 if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) { 1925 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1716 ti->error = "Metadata device is too large"; 1926 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1717 r = -EINVAL; 1927 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1718 goto out_metadata;
1719 }
1720 1928
1721 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 1929 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1722 if (r) { 1930 if (r) {
@@ -1742,8 +1950,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1742 /* 1950 /*
1743 * Set default pool features. 1951 * Set default pool features.
1744 */ 1952 */
1745 memset(&pf, 0, sizeof(pf)); 1953 pool_features_init(&pf);
1746 pf.zero_new_blocks = 1;
1747 1954
1748 dm_consume_args(&as, 4); 1955 dm_consume_args(&as, 4);
1749 r = parse_pool_features(&as, &pf, ti); 1956 r = parse_pool_features(&as, &pf, ti);
@@ -1757,20 +1964,58 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1757 } 1964 }
1758 1965
1759 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 1966 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1760 block_size, &ti->error); 1967 block_size, &ti->error, &pool_created);
1761 if (IS_ERR(pool)) { 1968 if (IS_ERR(pool)) {
1762 r = PTR_ERR(pool); 1969 r = PTR_ERR(pool);
1763 goto out_free_pt; 1970 goto out_free_pt;
1764 } 1971 }
1765 1972
1973 /*
1974 * 'pool_created' reflects whether this is the first table load.
1975 * Top level discard support is not allowed to be changed after
1976 * initial load. This would require a pool reload to trigger thin
1977 * device changes.
1978 */
1979 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
1980 ti->error = "Discard support cannot be disabled once enabled";
1981 r = -EINVAL;
1982 goto out_flags_changed;
1983 }
1984
1985 /*
1986 * If discard_passdown was enabled verify that the data device
1987 * supports discards. Disable discard_passdown if not; otherwise
1988 * -EOPNOTSUPP will be returned.
1989 */
1990 if (pf.discard_passdown) {
1991 struct request_queue *q = bdev_get_queue(data_dev->bdev);
1992 if (!q || !blk_queue_discard(q)) {
1993 DMWARN("Discard unsupported by data device: Disabling discard passdown.");
1994 pf.discard_passdown = 0;
1995 }
1996 }
1997
1766 pt->pool = pool; 1998 pt->pool = pool;
1767 pt->ti = ti; 1999 pt->ti = ti;
1768 pt->metadata_dev = metadata_dev; 2000 pt->metadata_dev = metadata_dev;
1769 pt->data_dev = data_dev; 2001 pt->data_dev = data_dev;
1770 pt->low_water_blocks = low_water_blocks; 2002 pt->low_water_blocks = low_water_blocks;
1771 pt->zero_new_blocks = pf.zero_new_blocks; 2003 pt->pf = pf;
1772 ti->num_flush_requests = 1; 2004 ti->num_flush_requests = 1;
1773 ti->num_discard_requests = 0; 2005 /*
2006 * Only need to enable discards if the pool should pass
2007 * them down to the data device. The thin device's discard
2008 * processing will cause mappings to be removed from the btree.
2009 */
2010 if (pf.discard_enabled && pf.discard_passdown) {
2011 ti->num_discard_requests = 1;
2012 /*
2013 * Setting 'discards_supported' circumvents the normal
2014 * stacking of discard limits (this keeps the pool and
2015 * thin devices' discard limits consistent).
2016 */
2017 ti->discards_supported = 1;
2018 }
1774 ti->private = pt; 2019 ti->private = pt;
1775 2020
1776 pt->callbacks.congested_fn = pool_is_congested; 2021 pt->callbacks.congested_fn = pool_is_congested;
@@ -1780,6 +2025,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1780 2025
1781 return 0; 2026 return 0;
1782 2027
2028out_flags_changed:
2029 __pool_dec(pool);
1783out_free_pt: 2030out_free_pt:
1784 kfree(pt); 2031 kfree(pt);
1785out: 2032out:
@@ -1878,7 +2125,7 @@ static void pool_resume(struct dm_target *ti)
1878 __requeue_bios(pool); 2125 __requeue_bios(pool);
1879 spin_unlock_irqrestore(&pool->lock, flags); 2126 spin_unlock_irqrestore(&pool->lock, flags);
1880 2127
1881 wake_worker(pool); 2128 do_waker(&pool->waker.work);
1882} 2129}
1883 2130
1884static void pool_postsuspend(struct dm_target *ti) 2131static void pool_postsuspend(struct dm_target *ti)
@@ -1887,6 +2134,7 @@ static void pool_postsuspend(struct dm_target *ti)
1887 struct pool_c *pt = ti->private; 2134 struct pool_c *pt = ti->private;
1888 struct pool *pool = pt->pool; 2135 struct pool *pool = pt->pool;
1889 2136
2137 cancel_delayed_work(&pool->waker);
1890 flush_workqueue(pool->wq); 2138 flush_workqueue(pool->wq);
1891 2139
1892 r = dm_pool_commit_metadata(pool->pmd); 2140 r = dm_pool_commit_metadata(pool->pmd);
@@ -2067,7 +2315,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2067static int pool_status(struct dm_target *ti, status_type_t type, 2315static int pool_status(struct dm_target *ti, status_type_t type,
2068 char *result, unsigned maxlen) 2316 char *result, unsigned maxlen)
2069{ 2317{
2070 int r; 2318 int r, count;
2071 unsigned sz = 0; 2319 unsigned sz = 0;
2072 uint64_t transaction_id; 2320 uint64_t transaction_id;
2073 dm_block_t nr_free_blocks_data; 2321 dm_block_t nr_free_blocks_data;
@@ -2130,10 +2378,19 @@ static int pool_status(struct dm_target *ti, status_type_t type,
2130 (unsigned long)pool->sectors_per_block, 2378 (unsigned long)pool->sectors_per_block,
2131 (unsigned long long)pt->low_water_blocks); 2379 (unsigned long long)pt->low_water_blocks);
2132 2380
2133 DMEMIT("%u ", !pool->zero_new_blocks); 2381 count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
2382 !pool->pf.discard_passdown;
2383 DMEMIT("%u ", count);
2134 2384
2135 if (!pool->zero_new_blocks) 2385 if (!pool->pf.zero_new_blocks)
2136 DMEMIT("skip_block_zeroing "); 2386 DMEMIT("skip_block_zeroing ");
2387
2388 if (!pool->pf.discard_enabled)
2389 DMEMIT("ignore_discard ");
2390
2391 if (!pool->pf.discard_passdown)
2392 DMEMIT("no_discard_passdown ");
2393
2137 break; 2394 break;
2138 } 2395 }
2139 2396
@@ -2162,6 +2419,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2162 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2419 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2163} 2420}
2164 2421
2422static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
2423{
2424 /*
2425 * FIXME: these limits may be incompatible with the pool's data device
2426 */
2427 limits->max_discard_sectors = pool->sectors_per_block;
2428
2429 /*
2430 * This is just a hint, and not enforced. We have to cope with
2431 * bios that overlap 2 blocks.
2432 */
2433 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2434 limits->discard_zeroes_data = pool->pf.zero_new_blocks;
2435}
2436
2165static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2437static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2166{ 2438{
2167 struct pool_c *pt = ti->private; 2439 struct pool_c *pt = ti->private;
@@ -2169,13 +2441,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2169 2441
2170 blk_limits_io_min(limits, 0); 2442 blk_limits_io_min(limits, 0);
2171 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2443 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2444 if (pool->pf.discard_enabled)
2445 set_discard_limits(pool, limits);
2172} 2446}
2173 2447
2174static struct target_type pool_target = { 2448static struct target_type pool_target = {
2175 .name = "thin-pool", 2449 .name = "thin-pool",
2176 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2450 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2177 DM_TARGET_IMMUTABLE, 2451 DM_TARGET_IMMUTABLE,
2178 .version = {1, 0, 0}, 2452 .version = {1, 1, 0},
2179 .module = THIS_MODULE, 2453 .module = THIS_MODULE,
2180 .ctr = pool_ctr, 2454 .ctr = pool_ctr,
2181 .dtr = pool_dtr, 2455 .dtr = pool_dtr,
@@ -2202,6 +2476,8 @@ static void thin_dtr(struct dm_target *ti)
2202 __pool_dec(tc->pool); 2476 __pool_dec(tc->pool);
2203 dm_pool_close_thin_device(tc->td); 2477 dm_pool_close_thin_device(tc->td);
2204 dm_put_device(ti, tc->pool_dev); 2478 dm_put_device(ti, tc->pool_dev);
2479 if (tc->origin_dev)
2480 dm_put_device(ti, tc->origin_dev);
2205 kfree(tc); 2481 kfree(tc);
2206 2482
2207 mutex_unlock(&dm_thin_pool_table.mutex); 2483 mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2210,21 +2486,25 @@ static void thin_dtr(struct dm_target *ti)
2210/* 2486/*
2211 * Thin target parameters: 2487 * Thin target parameters:
2212 * 2488 *
2213 * <pool_dev> <dev_id> 2489 * <pool_dev> <dev_id> [origin_dev]
2214 * 2490 *
2215 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2491 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2216 * dev_id: the internal device identifier 2492 * dev_id: the internal device identifier
2493 * origin_dev: a device external to the pool that should act as the origin
2494 *
2495 * If the pool device has discards disabled, they get disabled for the thin
2496 * device as well.
2217 */ 2497 */
2218static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2498static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2219{ 2499{
2220 int r; 2500 int r;
2221 struct thin_c *tc; 2501 struct thin_c *tc;
2222 struct dm_dev *pool_dev; 2502 struct dm_dev *pool_dev, *origin_dev;
2223 struct mapped_device *pool_md; 2503 struct mapped_device *pool_md;
2224 2504
2225 mutex_lock(&dm_thin_pool_table.mutex); 2505 mutex_lock(&dm_thin_pool_table.mutex);
2226 2506
2227 if (argc != 2) { 2507 if (argc != 2 && argc != 3) {
2228 ti->error = "Invalid argument count"; 2508 ti->error = "Invalid argument count";
2229 r = -EINVAL; 2509 r = -EINVAL;
2230 goto out_unlock; 2510 goto out_unlock;
@@ -2237,6 +2517,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2237 goto out_unlock; 2517 goto out_unlock;
2238 } 2518 }
2239 2519
2520 if (argc == 3) {
2521 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2522 if (r) {
2523 ti->error = "Error opening origin device";
2524 goto bad_origin_dev;
2525 }
2526 tc->origin_dev = origin_dev;
2527 }
2528
2240 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2529 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2241 if (r) { 2530 if (r) {
2242 ti->error = "Error opening pool device"; 2531 ti->error = "Error opening pool device";
@@ -2273,8 +2562,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2273 2562
2274 ti->split_io = tc->pool->sectors_per_block; 2563 ti->split_io = tc->pool->sectors_per_block;
2275 ti->num_flush_requests = 1; 2564 ti->num_flush_requests = 1;
2276 ti->num_discard_requests = 0; 2565
2277 ti->discards_supported = 0; 2566 /* In case the pool supports discards, pass them on. */
2567 if (tc->pool->pf.discard_enabled) {
2568 ti->discards_supported = 1;
2569 ti->num_discard_requests = 1;
2570 }
2278 2571
2279 dm_put(pool_md); 2572 dm_put(pool_md);
2280 2573
@@ -2289,6 +2582,9 @@ bad_pool_lookup:
2289bad_common: 2582bad_common:
2290 dm_put_device(ti, tc->pool_dev); 2583 dm_put_device(ti, tc->pool_dev);
2291bad_pool_dev: 2584bad_pool_dev:
2585 if (tc->origin_dev)
2586 dm_put_device(ti, tc->origin_dev);
2587bad_origin_dev:
2292 kfree(tc); 2588 kfree(tc);
2293out_unlock: 2589out_unlock:
2294 mutex_unlock(&dm_thin_pool_table.mutex); 2590 mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2299,11 +2595,46 @@ out_unlock:
2299static int thin_map(struct dm_target *ti, struct bio *bio, 2595static int thin_map(struct dm_target *ti, struct bio *bio,
2300 union map_info *map_context) 2596 union map_info *map_context)
2301{ 2597{
2302 bio->bi_sector -= ti->begin; 2598 bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
2303 2599
2304 return thin_bio_map(ti, bio, map_context); 2600 return thin_bio_map(ti, bio, map_context);
2305} 2601}
2306 2602
2603static int thin_endio(struct dm_target *ti,
2604 struct bio *bio, int err,
2605 union map_info *map_context)
2606{
2607 unsigned long flags;
2608 struct endio_hook *h = map_context->ptr;
2609 struct list_head work;
2610 struct new_mapping *m, *tmp;
2611 struct pool *pool = h->tc->pool;
2612
2613 if (h->shared_read_entry) {
2614 INIT_LIST_HEAD(&work);
2615 ds_dec(h->shared_read_entry, &work);
2616
2617 spin_lock_irqsave(&pool->lock, flags);
2618 list_for_each_entry_safe(m, tmp, &work, list) {
2619 list_del(&m->list);
2620 m->quiesced = 1;
2621 __maybe_add_mapping(m);
2622 }
2623 spin_unlock_irqrestore(&pool->lock, flags);
2624 }
2625
2626 if (h->all_io_entry) {
2627 INIT_LIST_HEAD(&work);
2628 ds_dec(h->all_io_entry, &work);
2629 list_for_each_entry_safe(m, tmp, &work, list)
2630 list_add(&m->list, &pool->prepared_discards);
2631 }
2632
2633 mempool_free(h, pool->endio_hook_pool);
2634
2635 return 0;
2636}
2637
2307static void thin_postsuspend(struct dm_target *ti) 2638static void thin_postsuspend(struct dm_target *ti)
2308{ 2639{
2309 if (dm_noflush_suspending(ti)) 2640 if (dm_noflush_suspending(ti))
@@ -2347,6 +2678,8 @@ static int thin_status(struct dm_target *ti, status_type_t type,
2347 DMEMIT("%s %lu", 2678 DMEMIT("%s %lu",
2348 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 2679 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2349 (unsigned long) tc->dev_id); 2680 (unsigned long) tc->dev_id);
2681 if (tc->origin_dev)
2682 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
2350 break; 2683 break;
2351 } 2684 }
2352 } 2685 }
@@ -2377,18 +2710,21 @@ static int thin_iterate_devices(struct dm_target *ti,
2377static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 2710static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2378{ 2711{
2379 struct thin_c *tc = ti->private; 2712 struct thin_c *tc = ti->private;
2713 struct pool *pool = tc->pool;
2380 2714
2381 blk_limits_io_min(limits, 0); 2715 blk_limits_io_min(limits, 0);
2382 blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT); 2716 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2717 set_discard_limits(pool, limits);
2383} 2718}
2384 2719
2385static struct target_type thin_target = { 2720static struct target_type thin_target = {
2386 .name = "thin", 2721 .name = "thin",
2387 .version = {1, 0, 0}, 2722 .version = {1, 1, 0},
2388 .module = THIS_MODULE, 2723 .module = THIS_MODULE,
2389 .ctr = thin_ctr, 2724 .ctr = thin_ctr,
2390 .dtr = thin_dtr, 2725 .dtr = thin_dtr,
2391 .map = thin_map, 2726 .map = thin_map,
2727 .end_io = thin_endio,
2392 .postsuspend = thin_postsuspend, 2728 .postsuspend = thin_postsuspend,
2393 .status = thin_status, 2729 .status = thin_status,
2394 .iterate_devices = thin_iterate_devices, 2730 .iterate_devices = thin_iterate_devices,