aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm-thin.c
diff options
context:
space:
mode:
authorJoe Thornber <thornber@redhat.com>2011-10-31 16:21:18 -0400
committerAlasdair G Kergon <agk@redhat.com>2011-10-31 16:21:18 -0400
commit991d9fa02da0dd1f843dc011376965e0c8c6c9b5 (patch)
treea64c94710246b77bb74cd77634581cea3d32cfe1 /drivers/md/dm-thin.c
parent3241b1d3e0aaafbfcd320f4d71ade629728cc4f4 (diff)
dm: add thin provisioning target
Initial EXPERIMENTAL implementation of device-mapper thin provisioning with snapshot support. The 'thin' target is used to create instances of the virtual devices that are hosted in the 'thin-pool' target. The thin-pool target provides data sharing among devices. This sharing is made possible using the persistent-data library in the previous patch. The main highlight of this implementation, compared to the previous implementation of snapshots, is that it allows many virtual devices to be stored on the same data volume, simplifying administration and allowing sharing of data between volumes (thus reducing disk usage). Another big feature is support for arbitrary depth of recursive snapshots (snapshots of snapshots of snapshots ...). The previous implementation of snapshots did this by chaining together lookup tables, and so performance was O(depth). This new implementation uses a single data structure so we don't get this degradation with depth. For further information and examples of how to use this, please read Documentation/device-mapper/thin-provisioning.txt Signed-off-by: Joe Thornber <thornber@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md/dm-thin.c')
-rw-r--r--drivers/md/dm-thin.c2428
1 files changed, 2428 insertions, 0 deletions
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
new file mode 100644
index 000000000000..c3087575fef0
--- /dev/null
+++ b/drivers/md/dm-thin.c
@@ -0,0 +1,2428 @@
1/*
2 * Copyright (C) 2011 Red Hat UK.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
8
9#include <linux/device-mapper.h>
10#include <linux/dm-io.h>
11#include <linux/dm-kcopyd.h>
12#include <linux/list.h>
13#include <linux/init.h>
14#include <linux/module.h>
15#include <linux/slab.h>
16
17#define DM_MSG_PREFIX "thin"
18
19/*
20 * Tunable constants
21 */
22#define ENDIO_HOOK_POOL_SIZE 10240
23#define DEFERRED_SET_SIZE 64
24#define MAPPING_POOL_SIZE 1024
25#define PRISON_CELLS 1024
26
27/*
28 * The block size of the device holding pool data must be
29 * between 64KB and 1GB.
30 */
31#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
32#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
33
34/*
35 * The metadata device is currently limited in size. The limitation is
36 * checked lower down in dm-space-map-metadata, but we also check it here
37 * so we can fail early.
38 *
39 * We have one block of index, which can hold 255 index entries. Each
40 * index entry contains allocation info about 16k metadata blocks.
41 */
42#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
43
44/*
45 * Device id is restricted to 24 bits.
46 */
47#define MAX_DEV_ID ((1 << 24) - 1)
48
49/*
50 * How do we handle breaking sharing of data blocks?
51 * =================================================
52 *
53 * We use a standard copy-on-write btree to store the mappings for the
54 * devices (note I'm talking about copy-on-write of the metadata here, not
55 * the data). When you take an internal snapshot you clone the root node
56 * of the origin btree. After this there is no concept of an origin or a
57 * snapshot. They are just two device trees that happen to point to the
58 * same data blocks.
59 *
60 * When we get a write in we decide if it's to a shared data block using
61 * some timestamp magic. If it is, we have to break sharing.
62 *
63 * Let's say we write to a shared block in what was the origin. The
64 * steps are:
65 *
66 * i) plug io further to this physical block. (see bio_prison code).
67 *
68 * ii) quiesce any read io to that shared data block. Obviously
69 * including all devices that share this block. (see deferred_set code)
70 *
71 * iii) copy the data block to a newly allocate block. This step can be
72 * missed out if the io covers the block. (schedule_copy).
73 *
74 * iv) insert the new mapping into the origin's btree
75 * (process_prepared_mappings). This act of inserting breaks some
76 * sharing of btree nodes between the two devices. Breaking sharing only
77 * effects the btree of that specific device. Btrees for the other
78 * devices that share the block never change. The btree for the origin
79 * device as it was after the last commit is untouched, ie. we're using
80 * persistent data structures in the functional programming sense.
81 *
82 * v) unplug io to this physical block, including the io that triggered
83 * the breaking of sharing.
84 *
85 * Steps (ii) and (iii) occur in parallel.
86 *
87 * The metadata _doesn't_ need to be committed before the io continues. We
88 * get away with this because the io is always written to a _new_ block.
89 * If there's a crash, then:
90 *
91 * - The origin mapping will point to the old origin block (the shared
92 * one). This will contain the data as it was before the io that triggered
93 * the breaking of sharing came in.
94 *
95 * - The snap mapping still points to the old block. As it would after
96 * the commit.
97 *
98 * The downside of this scheme is the timestamp magic isn't perfect, and
99 * will continue to think that data block in the snapshot device is shared
100 * even after the write to the origin has broken sharing. I suspect data
101 * blocks will typically be shared by many different devices, so we're
102 * breaking sharing n + 1 times, rather than n, where n is the number of
103 * devices that reference this data block. At the moment I think the
104 * benefits far, far outweigh the disadvantages.
105 */
106
107/*----------------------------------------------------------------*/
108
109/*
110 * Sometimes we can't deal with a bio straight away. We put them in prison
111 * where they can't cause any mischief. Bios are put in a cell identified
112 * by a key, multiple bios can be in the same cell. When the cell is
113 * subsequently unlocked the bios become available.
114 */
115struct bio_prison;
116
117struct cell_key {
118 int virtual;
119 dm_thin_id dev;
120 dm_block_t block;
121};
122
123struct cell {
124 struct hlist_node list;
125 struct bio_prison *prison;
126 struct cell_key key;
127 unsigned count;
128 struct bio_list bios;
129};
130
131struct bio_prison {
132 spinlock_t lock;
133 mempool_t *cell_pool;
134
135 unsigned nr_buckets;
136 unsigned hash_mask;
137 struct hlist_head *cells;
138};
139
140static uint32_t calc_nr_buckets(unsigned nr_cells)
141{
142 uint32_t n = 128;
143
144 nr_cells /= 4;
145 nr_cells = min(nr_cells, 8192u);
146
147 while (n < nr_cells)
148 n <<= 1;
149
150 return n;
151}
152
153/*
154 * @nr_cells should be the number of cells you want in use _concurrently_.
155 * Don't confuse it with the number of distinct keys.
156 */
157static struct bio_prison *prison_create(unsigned nr_cells)
158{
159 unsigned i;
160 uint32_t nr_buckets = calc_nr_buckets(nr_cells);
161 size_t len = sizeof(struct bio_prison) +
162 (sizeof(struct hlist_head) * nr_buckets);
163 struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
164
165 if (!prison)
166 return NULL;
167
168 spin_lock_init(&prison->lock);
169 prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
170 sizeof(struct cell));
171 if (!prison->cell_pool) {
172 kfree(prison);
173 return NULL;
174 }
175
176 prison->nr_buckets = nr_buckets;
177 prison->hash_mask = nr_buckets - 1;
178 prison->cells = (struct hlist_head *) (prison + 1);
179 for (i = 0; i < nr_buckets; i++)
180 INIT_HLIST_HEAD(prison->cells + i);
181
182 return prison;
183}
184
185static void prison_destroy(struct bio_prison *prison)
186{
187 mempool_destroy(prison->cell_pool);
188 kfree(prison);
189}
190
191static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key)
192{
193 const unsigned long BIG_PRIME = 4294967291UL;
194 uint64_t hash = key->block * BIG_PRIME;
195
196 return (uint32_t) (hash & prison->hash_mask);
197}
198
199static int keys_equal(struct cell_key *lhs, struct cell_key *rhs)
200{
201 return (lhs->virtual == rhs->virtual) &&
202 (lhs->dev == rhs->dev) &&
203 (lhs->block == rhs->block);
204}
205
206static struct cell *__search_bucket(struct hlist_head *bucket,
207 struct cell_key *key)
208{
209 struct cell *cell;
210 struct hlist_node *tmp;
211
212 hlist_for_each_entry(cell, tmp, bucket, list)
213 if (keys_equal(&cell->key, key))
214 return cell;
215
216 return NULL;
217}
218
219/*
220 * This may block if a new cell needs allocating. You must ensure that
221 * cells will be unlocked even if the calling thread is blocked.
222 *
223 * Returns the number of entries in the cell prior to the new addition
224 * or < 0 on failure.
225 */
226static int bio_detain(struct bio_prison *prison, struct cell_key *key,
227 struct bio *inmate, struct cell **ref)
228{
229 int r;
230 unsigned long flags;
231 uint32_t hash = hash_key(prison, key);
232 struct cell *uninitialized_var(cell), *cell2 = NULL;
233
234 BUG_ON(hash > prison->nr_buckets);
235
236 spin_lock_irqsave(&prison->lock, flags);
237 cell = __search_bucket(prison->cells + hash, key);
238
239 if (!cell) {
240 /*
241 * Allocate a new cell
242 */
243 spin_unlock_irqrestore(&prison->lock, flags);
244 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
245 spin_lock_irqsave(&prison->lock, flags);
246
247 /*
248 * We've been unlocked, so we have to double check that
249 * nobody else has inserted this cell in the meantime.
250 */
251 cell = __search_bucket(prison->cells + hash, key);
252
253 if (!cell) {
254 cell = cell2;
255 cell2 = NULL;
256
257 cell->prison = prison;
258 memcpy(&cell->key, key, sizeof(cell->key));
259 cell->count = 0;
260 bio_list_init(&cell->bios);
261 hlist_add_head(&cell->list, prison->cells + hash);
262 }
263 }
264
265 r = cell->count++;
266 bio_list_add(&cell->bios, inmate);
267 spin_unlock_irqrestore(&prison->lock, flags);
268
269 if (cell2)
270 mempool_free(cell2, prison->cell_pool);
271
272 *ref = cell;
273
274 return r;
275}
276
277/*
278 * @inmates must have been initialised prior to this call
279 */
280static void __cell_release(struct cell *cell, struct bio_list *inmates)
281{
282 struct bio_prison *prison = cell->prison;
283
284 hlist_del(&cell->list);
285
286 if (inmates)
287 bio_list_merge(inmates, &cell->bios);
288
289 mempool_free(cell, prison->cell_pool);
290}
291
292static void cell_release(struct cell *cell, struct bio_list *bios)
293{
294 unsigned long flags;
295 struct bio_prison *prison = cell->prison;
296
297 spin_lock_irqsave(&prison->lock, flags);
298 __cell_release(cell, bios);
299 spin_unlock_irqrestore(&prison->lock, flags);
300}
301
302/*
303 * There are a couple of places where we put a bio into a cell briefly
304 * before taking it out again. In these situations we know that no other
305 * bio may be in the cell. This function releases the cell, and also does
306 * a sanity check.
307 */
308static void cell_release_singleton(struct cell *cell, struct bio *bio)
309{
310 struct bio_prison *prison = cell->prison;
311 struct bio_list bios;
312 struct bio *b;
313 unsigned long flags;
314
315 bio_list_init(&bios);
316
317 spin_lock_irqsave(&prison->lock, flags);
318 __cell_release(cell, &bios);
319 spin_unlock_irqrestore(&prison->lock, flags);
320
321 b = bio_list_pop(&bios);
322 BUG_ON(b != bio);
323 BUG_ON(!bio_list_empty(&bios));
324}
325
326static void cell_error(struct cell *cell)
327{
328 struct bio_prison *prison = cell->prison;
329 struct bio_list bios;
330 struct bio *bio;
331 unsigned long flags;
332
333 bio_list_init(&bios);
334
335 spin_lock_irqsave(&prison->lock, flags);
336 __cell_release(cell, &bios);
337 spin_unlock_irqrestore(&prison->lock, flags);
338
339 while ((bio = bio_list_pop(&bios)))
340 bio_io_error(bio);
341}
342
343/*----------------------------------------------------------------*/
344
345/*
346 * We use the deferred set to keep track of pending reads to shared blocks.
347 * We do this to ensure the new mapping caused by a write isn't performed
348 * until these prior reads have completed. Otherwise the insertion of the
349 * new mapping could free the old block that the read bios are mapped to.
350 */
351
352struct deferred_set;
353struct deferred_entry {
354 struct deferred_set *ds;
355 unsigned count;
356 struct list_head work_items;
357};
358
359struct deferred_set {
360 spinlock_t lock;
361 unsigned current_entry;
362 unsigned sweeper;
363 struct deferred_entry entries[DEFERRED_SET_SIZE];
364};
365
366static void ds_init(struct deferred_set *ds)
367{
368 int i;
369
370 spin_lock_init(&ds->lock);
371 ds->current_entry = 0;
372 ds->sweeper = 0;
373 for (i = 0; i < DEFERRED_SET_SIZE; i++) {
374 ds->entries[i].ds = ds;
375 ds->entries[i].count = 0;
376 INIT_LIST_HEAD(&ds->entries[i].work_items);
377 }
378}
379
380static struct deferred_entry *ds_inc(struct deferred_set *ds)
381{
382 unsigned long flags;
383 struct deferred_entry *entry;
384
385 spin_lock_irqsave(&ds->lock, flags);
386 entry = ds->entries + ds->current_entry;
387 entry->count++;
388 spin_unlock_irqrestore(&ds->lock, flags);
389
390 return entry;
391}
392
393static unsigned ds_next(unsigned index)
394{
395 return (index + 1) % DEFERRED_SET_SIZE;
396}
397
398static void __sweep(struct deferred_set *ds, struct list_head *head)
399{
400 while ((ds->sweeper != ds->current_entry) &&
401 !ds->entries[ds->sweeper].count) {
402 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
403 ds->sweeper = ds_next(ds->sweeper);
404 }
405
406 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
407 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
408}
409
410static void ds_dec(struct deferred_entry *entry, struct list_head *head)
411{
412 unsigned long flags;
413
414 spin_lock_irqsave(&entry->ds->lock, flags);
415 BUG_ON(!entry->count);
416 --entry->count;
417 __sweep(entry->ds, head);
418 spin_unlock_irqrestore(&entry->ds->lock, flags);
419}
420
421/*
422 * Returns 1 if deferred or 0 if no pending items to delay job.
423 */
424static int ds_add_work(struct deferred_set *ds, struct list_head *work)
425{
426 int r = 1;
427 unsigned long flags;
428 unsigned next_entry;
429
430 spin_lock_irqsave(&ds->lock, flags);
431 if ((ds->sweeper == ds->current_entry) &&
432 !ds->entries[ds->current_entry].count)
433 r = 0;
434 else {
435 list_add(work, &ds->entries[ds->current_entry].work_items);
436 next_entry = ds_next(ds->current_entry);
437 if (!ds->entries[next_entry].count)
438 ds->current_entry = next_entry;
439 }
440 spin_unlock_irqrestore(&ds->lock, flags);
441
442 return r;
443}
444
445/*----------------------------------------------------------------*/
446
447/*
448 * Key building.
449 */
450static void build_data_key(struct dm_thin_device *td,
451 dm_block_t b, struct cell_key *key)
452{
453 key->virtual = 0;
454 key->dev = dm_thin_dev_id(td);
455 key->block = b;
456}
457
458static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
459 struct cell_key *key)
460{
461 key->virtual = 1;
462 key->dev = dm_thin_dev_id(td);
463 key->block = b;
464}
465
466/*----------------------------------------------------------------*/
467
468/*
469 * A pool device ties together a metadata device and a data device. It
470 * also provides the interface for creating and destroying internal
471 * devices.
472 */
473struct new_mapping;
474struct pool {
475 struct list_head list;
476 struct dm_target *ti; /* Only set if a pool target is bound */
477
478 struct mapped_device *pool_md;
479 struct block_device *md_dev;
480 struct dm_pool_metadata *pmd;
481
482 uint32_t sectors_per_block;
483 unsigned block_shift;
484 dm_block_t offset_mask;
485 dm_block_t low_water_blocks;
486
487 unsigned zero_new_blocks:1;
488 unsigned low_water_triggered:1; /* A dm event has been sent */
489 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
490
491 struct bio_prison *prison;
492 struct dm_kcopyd_client *copier;
493
494 struct workqueue_struct *wq;
495 struct work_struct worker;
496
497 unsigned ref_count;
498
499 spinlock_t lock;
500 struct bio_list deferred_bios;
501 struct bio_list deferred_flush_bios;
502 struct list_head prepared_mappings;
503
504 struct bio_list retry_on_resume_list;
505
506 struct deferred_set ds; /* FIXME: move to thin_c */
507
508 struct new_mapping *next_mapping;
509 mempool_t *mapping_pool;
510 mempool_t *endio_hook_pool;
511};
512
513/*
514 * Target context for a pool.
515 */
516struct pool_c {
517 struct dm_target *ti;
518 struct pool *pool;
519 struct dm_dev *data_dev;
520 struct dm_dev *metadata_dev;
521 struct dm_target_callbacks callbacks;
522
523 dm_block_t low_water_blocks;
524 unsigned zero_new_blocks:1;
525};
526
527/*
528 * Target context for a thin.
529 */
530struct thin_c {
531 struct dm_dev *pool_dev;
532 dm_thin_id dev_id;
533
534 struct pool *pool;
535 struct dm_thin_device *td;
536};
537
538/*----------------------------------------------------------------*/
539
540/*
541 * A global list of pools that uses a struct mapped_device as a key.
542 */
543static struct dm_thin_pool_table {
544 struct mutex mutex;
545 struct list_head pools;
546} dm_thin_pool_table;
547
548static void pool_table_init(void)
549{
550 mutex_init(&dm_thin_pool_table.mutex);
551 INIT_LIST_HEAD(&dm_thin_pool_table.pools);
552}
553
554static void __pool_table_insert(struct pool *pool)
555{
556 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
557 list_add(&pool->list, &dm_thin_pool_table.pools);
558}
559
560static void __pool_table_remove(struct pool *pool)
561{
562 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
563 list_del(&pool->list);
564}
565
566static struct pool *__pool_table_lookup(struct mapped_device *md)
567{
568 struct pool *pool = NULL, *tmp;
569
570 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
571
572 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
573 if (tmp->pool_md == md) {
574 pool = tmp;
575 break;
576 }
577 }
578
579 return pool;
580}
581
582static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
583{
584 struct pool *pool = NULL, *tmp;
585
586 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
587
588 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
589 if (tmp->md_dev == md_dev) {
590 pool = tmp;
591 break;
592 }
593 }
594
595 return pool;
596}
597
598/*----------------------------------------------------------------*/
599
600static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
601{
602 struct bio *bio;
603 struct bio_list bios;
604
605 bio_list_init(&bios);
606 bio_list_merge(&bios, master);
607 bio_list_init(master);
608
609 while ((bio = bio_list_pop(&bios))) {
610 if (dm_get_mapinfo(bio)->ptr == tc)
611 bio_endio(bio, DM_ENDIO_REQUEUE);
612 else
613 bio_list_add(master, bio);
614 }
615}
616
617static void requeue_io(struct thin_c *tc)
618{
619 struct pool *pool = tc->pool;
620 unsigned long flags;
621
622 spin_lock_irqsave(&pool->lock, flags);
623 __requeue_bio_list(tc, &pool->deferred_bios);
624 __requeue_bio_list(tc, &pool->retry_on_resume_list);
625 spin_unlock_irqrestore(&pool->lock, flags);
626}
627
628/*
629 * This section of code contains the logic for processing a thin device's IO.
630 * Much of the code depends on pool object resources (lists, workqueues, etc)
631 * but most is exclusively called from the thin target rather than the thin-pool
632 * target.
633 */
634
635static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
636{
637 return bio->bi_sector >> tc->pool->block_shift;
638}
639
640static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
641{
642 struct pool *pool = tc->pool;
643
644 bio->bi_bdev = tc->pool_dev->bdev;
645 bio->bi_sector = (block << pool->block_shift) +
646 (bio->bi_sector & pool->offset_mask);
647}
648
649static void remap_and_issue(struct thin_c *tc, struct bio *bio,
650 dm_block_t block)
651{
652 struct pool *pool = tc->pool;
653 unsigned long flags;
654
655 remap(tc, bio, block);
656
657 /*
658 * Batch together any FUA/FLUSH bios we find and then issue
659 * a single commit for them in process_deferred_bios().
660 */
661 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
662 spin_lock_irqsave(&pool->lock, flags);
663 bio_list_add(&pool->deferred_flush_bios, bio);
664 spin_unlock_irqrestore(&pool->lock, flags);
665 } else
666 generic_make_request(bio);
667}
668
669/*
670 * wake_worker() is used when new work is queued and when pool_resume is
671 * ready to continue deferred IO processing.
672 */
673static void wake_worker(struct pool *pool)
674{
675 queue_work(pool->wq, &pool->worker);
676}
677
678/*----------------------------------------------------------------*/
679
680/*
681 * Bio endio functions.
682 */
683struct endio_hook {
684 struct thin_c *tc;
685 bio_end_io_t *saved_bi_end_io;
686 struct deferred_entry *entry;
687};
688
689struct new_mapping {
690 struct list_head list;
691
692 int prepared;
693
694 struct thin_c *tc;
695 dm_block_t virt_block;
696 dm_block_t data_block;
697 struct cell *cell;
698 int err;
699
700 /*
701 * If the bio covers the whole area of a block then we can avoid
702 * zeroing or copying. Instead this bio is hooked. The bio will
703 * still be in the cell, so care has to be taken to avoid issuing
704 * the bio twice.
705 */
706 struct bio *bio;
707 bio_end_io_t *saved_bi_end_io;
708};
709
710static void __maybe_add_mapping(struct new_mapping *m)
711{
712 struct pool *pool = m->tc->pool;
713
714 if (list_empty(&m->list) && m->prepared) {
715 list_add(&m->list, &pool->prepared_mappings);
716 wake_worker(pool);
717 }
718}
719
720static void copy_complete(int read_err, unsigned long write_err, void *context)
721{
722 unsigned long flags;
723 struct new_mapping *m = context;
724 struct pool *pool = m->tc->pool;
725
726 m->err = read_err || write_err ? -EIO : 0;
727
728 spin_lock_irqsave(&pool->lock, flags);
729 m->prepared = 1;
730 __maybe_add_mapping(m);
731 spin_unlock_irqrestore(&pool->lock, flags);
732}
733
734static void overwrite_endio(struct bio *bio, int err)
735{
736 unsigned long flags;
737 struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
738 struct pool *pool = m->tc->pool;
739
740 m->err = err;
741
742 spin_lock_irqsave(&pool->lock, flags);
743 m->prepared = 1;
744 __maybe_add_mapping(m);
745 spin_unlock_irqrestore(&pool->lock, flags);
746}
747
748static void shared_read_endio(struct bio *bio, int err)
749{
750 struct list_head mappings;
751 struct new_mapping *m, *tmp;
752 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
753 unsigned long flags;
754 struct pool *pool = h->tc->pool;
755
756 bio->bi_end_io = h->saved_bi_end_io;
757 bio_endio(bio, err);
758
759 INIT_LIST_HEAD(&mappings);
760 ds_dec(h->entry, &mappings);
761
762 spin_lock_irqsave(&pool->lock, flags);
763 list_for_each_entry_safe(m, tmp, &mappings, list) {
764 list_del(&m->list);
765 INIT_LIST_HEAD(&m->list);
766 __maybe_add_mapping(m);
767 }
768 spin_unlock_irqrestore(&pool->lock, flags);
769
770 mempool_free(h, pool->endio_hook_pool);
771}
772
773/*----------------------------------------------------------------*/
774
775/*
776 * Workqueue.
777 */
778
779/*
780 * Prepared mapping jobs.
781 */
782
783/*
784 * This sends the bios in the cell back to the deferred_bios list.
785 */
786static void cell_defer(struct thin_c *tc, struct cell *cell,
787 dm_block_t data_block)
788{
789 struct pool *pool = tc->pool;
790 unsigned long flags;
791
792 spin_lock_irqsave(&pool->lock, flags);
793 cell_release(cell, &pool->deferred_bios);
794 spin_unlock_irqrestore(&tc->pool->lock, flags);
795
796 wake_worker(pool);
797}
798
799/*
800 * Same as cell_defer above, except it omits one particular detainee,
801 * a write bio that covers the block and has already been processed.
802 */
803static void cell_defer_except(struct thin_c *tc, struct cell *cell,
804 struct bio *exception)
805{
806 struct bio_list bios;
807 struct bio *bio;
808 struct pool *pool = tc->pool;
809 unsigned long flags;
810
811 bio_list_init(&bios);
812 cell_release(cell, &bios);
813
814 spin_lock_irqsave(&pool->lock, flags);
815 while ((bio = bio_list_pop(&bios)))
816 if (bio != exception)
817 bio_list_add(&pool->deferred_bios, bio);
818 spin_unlock_irqrestore(&pool->lock, flags);
819
820 wake_worker(pool);
821}
822
823static void process_prepared_mapping(struct new_mapping *m)
824{
825 struct thin_c *tc = m->tc;
826 struct bio *bio;
827 int r;
828
829 bio = m->bio;
830 if (bio)
831 bio->bi_end_io = m->saved_bi_end_io;
832
833 if (m->err) {
834 cell_error(m->cell);
835 return;
836 }
837
838 /*
839 * Commit the prepared block into the mapping btree.
840 * Any I/O for this block arriving after this point will get
841 * remapped to it directly.
842 */
843 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
844 if (r) {
845 DMERR("dm_thin_insert_block() failed");
846 cell_error(m->cell);
847 return;
848 }
849
850 /*
851 * Release any bios held while the block was being provisioned.
852 * If we are processing a write bio that completely covers the block,
853 * we already processed it so can ignore it now when processing
854 * the bios in the cell.
855 */
856 if (bio) {
857 cell_defer_except(tc, m->cell, bio);
858 bio_endio(bio, 0);
859 } else
860 cell_defer(tc, m->cell, m->data_block);
861
862 list_del(&m->list);
863 mempool_free(m, tc->pool->mapping_pool);
864}
865
866static void process_prepared_mappings(struct pool *pool)
867{
868 unsigned long flags;
869 struct list_head maps;
870 struct new_mapping *m, *tmp;
871
872 INIT_LIST_HEAD(&maps);
873 spin_lock_irqsave(&pool->lock, flags);
874 list_splice_init(&pool->prepared_mappings, &maps);
875 spin_unlock_irqrestore(&pool->lock, flags);
876
877 list_for_each_entry_safe(m, tmp, &maps, list)
878 process_prepared_mapping(m);
879}
880
881/*
882 * Deferred bio jobs.
883 */
884static int io_overwrites_block(struct pool *pool, struct bio *bio)
885{
886 return ((bio_data_dir(bio) == WRITE) &&
887 !(bio->bi_sector & pool->offset_mask)) &&
888 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
889}
890
891static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
892 bio_end_io_t *fn)
893{
894 *save = bio->bi_end_io;
895 bio->bi_end_io = fn;
896}
897
898static int ensure_next_mapping(struct pool *pool)
899{
900 if (pool->next_mapping)
901 return 0;
902
903 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
904
905 return pool->next_mapping ? 0 : -ENOMEM;
906}
907
908static struct new_mapping *get_next_mapping(struct pool *pool)
909{
910 struct new_mapping *r = pool->next_mapping;
911
912 BUG_ON(!pool->next_mapping);
913
914 pool->next_mapping = NULL;
915
916 return r;
917}
918
919static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
920 dm_block_t data_origin, dm_block_t data_dest,
921 struct cell *cell, struct bio *bio)
922{
923 int r;
924 struct pool *pool = tc->pool;
925 struct new_mapping *m = get_next_mapping(pool);
926
927 INIT_LIST_HEAD(&m->list);
928 m->prepared = 0;
929 m->tc = tc;
930 m->virt_block = virt_block;
931 m->data_block = data_dest;
932 m->cell = cell;
933 m->err = 0;
934 m->bio = NULL;
935
936 ds_add_work(&pool->ds, &m->list);
937
938 /*
939 * IO to pool_dev remaps to the pool target's data_dev.
940 *
941 * If the whole block of data is being overwritten, we can issue the
942 * bio immediately. Otherwise we use kcopyd to clone the data first.
943 */
944 if (io_overwrites_block(pool, bio)) {
945 m->bio = bio;
946 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
947 dm_get_mapinfo(bio)->ptr = m;
948 remap_and_issue(tc, bio, data_dest);
949 } else {
950 struct dm_io_region from, to;
951
952 from.bdev = tc->pool_dev->bdev;
953 from.sector = data_origin * pool->sectors_per_block;
954 from.count = pool->sectors_per_block;
955
956 to.bdev = tc->pool_dev->bdev;
957 to.sector = data_dest * pool->sectors_per_block;
958 to.count = pool->sectors_per_block;
959
960 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
961 0, copy_complete, m);
962 if (r < 0) {
963 mempool_free(m, pool->mapping_pool);
964 DMERR("dm_kcopyd_copy() failed");
965 cell_error(cell);
966 }
967 }
968}
969
970static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
971 dm_block_t data_block, struct cell *cell,
972 struct bio *bio)
973{
974 struct pool *pool = tc->pool;
975 struct new_mapping *m = get_next_mapping(pool);
976
977 INIT_LIST_HEAD(&m->list);
978 m->prepared = 0;
979 m->tc = tc;
980 m->virt_block = virt_block;
981 m->data_block = data_block;
982 m->cell = cell;
983 m->err = 0;
984 m->bio = NULL;
985
986 /*
987 * If the whole block of data is being overwritten or we are not
988 * zeroing pre-existing data, we can issue the bio immediately.
989 * Otherwise we use kcopyd to zero the data first.
990 */
991 if (!pool->zero_new_blocks)
992 process_prepared_mapping(m);
993
994 else if (io_overwrites_block(pool, bio)) {
995 m->bio = bio;
996 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
997 dm_get_mapinfo(bio)->ptr = m;
998 remap_and_issue(tc, bio, data_block);
999
1000 } else {
1001 int r;
1002 struct dm_io_region to;
1003
1004 to.bdev = tc->pool_dev->bdev;
1005 to.sector = data_block * pool->sectors_per_block;
1006 to.count = pool->sectors_per_block;
1007
1008 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
1009 if (r < 0) {
1010 mempool_free(m, pool->mapping_pool);
1011 DMERR("dm_kcopyd_zero() failed");
1012 cell_error(cell);
1013 }
1014 }
1015}
1016
1017static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1018{
1019 int r;
1020 dm_block_t free_blocks;
1021 unsigned long flags;
1022 struct pool *pool = tc->pool;
1023
1024 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1025 if (r)
1026 return r;
1027
1028 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1029 DMWARN("%s: reached low water mark, sending event.",
1030 dm_device_name(pool->pool_md));
1031 spin_lock_irqsave(&pool->lock, flags);
1032 pool->low_water_triggered = 1;
1033 spin_unlock_irqrestore(&pool->lock, flags);
1034 dm_table_event(pool->ti->table);
1035 }
1036
1037 if (!free_blocks) {
1038 if (pool->no_free_space)
1039 return -ENOSPC;
1040 else {
1041 /*
1042 * Try to commit to see if that will free up some
1043 * more space.
1044 */
1045 r = dm_pool_commit_metadata(pool->pmd);
1046 if (r) {
1047 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1048 __func__, r);
1049 return r;
1050 }
1051
1052 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1053 if (r)
1054 return r;
1055
1056 /*
1057 * If we still have no space we set a flag to avoid
1058 * doing all this checking and return -ENOSPC.
1059 */
1060 if (!free_blocks) {
1061 DMWARN("%s: no free space available.",
1062 dm_device_name(pool->pool_md));
1063 spin_lock_irqsave(&pool->lock, flags);
1064 pool->no_free_space = 1;
1065 spin_unlock_irqrestore(&pool->lock, flags);
1066 return -ENOSPC;
1067 }
1068 }
1069 }
1070
1071 r = dm_pool_alloc_data_block(pool->pmd, result);
1072 if (r)
1073 return r;
1074
1075 return 0;
1076}
1077
1078/*
1079 * If we have run out of space, queue bios until the device is
1080 * resumed, presumably after having been reloaded with more space.
1081 */
1082static void retry_on_resume(struct bio *bio)
1083{
1084 struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
1085 struct pool *pool = tc->pool;
1086 unsigned long flags;
1087
1088 spin_lock_irqsave(&pool->lock, flags);
1089 bio_list_add(&pool->retry_on_resume_list, bio);
1090 spin_unlock_irqrestore(&pool->lock, flags);
1091}
1092
1093static void no_space(struct cell *cell)
1094{
1095 struct bio *bio;
1096 struct bio_list bios;
1097
1098 bio_list_init(&bios);
1099 cell_release(cell, &bios);
1100
1101 while ((bio = bio_list_pop(&bios)))
1102 retry_on_resume(bio);
1103}
1104
1105static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1106 struct cell_key *key,
1107 struct dm_thin_lookup_result *lookup_result,
1108 struct cell *cell)
1109{
1110 int r;
1111 dm_block_t data_block;
1112
1113 r = alloc_data_block(tc, &data_block);
1114 switch (r) {
1115 case 0:
1116 schedule_copy(tc, block, lookup_result->block,
1117 data_block, cell, bio);
1118 break;
1119
1120 case -ENOSPC:
1121 no_space(cell);
1122 break;
1123
1124 default:
1125 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1126 cell_error(cell);
1127 break;
1128 }
1129}
1130
1131static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1132 dm_block_t block,
1133 struct dm_thin_lookup_result *lookup_result)
1134{
1135 struct cell *cell;
1136 struct pool *pool = tc->pool;
1137 struct cell_key key;
1138
1139 /*
1140 * If cell is already occupied, then sharing is already in the process
1141 * of being broken so we have nothing further to do here.
1142 */
1143 build_data_key(tc->td, lookup_result->block, &key);
1144 if (bio_detain(pool->prison, &key, bio, &cell))
1145 return;
1146
1147 if (bio_data_dir(bio) == WRITE)
1148 break_sharing(tc, bio, block, &key, lookup_result, cell);
1149 else {
1150 struct endio_hook *h;
1151 h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1152
1153 h->tc = tc;
1154 h->entry = ds_inc(&pool->ds);
1155 save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
1156 dm_get_mapinfo(bio)->ptr = h;
1157
1158 cell_release_singleton(cell, bio);
1159 remap_and_issue(tc, bio, lookup_result->block);
1160 }
1161}
1162
1163static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1164 struct cell *cell)
1165{
1166 int r;
1167 dm_block_t data_block;
1168
1169 /*
1170 * Remap empty bios (flushes) immediately, without provisioning.
1171 */
1172 if (!bio->bi_size) {
1173 cell_release_singleton(cell, bio);
1174 remap_and_issue(tc, bio, 0);
1175 return;
1176 }
1177
1178 /*
1179 * Fill read bios with zeroes and complete them immediately.
1180 */
1181 if (bio_data_dir(bio) == READ) {
1182 zero_fill_bio(bio);
1183 cell_release_singleton(cell, bio);
1184 bio_endio(bio, 0);
1185 return;
1186 }
1187
1188 r = alloc_data_block(tc, &data_block);
1189 switch (r) {
1190 case 0:
1191 schedule_zero(tc, block, data_block, cell, bio);
1192 break;
1193
1194 case -ENOSPC:
1195 no_space(cell);
1196 break;
1197
1198 default:
1199 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1200 cell_error(cell);
1201 break;
1202 }
1203}
1204
1205static void process_bio(struct thin_c *tc, struct bio *bio)
1206{
1207 int r;
1208 dm_block_t block = get_bio_block(tc, bio);
1209 struct cell *cell;
1210 struct cell_key key;
1211 struct dm_thin_lookup_result lookup_result;
1212
1213 /*
1214 * If cell is already occupied, then the block is already
1215 * being provisioned so we have nothing further to do here.
1216 */
1217 build_virtual_key(tc->td, block, &key);
1218 if (bio_detain(tc->pool->prison, &key, bio, &cell))
1219 return;
1220
1221 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1222 switch (r) {
1223 case 0:
1224 /*
1225 * We can release this cell now. This thread is the only
1226 * one that puts bios into a cell, and we know there were
1227 * no preceding bios.
1228 */
1229 /*
1230 * TODO: this will probably have to change when discard goes
1231 * back in.
1232 */
1233 cell_release_singleton(cell, bio);
1234
1235 if (lookup_result.shared)
1236 process_shared_bio(tc, bio, block, &lookup_result);
1237 else
1238 remap_and_issue(tc, bio, lookup_result.block);
1239 break;
1240
1241 case -ENODATA:
1242 provision_block(tc, bio, block, cell);
1243 break;
1244
1245 default:
1246 DMERR("dm_thin_find_block() failed, error = %d", r);
1247 bio_io_error(bio);
1248 break;
1249 }
1250}
1251
1252static void process_deferred_bios(struct pool *pool)
1253{
1254 unsigned long flags;
1255 struct bio *bio;
1256 struct bio_list bios;
1257 int r;
1258
1259 bio_list_init(&bios);
1260
1261 spin_lock_irqsave(&pool->lock, flags);
1262 bio_list_merge(&bios, &pool->deferred_bios);
1263 bio_list_init(&pool->deferred_bios);
1264 spin_unlock_irqrestore(&pool->lock, flags);
1265
1266 while ((bio = bio_list_pop(&bios))) {
1267 struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
1268 /*
1269 * If we've got no free new_mapping structs, and processing
1270 * this bio might require one, we pause until there are some
1271 * prepared mappings to process.
1272 */
1273 if (ensure_next_mapping(pool)) {
1274 spin_lock_irqsave(&pool->lock, flags);
1275 bio_list_merge(&pool->deferred_bios, &bios);
1276 spin_unlock_irqrestore(&pool->lock, flags);
1277
1278 break;
1279 }
1280 process_bio(tc, bio);
1281 }
1282
1283 /*
1284 * If there are any deferred flush bios, we must commit
1285 * the metadata before issuing them.
1286 */
1287 bio_list_init(&bios);
1288 spin_lock_irqsave(&pool->lock, flags);
1289 bio_list_merge(&bios, &pool->deferred_flush_bios);
1290 bio_list_init(&pool->deferred_flush_bios);
1291 spin_unlock_irqrestore(&pool->lock, flags);
1292
1293 if (bio_list_empty(&bios))
1294 return;
1295
1296 r = dm_pool_commit_metadata(pool->pmd);
1297 if (r) {
1298 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1299 __func__, r);
1300 while ((bio = bio_list_pop(&bios)))
1301 bio_io_error(bio);
1302 return;
1303 }
1304
1305 while ((bio = bio_list_pop(&bios)))
1306 generic_make_request(bio);
1307}
1308
1309static void do_worker(struct work_struct *ws)
1310{
1311 struct pool *pool = container_of(ws, struct pool, worker);
1312
1313 process_prepared_mappings(pool);
1314 process_deferred_bios(pool);
1315}
1316
1317/*----------------------------------------------------------------*/
1318
1319/*
1320 * Mapping functions.
1321 */
1322
1323/*
1324 * Called only while mapping a thin bio to hand it over to the workqueue.
1325 */
1326static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1327{
1328 unsigned long flags;
1329 struct pool *pool = tc->pool;
1330
1331 spin_lock_irqsave(&pool->lock, flags);
1332 bio_list_add(&pool->deferred_bios, bio);
1333 spin_unlock_irqrestore(&pool->lock, flags);
1334
1335 wake_worker(pool);
1336}
1337
1338/*
1339 * Non-blocking function called from the thin target's map function.
1340 */
1341static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1342 union map_info *map_context)
1343{
1344 int r;
1345 struct thin_c *tc = ti->private;
1346 dm_block_t block = get_bio_block(tc, bio);
1347 struct dm_thin_device *td = tc->td;
1348 struct dm_thin_lookup_result result;
1349
1350 /*
1351 * Save the thin context for easy access from the deferred bio later.
1352 */
1353 map_context->ptr = tc;
1354
1355 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1356 thin_defer_bio(tc, bio);
1357 return DM_MAPIO_SUBMITTED;
1358 }
1359
1360 r = dm_thin_find_block(td, block, 0, &result);
1361
1362 /*
1363 * Note that we defer readahead too.
1364 */
1365 switch (r) {
1366 case 0:
1367 if (unlikely(result.shared)) {
1368 /*
1369 * We have a race condition here between the
1370 * result.shared value returned by the lookup and
1371 * snapshot creation, which may cause new
1372 * sharing.
1373 *
1374 * To avoid this always quiesce the origin before
1375 * taking the snap. You want to do this anyway to
1376 * ensure a consistent application view
1377 * (i.e. lockfs).
1378 *
1379 * More distant ancestors are irrelevant. The
1380 * shared flag will be set in their case.
1381 */
1382 thin_defer_bio(tc, bio);
1383 r = DM_MAPIO_SUBMITTED;
1384 } else {
1385 remap(tc, bio, result.block);
1386 r = DM_MAPIO_REMAPPED;
1387 }
1388 break;
1389
1390 case -ENODATA:
1391 /*
1392 * In future, the failed dm_thin_find_block above could
1393 * provide the hint to load the metadata into cache.
1394 */
1395 case -EWOULDBLOCK:
1396 thin_defer_bio(tc, bio);
1397 r = DM_MAPIO_SUBMITTED;
1398 break;
1399 }
1400
1401 return r;
1402}
1403
1404static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1405{
1406 int r;
1407 unsigned long flags;
1408 struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1409
1410 spin_lock_irqsave(&pt->pool->lock, flags);
1411 r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1412 spin_unlock_irqrestore(&pt->pool->lock, flags);
1413
1414 if (!r) {
1415 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1416 r = bdi_congested(&q->backing_dev_info, bdi_bits);
1417 }
1418
1419 return r;
1420}
1421
1422static void __requeue_bios(struct pool *pool)
1423{
1424 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1425 bio_list_init(&pool->retry_on_resume_list);
1426}
1427
1428/*----------------------------------------------------------------
1429 * Binding of control targets to a pool object
1430 *--------------------------------------------------------------*/
1431static int bind_control_target(struct pool *pool, struct dm_target *ti)
1432{
1433 struct pool_c *pt = ti->private;
1434
1435 pool->ti = ti;
1436 pool->low_water_blocks = pt->low_water_blocks;
1437 pool->zero_new_blocks = pt->zero_new_blocks;
1438
1439 return 0;
1440}
1441
1442static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1443{
1444 if (pool->ti == ti)
1445 pool->ti = NULL;
1446}
1447
1448/*----------------------------------------------------------------
1449 * Pool creation
1450 *--------------------------------------------------------------*/
1451static void __pool_destroy(struct pool *pool)
1452{
1453 __pool_table_remove(pool);
1454
1455 if (dm_pool_metadata_close(pool->pmd) < 0)
1456 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1457
1458 prison_destroy(pool->prison);
1459 dm_kcopyd_client_destroy(pool->copier);
1460
1461 if (pool->wq)
1462 destroy_workqueue(pool->wq);
1463
1464 if (pool->next_mapping)
1465 mempool_free(pool->next_mapping, pool->mapping_pool);
1466 mempool_destroy(pool->mapping_pool);
1467 mempool_destroy(pool->endio_hook_pool);
1468 kfree(pool);
1469}
1470
1471static struct pool *pool_create(struct mapped_device *pool_md,
1472 struct block_device *metadata_dev,
1473 unsigned long block_size, char **error)
1474{
1475 int r;
1476 void *err_p;
1477 struct pool *pool;
1478 struct dm_pool_metadata *pmd;
1479
1480 pmd = dm_pool_metadata_open(metadata_dev, block_size);
1481 if (IS_ERR(pmd)) {
1482 *error = "Error creating metadata object";
1483 return (struct pool *)pmd;
1484 }
1485
1486 pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1487 if (!pool) {
1488 *error = "Error allocating memory for pool";
1489 err_p = ERR_PTR(-ENOMEM);
1490 goto bad_pool;
1491 }
1492
1493 pool->pmd = pmd;
1494 pool->sectors_per_block = block_size;
1495 pool->block_shift = ffs(block_size) - 1;
1496 pool->offset_mask = block_size - 1;
1497 pool->low_water_blocks = 0;
1498 pool->zero_new_blocks = 1;
1499 pool->prison = prison_create(PRISON_CELLS);
1500 if (!pool->prison) {
1501 *error = "Error creating pool's bio prison";
1502 err_p = ERR_PTR(-ENOMEM);
1503 goto bad_prison;
1504 }
1505
1506 pool->copier = dm_kcopyd_client_create();
1507 if (IS_ERR(pool->copier)) {
1508 r = PTR_ERR(pool->copier);
1509 *error = "Error creating pool's kcopyd client";
1510 err_p = ERR_PTR(r);
1511 goto bad_kcopyd_client;
1512 }
1513
1514 /*
1515 * Create singlethreaded workqueue that will service all devices
1516 * that use this metadata.
1517 */
1518 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1519 if (!pool->wq) {
1520 *error = "Error creating pool's workqueue";
1521 err_p = ERR_PTR(-ENOMEM);
1522 goto bad_wq;
1523 }
1524
1525 INIT_WORK(&pool->worker, do_worker);
1526 spin_lock_init(&pool->lock);
1527 bio_list_init(&pool->deferred_bios);
1528 bio_list_init(&pool->deferred_flush_bios);
1529 INIT_LIST_HEAD(&pool->prepared_mappings);
1530 pool->low_water_triggered = 0;
1531 pool->no_free_space = 0;
1532 bio_list_init(&pool->retry_on_resume_list);
1533 ds_init(&pool->ds);
1534
1535 pool->next_mapping = NULL;
1536 pool->mapping_pool =
1537 mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
1538 if (!pool->mapping_pool) {
1539 *error = "Error creating pool's mapping mempool";
1540 err_p = ERR_PTR(-ENOMEM);
1541 goto bad_mapping_pool;
1542 }
1543
1544 pool->endio_hook_pool =
1545 mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
1546 if (!pool->endio_hook_pool) {
1547 *error = "Error creating pool's endio_hook mempool";
1548 err_p = ERR_PTR(-ENOMEM);
1549 goto bad_endio_hook_pool;
1550 }
1551 pool->ref_count = 1;
1552 pool->pool_md = pool_md;
1553 pool->md_dev = metadata_dev;
1554 __pool_table_insert(pool);
1555
1556 return pool;
1557
1558bad_endio_hook_pool:
1559 mempool_destroy(pool->mapping_pool);
1560bad_mapping_pool:
1561 destroy_workqueue(pool->wq);
1562bad_wq:
1563 dm_kcopyd_client_destroy(pool->copier);
1564bad_kcopyd_client:
1565 prison_destroy(pool->prison);
1566bad_prison:
1567 kfree(pool);
1568bad_pool:
1569 if (dm_pool_metadata_close(pmd))
1570 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1571
1572 return err_p;
1573}
1574
1575static void __pool_inc(struct pool *pool)
1576{
1577 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1578 pool->ref_count++;
1579}
1580
1581static void __pool_dec(struct pool *pool)
1582{
1583 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1584 BUG_ON(!pool->ref_count);
1585 if (!--pool->ref_count)
1586 __pool_destroy(pool);
1587}
1588
1589static struct pool *__pool_find(struct mapped_device *pool_md,
1590 struct block_device *metadata_dev,
1591 unsigned long block_size, char **error)
1592{
1593 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1594
1595 if (pool) {
1596 if (pool->pool_md != pool_md)
1597 return ERR_PTR(-EBUSY);
1598 __pool_inc(pool);
1599
1600 } else {
1601 pool = __pool_table_lookup(pool_md);
1602 if (pool) {
1603 if (pool->md_dev != metadata_dev)
1604 return ERR_PTR(-EINVAL);
1605 __pool_inc(pool);
1606
1607 } else
1608 pool = pool_create(pool_md, metadata_dev, block_size, error);
1609 }
1610
1611 return pool;
1612}
1613
1614/*----------------------------------------------------------------
1615 * Pool target methods
1616 *--------------------------------------------------------------*/
1617static void pool_dtr(struct dm_target *ti)
1618{
1619 struct pool_c *pt = ti->private;
1620
1621 mutex_lock(&dm_thin_pool_table.mutex);
1622
1623 unbind_control_target(pt->pool, ti);
1624 __pool_dec(pt->pool);
1625 dm_put_device(ti, pt->metadata_dev);
1626 dm_put_device(ti, pt->data_dev);
1627 kfree(pt);
1628
1629 mutex_unlock(&dm_thin_pool_table.mutex);
1630}
1631
1632struct pool_features {
1633 unsigned zero_new_blocks:1;
1634};
1635
1636static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1637 struct dm_target *ti)
1638{
1639 int r;
1640 unsigned argc;
1641 const char *arg_name;
1642
1643 static struct dm_arg _args[] = {
1644 {0, 1, "Invalid number of pool feature arguments"},
1645 };
1646
1647 /*
1648 * No feature arguments supplied.
1649 */
1650 if (!as->argc)
1651 return 0;
1652
1653 r = dm_read_arg_group(_args, as, &argc, &ti->error);
1654 if (r)
1655 return -EINVAL;
1656
1657 while (argc && !r) {
1658 arg_name = dm_shift_arg(as);
1659 argc--;
1660
1661 if (!strcasecmp(arg_name, "skip_block_zeroing")) {
1662 pf->zero_new_blocks = 0;
1663 continue;
1664 }
1665
1666 ti->error = "Unrecognised pool feature requested";
1667 r = -EINVAL;
1668 }
1669
1670 return r;
1671}
1672
1673/*
1674 * thin-pool <metadata dev> <data dev>
1675 * <data block size (sectors)>
1676 * <low water mark (blocks)>
1677 * [<#feature args> [<arg>]*]
1678 *
1679 * Optional feature arguments are:
1680 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1681 */
1682static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1683{
1684 int r;
1685 struct pool_c *pt;
1686 struct pool *pool;
1687 struct pool_features pf;
1688 struct dm_arg_set as;
1689 struct dm_dev *data_dev;
1690 unsigned long block_size;
1691 dm_block_t low_water_blocks;
1692 struct dm_dev *metadata_dev;
1693 sector_t metadata_dev_size;
1694
1695 /*
1696 * FIXME Remove validation from scope of lock.
1697 */
1698 mutex_lock(&dm_thin_pool_table.mutex);
1699
1700 if (argc < 4) {
1701 ti->error = "Invalid argument count";
1702 r = -EINVAL;
1703 goto out_unlock;
1704 }
1705 as.argc = argc;
1706 as.argv = argv;
1707
1708 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1709 if (r) {
1710 ti->error = "Error opening metadata block device";
1711 goto out_unlock;
1712 }
1713
1714 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1715 if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
1716 ti->error = "Metadata device is too large";
1717 r = -EINVAL;
1718 goto out_metadata;
1719 }
1720
1721 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1722 if (r) {
1723 ti->error = "Error getting data device";
1724 goto out_metadata;
1725 }
1726
1727 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1728 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1729 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1730 !is_power_of_2(block_size)) {
1731 ti->error = "Invalid block size";
1732 r = -EINVAL;
1733 goto out;
1734 }
1735
1736 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
1737 ti->error = "Invalid low water mark";
1738 r = -EINVAL;
1739 goto out;
1740 }
1741
1742 /*
1743 * Set default pool features.
1744 */
1745 memset(&pf, 0, sizeof(pf));
1746 pf.zero_new_blocks = 1;
1747
1748 dm_consume_args(&as, 4);
1749 r = parse_pool_features(&as, &pf, ti);
1750 if (r)
1751 goto out;
1752
1753 pt = kzalloc(sizeof(*pt), GFP_KERNEL);
1754 if (!pt) {
1755 r = -ENOMEM;
1756 goto out;
1757 }
1758
1759 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1760 block_size, &ti->error);
1761 if (IS_ERR(pool)) {
1762 r = PTR_ERR(pool);
1763 goto out_free_pt;
1764 }
1765
1766 pt->pool = pool;
1767 pt->ti = ti;
1768 pt->metadata_dev = metadata_dev;
1769 pt->data_dev = data_dev;
1770 pt->low_water_blocks = low_water_blocks;
1771 pt->zero_new_blocks = pf.zero_new_blocks;
1772 ti->num_flush_requests = 1;
1773 ti->num_discard_requests = 0;
1774 ti->private = pt;
1775
1776 pt->callbacks.congested_fn = pool_is_congested;
1777 dm_table_add_target_callbacks(ti->table, &pt->callbacks);
1778
1779 mutex_unlock(&dm_thin_pool_table.mutex);
1780
1781 return 0;
1782
1783out_free_pt:
1784 kfree(pt);
1785out:
1786 dm_put_device(ti, data_dev);
1787out_metadata:
1788 dm_put_device(ti, metadata_dev);
1789out_unlock:
1790 mutex_unlock(&dm_thin_pool_table.mutex);
1791
1792 return r;
1793}
1794
1795static int pool_map(struct dm_target *ti, struct bio *bio,
1796 union map_info *map_context)
1797{
1798 int r;
1799 struct pool_c *pt = ti->private;
1800 struct pool *pool = pt->pool;
1801 unsigned long flags;
1802
1803 /*
1804 * As this is a singleton target, ti->begin is always zero.
1805 */
1806 spin_lock_irqsave(&pool->lock, flags);
1807 bio->bi_bdev = pt->data_dev->bdev;
1808 r = DM_MAPIO_REMAPPED;
1809 spin_unlock_irqrestore(&pool->lock, flags);
1810
1811 return r;
1812}
1813
1814/*
1815 * Retrieves the number of blocks of the data device from
1816 * the superblock and compares it to the actual device size,
1817 * thus resizing the data device in case it has grown.
1818 *
1819 * This both copes with opening preallocated data devices in the ctr
1820 * being followed by a resume
1821 * -and-
1822 * calling the resume method individually after userspace has
1823 * grown the data device in reaction to a table event.
1824 */
1825static int pool_preresume(struct dm_target *ti)
1826{
1827 int r;
1828 struct pool_c *pt = ti->private;
1829 struct pool *pool = pt->pool;
1830 dm_block_t data_size, sb_data_size;
1831
1832 /*
1833 * Take control of the pool object.
1834 */
1835 r = bind_control_target(pool, ti);
1836 if (r)
1837 return r;
1838
1839 data_size = ti->len >> pool->block_shift;
1840 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
1841 if (r) {
1842 DMERR("failed to retrieve data device size");
1843 return r;
1844 }
1845
1846 if (data_size < sb_data_size) {
1847 DMERR("pool target too small, is %llu blocks (expected %llu)",
1848 data_size, sb_data_size);
1849 return -EINVAL;
1850
1851 } else if (data_size > sb_data_size) {
1852 r = dm_pool_resize_data_dev(pool->pmd, data_size);
1853 if (r) {
1854 DMERR("failed to resize data device");
1855 return r;
1856 }
1857
1858 r = dm_pool_commit_metadata(pool->pmd);
1859 if (r) {
1860 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1861 __func__, r);
1862 return r;
1863 }
1864 }
1865
1866 return 0;
1867}
1868
1869static void pool_resume(struct dm_target *ti)
1870{
1871 struct pool_c *pt = ti->private;
1872 struct pool *pool = pt->pool;
1873 unsigned long flags;
1874
1875 spin_lock_irqsave(&pool->lock, flags);
1876 pool->low_water_triggered = 0;
1877 pool->no_free_space = 0;
1878 __requeue_bios(pool);
1879 spin_unlock_irqrestore(&pool->lock, flags);
1880
1881 wake_worker(pool);
1882}
1883
1884static void pool_postsuspend(struct dm_target *ti)
1885{
1886 int r;
1887 struct pool_c *pt = ti->private;
1888 struct pool *pool = pt->pool;
1889
1890 flush_workqueue(pool->wq);
1891
1892 r = dm_pool_commit_metadata(pool->pmd);
1893 if (r < 0) {
1894 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1895 __func__, r);
1896 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
1897 }
1898}
1899
1900static int check_arg_count(unsigned argc, unsigned args_required)
1901{
1902 if (argc != args_required) {
1903 DMWARN("Message received with %u arguments instead of %u.",
1904 argc, args_required);
1905 return -EINVAL;
1906 }
1907
1908 return 0;
1909}
1910
1911static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
1912{
1913 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
1914 *dev_id <= MAX_DEV_ID)
1915 return 0;
1916
1917 if (warning)
1918 DMWARN("Message received with invalid device id: %s", arg);
1919
1920 return -EINVAL;
1921}
1922
1923static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
1924{
1925 dm_thin_id dev_id;
1926 int r;
1927
1928 r = check_arg_count(argc, 2);
1929 if (r)
1930 return r;
1931
1932 r = read_dev_id(argv[1], &dev_id, 1);
1933 if (r)
1934 return r;
1935
1936 r = dm_pool_create_thin(pool->pmd, dev_id);
1937 if (r) {
1938 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
1939 argv[1]);
1940 return r;
1941 }
1942
1943 return 0;
1944}
1945
1946static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
1947{
1948 dm_thin_id dev_id;
1949 dm_thin_id origin_dev_id;
1950 int r;
1951
1952 r = check_arg_count(argc, 3);
1953 if (r)
1954 return r;
1955
1956 r = read_dev_id(argv[1], &dev_id, 1);
1957 if (r)
1958 return r;
1959
1960 r = read_dev_id(argv[2], &origin_dev_id, 1);
1961 if (r)
1962 return r;
1963
1964 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
1965 if (r) {
1966 DMWARN("Creation of new snapshot %s of device %s failed.",
1967 argv[1], argv[2]);
1968 return r;
1969 }
1970
1971 return 0;
1972}
1973
1974static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
1975{
1976 dm_thin_id dev_id;
1977 int r;
1978
1979 r = check_arg_count(argc, 2);
1980 if (r)
1981 return r;
1982
1983 r = read_dev_id(argv[1], &dev_id, 1);
1984 if (r)
1985 return r;
1986
1987 r = dm_pool_delete_thin_device(pool->pmd, dev_id);
1988 if (r)
1989 DMWARN("Deletion of thin device %s failed.", argv[1]);
1990
1991 return r;
1992}
1993
1994static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
1995{
1996 dm_thin_id old_id, new_id;
1997 int r;
1998
1999 r = check_arg_count(argc, 3);
2000 if (r)
2001 return r;
2002
2003 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2004 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2005 return -EINVAL;
2006 }
2007
2008 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2009 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2010 return -EINVAL;
2011 }
2012
2013 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2014 if (r) {
2015 DMWARN("Failed to change transaction id from %s to %s.",
2016 argv[1], argv[2]);
2017 return r;
2018 }
2019
2020 return 0;
2021}
2022
2023/*
2024 * Messages supported:
2025 * create_thin <dev_id>
2026 * create_snap <dev_id> <origin_id>
2027 * delete <dev_id>
2028 * trim <dev_id> <new_size_in_sectors>
2029 * set_transaction_id <current_trans_id> <new_trans_id>
2030 */
2031static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2032{
2033 int r = -EINVAL;
2034 struct pool_c *pt = ti->private;
2035 struct pool *pool = pt->pool;
2036
2037 if (!strcasecmp(argv[0], "create_thin"))
2038 r = process_create_thin_mesg(argc, argv, pool);
2039
2040 else if (!strcasecmp(argv[0], "create_snap"))
2041 r = process_create_snap_mesg(argc, argv, pool);
2042
2043 else if (!strcasecmp(argv[0], "delete"))
2044 r = process_delete_mesg(argc, argv, pool);
2045
2046 else if (!strcasecmp(argv[0], "set_transaction_id"))
2047 r = process_set_transaction_id_mesg(argc, argv, pool);
2048
2049 else
2050 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2051
2052 if (!r) {
2053 r = dm_pool_commit_metadata(pool->pmd);
2054 if (r)
2055 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
2056 argv[0], r);
2057 }
2058
2059 return r;
2060}
2061
2062/*
2063 * Status line is:
2064 * <transaction id> <used metadata sectors>/<total metadata sectors>
2065 * <used data sectors>/<total data sectors> <held metadata root>
2066 */
2067static int pool_status(struct dm_target *ti, status_type_t type,
2068 char *result, unsigned maxlen)
2069{
2070 int r;
2071 unsigned sz = 0;
2072 uint64_t transaction_id;
2073 dm_block_t nr_free_blocks_data;
2074 dm_block_t nr_free_blocks_metadata;
2075 dm_block_t nr_blocks_data;
2076 dm_block_t nr_blocks_metadata;
2077 dm_block_t held_root;
2078 char buf[BDEVNAME_SIZE];
2079 char buf2[BDEVNAME_SIZE];
2080 struct pool_c *pt = ti->private;
2081 struct pool *pool = pt->pool;
2082
2083 switch (type) {
2084 case STATUSTYPE_INFO:
2085 r = dm_pool_get_metadata_transaction_id(pool->pmd,
2086 &transaction_id);
2087 if (r)
2088 return r;
2089
2090 r = dm_pool_get_free_metadata_block_count(pool->pmd,
2091 &nr_free_blocks_metadata);
2092 if (r)
2093 return r;
2094
2095 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2096 if (r)
2097 return r;
2098
2099 r = dm_pool_get_free_block_count(pool->pmd,
2100 &nr_free_blocks_data);
2101 if (r)
2102 return r;
2103
2104 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2105 if (r)
2106 return r;
2107
2108 r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
2109 if (r)
2110 return r;
2111
2112 DMEMIT("%llu %llu/%llu %llu/%llu ",
2113 (unsigned long long)transaction_id,
2114 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2115 (unsigned long long)nr_blocks_metadata,
2116 (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2117 (unsigned long long)nr_blocks_data);
2118
2119 if (held_root)
2120 DMEMIT("%llu", held_root);
2121 else
2122 DMEMIT("-");
2123
2124 break;
2125
2126 case STATUSTYPE_TABLE:
2127 DMEMIT("%s %s %lu %llu ",
2128 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2129 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2130 (unsigned long)pool->sectors_per_block,
2131 (unsigned long long)pt->low_water_blocks);
2132
2133 DMEMIT("%u ", !pool->zero_new_blocks);
2134
2135 if (!pool->zero_new_blocks)
2136 DMEMIT("skip_block_zeroing ");
2137 break;
2138 }
2139
2140 return 0;
2141}
2142
2143static int pool_iterate_devices(struct dm_target *ti,
2144 iterate_devices_callout_fn fn, void *data)
2145{
2146 struct pool_c *pt = ti->private;
2147
2148 return fn(ti, pt->data_dev, 0, ti->len, data);
2149}
2150
2151static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2152 struct bio_vec *biovec, int max_size)
2153{
2154 struct pool_c *pt = ti->private;
2155 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2156
2157 if (!q->merge_bvec_fn)
2158 return max_size;
2159
2160 bvm->bi_bdev = pt->data_dev->bdev;
2161
2162 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2163}
2164
2165static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2166{
2167 struct pool_c *pt = ti->private;
2168 struct pool *pool = pt->pool;
2169
2170 blk_limits_io_min(limits, 0);
2171 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2172}
2173
2174static struct target_type pool_target = {
2175 .name = "thin-pool",
2176 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2177 DM_TARGET_IMMUTABLE,
2178 .version = {1, 0, 0},
2179 .module = THIS_MODULE,
2180 .ctr = pool_ctr,
2181 .dtr = pool_dtr,
2182 .map = pool_map,
2183 .postsuspend = pool_postsuspend,
2184 .preresume = pool_preresume,
2185 .resume = pool_resume,
2186 .message = pool_message,
2187 .status = pool_status,
2188 .merge = pool_merge,
2189 .iterate_devices = pool_iterate_devices,
2190 .io_hints = pool_io_hints,
2191};
2192
2193/*----------------------------------------------------------------
2194 * Thin target methods
2195 *--------------------------------------------------------------*/
2196static void thin_dtr(struct dm_target *ti)
2197{
2198 struct thin_c *tc = ti->private;
2199
2200 mutex_lock(&dm_thin_pool_table.mutex);
2201
2202 __pool_dec(tc->pool);
2203 dm_pool_close_thin_device(tc->td);
2204 dm_put_device(ti, tc->pool_dev);
2205 kfree(tc);
2206
2207 mutex_unlock(&dm_thin_pool_table.mutex);
2208}
2209
2210/*
2211 * Thin target parameters:
2212 *
2213 * <pool_dev> <dev_id>
2214 *
2215 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2216 * dev_id: the internal device identifier
2217 */
2218static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2219{
2220 int r;
2221 struct thin_c *tc;
2222 struct dm_dev *pool_dev;
2223 struct mapped_device *pool_md;
2224
2225 mutex_lock(&dm_thin_pool_table.mutex);
2226
2227 if (argc != 2) {
2228 ti->error = "Invalid argument count";
2229 r = -EINVAL;
2230 goto out_unlock;
2231 }
2232
2233 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2234 if (!tc) {
2235 ti->error = "Out of memory";
2236 r = -ENOMEM;
2237 goto out_unlock;
2238 }
2239
2240 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2241 if (r) {
2242 ti->error = "Error opening pool device";
2243 goto bad_pool_dev;
2244 }
2245 tc->pool_dev = pool_dev;
2246
2247 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2248 ti->error = "Invalid device id";
2249 r = -EINVAL;
2250 goto bad_common;
2251 }
2252
2253 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2254 if (!pool_md) {
2255 ti->error = "Couldn't get pool mapped device";
2256 r = -EINVAL;
2257 goto bad_common;
2258 }
2259
2260 tc->pool = __pool_table_lookup(pool_md);
2261 if (!tc->pool) {
2262 ti->error = "Couldn't find pool object";
2263 r = -EINVAL;
2264 goto bad_pool_lookup;
2265 }
2266 __pool_inc(tc->pool);
2267
2268 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2269 if (r) {
2270 ti->error = "Couldn't open thin internal device";
2271 goto bad_thin_open;
2272 }
2273
2274 ti->split_io = tc->pool->sectors_per_block;
2275 ti->num_flush_requests = 1;
2276 ti->num_discard_requests = 0;
2277 ti->discards_supported = 0;
2278
2279 dm_put(pool_md);
2280
2281 mutex_unlock(&dm_thin_pool_table.mutex);
2282
2283 return 0;
2284
2285bad_thin_open:
2286 __pool_dec(tc->pool);
2287bad_pool_lookup:
2288 dm_put(pool_md);
2289bad_common:
2290 dm_put_device(ti, tc->pool_dev);
2291bad_pool_dev:
2292 kfree(tc);
2293out_unlock:
2294 mutex_unlock(&dm_thin_pool_table.mutex);
2295
2296 return r;
2297}
2298
2299static int thin_map(struct dm_target *ti, struct bio *bio,
2300 union map_info *map_context)
2301{
2302 bio->bi_sector -= ti->begin;
2303
2304 return thin_bio_map(ti, bio, map_context);
2305}
2306
2307static void thin_postsuspend(struct dm_target *ti)
2308{
2309 if (dm_noflush_suspending(ti))
2310 requeue_io((struct thin_c *)ti->private);
2311}
2312
2313/*
2314 * <nr mapped sectors> <highest mapped sector>
2315 */
2316static int thin_status(struct dm_target *ti, status_type_t type,
2317 char *result, unsigned maxlen)
2318{
2319 int r;
2320 ssize_t sz = 0;
2321 dm_block_t mapped, highest;
2322 char buf[BDEVNAME_SIZE];
2323 struct thin_c *tc = ti->private;
2324
2325 if (!tc->td)
2326 DMEMIT("-");
2327 else {
2328 switch (type) {
2329 case STATUSTYPE_INFO:
2330 r = dm_thin_get_mapped_count(tc->td, &mapped);
2331 if (r)
2332 return r;
2333
2334 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2335 if (r < 0)
2336 return r;
2337
2338 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2339 if (r)
2340 DMEMIT("%llu", ((highest + 1) *
2341 tc->pool->sectors_per_block) - 1);
2342 else
2343 DMEMIT("-");
2344 break;
2345
2346 case STATUSTYPE_TABLE:
2347 DMEMIT("%s %lu",
2348 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2349 (unsigned long) tc->dev_id);
2350 break;
2351 }
2352 }
2353
2354 return 0;
2355}
2356
2357static int thin_iterate_devices(struct dm_target *ti,
2358 iterate_devices_callout_fn fn, void *data)
2359{
2360 dm_block_t blocks;
2361 struct thin_c *tc = ti->private;
2362
2363 /*
2364 * We can't call dm_pool_get_data_dev_size() since that blocks. So
2365 * we follow a more convoluted path through to the pool's target.
2366 */
2367 if (!tc->pool->ti)
2368 return 0; /* nothing is bound */
2369
2370 blocks = tc->pool->ti->len >> tc->pool->block_shift;
2371 if (blocks)
2372 return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
2373
2374 return 0;
2375}
2376
2377static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2378{
2379 struct thin_c *tc = ti->private;
2380
2381 blk_limits_io_min(limits, 0);
2382 blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
2383}
2384
2385static struct target_type thin_target = {
2386 .name = "thin",
2387 .version = {1, 0, 0},
2388 .module = THIS_MODULE,
2389 .ctr = thin_ctr,
2390 .dtr = thin_dtr,
2391 .map = thin_map,
2392 .postsuspend = thin_postsuspend,
2393 .status = thin_status,
2394 .iterate_devices = thin_iterate_devices,
2395 .io_hints = thin_io_hints,
2396};
2397
2398/*----------------------------------------------------------------*/
2399
2400static int __init dm_thin_init(void)
2401{
2402 int r;
2403
2404 pool_table_init();
2405
2406 r = dm_register_target(&thin_target);
2407 if (r)
2408 return r;
2409
2410 r = dm_register_target(&pool_target);
2411 if (r)
2412 dm_unregister_target(&thin_target);
2413
2414 return r;
2415}
2416
2417static void dm_thin_exit(void)
2418{
2419 dm_unregister_target(&thin_target);
2420 dm_unregister_target(&pool_target);
2421}
2422
2423module_init(dm_thin_init);
2424module_exit(dm_thin_exit);
2425
2426MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
2427MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2428MODULE_LICENSE("GPL");