aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm-thin.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm-thin.c')
-rw-r--r--drivers/md/dm-thin.c760
1 files changed, 607 insertions, 153 deletions
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 0f86d802b533..8735543eacdb 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -11,11 +11,13 @@
11#include <linux/device-mapper.h> 11#include <linux/device-mapper.h>
12#include <linux/dm-io.h> 12#include <linux/dm-io.h>
13#include <linux/dm-kcopyd.h> 13#include <linux/dm-kcopyd.h>
14#include <linux/log2.h>
14#include <linux/list.h> 15#include <linux/list.h>
15#include <linux/rculist.h> 16#include <linux/rculist.h>
16#include <linux/init.h> 17#include <linux/init.h>
17#include <linux/module.h> 18#include <linux/module.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/sort.h>
19#include <linux/rbtree.h> 21#include <linux/rbtree.h>
20 22
21#define DM_MSG_PREFIX "thin" 23#define DM_MSG_PREFIX "thin"
@@ -25,7 +27,6 @@
25 */ 27 */
26#define ENDIO_HOOK_POOL_SIZE 1024 28#define ENDIO_HOOK_POOL_SIZE 1024
27#define MAPPING_POOL_SIZE 1024 29#define MAPPING_POOL_SIZE 1024
28#define PRISON_CELLS 1024
29#define COMMIT_PERIOD HZ 30#define COMMIT_PERIOD HZ
30#define NO_SPACE_TIMEOUT_SECS 60 31#define NO_SPACE_TIMEOUT_SECS 60
31 32
@@ -114,7 +115,8 @@ static void build_data_key(struct dm_thin_device *td,
114{ 115{
115 key->virtual = 0; 116 key->virtual = 0;
116 key->dev = dm_thin_dev_id(td); 117 key->dev = dm_thin_dev_id(td);
117 key->block = b; 118 key->block_begin = b;
119 key->block_end = b + 1ULL;
118} 120}
119 121
120static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 122static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
@@ -122,7 +124,55 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
122{ 124{
123 key->virtual = 1; 125 key->virtual = 1;
124 key->dev = dm_thin_dev_id(td); 126 key->dev = dm_thin_dev_id(td);
125 key->block = b; 127 key->block_begin = b;
128 key->block_end = b + 1ULL;
129}
130
131/*----------------------------------------------------------------*/
132
133#define THROTTLE_THRESHOLD (1 * HZ)
134
135struct throttle {
136 struct rw_semaphore lock;
137 unsigned long threshold;
138 bool throttle_applied;
139};
140
141static void throttle_init(struct throttle *t)
142{
143 init_rwsem(&t->lock);
144 t->throttle_applied = false;
145}
146
147static void throttle_work_start(struct throttle *t)
148{
149 t->threshold = jiffies + THROTTLE_THRESHOLD;
150}
151
152static void throttle_work_update(struct throttle *t)
153{
154 if (!t->throttle_applied && jiffies > t->threshold) {
155 down_write(&t->lock);
156 t->throttle_applied = true;
157 }
158}
159
160static void throttle_work_complete(struct throttle *t)
161{
162 if (t->throttle_applied) {
163 t->throttle_applied = false;
164 up_write(&t->lock);
165 }
166}
167
168static void throttle_lock(struct throttle *t)
169{
170 down_read(&t->lock);
171}
172
173static void throttle_unlock(struct throttle *t)
174{
175 up_read(&t->lock);
126} 176}
127 177
128/*----------------------------------------------------------------*/ 178/*----------------------------------------------------------------*/
@@ -155,8 +205,11 @@ struct pool_features {
155 205
156struct thin_c; 206struct thin_c;
157typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); 207typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
208typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
158typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); 209typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
159 210
211#define CELL_SORT_ARRAY_SIZE 8192
212
160struct pool { 213struct pool {
161 struct list_head list; 214 struct list_head list;
162 struct dm_target *ti; /* Only set if a pool target is bound */ 215 struct dm_target *ti; /* Only set if a pool target is bound */
@@ -171,11 +224,13 @@ struct pool {
171 224
172 struct pool_features pf; 225 struct pool_features pf;
173 bool low_water_triggered:1; /* A dm event has been sent */ 226 bool low_water_triggered:1; /* A dm event has been sent */
227 bool suspended:1;
174 228
175 struct dm_bio_prison *prison; 229 struct dm_bio_prison *prison;
176 struct dm_kcopyd_client *copier; 230 struct dm_kcopyd_client *copier;
177 231
178 struct workqueue_struct *wq; 232 struct workqueue_struct *wq;
233 struct throttle throttle;
179 struct work_struct worker; 234 struct work_struct worker;
180 struct delayed_work waker; 235 struct delayed_work waker;
181 struct delayed_work no_space_timeout; 236 struct delayed_work no_space_timeout;
@@ -198,8 +253,13 @@ struct pool {
198 process_bio_fn process_bio; 253 process_bio_fn process_bio;
199 process_bio_fn process_discard; 254 process_bio_fn process_discard;
200 255
256 process_cell_fn process_cell;
257 process_cell_fn process_discard_cell;
258
201 process_mapping_fn process_prepared_mapping; 259 process_mapping_fn process_prepared_mapping;
202 process_mapping_fn process_prepared_discard; 260 process_mapping_fn process_prepared_discard;
261
262 struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];
203}; 263};
204 264
205static enum pool_mode get_pool_mode(struct pool *pool); 265static enum pool_mode get_pool_mode(struct pool *pool);
@@ -232,8 +292,11 @@ struct thin_c {
232 292
233 struct pool *pool; 293 struct pool *pool;
234 struct dm_thin_device *td; 294 struct dm_thin_device *td;
295 struct mapped_device *thin_md;
296
235 bool requeue_mode:1; 297 bool requeue_mode:1;
236 spinlock_t lock; 298 spinlock_t lock;
299 struct list_head deferred_cells;
237 struct bio_list deferred_bio_list; 300 struct bio_list deferred_bio_list;
238 struct bio_list retry_on_resume_list; 301 struct bio_list retry_on_resume_list;
239 struct rb_root sort_bio_list; /* sorted list of deferred bios */ 302 struct rb_root sort_bio_list; /* sorted list of deferred bios */
@@ -290,6 +353,15 @@ static void cell_release(struct pool *pool,
290 dm_bio_prison_free_cell(pool->prison, cell); 353 dm_bio_prison_free_cell(pool->prison, cell);
291} 354}
292 355
356static void cell_visit_release(struct pool *pool,
357 void (*fn)(void *, struct dm_bio_prison_cell *),
358 void *context,
359 struct dm_bio_prison_cell *cell)
360{
361 dm_cell_visit_release(pool->prison, fn, context, cell);
362 dm_bio_prison_free_cell(pool->prison, cell);
363}
364
293static void cell_release_no_holder(struct pool *pool, 365static void cell_release_no_holder(struct pool *pool,
294 struct dm_bio_prison_cell *cell, 366 struct dm_bio_prison_cell *cell,
295 struct bio_list *bios) 367 struct bio_list *bios)
@@ -298,19 +370,6 @@ static void cell_release_no_holder(struct pool *pool,
298 dm_bio_prison_free_cell(pool->prison, cell); 370 dm_bio_prison_free_cell(pool->prison, cell);
299} 371}
300 372
301static void cell_defer_no_holder_no_free(struct thin_c *tc,
302 struct dm_bio_prison_cell *cell)
303{
304 struct pool *pool = tc->pool;
305 unsigned long flags;
306
307 spin_lock_irqsave(&tc->lock, flags);
308 dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
309 spin_unlock_irqrestore(&tc->lock, flags);
310
311 wake_worker(pool);
312}
313
314static void cell_error_with_code(struct pool *pool, 373static void cell_error_with_code(struct pool *pool,
315 struct dm_bio_prison_cell *cell, int error_code) 374 struct dm_bio_prison_cell *cell, int error_code)
316{ 375{
@@ -323,6 +382,16 @@ static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
323 cell_error_with_code(pool, cell, -EIO); 382 cell_error_with_code(pool, cell, -EIO);
324} 383}
325 384
385static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
386{
387 cell_error_with_code(pool, cell, 0);
388}
389
390static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
391{
392 cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
393}
394
326/*----------------------------------------------------------------*/ 395/*----------------------------------------------------------------*/
327 396
328/* 397/*
@@ -393,44 +462,65 @@ struct dm_thin_endio_hook {
393 struct rb_node rb_node; 462 struct rb_node rb_node;
394}; 463};
395 464
396static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) 465static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
466{
467 bio_list_merge(bios, master);
468 bio_list_init(master);
469}
470
471static void error_bio_list(struct bio_list *bios, int error)
397{ 472{
398 struct bio *bio; 473 struct bio *bio;
474
475 while ((bio = bio_list_pop(bios)))
476 bio_endio(bio, error);
477}
478
479static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
480{
399 struct bio_list bios; 481 struct bio_list bios;
400 unsigned long flags; 482 unsigned long flags;
401 483
402 bio_list_init(&bios); 484 bio_list_init(&bios);
403 485
404 spin_lock_irqsave(&tc->lock, flags); 486 spin_lock_irqsave(&tc->lock, flags);
405 bio_list_merge(&bios, master); 487 __merge_bio_list(&bios, master);
406 bio_list_init(master);
407 spin_unlock_irqrestore(&tc->lock, flags); 488 spin_unlock_irqrestore(&tc->lock, flags);
408 489
409 while ((bio = bio_list_pop(&bios))) 490 error_bio_list(&bios, error);
410 bio_endio(bio, DM_ENDIO_REQUEUE);
411} 491}
412 492
413static void requeue_io(struct thin_c *tc) 493static void requeue_deferred_cells(struct thin_c *tc)
414{ 494{
415 requeue_bio_list(tc, &tc->deferred_bio_list); 495 struct pool *pool = tc->pool;
416 requeue_bio_list(tc, &tc->retry_on_resume_list); 496 unsigned long flags;
497 struct list_head cells;
498 struct dm_bio_prison_cell *cell, *tmp;
499
500 INIT_LIST_HEAD(&cells);
501
502 spin_lock_irqsave(&tc->lock, flags);
503 list_splice_init(&tc->deferred_cells, &cells);
504 spin_unlock_irqrestore(&tc->lock, flags);
505
506 list_for_each_entry_safe(cell, tmp, &cells, user_list)
507 cell_requeue(pool, cell);
417} 508}
418 509
419static void error_thin_retry_list(struct thin_c *tc) 510static void requeue_io(struct thin_c *tc)
420{ 511{
421 struct bio *bio;
422 unsigned long flags;
423 struct bio_list bios; 512 struct bio_list bios;
513 unsigned long flags;
424 514
425 bio_list_init(&bios); 515 bio_list_init(&bios);
426 516
427 spin_lock_irqsave(&tc->lock, flags); 517 spin_lock_irqsave(&tc->lock, flags);
428 bio_list_merge(&bios, &tc->retry_on_resume_list); 518 __merge_bio_list(&bios, &tc->deferred_bio_list);
429 bio_list_init(&tc->retry_on_resume_list); 519 __merge_bio_list(&bios, &tc->retry_on_resume_list);
430 spin_unlock_irqrestore(&tc->lock, flags); 520 spin_unlock_irqrestore(&tc->lock, flags);
431 521
432 while ((bio = bio_list_pop(&bios))) 522 error_bio_list(&bios, DM_ENDIO_REQUEUE);
433 bio_io_error(bio); 523 requeue_deferred_cells(tc);
434} 524}
435 525
436static void error_retry_list(struct pool *pool) 526static void error_retry_list(struct pool *pool)
@@ -439,7 +529,7 @@ static void error_retry_list(struct pool *pool)
439 529
440 rcu_read_lock(); 530 rcu_read_lock();
441 list_for_each_entry_rcu(tc, &pool->active_thins, list) 531 list_for_each_entry_rcu(tc, &pool->active_thins, list)
442 error_thin_retry_list(tc); 532 error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
443 rcu_read_unlock(); 533 rcu_read_unlock();
444} 534}
445 535
@@ -629,33 +719,75 @@ static void overwrite_endio(struct bio *bio, int err)
629 */ 719 */
630 720
631/* 721/*
632 * This sends the bios in the cell back to the deferred_bios list. 722 * This sends the bios in the cell, except the original holder, back
723 * to the deferred_bios list.
633 */ 724 */
634static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) 725static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
635{ 726{
636 struct pool *pool = tc->pool; 727 struct pool *pool = tc->pool;
637 unsigned long flags; 728 unsigned long flags;
638 729
639 spin_lock_irqsave(&tc->lock, flags); 730 spin_lock_irqsave(&tc->lock, flags);
640 cell_release(pool, cell, &tc->deferred_bio_list); 731 cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
641 spin_unlock_irqrestore(&tc->lock, flags); 732 spin_unlock_irqrestore(&tc->lock, flags);
642 733
643 wake_worker(pool); 734 wake_worker(pool);
644} 735}
645 736
646/* 737static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
647 * Same as cell_defer above, except it omits the original holder of the cell. 738
648 */ 739struct remap_info {
649static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) 740 struct thin_c *tc;
741 struct bio_list defer_bios;
742 struct bio_list issue_bios;
743};
744
745static void __inc_remap_and_issue_cell(void *context,
746 struct dm_bio_prison_cell *cell)
650{ 747{
651 struct pool *pool = tc->pool; 748 struct remap_info *info = context;
652 unsigned long flags; 749 struct bio *bio;
653 750
654 spin_lock_irqsave(&tc->lock, flags); 751 while ((bio = bio_list_pop(&cell->bios))) {
655 cell_release_no_holder(pool, cell, &tc->deferred_bio_list); 752 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))
656 spin_unlock_irqrestore(&tc->lock, flags); 753 bio_list_add(&info->defer_bios, bio);
754 else {
755 inc_all_io_entry(info->tc->pool, bio);
657 756
658 wake_worker(pool); 757 /*
758 * We can't issue the bios with the bio prison lock
759 * held, so we add them to a list to issue on
760 * return from this function.
761 */
762 bio_list_add(&info->issue_bios, bio);
763 }
764 }
765}
766
767static void inc_remap_and_issue_cell(struct thin_c *tc,
768 struct dm_bio_prison_cell *cell,
769 dm_block_t block)
770{
771 struct bio *bio;
772 struct remap_info info;
773
774 info.tc = tc;
775 bio_list_init(&info.defer_bios);
776 bio_list_init(&info.issue_bios);
777
778 /*
779 * We have to be careful to inc any bios we're about to issue
780 * before the cell is released, and avoid a race with new bios
781 * being added to the cell.
782 */
783 cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
784 &info, cell);
785
786 while ((bio = bio_list_pop(&info.defer_bios)))
787 thin_defer_bio(tc, bio);
788
789 while ((bio = bio_list_pop(&info.issue_bios)))
790 remap_and_issue(info.tc, bio, block);
659} 791}
660 792
661static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) 793static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
@@ -706,10 +838,13 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
706 * the bios in the cell. 838 * the bios in the cell.
707 */ 839 */
708 if (bio) { 840 if (bio) {
709 cell_defer_no_holder(tc, m->cell); 841 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
710 bio_endio(bio, 0); 842 bio_endio(bio, 0);
711 } else 843 } else {
712 cell_defer(tc, m->cell); 844 inc_all_io_entry(tc->pool, m->cell->holder);
845 remap_and_issue(tc, m->cell->holder, m->data_block);
846 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
847 }
713 848
714out: 849out:
715 list_del(&m->list); 850 list_del(&m->list);
@@ -842,6 +977,20 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
842 } 977 }
843} 978}
844 979
980static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
981 dm_block_t data_block,
982 struct dm_thin_new_mapping *m)
983{
984 struct pool *pool = tc->pool;
985 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
986
987 h->overwrite_mapping = m;
988 m->bio = bio;
989 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
990 inc_all_io_entry(pool, bio);
991 remap_and_issue(tc, bio, data_block);
992}
993
845/* 994/*
846 * A partial copy also needs to zero the uncopied region. 995 * A partial copy also needs to zero the uncopied region.
847 */ 996 */
@@ -876,15 +1025,9 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
876 * If the whole block of data is being overwritten, we can issue the 1025 * If the whole block of data is being overwritten, we can issue the
877 * bio immediately. Otherwise we use kcopyd to clone the data first. 1026 * bio immediately. Otherwise we use kcopyd to clone the data first.
878 */ 1027 */
879 if (io_overwrites_block(pool, bio)) { 1028 if (io_overwrites_block(pool, bio))
880 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1029 remap_and_issue_overwrite(tc, bio, data_dest, m);
881 1030 else {
882 h->overwrite_mapping = m;
883 m->bio = bio;
884 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
885 inc_all_io_entry(pool, bio);
886 remap_and_issue(tc, bio, data_dest);
887 } else {
888 struct dm_io_region from, to; 1031 struct dm_io_region from, to;
889 1032
890 from.bdev = origin->bdev; 1033 from.bdev = origin->bdev;
@@ -953,16 +1096,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
953 if (!pool->pf.zero_new_blocks) 1096 if (!pool->pf.zero_new_blocks)
954 process_prepared_mapping(m); 1097 process_prepared_mapping(m);
955 1098
956 else if (io_overwrites_block(pool, bio)) { 1099 else if (io_overwrites_block(pool, bio))
957 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1100 remap_and_issue_overwrite(tc, bio, data_block, m);
958
959 h->overwrite_mapping = m;
960 m->bio = bio;
961 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
962 inc_all_io_entry(pool, bio);
963 remap_and_issue(tc, bio, data_block);
964 1101
965 } else 1102 else
966 ll_zero(tc, m, 1103 ll_zero(tc, m,
967 data_block * pool->sectors_per_block, 1104 data_block * pool->sectors_per_block,
968 (data_block + 1) * pool->sectors_per_block); 1105 (data_block + 1) * pool->sectors_per_block);
@@ -1134,29 +1271,25 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
1134 bio_list_init(&bios); 1271 bio_list_init(&bios);
1135 cell_release(pool, cell, &bios); 1272 cell_release(pool, cell, &bios);
1136 1273
1137 error = should_error_unserviceable_bio(pool); 1274 while ((bio = bio_list_pop(&bios)))
1138 if (error) 1275 retry_on_resume(bio);
1139 while ((bio = bio_list_pop(&bios)))
1140 bio_endio(bio, error);
1141 else
1142 while ((bio = bio_list_pop(&bios)))
1143 retry_on_resume(bio);
1144} 1276}
1145 1277
1146static void process_discard(struct thin_c *tc, struct bio *bio) 1278static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1147{ 1279{
1148 int r; 1280 int r;
1149 unsigned long flags; 1281 struct bio *bio = cell->holder;
1150 struct pool *pool = tc->pool; 1282 struct pool *pool = tc->pool;
1151 struct dm_bio_prison_cell *cell, *cell2; 1283 struct dm_bio_prison_cell *cell2;
1152 struct dm_cell_key key, key2; 1284 struct dm_cell_key key2;
1153 dm_block_t block = get_bio_block(tc, bio); 1285 dm_block_t block = get_bio_block(tc, bio);
1154 struct dm_thin_lookup_result lookup_result; 1286 struct dm_thin_lookup_result lookup_result;
1155 struct dm_thin_new_mapping *m; 1287 struct dm_thin_new_mapping *m;
1156 1288
1157 build_virtual_key(tc->td, block, &key); 1289 if (tc->requeue_mode) {
1158 if (bio_detain(tc->pool, &key, bio, &cell)) 1290 cell_requeue(pool, cell);
1159 return; 1291 return;
1292 }
1160 1293
1161 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1294 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1162 switch (r) { 1295 switch (r) {
@@ -1187,12 +1320,9 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
1187 m->cell2 = cell2; 1320 m->cell2 = cell2;
1188 m->bio = bio; 1321 m->bio = bio;
1189 1322
1190 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { 1323 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1191 spin_lock_irqsave(&pool->lock, flags); 1324 pool->process_prepared_discard(m);
1192 list_add_tail(&m->list, &pool->prepared_discards); 1325
1193 spin_unlock_irqrestore(&pool->lock, flags);
1194 wake_worker(pool);
1195 }
1196 } else { 1326 } else {
1197 inc_all_io_entry(pool, bio); 1327 inc_all_io_entry(pool, bio);
1198 cell_defer_no_holder(tc, cell); 1328 cell_defer_no_holder(tc, cell);
@@ -1227,6 +1357,19 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
1227 } 1357 }
1228} 1358}
1229 1359
1360static void process_discard_bio(struct thin_c *tc, struct bio *bio)
1361{
1362 struct dm_bio_prison_cell *cell;
1363 struct dm_cell_key key;
1364 dm_block_t block = get_bio_block(tc, bio);
1365
1366 build_virtual_key(tc->td, block, &key);
1367 if (bio_detain(tc->pool, &key, bio, &cell))
1368 return;
1369
1370 process_discard_cell(tc, cell);
1371}
1372
1230static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1373static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1231 struct dm_cell_key *key, 1374 struct dm_cell_key *key,
1232 struct dm_thin_lookup_result *lookup_result, 1375 struct dm_thin_lookup_result *lookup_result,
@@ -1255,11 +1398,53 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1255 } 1398 }
1256} 1399}
1257 1400
1401static void __remap_and_issue_shared_cell(void *context,
1402 struct dm_bio_prison_cell *cell)
1403{
1404 struct remap_info *info = context;
1405 struct bio *bio;
1406
1407 while ((bio = bio_list_pop(&cell->bios))) {
1408 if ((bio_data_dir(bio) == WRITE) ||
1409 (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)))
1410 bio_list_add(&info->defer_bios, bio);
1411 else {
1412 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
1413
1414 h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
1415 inc_all_io_entry(info->tc->pool, bio);
1416 bio_list_add(&info->issue_bios, bio);
1417 }
1418 }
1419}
1420
1421static void remap_and_issue_shared_cell(struct thin_c *tc,
1422 struct dm_bio_prison_cell *cell,
1423 dm_block_t block)
1424{
1425 struct bio *bio;
1426 struct remap_info info;
1427
1428 info.tc = tc;
1429 bio_list_init(&info.defer_bios);
1430 bio_list_init(&info.issue_bios);
1431
1432 cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
1433 &info, cell);
1434
1435 while ((bio = bio_list_pop(&info.defer_bios)))
1436 thin_defer_bio(tc, bio);
1437
1438 while ((bio = bio_list_pop(&info.issue_bios)))
1439 remap_and_issue(tc, bio, block);
1440}
1441
1258static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1442static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1259 dm_block_t block, 1443 dm_block_t block,
1260 struct dm_thin_lookup_result *lookup_result) 1444 struct dm_thin_lookup_result *lookup_result,
1445 struct dm_bio_prison_cell *virt_cell)
1261{ 1446{
1262 struct dm_bio_prison_cell *cell; 1447 struct dm_bio_prison_cell *data_cell;
1263 struct pool *pool = tc->pool; 1448 struct pool *pool = tc->pool;
1264 struct dm_cell_key key; 1449 struct dm_cell_key key;
1265 1450
@@ -1268,19 +1453,23 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1268 * of being broken so we have nothing further to do here. 1453 * of being broken so we have nothing further to do here.
1269 */ 1454 */
1270 build_data_key(tc->td, lookup_result->block, &key); 1455 build_data_key(tc->td, lookup_result->block, &key);
1271 if (bio_detain(pool, &key, bio, &cell)) 1456 if (bio_detain(pool, &key, bio, &data_cell)) {
1457 cell_defer_no_holder(tc, virt_cell);
1272 return; 1458 return;
1459 }
1273 1460
1274 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) 1461 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
1275 break_sharing(tc, bio, block, &key, lookup_result, cell); 1462 break_sharing(tc, bio, block, &key, lookup_result, data_cell);
1276 else { 1463 cell_defer_no_holder(tc, virt_cell);
1464 } else {
1277 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1465 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1278 1466
1279 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); 1467 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1280 inc_all_io_entry(pool, bio); 1468 inc_all_io_entry(pool, bio);
1281 cell_defer_no_holder(tc, cell);
1282
1283 remap_and_issue(tc, bio, lookup_result->block); 1469 remap_and_issue(tc, bio, lookup_result->block);
1470
1471 remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
1472 remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
1284 } 1473 }
1285} 1474}
1286 1475
@@ -1333,34 +1522,28 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1333 } 1522 }
1334} 1523}
1335 1524
1336static void process_bio(struct thin_c *tc, struct bio *bio) 1525static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1337{ 1526{
1338 int r; 1527 int r;
1339 struct pool *pool = tc->pool; 1528 struct pool *pool = tc->pool;
1529 struct bio *bio = cell->holder;
1340 dm_block_t block = get_bio_block(tc, bio); 1530 dm_block_t block = get_bio_block(tc, bio);
1341 struct dm_bio_prison_cell *cell;
1342 struct dm_cell_key key;
1343 struct dm_thin_lookup_result lookup_result; 1531 struct dm_thin_lookup_result lookup_result;
1344 1532
1345 /* 1533 if (tc->requeue_mode) {
1346 * If cell is already occupied, then the block is already 1534 cell_requeue(pool, cell);
1347 * being provisioned so we have nothing further to do here.
1348 */
1349 build_virtual_key(tc->td, block, &key);
1350 if (bio_detain(pool, &key, bio, &cell))
1351 return; 1535 return;
1536 }
1352 1537
1353 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1538 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1354 switch (r) { 1539 switch (r) {
1355 case 0: 1540 case 0:
1356 if (lookup_result.shared) { 1541 if (lookup_result.shared)
1357 process_shared_bio(tc, bio, block, &lookup_result); 1542 process_shared_bio(tc, bio, block, &lookup_result, cell);
1358 cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ 1543 else {
1359 } else {
1360 inc_all_io_entry(pool, bio); 1544 inc_all_io_entry(pool, bio);
1361 cell_defer_no_holder(tc, cell);
1362
1363 remap_and_issue(tc, bio, lookup_result.block); 1545 remap_and_issue(tc, bio, lookup_result.block);
1546 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
1364 } 1547 }
1365 break; 1548 break;
1366 1549
@@ -1394,7 +1577,26 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1394 } 1577 }
1395} 1578}
1396 1579
1397static void process_bio_read_only(struct thin_c *tc, struct bio *bio) 1580static void process_bio(struct thin_c *tc, struct bio *bio)
1581{
1582 struct pool *pool = tc->pool;
1583 dm_block_t block = get_bio_block(tc, bio);
1584 struct dm_bio_prison_cell *cell;
1585 struct dm_cell_key key;
1586
1587 /*
1588 * If cell is already occupied, then the block is already
1589 * being provisioned so we have nothing further to do here.
1590 */
1591 build_virtual_key(tc->td, block, &key);
1592 if (bio_detain(pool, &key, bio, &cell))
1593 return;
1594
1595 process_cell(tc, cell);
1596}
1597
1598static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
1599 struct dm_bio_prison_cell *cell)
1398{ 1600{
1399 int r; 1601 int r;
1400 int rw = bio_data_dir(bio); 1602 int rw = bio_data_dir(bio);
@@ -1404,15 +1606,21 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1404 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1606 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1405 switch (r) { 1607 switch (r) {
1406 case 0: 1608 case 0:
1407 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) 1609 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
1408 handle_unserviceable_bio(tc->pool, bio); 1610 handle_unserviceable_bio(tc->pool, bio);
1409 else { 1611 if (cell)
1612 cell_defer_no_holder(tc, cell);
1613 } else {
1410 inc_all_io_entry(tc->pool, bio); 1614 inc_all_io_entry(tc->pool, bio);
1411 remap_and_issue(tc, bio, lookup_result.block); 1615 remap_and_issue(tc, bio, lookup_result.block);
1616 if (cell)
1617 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
1412 } 1618 }
1413 break; 1619 break;
1414 1620
1415 case -ENODATA: 1621 case -ENODATA:
1622 if (cell)
1623 cell_defer_no_holder(tc, cell);
1416 if (rw != READ) { 1624 if (rw != READ) {
1417 handle_unserviceable_bio(tc->pool, bio); 1625 handle_unserviceable_bio(tc->pool, bio);
1418 break; 1626 break;
@@ -1431,11 +1639,23 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1431 default: 1639 default:
1432 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", 1640 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1433 __func__, r); 1641 __func__, r);
1642 if (cell)
1643 cell_defer_no_holder(tc, cell);
1434 bio_io_error(bio); 1644 bio_io_error(bio);
1435 break; 1645 break;
1436 } 1646 }
1437} 1647}
1438 1648
1649static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1650{
1651 __process_bio_read_only(tc, bio, NULL);
1652}
1653
1654static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1655{
1656 __process_bio_read_only(tc, cell->holder, cell);
1657}
1658
1439static void process_bio_success(struct thin_c *tc, struct bio *bio) 1659static void process_bio_success(struct thin_c *tc, struct bio *bio)
1440{ 1660{
1441 bio_endio(bio, 0); 1661 bio_endio(bio, 0);
@@ -1446,6 +1666,16 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1446 bio_io_error(bio); 1666 bio_io_error(bio);
1447} 1667}
1448 1668
1669static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1670{
1671 cell_success(tc->pool, cell);
1672}
1673
1674static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1675{
1676 cell_error(tc->pool, cell);
1677}
1678
1449/* 1679/*
1450 * FIXME: should we also commit due to size of transaction, measured in 1680 * FIXME: should we also commit due to size of transaction, measured in
1451 * metadata blocks? 1681 * metadata blocks?
@@ -1527,9 +1757,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
1527 struct bio *bio; 1757 struct bio *bio;
1528 struct bio_list bios; 1758 struct bio_list bios;
1529 struct blk_plug plug; 1759 struct blk_plug plug;
1760 unsigned count = 0;
1530 1761
1531 if (tc->requeue_mode) { 1762 if (tc->requeue_mode) {
1532 requeue_bio_list(tc, &tc->deferred_bio_list); 1763 error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);
1533 return; 1764 return;
1534 } 1765 }
1535 1766
@@ -1568,10 +1799,97 @@ static void process_thin_deferred_bios(struct thin_c *tc)
1568 pool->process_discard(tc, bio); 1799 pool->process_discard(tc, bio);
1569 else 1800 else
1570 pool->process_bio(tc, bio); 1801 pool->process_bio(tc, bio);
1802
1803 if ((count++ & 127) == 0) {
1804 throttle_work_update(&pool->throttle);
1805 dm_pool_issue_prefetches(pool->pmd);
1806 }
1571 } 1807 }
1572 blk_finish_plug(&plug); 1808 blk_finish_plug(&plug);
1573} 1809}
1574 1810
1811static int cmp_cells(const void *lhs, const void *rhs)
1812{
1813 struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
1814 struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
1815
1816 BUG_ON(!lhs_cell->holder);
1817 BUG_ON(!rhs_cell->holder);
1818
1819 if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
1820 return -1;
1821
1822 if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
1823 return 1;
1824
1825 return 0;
1826}
1827
1828static unsigned sort_cells(struct pool *pool, struct list_head *cells)
1829{
1830 unsigned count = 0;
1831 struct dm_bio_prison_cell *cell, *tmp;
1832
1833 list_for_each_entry_safe(cell, tmp, cells, user_list) {
1834 if (count >= CELL_SORT_ARRAY_SIZE)
1835 break;
1836
1837 pool->cell_sort_array[count++] = cell;
1838 list_del(&cell->user_list);
1839 }
1840
1841 sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
1842
1843 return count;
1844}
1845
1846static void process_thin_deferred_cells(struct thin_c *tc)
1847{
1848 struct pool *pool = tc->pool;
1849 unsigned long flags;
1850 struct list_head cells;
1851 struct dm_bio_prison_cell *cell;
1852 unsigned i, j, count;
1853
1854 INIT_LIST_HEAD(&cells);
1855
1856 spin_lock_irqsave(&tc->lock, flags);
1857 list_splice_init(&tc->deferred_cells, &cells);
1858 spin_unlock_irqrestore(&tc->lock, flags);
1859
1860 if (list_empty(&cells))
1861 return;
1862
1863 do {
1864 count = sort_cells(tc->pool, &cells);
1865
1866 for (i = 0; i < count; i++) {
1867 cell = pool->cell_sort_array[i];
1868 BUG_ON(!cell->holder);
1869
1870 /*
1871 * If we've got no free new_mapping structs, and processing
1872 * this bio might require one, we pause until there are some
1873 * prepared mappings to process.
1874 */
1875 if (ensure_next_mapping(pool)) {
1876 for (j = i; j < count; j++)
1877 list_add(&pool->cell_sort_array[j]->user_list, &cells);
1878
1879 spin_lock_irqsave(&tc->lock, flags);
1880 list_splice(&cells, &tc->deferred_cells);
1881 spin_unlock_irqrestore(&tc->lock, flags);
1882 return;
1883 }
1884
1885 if (cell->holder->bi_rw & REQ_DISCARD)
1886 pool->process_discard_cell(tc, cell);
1887 else
1888 pool->process_cell(tc, cell);
1889 }
1890 } while (!list_empty(&cells));
1891}
1892
1575static void thin_get(struct thin_c *tc); 1893static void thin_get(struct thin_c *tc);
1576static void thin_put(struct thin_c *tc); 1894static void thin_put(struct thin_c *tc);
1577 1895
@@ -1620,6 +1938,7 @@ static void process_deferred_bios(struct pool *pool)
1620 1938
1621 tc = get_first_thin(pool); 1939 tc = get_first_thin(pool);
1622 while (tc) { 1940 while (tc) {
1941 process_thin_deferred_cells(tc);
1623 process_thin_deferred_bios(tc); 1942 process_thin_deferred_bios(tc);
1624 tc = get_next_thin(pool, tc); 1943 tc = get_next_thin(pool, tc);
1625 } 1944 }
@@ -1653,9 +1972,15 @@ static void do_worker(struct work_struct *ws)
1653{ 1972{
1654 struct pool *pool = container_of(ws, struct pool, worker); 1973 struct pool *pool = container_of(ws, struct pool, worker);
1655 1974
1975 throttle_work_start(&pool->throttle);
1976 dm_pool_issue_prefetches(pool->pmd);
1977 throttle_work_update(&pool->throttle);
1656 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); 1978 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1979 throttle_work_update(&pool->throttle);
1657 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); 1980 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1981 throttle_work_update(&pool->throttle);
1658 process_deferred_bios(pool); 1982 process_deferred_bios(pool);
1983 throttle_work_complete(&pool->throttle);
1659} 1984}
1660 1985
1661/* 1986/*
@@ -1792,6 +2117,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1792 dm_pool_metadata_read_only(pool->pmd); 2117 dm_pool_metadata_read_only(pool->pmd);
1793 pool->process_bio = process_bio_fail; 2118 pool->process_bio = process_bio_fail;
1794 pool->process_discard = process_bio_fail; 2119 pool->process_discard = process_bio_fail;
2120 pool->process_cell = process_cell_fail;
2121 pool->process_discard_cell = process_cell_fail;
1795 pool->process_prepared_mapping = process_prepared_mapping_fail; 2122 pool->process_prepared_mapping = process_prepared_mapping_fail;
1796 pool->process_prepared_discard = process_prepared_discard_fail; 2123 pool->process_prepared_discard = process_prepared_discard_fail;
1797 2124
@@ -1804,6 +2131,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1804 dm_pool_metadata_read_only(pool->pmd); 2131 dm_pool_metadata_read_only(pool->pmd);
1805 pool->process_bio = process_bio_read_only; 2132 pool->process_bio = process_bio_read_only;
1806 pool->process_discard = process_bio_success; 2133 pool->process_discard = process_bio_success;
2134 pool->process_cell = process_cell_read_only;
2135 pool->process_discard_cell = process_cell_success;
1807 pool->process_prepared_mapping = process_prepared_mapping_fail; 2136 pool->process_prepared_mapping = process_prepared_mapping_fail;
1808 pool->process_prepared_discard = process_prepared_discard_passdown; 2137 pool->process_prepared_discard = process_prepared_discard_passdown;
1809 2138
@@ -1822,7 +2151,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1822 if (old_mode != new_mode) 2151 if (old_mode != new_mode)
1823 notify_of_pool_mode_change(pool, "out-of-data-space"); 2152 notify_of_pool_mode_change(pool, "out-of-data-space");
1824 pool->process_bio = process_bio_read_only; 2153 pool->process_bio = process_bio_read_only;
1825 pool->process_discard = process_discard; 2154 pool->process_discard = process_discard_bio;
2155 pool->process_cell = process_cell_read_only;
2156 pool->process_discard_cell = process_discard_cell;
1826 pool->process_prepared_mapping = process_prepared_mapping; 2157 pool->process_prepared_mapping = process_prepared_mapping;
1827 pool->process_prepared_discard = process_prepared_discard_passdown; 2158 pool->process_prepared_discard = process_prepared_discard_passdown;
1828 2159
@@ -1835,7 +2166,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1835 notify_of_pool_mode_change(pool, "write"); 2166 notify_of_pool_mode_change(pool, "write");
1836 dm_pool_metadata_read_write(pool->pmd); 2167 dm_pool_metadata_read_write(pool->pmd);
1837 pool->process_bio = process_bio; 2168 pool->process_bio = process_bio;
1838 pool->process_discard = process_discard; 2169 pool->process_discard = process_discard_bio;
2170 pool->process_cell = process_cell;
2171 pool->process_discard_cell = process_discard_cell;
1839 pool->process_prepared_mapping = process_prepared_mapping; 2172 pool->process_prepared_mapping = process_prepared_mapping;
1840 pool->process_prepared_discard = process_prepared_discard; 2173 pool->process_prepared_discard = process_prepared_discard;
1841 break; 2174 break;
@@ -1895,6 +2228,29 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1895 wake_worker(pool); 2228 wake_worker(pool);
1896} 2229}
1897 2230
2231static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
2232{
2233 struct pool *pool = tc->pool;
2234
2235 throttle_lock(&pool->throttle);
2236 thin_defer_bio(tc, bio);
2237 throttle_unlock(&pool->throttle);
2238}
2239
2240static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2241{
2242 unsigned long flags;
2243 struct pool *pool = tc->pool;
2244
2245 throttle_lock(&pool->throttle);
2246 spin_lock_irqsave(&tc->lock, flags);
2247 list_add_tail(&cell->user_list, &tc->deferred_cells);
2248 spin_unlock_irqrestore(&tc->lock, flags);
2249 throttle_unlock(&pool->throttle);
2250
2251 wake_worker(pool);
2252}
2253
1898static void thin_hook_bio(struct thin_c *tc, struct bio *bio) 2254static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
1899{ 2255{
1900 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 2256 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -1915,8 +2271,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1915 dm_block_t block = get_bio_block(tc, bio); 2271 dm_block_t block = get_bio_block(tc, bio);
1916 struct dm_thin_device *td = tc->td; 2272 struct dm_thin_device *td = tc->td;
1917 struct dm_thin_lookup_result result; 2273 struct dm_thin_lookup_result result;
1918 struct dm_bio_prison_cell cell1, cell2; 2274 struct dm_bio_prison_cell *virt_cell, *data_cell;
1919 struct dm_bio_prison_cell *cell_result;
1920 struct dm_cell_key key; 2275 struct dm_cell_key key;
1921 2276
1922 thin_hook_bio(tc, bio); 2277 thin_hook_bio(tc, bio);
@@ -1932,7 +2287,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1932 } 2287 }
1933 2288
1934 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 2289 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1935 thin_defer_bio(tc, bio); 2290 thin_defer_bio_with_throttle(tc, bio);
1936 return DM_MAPIO_SUBMITTED; 2291 return DM_MAPIO_SUBMITTED;
1937 } 2292 }
1938 2293
@@ -1941,7 +2296,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1941 * there's a race with discard. 2296 * there's a race with discard.
1942 */ 2297 */
1943 build_virtual_key(tc->td, block, &key); 2298 build_virtual_key(tc->td, block, &key);
1944 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) 2299 if (bio_detain(tc->pool, &key, bio, &virt_cell))
1945 return DM_MAPIO_SUBMITTED; 2300 return DM_MAPIO_SUBMITTED;
1946 2301
1947 r = dm_thin_find_block(td, block, 0, &result); 2302 r = dm_thin_find_block(td, block, 0, &result);
@@ -1966,20 +2321,19 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1966 * More distant ancestors are irrelevant. The 2321 * More distant ancestors are irrelevant. The
1967 * shared flag will be set in their case. 2322 * shared flag will be set in their case.
1968 */ 2323 */
1969 thin_defer_bio(tc, bio); 2324 thin_defer_cell(tc, virt_cell);
1970 cell_defer_no_holder_no_free(tc, &cell1);
1971 return DM_MAPIO_SUBMITTED; 2325 return DM_MAPIO_SUBMITTED;
1972 } 2326 }
1973 2327
1974 build_data_key(tc->td, result.block, &key); 2328 build_data_key(tc->td, result.block, &key);
1975 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { 2329 if (bio_detain(tc->pool, &key, bio, &data_cell)) {
1976 cell_defer_no_holder_no_free(tc, &cell1); 2330 cell_defer_no_holder(tc, virt_cell);
1977 return DM_MAPIO_SUBMITTED; 2331 return DM_MAPIO_SUBMITTED;
1978 } 2332 }
1979 2333
1980 inc_all_io_entry(tc->pool, bio); 2334 inc_all_io_entry(tc->pool, bio);
1981 cell_defer_no_holder_no_free(tc, &cell2); 2335 cell_defer_no_holder(tc, data_cell);
1982 cell_defer_no_holder_no_free(tc, &cell1); 2336 cell_defer_no_holder(tc, virt_cell);
1983 2337
1984 remap(tc, bio, result.block); 2338 remap(tc, bio, result.block);
1985 return DM_MAPIO_REMAPPED; 2339 return DM_MAPIO_REMAPPED;
@@ -1991,18 +2345,13 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1991 * of doing so. 2345 * of doing so.
1992 */ 2346 */
1993 handle_unserviceable_bio(tc->pool, bio); 2347 handle_unserviceable_bio(tc->pool, bio);
1994 cell_defer_no_holder_no_free(tc, &cell1); 2348 cell_defer_no_holder(tc, virt_cell);
1995 return DM_MAPIO_SUBMITTED; 2349 return DM_MAPIO_SUBMITTED;
1996 } 2350 }
1997 /* fall through */ 2351 /* fall through */
1998 2352
1999 case -EWOULDBLOCK: 2353 case -EWOULDBLOCK:
2000 /* 2354 thin_defer_cell(tc, virt_cell);
2001 * In future, the failed dm_thin_find_block above could
2002 * provide the hint to load the metadata into cache.
2003 */
2004 thin_defer_bio(tc, bio);
2005 cell_defer_no_holder_no_free(tc, &cell1);
2006 return DM_MAPIO_SUBMITTED; 2355 return DM_MAPIO_SUBMITTED;
2007 2356
2008 default: 2357 default:
@@ -2012,7 +2361,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
2012 * pool is switched to fail-io mode. 2361 * pool is switched to fail-io mode.
2013 */ 2362 */
2014 bio_io_error(bio); 2363 bio_io_error(bio);
2015 cell_defer_no_holder_no_free(tc, &cell1); 2364 cell_defer_no_holder(tc, virt_cell);
2016 return DM_MAPIO_SUBMITTED; 2365 return DM_MAPIO_SUBMITTED;
2017 } 2366 }
2018} 2367}
@@ -2193,7 +2542,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
2193 pool->sectors_per_block_shift = __ffs(block_size); 2542 pool->sectors_per_block_shift = __ffs(block_size);
2194 pool->low_water_blocks = 0; 2543 pool->low_water_blocks = 0;
2195 pool_features_init(&pool->pf); 2544 pool_features_init(&pool->pf);
2196 pool->prison = dm_bio_prison_create(PRISON_CELLS); 2545 pool->prison = dm_bio_prison_create();
2197 if (!pool->prison) { 2546 if (!pool->prison) {
2198 *error = "Error creating pool's bio prison"; 2547 *error = "Error creating pool's bio prison";
2199 err_p = ERR_PTR(-ENOMEM); 2548 err_p = ERR_PTR(-ENOMEM);
@@ -2219,6 +2568,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
2219 goto bad_wq; 2568 goto bad_wq;
2220 } 2569 }
2221 2570
2571 throttle_init(&pool->throttle);
2222 INIT_WORK(&pool->worker, do_worker); 2572 INIT_WORK(&pool->worker, do_worker);
2223 INIT_DELAYED_WORK(&pool->waker, do_waker); 2573 INIT_DELAYED_WORK(&pool->waker, do_waker);
2224 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); 2574 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
@@ -2228,6 +2578,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
2228 INIT_LIST_HEAD(&pool->prepared_discards); 2578 INIT_LIST_HEAD(&pool->prepared_discards);
2229 INIT_LIST_HEAD(&pool->active_thins); 2579 INIT_LIST_HEAD(&pool->active_thins);
2230 pool->low_water_triggered = false; 2580 pool->low_water_triggered = false;
2581 pool->suspended = true;
2231 2582
2232 pool->shared_read_ds = dm_deferred_set_create(); 2583 pool->shared_read_ds = dm_deferred_set_create();
2233 if (!pool->shared_read_ds) { 2584 if (!pool->shared_read_ds) {
@@ -2764,20 +3115,77 @@ static int pool_preresume(struct dm_target *ti)
2764 return 0; 3115 return 0;
2765} 3116}
2766 3117
3118static void pool_suspend_active_thins(struct pool *pool)
3119{
3120 struct thin_c *tc;
3121
3122 /* Suspend all active thin devices */
3123 tc = get_first_thin(pool);
3124 while (tc) {
3125 dm_internal_suspend_noflush(tc->thin_md);
3126 tc = get_next_thin(pool, tc);
3127 }
3128}
3129
3130static void pool_resume_active_thins(struct pool *pool)
3131{
3132 struct thin_c *tc;
3133
3134 /* Resume all active thin devices */
3135 tc = get_first_thin(pool);
3136 while (tc) {
3137 dm_internal_resume(tc->thin_md);
3138 tc = get_next_thin(pool, tc);
3139 }
3140}
3141
2767static void pool_resume(struct dm_target *ti) 3142static void pool_resume(struct dm_target *ti)
2768{ 3143{
2769 struct pool_c *pt = ti->private; 3144 struct pool_c *pt = ti->private;
2770 struct pool *pool = pt->pool; 3145 struct pool *pool = pt->pool;
2771 unsigned long flags; 3146 unsigned long flags;
2772 3147
3148 /*
3149 * Must requeue active_thins' bios and then resume
3150 * active_thins _before_ clearing 'suspend' flag.
3151 */
3152 requeue_bios(pool);
3153 pool_resume_active_thins(pool);
3154
2773 spin_lock_irqsave(&pool->lock, flags); 3155 spin_lock_irqsave(&pool->lock, flags);
2774 pool->low_water_triggered = false; 3156 pool->low_water_triggered = false;
3157 pool->suspended = false;
2775 spin_unlock_irqrestore(&pool->lock, flags); 3158 spin_unlock_irqrestore(&pool->lock, flags);
2776 requeue_bios(pool);
2777 3159
2778 do_waker(&pool->waker.work); 3160 do_waker(&pool->waker.work);
2779} 3161}
2780 3162
3163static void pool_presuspend(struct dm_target *ti)
3164{
3165 struct pool_c *pt = ti->private;
3166 struct pool *pool = pt->pool;
3167 unsigned long flags;
3168
3169 spin_lock_irqsave(&pool->lock, flags);
3170 pool->suspended = true;
3171 spin_unlock_irqrestore(&pool->lock, flags);
3172
3173 pool_suspend_active_thins(pool);
3174}
3175
3176static void pool_presuspend_undo(struct dm_target *ti)
3177{
3178 struct pool_c *pt = ti->private;
3179 struct pool *pool = pt->pool;
3180 unsigned long flags;
3181
3182 pool_resume_active_thins(pool);
3183
3184 spin_lock_irqsave(&pool->lock, flags);
3185 pool->suspended = false;
3186 spin_unlock_irqrestore(&pool->lock, flags);
3187}
3188
2781static void pool_postsuspend(struct dm_target *ti) 3189static void pool_postsuspend(struct dm_target *ti)
2782{ 3190{
2783 struct pool_c *pt = ti->private; 3191 struct pool_c *pt = ti->private;
@@ -2949,7 +3357,6 @@ static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct
2949 * create_thin <dev_id> 3357 * create_thin <dev_id>
2950 * create_snap <dev_id> <origin_id> 3358 * create_snap <dev_id> <origin_id>
2951 * delete <dev_id> 3359 * delete <dev_id>
2952 * trim <dev_id> <new_size_in_sectors>
2953 * set_transaction_id <current_trans_id> <new_trans_id> 3360 * set_transaction_id <current_trans_id> <new_trans_id>
2954 * reserve_metadata_snap 3361 * reserve_metadata_snap
2955 * release_metadata_snap 3362 * release_metadata_snap
@@ -3177,15 +3584,35 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
3177{ 3584{
3178 struct pool_c *pt = ti->private; 3585 struct pool_c *pt = ti->private;
3179 struct pool *pool = pt->pool; 3586 struct pool *pool = pt->pool;
3180 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3587 sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3588
3589 /*
3590 * If max_sectors is smaller than pool->sectors_per_block adjust it
3591 * to the highest possible power-of-2 factor of pool->sectors_per_block.
3592 * This is especially beneficial when the pool's data device is a RAID
3593 * device that has a full stripe width that matches pool->sectors_per_block
3594 * -- because even though partial RAID stripe-sized IOs will be issued to a
3595 * single RAID stripe; when aggregated they will end on a full RAID stripe
3596 * boundary.. which avoids additional partial RAID stripe writes cascading
3597 */
3598 if (limits->max_sectors < pool->sectors_per_block) {
3599 while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
3600 if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
3601 limits->max_sectors--;
3602 limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
3603 }
3604 }
3181 3605
3182 /* 3606 /*
3183 * If the system-determined stacked limits are compatible with the 3607 * If the system-determined stacked limits are compatible with the
3184 * pool's blocksize (io_opt is a factor) do not override them. 3608 * pool's blocksize (io_opt is a factor) do not override them.
3185 */ 3609 */
3186 if (io_opt_sectors < pool->sectors_per_block || 3610 if (io_opt_sectors < pool->sectors_per_block ||
3187 do_div(io_opt_sectors, pool->sectors_per_block)) { 3611 !is_factor(io_opt_sectors, pool->sectors_per_block)) {
3188 blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); 3612 if (is_factor(pool->sectors_per_block, limits->max_sectors))
3613 blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
3614 else
3615 blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
3189 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 3616 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
3190 } 3617 }
3191 3618
@@ -3214,11 +3641,13 @@ static struct target_type pool_target = {
3214 .name = "thin-pool", 3641 .name = "thin-pool",
3215 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 3642 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
3216 DM_TARGET_IMMUTABLE, 3643 DM_TARGET_IMMUTABLE,
3217 .version = {1, 13, 0}, 3644 .version = {1, 14, 0},
3218 .module = THIS_MODULE, 3645 .module = THIS_MODULE,
3219 .ctr = pool_ctr, 3646 .ctr = pool_ctr,
3220 .dtr = pool_dtr, 3647 .dtr = pool_dtr,
3221 .map = pool_map, 3648 .map = pool_map,
3649 .presuspend = pool_presuspend,
3650 .presuspend_undo = pool_presuspend_undo,
3222 .postsuspend = pool_postsuspend, 3651 .postsuspend = pool_postsuspend,
3223 .preresume = pool_preresume, 3652 .preresume = pool_preresume,
3224 .resume = pool_resume, 3653 .resume = pool_resume,
@@ -3248,14 +3677,14 @@ static void thin_dtr(struct dm_target *ti)
3248 struct thin_c *tc = ti->private; 3677 struct thin_c *tc = ti->private;
3249 unsigned long flags; 3678 unsigned long flags;
3250 3679
3251 thin_put(tc);
3252 wait_for_completion(&tc->can_destroy);
3253
3254 spin_lock_irqsave(&tc->pool->lock, flags); 3680 spin_lock_irqsave(&tc->pool->lock, flags);
3255 list_del_rcu(&tc->list); 3681 list_del_rcu(&tc->list);
3256 spin_unlock_irqrestore(&tc->pool->lock, flags); 3682 spin_unlock_irqrestore(&tc->pool->lock, flags);
3257 synchronize_rcu(); 3683 synchronize_rcu();
3258 3684
3685 thin_put(tc);
3686 wait_for_completion(&tc->can_destroy);
3687
3259 mutex_lock(&dm_thin_pool_table.mutex); 3688 mutex_lock(&dm_thin_pool_table.mutex);
3260 3689
3261 __pool_dec(tc->pool); 3690 __pool_dec(tc->pool);
@@ -3302,7 +3731,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3302 r = -ENOMEM; 3731 r = -ENOMEM;
3303 goto out_unlock; 3732 goto out_unlock;
3304 } 3733 }
3734 tc->thin_md = dm_table_get_md(ti->table);
3305 spin_lock_init(&tc->lock); 3735 spin_lock_init(&tc->lock);
3736 INIT_LIST_HEAD(&tc->deferred_cells);
3306 bio_list_init(&tc->deferred_bio_list); 3737 bio_list_init(&tc->deferred_bio_list);
3307 bio_list_init(&tc->retry_on_resume_list); 3738 bio_list_init(&tc->retry_on_resume_list);
3308 tc->sort_bio_list = RB_ROOT; 3739 tc->sort_bio_list = RB_ROOT;
@@ -3347,18 +3778,18 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3347 if (get_pool_mode(tc->pool) == PM_FAIL) { 3778 if (get_pool_mode(tc->pool) == PM_FAIL) {
3348 ti->error = "Couldn't open thin device, Pool is in fail mode"; 3779 ti->error = "Couldn't open thin device, Pool is in fail mode";
3349 r = -EINVAL; 3780 r = -EINVAL;
3350 goto bad_thin_open; 3781 goto bad_pool;
3351 } 3782 }
3352 3783
3353 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 3784 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
3354 if (r) { 3785 if (r) {
3355 ti->error = "Couldn't open thin internal device"; 3786 ti->error = "Couldn't open thin internal device";
3356 goto bad_thin_open; 3787 goto bad_pool;
3357 } 3788 }
3358 3789
3359 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); 3790 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
3360 if (r) 3791 if (r)
3361 goto bad_target_max_io_len; 3792 goto bad;
3362 3793
3363 ti->num_flush_bios = 1; 3794 ti->num_flush_bios = 1;
3364 ti->flush_supported = true; 3795 ti->flush_supported = true;
@@ -3373,14 +3804,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3373 ti->split_discard_bios = true; 3804 ti->split_discard_bios = true;
3374 } 3805 }
3375 3806
3376 dm_put(pool_md);
3377
3378 mutex_unlock(&dm_thin_pool_table.mutex); 3807 mutex_unlock(&dm_thin_pool_table.mutex);
3379 3808
3380 atomic_set(&tc->refcount, 1);
3381 init_completion(&tc->can_destroy);
3382
3383 spin_lock_irqsave(&tc->pool->lock, flags); 3809 spin_lock_irqsave(&tc->pool->lock, flags);
3810 if (tc->pool->suspended) {
3811 spin_unlock_irqrestore(&tc->pool->lock, flags);
3812 mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
3813 ti->error = "Unable to activate thin device while pool is suspended";
3814 r = -EINVAL;
3815 goto bad;
3816 }
3384 list_add_tail_rcu(&tc->list, &tc->pool->active_thins); 3817 list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
3385 spin_unlock_irqrestore(&tc->pool->lock, flags); 3818 spin_unlock_irqrestore(&tc->pool->lock, flags);
3386 /* 3819 /*
@@ -3391,11 +3824,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3391 */ 3824 */
3392 synchronize_rcu(); 3825 synchronize_rcu();
3393 3826
3827 dm_put(pool_md);
3828
3829 atomic_set(&tc->refcount, 1);
3830 init_completion(&tc->can_destroy);
3831
3394 return 0; 3832 return 0;
3395 3833
3396bad_target_max_io_len: 3834bad:
3397 dm_pool_close_thin_device(tc->td); 3835 dm_pool_close_thin_device(tc->td);
3398bad_thin_open: 3836bad_pool:
3399 __pool_dec(tc->pool); 3837 __pool_dec(tc->pool);
3400bad_pool_lookup: 3838bad_pool_lookup:
3401 dm_put(pool_md); 3839 dm_put(pool_md);
@@ -3541,6 +3979,21 @@ err:
3541 DMEMIT("Error"); 3979 DMEMIT("Error");
3542} 3980}
3543 3981
3982static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
3983 struct bio_vec *biovec, int max_size)
3984{
3985 struct thin_c *tc = ti->private;
3986 struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
3987
3988 if (!q->merge_bvec_fn)
3989 return max_size;
3990
3991 bvm->bi_bdev = tc->pool_dev->bdev;
3992 bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
3993
3994 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3995}
3996
3544static int thin_iterate_devices(struct dm_target *ti, 3997static int thin_iterate_devices(struct dm_target *ti,
3545 iterate_devices_callout_fn fn, void *data) 3998 iterate_devices_callout_fn fn, void *data)
3546{ 3999{
@@ -3565,7 +4018,7 @@ static int thin_iterate_devices(struct dm_target *ti,
3565 4018
3566static struct target_type thin_target = { 4019static struct target_type thin_target = {
3567 .name = "thin", 4020 .name = "thin",
3568 .version = {1, 13, 0}, 4021 .version = {1, 14, 0},
3569 .module = THIS_MODULE, 4022 .module = THIS_MODULE,
3570 .ctr = thin_ctr, 4023 .ctr = thin_ctr,
3571 .dtr = thin_dtr, 4024 .dtr = thin_dtr,
@@ -3575,6 +4028,7 @@ static struct target_type thin_target = {
3575 .presuspend = thin_presuspend, 4028 .presuspend = thin_presuspend,
3576 .postsuspend = thin_postsuspend, 4029 .postsuspend = thin_postsuspend,
3577 .status = thin_status, 4030 .status = thin_status,
4031 .merge = thin_merge,
3578 .iterate_devices = thin_iterate_devices, 4032 .iterate_devices = thin_iterate_devices,
3579}; 4033};
3580 4034