aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig10
-rw-r--r--drivers/md/dm-cache-policy-mq.c4
-rw-r--r--drivers/md/dm-cache-target.c24
-rw-r--r--drivers/md/dm-io.c23
-rw-r--r--drivers/md/dm-log-userspace-transfer.c2
-rw-r--r--drivers/md/dm-mpath.c7
-rw-r--r--drivers/md/dm-raid1.c3
-rw-r--r--drivers/md/dm-snap-persistent.c3
-rw-r--r--drivers/md/dm-thin-metadata.c58
-rw-r--r--drivers/md/dm-thin-metadata.h21
-rw-r--r--drivers/md/dm-thin.c343
-rw-r--r--drivers/md/persistent-data/Kconfig10
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c115
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.h11
14 files changed, 484 insertions, 150 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 9a06fe883766..95ad936e6048 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -254,16 +254,6 @@ config DM_THIN_PROVISIONING
254 ---help--- 254 ---help---
255 Provides thin provisioning and snapshots that share a data store. 255 Provides thin provisioning and snapshots that share a data store.
256 256
257config DM_DEBUG_BLOCK_STACK_TRACING
258 boolean "Keep stack trace of persistent data block lock holders"
259 depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
260 select STACKTRACE
261 ---help---
262 Enable this for messages that may help debug problems with the
263 block manager locking used by thin provisioning and caching.
264
265 If unsure, say N.
266
267config DM_CACHE 257config DM_CACHE
268 tristate "Cache target (EXPERIMENTAL)" 258 tristate "Cache target (EXPERIMENTAL)"
269 depends on BLK_DEV_DM 259 depends on BLK_DEV_DM
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 1e018e986610..0e385e40909e 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -872,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p)
872{ 872{
873 struct mq_policy *mq = to_mq_policy(p); 873 struct mq_policy *mq = to_mq_policy(p);
874 874
875 kfree(mq->table); 875 vfree(mq->table);
876 epool_exit(&mq->cache_pool); 876 epool_exit(&mq->cache_pool);
877 epool_exit(&mq->pre_cache_pool); 877 epool_exit(&mq->pre_cache_pool);
878 kfree(mq); 878 kfree(mq);
@@ -1245,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1245 1245
1246 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); 1246 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
1247 mq->hash_bits = ffs(mq->nr_buckets) - 1; 1247 mq->hash_bits = ffs(mq->nr_buckets) - 1;
1248 mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); 1248 mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
1249 if (!mq->table) 1249 if (!mq->table)
1250 goto bad_alloc_table; 1250 goto bad_alloc_table;
1251 1251
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index ffd472e015ca..074b9c8e4cf0 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -289,6 +289,7 @@ struct per_bio_data {
289 bool tick:1; 289 bool tick:1;
290 unsigned req_nr:2; 290 unsigned req_nr:2;
291 struct dm_deferred_entry *all_io_entry; 291 struct dm_deferred_entry *all_io_entry;
292 struct dm_hook_info hook_info;
292 293
293 /* 294 /*
294 * writethrough fields. These MUST remain at the end of this 295 * writethrough fields. These MUST remain at the end of this
@@ -297,7 +298,6 @@ struct per_bio_data {
297 */ 298 */
298 struct cache *cache; 299 struct cache *cache;
299 dm_cblock_t cblock; 300 dm_cblock_t cblock;
300 struct dm_hook_info hook_info;
301 struct dm_bio_details bio_details; 301 struct dm_bio_details bio_details;
302}; 302};
303 303
@@ -671,15 +671,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio,
671 dm_cblock_t cblock) 671 dm_cblock_t cblock)
672{ 672{
673 sector_t bi_sector = bio->bi_iter.bi_sector; 673 sector_t bi_sector = bio->bi_iter.bi_sector;
674 sector_t block = from_cblock(cblock);
674 675
675 bio->bi_bdev = cache->cache_dev->bdev; 676 bio->bi_bdev = cache->cache_dev->bdev;
676 if (!block_size_is_power_of_two(cache)) 677 if (!block_size_is_power_of_two(cache))
677 bio->bi_iter.bi_sector = 678 bio->bi_iter.bi_sector =
678 (from_cblock(cblock) * cache->sectors_per_block) + 679 (block * cache->sectors_per_block) +
679 sector_div(bi_sector, cache->sectors_per_block); 680 sector_div(bi_sector, cache->sectors_per_block);
680 else 681 else
681 bio->bi_iter.bi_sector = 682 bio->bi_iter.bi_sector =
682 (from_cblock(cblock) << cache->sectors_per_block_shift) | 683 (block << cache->sectors_per_block_shift) |
683 (bi_sector & (cache->sectors_per_block - 1)); 684 (bi_sector & (cache->sectors_per_block - 1));
684} 685}
685 686
@@ -978,12 +979,13 @@ static void issue_copy_real(struct dm_cache_migration *mg)
978 int r; 979 int r;
979 struct dm_io_region o_region, c_region; 980 struct dm_io_region o_region, c_region;
980 struct cache *cache = mg->cache; 981 struct cache *cache = mg->cache;
982 sector_t cblock = from_cblock(mg->cblock);
981 983
982 o_region.bdev = cache->origin_dev->bdev; 984 o_region.bdev = cache->origin_dev->bdev;
983 o_region.count = cache->sectors_per_block; 985 o_region.count = cache->sectors_per_block;
984 986
985 c_region.bdev = cache->cache_dev->bdev; 987 c_region.bdev = cache->cache_dev->bdev;
986 c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block; 988 c_region.sector = cblock * cache->sectors_per_block;
987 c_region.count = cache->sectors_per_block; 989 c_region.count = cache->sectors_per_block;
988 990
989 if (mg->writeback || mg->demote) { 991 if (mg->writeback || mg->demote) {
@@ -1010,13 +1012,15 @@ static void overwrite_endio(struct bio *bio, int err)
1010 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1012 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1011 unsigned long flags; 1013 unsigned long flags;
1012 1014
1015 dm_unhook_bio(&pb->hook_info, bio);
1016
1013 if (err) 1017 if (err)
1014 mg->err = true; 1018 mg->err = true;
1015 1019
1020 mg->requeue_holder = false;
1021
1016 spin_lock_irqsave(&cache->lock, flags); 1022 spin_lock_irqsave(&cache->lock, flags);
1017 list_add_tail(&mg->list, &cache->completed_migrations); 1023 list_add_tail(&mg->list, &cache->completed_migrations);
1018 dm_unhook_bio(&pb->hook_info, bio);
1019 mg->requeue_holder = false;
1020 spin_unlock_irqrestore(&cache->lock, flags); 1024 spin_unlock_irqrestore(&cache->lock, flags);
1021 1025
1022 wake_worker(cache); 1026 wake_worker(cache);
@@ -2461,20 +2465,18 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
2461 bool discarded_block; 2465 bool discarded_block;
2462 struct dm_bio_prison_cell *cell; 2466 struct dm_bio_prison_cell *cell;
2463 struct policy_result lookup_result; 2467 struct policy_result lookup_result;
2464 struct per_bio_data *pb; 2468 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
2465 2469
2466 if (from_oblock(block) > from_oblock(cache->origin_blocks)) { 2470 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2467 /* 2471 /*
2468 * This can only occur if the io goes to a partial block at 2472 * This can only occur if the io goes to a partial block at
2469 * the end of the origin device. We don't cache these. 2473 * the end of the origin device. We don't cache these.
2470 * Just remap to the origin and carry on. 2474 * Just remap to the origin and carry on.
2471 */ 2475 */
2472 remap_to_origin_clear_discard(cache, bio, block); 2476 remap_to_origin(cache, bio);
2473 return DM_MAPIO_REMAPPED; 2477 return DM_MAPIO_REMAPPED;
2474 } 2478 }
2475 2479
2476 pb = init_per_bio_data(bio, pb_data_size);
2477
2478 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { 2480 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2479 defer_bio(cache, bio); 2481 defer_bio(cache, bio);
2480 return DM_MAPIO_SUBMITTED; 2482 return DM_MAPIO_SUBMITTED;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b2b8a10e8427..3842ac738f98 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -201,29 +201,28 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse
201/* 201/*
202 * Functions for getting the pages from a bvec. 202 * Functions for getting the pages from a bvec.
203 */ 203 */
204static void bio_get_page(struct dpages *dp, 204static void bio_get_page(struct dpages *dp, struct page **p,
205 struct page **p, unsigned long *len, unsigned *offset) 205 unsigned long *len, unsigned *offset)
206{ 206{
207 struct bio *bio = dp->context_ptr; 207 struct bio_vec *bvec = dp->context_ptr;
208 struct bio_vec bvec = bio_iovec(bio); 208 *p = bvec->bv_page;
209 *p = bvec.bv_page; 209 *len = bvec->bv_len - dp->context_u;
210 *len = bvec.bv_len; 210 *offset = bvec->bv_offset + dp->context_u;
211 *offset = bvec.bv_offset;
212} 211}
213 212
214static void bio_next_page(struct dpages *dp) 213static void bio_next_page(struct dpages *dp)
215{ 214{
216 struct bio *bio = dp->context_ptr; 215 struct bio_vec *bvec = dp->context_ptr;
217 struct bio_vec bvec = bio_iovec(bio); 216 dp->context_ptr = bvec + 1;
218 217 dp->context_u = 0;
219 bio_advance(bio, bvec.bv_len);
220} 218}
221 219
222static void bio_dp_init(struct dpages *dp, struct bio *bio) 220static void bio_dp_init(struct dpages *dp, struct bio *bio)
223{ 221{
224 dp->get_page = bio_get_page; 222 dp->get_page = bio_get_page;
225 dp->next_page = bio_next_page; 223 dp->next_page = bio_next_page;
226 dp->context_ptr = bio; 224 dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
225 dp->context_u = bio->bi_iter.bi_bvec_done;
227} 226}
228 227
229/* 228/*
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 08d9a207259a..b428c0ae63d5 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -66,7 +66,7 @@ static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
66 msg->seq = tfr->seq; 66 msg->seq = tfr->seq;
67 msg->len = sizeof(struct dm_ulog_request) + tfr->data_size; 67 msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
68 68
69 r = cn_netlink_send(msg, 0, gfp_any()); 69 r = cn_netlink_send(msg, 0, 0, gfp_any());
70 70
71 return r; 71 return r;
72} 72}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6eb9dc9ef8f3..422a9fdeb53e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1626,8 +1626,11 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1626 /* 1626 /*
1627 * Only pass ioctls through if the device sizes match exactly. 1627 * Only pass ioctls through if the device sizes match exactly.
1628 */ 1628 */
1629 if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) 1629 if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
1630 r = scsi_verify_blk_ioctl(NULL, cmd); 1630 int err = scsi_verify_blk_ioctl(NULL, cmd);
1631 if (err)
1632 r = err;
1633 }
1631 1634
1632 if (r == -ENOTCONN && !fatal_signal_pending(current)) 1635 if (r == -ENOTCONN && !fatal_signal_pending(current))
1633 queue_work(kmultipathd, &m->process_queued_ios); 1636 queue_work(kmultipathd, &m->process_queued_ios);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index f284e0bfb25f..7dfdb5c746d6 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1244,6 +1244,9 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
1244 1244
1245 dm_bio_restore(bd, bio); 1245 dm_bio_restore(bd, bio);
1246 bio_record->details.bi_bdev = NULL; 1246 bio_record->details.bi_bdev = NULL;
1247
1248 atomic_inc(&bio->bi_remaining);
1249
1247 queue_bio(ms, bio, rw); 1250 queue_bio(ms, bio, rw);
1248 return DM_ENDIO_INCOMPLETE; 1251 return DM_ENDIO_INCOMPLETE;
1249 } 1252 }
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index afc3d017de4c..d6e88178d22c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -546,6 +546,9 @@ static int read_exceptions(struct pstore *ps,
546 r = insert_exceptions(ps, area, callback, callback_context, 546 r = insert_exceptions(ps, area, callback, callback_context,
547 &full); 547 &full);
548 548
549 if (!full)
550 memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT);
551
549 dm_bufio_release(bp); 552 dm_bufio_release(bp);
550 553
551 dm_bufio_forget(client, chunk); 554 dm_bufio_forget(client, chunk);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 7da347665552..fb9efc829182 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -76,7 +76,7 @@
76 76
77#define THIN_SUPERBLOCK_MAGIC 27022010 77#define THIN_SUPERBLOCK_MAGIC 27022010
78#define THIN_SUPERBLOCK_LOCATION 0 78#define THIN_SUPERBLOCK_LOCATION 0
79#define THIN_VERSION 1 79#define THIN_VERSION 2
80#define THIN_METADATA_CACHE_SIZE 64 80#define THIN_METADATA_CACHE_SIZE 64
81#define SECTOR_TO_BLOCK_SHIFT 3 81#define SECTOR_TO_BLOCK_SHIFT 3
82 82
@@ -483,7 +483,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
483 483
484 disk_super->data_mapping_root = cpu_to_le64(pmd->root); 484 disk_super->data_mapping_root = cpu_to_le64(pmd->root);
485 disk_super->device_details_root = cpu_to_le64(pmd->details_root); 485 disk_super->device_details_root = cpu_to_le64(pmd->details_root);
486 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 486 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
487 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); 487 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
488 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); 488 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
489 489
@@ -651,7 +651,7 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
651{ 651{
652 int r; 652 int r;
653 653
654 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE, 654 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
655 THIN_METADATA_CACHE_SIZE, 655 THIN_METADATA_CACHE_SIZE,
656 THIN_MAX_CONCURRENT_LOCKS); 656 THIN_MAX_CONCURRENT_LOCKS);
657 if (IS_ERR(pmd->bm)) { 657 if (IS_ERR(pmd->bm)) {
@@ -1489,6 +1489,23 @@ bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1489 return r; 1489 return r;
1490} 1490}
1491 1491
1492bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
1493{
1494 bool r = false;
1495 struct dm_thin_device *td, *tmp;
1496
1497 down_read(&pmd->root_lock);
1498 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1499 if (td->changed) {
1500 r = td->changed;
1501 break;
1502 }
1503 }
1504 up_read(&pmd->root_lock);
1505
1506 return r;
1507}
1508
1492bool dm_thin_aborted_changes(struct dm_thin_device *td) 1509bool dm_thin_aborted_changes(struct dm_thin_device *td)
1493{ 1510{
1494 bool r; 1511 bool r;
@@ -1738,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
1738 1755
1739 return r; 1756 return r;
1740} 1757}
1758
1759int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
1760{
1761 int r;
1762 struct dm_block *sblock;
1763 struct thin_disk_superblock *disk_super;
1764
1765 down_write(&pmd->root_lock);
1766 pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
1767
1768 r = superblock_lock(pmd, &sblock);
1769 if (r) {
1770 DMERR("couldn't read superblock");
1771 goto out;
1772 }
1773
1774 disk_super = dm_block_data(sblock);
1775 disk_super->flags = cpu_to_le32(pmd->flags);
1776
1777 dm_bm_unlock(sblock);
1778out:
1779 up_write(&pmd->root_lock);
1780 return r;
1781}
1782
1783bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
1784{
1785 bool needs_check;
1786
1787 down_read(&pmd->root_lock);
1788 needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
1789 up_read(&pmd->root_lock);
1790
1791 return needs_check;
1792}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 9a368567632f..e3c857db195a 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -9,16 +9,14 @@
9 9
10#include "persistent-data/dm-block-manager.h" 10#include "persistent-data/dm-block-manager.h"
11#include "persistent-data/dm-space-map.h" 11#include "persistent-data/dm-space-map.h"
12#include "persistent-data/dm-space-map-metadata.h"
12 13
13#define THIN_METADATA_BLOCK_SIZE 4096 14#define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
14 15
15/* 16/*
16 * The metadata device is currently limited in size. 17 * The metadata device is currently limited in size.
17 *
18 * We have one block of index, which can hold 255 index entries. Each
19 * index entry contains allocation info about 16k metadata blocks.
20 */ 18 */
21#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) 19#define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
22 20
23/* 21/*
24 * A metadata device larger than 16GB triggers a warning. 22 * A metadata device larger than 16GB triggers a warning.
@@ -27,6 +25,11 @@
27 25
28/*----------------------------------------------------------------*/ 26/*----------------------------------------------------------------*/
29 27
28/*
29 * Thin metadata superblock flags.
30 */
31#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0)
32
30struct dm_pool_metadata; 33struct dm_pool_metadata;
31struct dm_thin_device; 34struct dm_thin_device;
32 35
@@ -161,6 +164,8 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
161 */ 164 */
162bool dm_thin_changed_this_transaction(struct dm_thin_device *td); 165bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
163 166
167bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd);
168
164bool dm_thin_aborted_changes(struct dm_thin_device *td); 169bool dm_thin_aborted_changes(struct dm_thin_device *td);
165 170
166int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, 171int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
@@ -202,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
202 dm_sm_threshold_fn fn, 207 dm_sm_threshold_fn fn,
203 void *context); 208 void *context);
204 209
210/*
211 * Updates the superblock immediately.
212 */
213int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
214bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
215
205/*----------------------------------------------------------------*/ 216/*----------------------------------------------------------------*/
206 217
207#endif 218#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index faaf944597ab..be70d38745f7 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
130struct dm_thin_new_mapping; 130struct dm_thin_new_mapping;
131 131
132/* 132/*
133 * The pool runs in 3 modes. Ordered in degraded order for comparisons. 133 * The pool runs in 4 modes. Ordered in degraded order for comparisons.
134 */ 134 */
135enum pool_mode { 135enum pool_mode {
136 PM_WRITE, /* metadata may be changed */ 136 PM_WRITE, /* metadata may be changed */
137 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
137 PM_READ_ONLY, /* metadata may not be changed */ 138 PM_READ_ONLY, /* metadata may not be changed */
138 PM_FAIL, /* all I/O fails */ 139 PM_FAIL, /* all I/O fails */
139}; 140};
@@ -198,7 +199,6 @@ struct pool {
198}; 199};
199 200
200static enum pool_mode get_pool_mode(struct pool *pool); 201static enum pool_mode get_pool_mode(struct pool *pool);
201static void out_of_data_space(struct pool *pool);
202static void metadata_operation_failed(struct pool *pool, const char *op, int r); 202static void metadata_operation_failed(struct pool *pool, const char *op, int r);
203 203
204/* 204/*
@@ -226,6 +226,7 @@ struct thin_c {
226 226
227 struct pool *pool; 227 struct pool *pool;
228 struct dm_thin_device *td; 228 struct dm_thin_device *td;
229 bool requeue_mode:1;
229}; 230};
230 231
231/*----------------------------------------------------------------*/ 232/*----------------------------------------------------------------*/
@@ -369,14 +370,18 @@ struct dm_thin_endio_hook {
369 struct dm_thin_new_mapping *overwrite_mapping; 370 struct dm_thin_new_mapping *overwrite_mapping;
370}; 371};
371 372
372static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 373static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
373{ 374{
374 struct bio *bio; 375 struct bio *bio;
375 struct bio_list bios; 376 struct bio_list bios;
377 unsigned long flags;
376 378
377 bio_list_init(&bios); 379 bio_list_init(&bios);
380
381 spin_lock_irqsave(&tc->pool->lock, flags);
378 bio_list_merge(&bios, master); 382 bio_list_merge(&bios, master);
379 bio_list_init(master); 383 bio_list_init(master);
384 spin_unlock_irqrestore(&tc->pool->lock, flags);
380 385
381 while ((bio = bio_list_pop(&bios))) { 386 while ((bio = bio_list_pop(&bios))) {
382 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 387 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -391,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
391static void requeue_io(struct thin_c *tc) 396static void requeue_io(struct thin_c *tc)
392{ 397{
393 struct pool *pool = tc->pool; 398 struct pool *pool = tc->pool;
399
400 requeue_bio_list(tc, &pool->deferred_bios);
401 requeue_bio_list(tc, &pool->retry_on_resume_list);
402}
403
404static void error_retry_list(struct pool *pool)
405{
406 struct bio *bio;
394 unsigned long flags; 407 unsigned long flags;
408 struct bio_list bios;
409
410 bio_list_init(&bios);
395 411
396 spin_lock_irqsave(&pool->lock, flags); 412 spin_lock_irqsave(&pool->lock, flags);
397 __requeue_bio_list(tc, &pool->deferred_bios); 413 bio_list_merge(&bios, &pool->retry_on_resume_list);
398 __requeue_bio_list(tc, &pool->retry_on_resume_list); 414 bio_list_init(&pool->retry_on_resume_list);
399 spin_unlock_irqrestore(&pool->lock, flags); 415 spin_unlock_irqrestore(&pool->lock, flags);
416
417 while ((bio = bio_list_pop(&bios)))
418 bio_io_error(bio);
400} 419}
401 420
402/* 421/*
@@ -925,13 +944,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
925 } 944 }
926} 945}
927 946
947static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
948
928static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 949static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
929{ 950{
930 int r; 951 int r;
931 dm_block_t free_blocks; 952 dm_block_t free_blocks;
932 struct pool *pool = tc->pool; 953 struct pool *pool = tc->pool;
933 954
934 if (get_pool_mode(pool) != PM_WRITE) 955 if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
935 return -EINVAL; 956 return -EINVAL;
936 957
937 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 958 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
@@ -958,7 +979,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
958 } 979 }
959 980
960 if (!free_blocks) { 981 if (!free_blocks) {
961 out_of_data_space(pool); 982 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
962 return -ENOSPC; 983 return -ENOSPC;
963 } 984 }
964 } 985 }
@@ -988,15 +1009,32 @@ static void retry_on_resume(struct bio *bio)
988 spin_unlock_irqrestore(&pool->lock, flags); 1009 spin_unlock_irqrestore(&pool->lock, flags);
989} 1010}
990 1011
991static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) 1012static bool should_error_unserviceable_bio(struct pool *pool)
992{ 1013{
993 /* 1014 enum pool_mode m = get_pool_mode(pool);
994 * When pool is read-only, no cell locking is needed because
995 * nothing is changing.
996 */
997 WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
998 1015
999 if (pool->pf.error_if_no_space) 1016 switch (m) {
1017 case PM_WRITE:
1018 /* Shouldn't get here */
1019 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1020 return true;
1021
1022 case PM_OUT_OF_DATA_SPACE:
1023 return pool->pf.error_if_no_space;
1024
1025 case PM_READ_ONLY:
1026 case PM_FAIL:
1027 return true;
1028 default:
1029 /* Shouldn't get here */
1030 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1031 return true;
1032 }
1033}
1034
1035static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1036{
1037 if (should_error_unserviceable_bio(pool))
1000 bio_io_error(bio); 1038 bio_io_error(bio);
1001 else 1039 else
1002 retry_on_resume(bio); 1040 retry_on_resume(bio);
@@ -1007,11 +1045,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
1007 struct bio *bio; 1045 struct bio *bio;
1008 struct bio_list bios; 1046 struct bio_list bios;
1009 1047
1048 if (should_error_unserviceable_bio(pool)) {
1049 cell_error(pool, cell);
1050 return;
1051 }
1052
1010 bio_list_init(&bios); 1053 bio_list_init(&bios);
1011 cell_release(pool, cell, &bios); 1054 cell_release(pool, cell, &bios);
1012 1055
1013 while ((bio = bio_list_pop(&bios))) 1056 if (should_error_unserviceable_bio(pool))
1014 handle_unserviceable_bio(pool, bio); 1057 while ((bio = bio_list_pop(&bios)))
1058 bio_io_error(bio);
1059 else
1060 while ((bio = bio_list_pop(&bios)))
1061 retry_on_resume(bio);
1015} 1062}
1016 1063
1017static void process_discard(struct thin_c *tc, struct bio *bio) 1064static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -1296,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1296 } 1343 }
1297} 1344}
1298 1345
1346static void process_bio_success(struct thin_c *tc, struct bio *bio)
1347{
1348 bio_endio(bio, 0);
1349}
1350
1299static void process_bio_fail(struct thin_c *tc, struct bio *bio) 1351static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1300{ 1352{
1301 bio_io_error(bio); 1353 bio_io_error(bio);
@@ -1328,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool)
1328 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1380 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1329 struct thin_c *tc = h->tc; 1381 struct thin_c *tc = h->tc;
1330 1382
1383 if (tc->requeue_mode) {
1384 bio_endio(bio, DM_ENDIO_REQUEUE);
1385 continue;
1386 }
1387
1331 /* 1388 /*
1332 * If we've got no free new_mapping structs, and processing 1389 * If we've got no free new_mapping structs, and processing
1333 * this bio might require one, we pause until there are some 1390 * this bio might require one, we pause until there are some
@@ -1357,7 +1414,8 @@ static void process_deferred_bios(struct pool *pool)
1357 bio_list_init(&pool->deferred_flush_bios); 1414 bio_list_init(&pool->deferred_flush_bios);
1358 spin_unlock_irqrestore(&pool->lock, flags); 1415 spin_unlock_irqrestore(&pool->lock, flags);
1359 1416
1360 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1417 if (bio_list_empty(&bios) &&
1418 !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
1361 return; 1419 return;
1362 1420
1363 if (commit(pool)) { 1421 if (commit(pool)) {
@@ -1393,51 +1451,134 @@ static void do_waker(struct work_struct *ws)
1393 1451
1394/*----------------------------------------------------------------*/ 1452/*----------------------------------------------------------------*/
1395 1453
1454struct noflush_work {
1455 struct work_struct worker;
1456 struct thin_c *tc;
1457
1458 atomic_t complete;
1459 wait_queue_head_t wait;
1460};
1461
1462static void complete_noflush_work(struct noflush_work *w)
1463{
1464 atomic_set(&w->complete, 1);
1465 wake_up(&w->wait);
1466}
1467
1468static void do_noflush_start(struct work_struct *ws)
1469{
1470 struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1471 w->tc->requeue_mode = true;
1472 requeue_io(w->tc);
1473 complete_noflush_work(w);
1474}
1475
1476static void do_noflush_stop(struct work_struct *ws)
1477{
1478 struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1479 w->tc->requeue_mode = false;
1480 complete_noflush_work(w);
1481}
1482
1483static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
1484{
1485 struct noflush_work w;
1486
1487 INIT_WORK(&w.worker, fn);
1488 w.tc = tc;
1489 atomic_set(&w.complete, 0);
1490 init_waitqueue_head(&w.wait);
1491
1492 queue_work(tc->pool->wq, &w.worker);
1493
1494 wait_event(w.wait, atomic_read(&w.complete));
1495}
1496
1497/*----------------------------------------------------------------*/
1498
1396static enum pool_mode get_pool_mode(struct pool *pool) 1499static enum pool_mode get_pool_mode(struct pool *pool)
1397{ 1500{
1398 return pool->pf.mode; 1501 return pool->pf.mode;
1399} 1502}
1400 1503
1504static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
1505{
1506 dm_table_event(pool->ti->table);
1507 DMINFO("%s: switching pool to %s mode",
1508 dm_device_name(pool->pool_md), new_mode);
1509}
1510
1401static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) 1511static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1402{ 1512{
1403 int r; 1513 struct pool_c *pt = pool->ti->private;
1404 enum pool_mode old_mode = pool->pf.mode; 1514 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1515 enum pool_mode old_mode = get_pool_mode(pool);
1516
1517 /*
1518 * Never allow the pool to transition to PM_WRITE mode if user
1519 * intervention is required to verify metadata and data consistency.
1520 */
1521 if (new_mode == PM_WRITE && needs_check) {
1522 DMERR("%s: unable to switch pool to write mode until repaired.",
1523 dm_device_name(pool->pool_md));
1524 if (old_mode != new_mode)
1525 new_mode = old_mode;
1526 else
1527 new_mode = PM_READ_ONLY;
1528 }
1529 /*
1530 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1531 * not going to recover without a thin_repair. So we never let the
1532 * pool move out of the old mode.
1533 */
1534 if (old_mode == PM_FAIL)
1535 new_mode = old_mode;
1405 1536
1406 switch (new_mode) { 1537 switch (new_mode) {
1407 case PM_FAIL: 1538 case PM_FAIL:
1408 if (old_mode != new_mode) 1539 if (old_mode != new_mode)
1409 DMERR("%s: switching pool to failure mode", 1540 notify_of_pool_mode_change(pool, "failure");
1410 dm_device_name(pool->pool_md));
1411 dm_pool_metadata_read_only(pool->pmd); 1541 dm_pool_metadata_read_only(pool->pmd);
1412 pool->process_bio = process_bio_fail; 1542 pool->process_bio = process_bio_fail;
1413 pool->process_discard = process_bio_fail; 1543 pool->process_discard = process_bio_fail;
1414 pool->process_prepared_mapping = process_prepared_mapping_fail; 1544 pool->process_prepared_mapping = process_prepared_mapping_fail;
1415 pool->process_prepared_discard = process_prepared_discard_fail; 1545 pool->process_prepared_discard = process_prepared_discard_fail;
1546
1547 error_retry_list(pool);
1416 break; 1548 break;
1417 1549
1418 case PM_READ_ONLY: 1550 case PM_READ_ONLY:
1419 if (old_mode != new_mode) 1551 if (old_mode != new_mode)
1420 DMERR("%s: switching pool to read-only mode", 1552 notify_of_pool_mode_change(pool, "read-only");
1421 dm_device_name(pool->pool_md)); 1553 dm_pool_metadata_read_only(pool->pmd);
1422 r = dm_pool_abort_metadata(pool->pmd); 1554 pool->process_bio = process_bio_read_only;
1423 if (r) { 1555 pool->process_discard = process_bio_success;
1424 DMERR("%s: aborting transaction failed", 1556 pool->process_prepared_mapping = process_prepared_mapping_fail;
1425 dm_device_name(pool->pool_md)); 1557 pool->process_prepared_discard = process_prepared_discard_passdown;
1426 new_mode = PM_FAIL; 1558
1427 set_pool_mode(pool, new_mode); 1559 error_retry_list(pool);
1428 } else { 1560 break;
1429 dm_pool_metadata_read_only(pool->pmd); 1561
1430 pool->process_bio = process_bio_read_only; 1562 case PM_OUT_OF_DATA_SPACE:
1431 pool->process_discard = process_discard; 1563 /*
1432 pool->process_prepared_mapping = process_prepared_mapping_fail; 1564 * Ideally we'd never hit this state; the low water mark
1433 pool->process_prepared_discard = process_prepared_discard_passdown; 1565 * would trigger userland to extend the pool before we
1434 } 1566 * completely run out of data space. However, many small
1567 * IOs to unprovisioned space can consume data space at an
1568 * alarming rate. Adjust your low water mark if you're
1569 * frequently seeing this mode.
1570 */
1571 if (old_mode != new_mode)
1572 notify_of_pool_mode_change(pool, "out-of-data-space");
1573 pool->process_bio = process_bio_read_only;
1574 pool->process_discard = process_discard;
1575 pool->process_prepared_mapping = process_prepared_mapping;
1576 pool->process_prepared_discard = process_prepared_discard_passdown;
1435 break; 1577 break;
1436 1578
1437 case PM_WRITE: 1579 case PM_WRITE:
1438 if (old_mode != new_mode) 1580 if (old_mode != new_mode)
1439 DMINFO("%s: switching pool to write mode", 1581 notify_of_pool_mode_change(pool, "write");
1440 dm_device_name(pool->pool_md));
1441 dm_pool_metadata_read_write(pool->pmd); 1582 dm_pool_metadata_read_write(pool->pmd);
1442 pool->process_bio = process_bio; 1583 pool->process_bio = process_bio;
1443 pool->process_discard = process_discard; 1584 pool->process_discard = process_discard;
@@ -1447,32 +1588,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1447 } 1588 }
1448 1589
1449 pool->pf.mode = new_mode; 1590 pool->pf.mode = new_mode;
1591 /*
1592 * The pool mode may have changed, sync it so bind_control_target()
1593 * doesn't cause an unexpected mode transition on resume.
1594 */
1595 pt->adjusted_pf.mode = new_mode;
1450} 1596}
1451 1597
1452/* 1598static void abort_transaction(struct pool *pool)
1453 * Rather than calling set_pool_mode directly, use these which describe the
1454 * reason for mode degradation.
1455 */
1456static void out_of_data_space(struct pool *pool)
1457{ 1599{
1458 DMERR_LIMIT("%s: no free data space available.", 1600 const char *dev_name = dm_device_name(pool->pool_md);
1459 dm_device_name(pool->pool_md)); 1601
1460 set_pool_mode(pool, PM_READ_ONLY); 1602 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1603 if (dm_pool_abort_metadata(pool->pmd)) {
1604 DMERR("%s: failed to abort metadata transaction", dev_name);
1605 set_pool_mode(pool, PM_FAIL);
1606 }
1607
1608 if (dm_pool_metadata_set_needs_check(pool->pmd)) {
1609 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1610 set_pool_mode(pool, PM_FAIL);
1611 }
1461} 1612}
1462 1613
1463static void metadata_operation_failed(struct pool *pool, const char *op, int r) 1614static void metadata_operation_failed(struct pool *pool, const char *op, int r)
1464{ 1615{
1465 dm_block_t free_blocks;
1466
1467 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1616 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1468 dm_device_name(pool->pool_md), op, r); 1617 dm_device_name(pool->pool_md), op, r);
1469 1618
1470 if (r == -ENOSPC && 1619 abort_transaction(pool);
1471 !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
1472 !free_blocks)
1473 DMERR_LIMIT("%s: no free metadata space available.",
1474 dm_device_name(pool->pool_md));
1475
1476 set_pool_mode(pool, PM_READ_ONLY); 1620 set_pool_mode(pool, PM_READ_ONLY);
1477} 1621}
1478 1622
@@ -1523,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1523 1667
1524 thin_hook_bio(tc, bio); 1668 thin_hook_bio(tc, bio);
1525 1669
1670 if (tc->requeue_mode) {
1671 bio_endio(bio, DM_ENDIO_REQUEUE);
1672 return DM_MAPIO_SUBMITTED;
1673 }
1674
1526 if (get_pool_mode(tc->pool) == PM_FAIL) { 1675 if (get_pool_mode(tc->pool) == PM_FAIL) {
1527 bio_io_error(bio); 1676 bio_io_error(bio);
1528 return DM_MAPIO_SUBMITTED; 1677 return DM_MAPIO_SUBMITTED;
@@ -1686,7 +1835,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1686 /* 1835 /*
1687 * We want to make sure that a pool in PM_FAIL mode is never upgraded. 1836 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
1688 */ 1837 */
1689 enum pool_mode old_mode = pool->pf.mode; 1838 enum pool_mode old_mode = get_pool_mode(pool);
1690 enum pool_mode new_mode = pt->adjusted_pf.mode; 1839 enum pool_mode new_mode = pt->adjusted_pf.mode;
1691 1840
1692 /* 1841 /*
@@ -1700,16 +1849,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1700 pool->pf = pt->adjusted_pf; 1849 pool->pf = pt->adjusted_pf;
1701 pool->low_water_blocks = pt->low_water_blocks; 1850 pool->low_water_blocks = pt->low_water_blocks;
1702 1851
1703 /*
1704 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1705 * not going to recover without a thin_repair. So we never let the
1706 * pool move out of the old mode. On the other hand a PM_READ_ONLY
1707 * may have been due to a lack of metadata or data space, and may
1708 * now work (ie. if the underlying devices have been resized).
1709 */
1710 if (old_mode == PM_FAIL)
1711 new_mode = old_mode;
1712
1713 set_pool_mode(pool, new_mode); 1852 set_pool_mode(pool, new_mode);
1714 1853
1715 return 0; 1854 return 0;
@@ -1999,16 +2138,27 @@ static void metadata_low_callback(void *context)
1999 dm_table_event(pool->ti->table); 2138 dm_table_event(pool->ti->table);
2000} 2139}
2001 2140
2002static sector_t get_metadata_dev_size(struct block_device *bdev) 2141static sector_t get_dev_size(struct block_device *bdev)
2142{
2143 return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
2144}
2145
2146static void warn_if_metadata_device_too_big(struct block_device *bdev)
2003{ 2147{
2004 sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 2148 sector_t metadata_dev_size = get_dev_size(bdev);
2005 char buffer[BDEVNAME_SIZE]; 2149 char buffer[BDEVNAME_SIZE];
2006 2150
2007 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) { 2151 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
2008 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2152 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2009 bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); 2153 bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
2010 metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING; 2154}
2011 } 2155
2156static sector_t get_metadata_dev_size(struct block_device *bdev)
2157{
2158 sector_t metadata_dev_size = get_dev_size(bdev);
2159
2160 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
2161 metadata_dev_size = THIN_METADATA_MAX_SECTORS;
2012 2162
2013 return metadata_dev_size; 2163 return metadata_dev_size;
2014} 2164}
@@ -2017,7 +2167,7 @@ static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
2017{ 2167{
2018 sector_t metadata_dev_size = get_metadata_dev_size(bdev); 2168 sector_t metadata_dev_size = get_metadata_dev_size(bdev);
2019 2169
2020 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 2170 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
2021 2171
2022 return metadata_dev_size; 2172 return metadata_dev_size;
2023} 2173}
@@ -2095,12 +2245,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2095 ti->error = "Error opening metadata block device"; 2245 ti->error = "Error opening metadata block device";
2096 goto out_unlock; 2246 goto out_unlock;
2097 } 2247 }
2098 2248 warn_if_metadata_device_too_big(metadata_dev->bdev);
2099 /*
2100 * Run for the side-effect of possibly issuing a warning if the
2101 * device is too big.
2102 */
2103 (void) get_metadata_dev_size(metadata_dev->bdev);
2104 2249
2105 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 2250 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
2106 if (r) { 2251 if (r) {
@@ -2246,6 +2391,12 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
2246 return -EINVAL; 2391 return -EINVAL;
2247 2392
2248 } else if (data_size > sb_data_size) { 2393 } else if (data_size > sb_data_size) {
2394 if (dm_pool_metadata_needs_check(pool->pmd)) {
2395 DMERR("%s: unable to grow the data device until repaired.",
2396 dm_device_name(pool->pool_md));
2397 return 0;
2398 }
2399
2249 if (sb_data_size) 2400 if (sb_data_size)
2250 DMINFO("%s: growing the data device from %llu to %llu blocks", 2401 DMINFO("%s: growing the data device from %llu to %llu blocks",
2251 dm_device_name(pool->pool_md), 2402 dm_device_name(pool->pool_md),
@@ -2287,6 +2438,13 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2287 return -EINVAL; 2438 return -EINVAL;
2288 2439
2289 } else if (metadata_dev_size > sb_metadata_dev_size) { 2440 } else if (metadata_dev_size > sb_metadata_dev_size) {
2441 if (dm_pool_metadata_needs_check(pool->pmd)) {
2442 DMERR("%s: unable to grow the metadata device until repaired.",
2443 dm_device_name(pool->pool_md));
2444 return 0;
2445 }
2446
2447 warn_if_metadata_device_too_big(pool->md_dev);
2290 DMINFO("%s: growing the metadata device from %llu to %llu blocks", 2448 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
2291 dm_device_name(pool->pool_md), 2449 dm_device_name(pool->pool_md),
2292 sb_metadata_dev_size, metadata_dev_size); 2450 sb_metadata_dev_size, metadata_dev_size);
@@ -2673,7 +2831,9 @@ static void pool_status(struct dm_target *ti, status_type_t type,
2673 else 2831 else
2674 DMEMIT("- "); 2832 DMEMIT("- ");
2675 2833
2676 if (pool->pf.mode == PM_READ_ONLY) 2834 if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
2835 DMEMIT("out_of_data_space ");
2836 else if (pool->pf.mode == PM_READ_ONLY)
2677 DMEMIT("ro "); 2837 DMEMIT("ro ");
2678 else 2838 else
2679 DMEMIT("rw "); 2839 DMEMIT("rw ");
@@ -2787,7 +2947,7 @@ static struct target_type pool_target = {
2787 .name = "thin-pool", 2947 .name = "thin-pool",
2788 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2948 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2789 DM_TARGET_IMMUTABLE, 2949 DM_TARGET_IMMUTABLE,
2790 .version = {1, 10, 0}, 2950 .version = {1, 11, 0},
2791 .module = THIS_MODULE, 2951 .module = THIS_MODULE,
2792 .ctr = pool_ctr, 2952 .ctr = pool_ctr,
2793 .dtr = pool_dtr, 2953 .dtr = pool_dtr,
@@ -2894,6 +3054,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2894 3054
2895 if (get_pool_mode(tc->pool) == PM_FAIL) { 3055 if (get_pool_mode(tc->pool) == PM_FAIL) {
2896 ti->error = "Couldn't open thin device, Pool is in fail mode"; 3056 ti->error = "Couldn't open thin device, Pool is in fail mode";
3057 r = -EINVAL;
2897 goto bad_thin_open; 3058 goto bad_thin_open;
2898 } 3059 }
2899 3060
@@ -2905,7 +3066,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2905 3066
2906 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); 3067 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2907 if (r) 3068 if (r)
2908 goto bad_thin_open; 3069 goto bad_target_max_io_len;
2909 3070
2910 ti->num_flush_bios = 1; 3071 ti->num_flush_bios = 1;
2911 ti->flush_supported = true; 3072 ti->flush_supported = true;
@@ -2926,6 +3087,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2926 3087
2927 return 0; 3088 return 0;
2928 3089
3090bad_target_max_io_len:
3091 dm_pool_close_thin_device(tc->td);
2929bad_thin_open: 3092bad_thin_open:
2930 __pool_dec(tc->pool); 3093 __pool_dec(tc->pool);
2931bad_pool_lookup: 3094bad_pool_lookup:
@@ -2986,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
2986 return 0; 3149 return 0;
2987} 3150}
2988 3151
2989static void thin_postsuspend(struct dm_target *ti) 3152static void thin_presuspend(struct dm_target *ti)
2990{ 3153{
3154 struct thin_c *tc = ti->private;
3155
2991 if (dm_noflush_suspending(ti)) 3156 if (dm_noflush_suspending(ti))
2992 requeue_io((struct thin_c *)ti->private); 3157 noflush_work(tc, do_noflush_start);
3158}
3159
3160static void thin_postsuspend(struct dm_target *ti)
3161{
3162 struct thin_c *tc = ti->private;
3163
3164 /*
3165 * The dm_noflush_suspending flag has been cleared by now, so
3166 * unfortunately we must always run this.
3167 */
3168 noflush_work(tc, do_noflush_stop);
2993} 3169}
2994 3170
2995/* 3171/*
@@ -3074,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti,
3074 3250
3075static struct target_type thin_target = { 3251static struct target_type thin_target = {
3076 .name = "thin", 3252 .name = "thin",
3077 .version = {1, 10, 0}, 3253 .version = {1, 11, 0},
3078 .module = THIS_MODULE, 3254 .module = THIS_MODULE,
3079 .ctr = thin_ctr, 3255 .ctr = thin_ctr,
3080 .dtr = thin_dtr, 3256 .dtr = thin_dtr,
3081 .map = thin_map, 3257 .map = thin_map,
3082 .end_io = thin_endio, 3258 .end_io = thin_endio,
3259 .presuspend = thin_presuspend,
3083 .postsuspend = thin_postsuspend, 3260 .postsuspend = thin_postsuspend,
3084 .status = thin_status, 3261 .status = thin_status,
3085 .iterate_devices = thin_iterate_devices, 3262 .iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index 19b268795415..0c2dec7aec20 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA
6 ---help--- 6 ---help---
7 Library providing immutable on-disk data structure support for 7 Library providing immutable on-disk data structure support for
8 device-mapper targets such as the thin provisioning target. 8 device-mapper targets such as the thin provisioning target.
9
10config DM_DEBUG_BLOCK_STACK_TRACING
11 boolean "Keep stack trace of persistent data block lock holders"
12 depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
13 select STACKTRACE
14 ---help---
15 Enable this for messages that may help debug problems with the
16 block manager locking used by thin provisioning and caching.
17
18 If unsure, say N.
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 536782e3bcb7..786b689bdfc7 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -91,6 +91,69 @@ struct block_op {
91 dm_block_t block; 91 dm_block_t block;
92}; 92};
93 93
94struct bop_ring_buffer {
95 unsigned begin;
96 unsigned end;
97 struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
98};
99
100static void brb_init(struct bop_ring_buffer *brb)
101{
102 brb->begin = 0;
103 brb->end = 0;
104}
105
106static bool brb_empty(struct bop_ring_buffer *brb)
107{
108 return brb->begin == brb->end;
109}
110
111static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
112{
113 unsigned r = old + 1;
114 return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
115}
116
117static int brb_push(struct bop_ring_buffer *brb,
118 enum block_op_type type, dm_block_t b)
119{
120 struct block_op *bop;
121 unsigned next = brb_next(brb, brb->end);
122
123 /*
124 * We don't allow the last bop to be filled, this way we can
125 * differentiate between full and empty.
126 */
127 if (next == brb->begin)
128 return -ENOMEM;
129
130 bop = brb->bops + brb->end;
131 bop->type = type;
132 bop->block = b;
133
134 brb->end = next;
135
136 return 0;
137}
138
139static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
140{
141 struct block_op *bop;
142
143 if (brb_empty(brb))
144 return -ENODATA;
145
146 bop = brb->bops + brb->begin;
147 result->type = bop->type;
148 result->block = bop->block;
149
150 brb->begin = brb_next(brb, brb->begin);
151
152 return 0;
153}
154
155/*----------------------------------------------------------------*/
156
94struct sm_metadata { 157struct sm_metadata {
95 struct dm_space_map sm; 158 struct dm_space_map sm;
96 159
@@ -101,25 +164,20 @@ struct sm_metadata {
101 164
102 unsigned recursion_count; 165 unsigned recursion_count;
103 unsigned allocated_this_transaction; 166 unsigned allocated_this_transaction;
104 unsigned nr_uncommitted; 167 struct bop_ring_buffer uncommitted;
105 struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
106 168
107 struct threshold threshold; 169 struct threshold threshold;
108}; 170};
109 171
110static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) 172static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
111{ 173{
112 struct block_op *op; 174 int r = brb_push(&smm->uncommitted, type, b);
113 175
114 if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { 176 if (r) {
115 DMERR("too many recursive allocations"); 177 DMERR("too many recursive allocations");
116 return -ENOMEM; 178 return -ENOMEM;
117 } 179 }
118 180
119 op = smm->uncommitted + smm->nr_uncommitted++;
120 op->type = type;
121 op->block = b;
122
123 return 0; 181 return 0;
124} 182}
125 183
@@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm)
158 return -ENOMEM; 216 return -ENOMEM;
159 } 217 }
160 218
161 if (smm->recursion_count == 1 && smm->nr_uncommitted) { 219 if (smm->recursion_count == 1) {
162 while (smm->nr_uncommitted && !r) { 220 while (!brb_empty(&smm->uncommitted)) {
163 smm->nr_uncommitted--; 221 struct block_op bop;
164 r = commit_bop(smm, smm->uncommitted + 222
165 smm->nr_uncommitted); 223 r = brb_pop(&smm->uncommitted, &bop);
224 if (r) {
225 DMERR("bug in bop ring buffer");
226 break;
227 }
228
229 r = commit_bop(smm, &bop);
166 if (r) 230 if (r)
167 break; 231 break;
168 } 232 }
@@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
217static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, 281static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
218 uint32_t *result) 282 uint32_t *result)
219{ 283{
220 int r, i; 284 int r;
285 unsigned i;
221 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 286 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
222 unsigned adjustment = 0; 287 unsigned adjustment = 0;
223 288
@@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
225 * We may have some uncommitted adjustments to add. This list 290 * We may have some uncommitted adjustments to add. This list
226 * should always be really short. 291 * should always be really short.
227 */ 292 */
228 for (i = 0; i < smm->nr_uncommitted; i++) { 293 for (i = smm->uncommitted.begin;
229 struct block_op *op = smm->uncommitted + i; 294 i != smm->uncommitted.end;
295 i = brb_next(&smm->uncommitted, i)) {
296 struct block_op *op = smm->uncommitted.bops + i;
230 297
231 if (op->block != b) 298 if (op->block != b)
232 continue; 299 continue;
@@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
254static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, 321static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
255 dm_block_t b, int *result) 322 dm_block_t b, int *result)
256{ 323{
257 int r, i, adjustment = 0; 324 int r, adjustment = 0;
325 unsigned i;
258 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 326 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
259 uint32_t rc; 327 uint32_t rc;
260 328
@@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
262 * We may have some uncommitted adjustments to add. This list 330 * We may have some uncommitted adjustments to add. This list
263 * should always be really short. 331 * should always be really short.
264 */ 332 */
265 for (i = 0; i < smm->nr_uncommitted; i++) { 333 for (i = smm->uncommitted.begin;
266 struct block_op *op = smm->uncommitted + i; 334 i != smm->uncommitted.end;
335 i = brb_next(&smm->uncommitted, i)) {
336
337 struct block_op *op = smm->uncommitted.bops + i;
267 338
268 if (op->block != b) 339 if (op->block != b)
269 continue; 340 continue;
@@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
671 smm->begin = superblock + 1; 742 smm->begin = superblock + 1;
672 smm->recursion_count = 0; 743 smm->recursion_count = 0;
673 smm->allocated_this_transaction = 0; 744 smm->allocated_this_transaction = 0;
674 smm->nr_uncommitted = 0; 745 brb_init(&smm->uncommitted);
675 threshold_init(&smm->threshold); 746 threshold_init(&smm->threshold);
676 747
677 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); 748 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
@@ -680,6 +751,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
680 if (r) 751 if (r)
681 return r; 752 return r;
682 753
754 if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS)
755 nr_blocks = DM_SM_METADATA_MAX_BLOCKS;
683 r = sm_ll_extend(&smm->ll, nr_blocks); 756 r = sm_ll_extend(&smm->ll, nr_blocks);
684 if (r) 757 if (r)
685 return r; 758 return r;
@@ -713,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
713 smm->begin = 0; 786 smm->begin = 0;
714 smm->recursion_count = 0; 787 smm->recursion_count = 0;
715 smm->allocated_this_transaction = 0; 788 smm->allocated_this_transaction = 0;
716 smm->nr_uncommitted = 0; 789 brb_init(&smm->uncommitted);
717 threshold_init(&smm->threshold); 790 threshold_init(&smm->threshold);
718 791
719 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); 792 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
index 39bba0801cf2..64df923974d8 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.h
+++ b/drivers/md/persistent-data/dm-space-map-metadata.h
@@ -9,6 +9,17 @@
9 9
10#include "dm-transaction-manager.h" 10#include "dm-transaction-manager.h"
11 11
12#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT)
13
14/*
15 * The metadata device is currently limited in size.
16 *
17 * We have one block of index, which can hold 255 index entries. Each
18 * index entry contains allocation info about ~16k metadata blocks.
19 */
20#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64))
21#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE)
22
12/* 23/*
13 * Unfortunately we have to use two-phase construction due to the cycle 24 * Unfortunately we have to use two-phase construction due to the cycle
14 * between the tm and sm. 25 * between the tm and sm.