diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-03-07 18:17:36 -0500 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-03-07 18:17:36 -0500 |
| commit | 2ef176f11a40d21e8d6c5d98a3c09d57c861fac6 (patch) | |
| tree | bfac92c332d83f77a97514648e8b45b096d2a35c | |
| parent | b053940df41808f0f27568eb36820d10a8a987f8 (diff) | |
| parent | cebc2de44d3bce53e46476e774126c298ca2c8a9 (diff) | |
Merge tag 'dm-3.14-fixes-3' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper fixes from Mike Snitzer:
- dm-cache memory allocation failure fix
- fix DM's Kconfig identation
- dm-snapshot metadata corruption fix for bug introduced in 3.14-rc1
- important refcount < 0 fix for the DM persistent data library's space
map metadata interface which fixes corruption reported by a few
dm-thinp users
and last but not least:
- more extensive fixes than ideal for dm-thinp's data resize capability
(which has had growing pain much like we've seen from -ENOSPC
handling of filesystems that mature).
The end result is dm-thinp now handles metadata operation failure and
no data space error conditions much better than before.
* tag 'dm-3.14-fixes-3' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
dm space map metadata: fix refcount decrement below 0 which caused corruption
dm thin: fix Documentation for held metadata root feature
dm thin: fix noflush suspend IO queueing
dm thin: fix deadlock in __requeue_bio_list
dm thin: fix out of data space handling
dm thin: ensure user takes action to validate data and metadata consistency
dm thin: synchronize the pool mode during suspend
dm snapshot: fix metadata corruption
dm: fix Kconfig indentation
dm cache mq: fix memory allocation failure for large cache devices
| -rw-r--r-- | Documentation/device-mapper/cache.txt | 11 | ||||
| -rw-r--r-- | Documentation/device-mapper/thin-provisioning.txt | 34 | ||||
| -rw-r--r-- | drivers/md/Kconfig | 10 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 4 | ||||
| -rw-r--r-- | drivers/md/dm-snap-persistent.c | 3 | ||||
| -rw-r--r-- | drivers/md/dm-thin-metadata.c | 37 | ||||
| -rw-r--r-- | drivers/md/dm-thin-metadata.h | 11 | ||||
| -rw-r--r-- | drivers/md/dm-thin.c | 304 | ||||
| -rw-r--r-- | drivers/md/persistent-data/Kconfig | 10 | ||||
| -rw-r--r-- | drivers/md/persistent-data/dm-space-map-metadata.c | 113 |
10 files changed, 425 insertions, 112 deletions
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt index e6b72d355151..68c0f517c60e 100644 --- a/Documentation/device-mapper/cache.txt +++ b/Documentation/device-mapper/cache.txt | |||
| @@ -124,12 +124,11 @@ the default being 204800 sectors (or 100MB). | |||
| 124 | Updating on-disk metadata | 124 | Updating on-disk metadata |
| 125 | ------------------------- | 125 | ------------------------- |
| 126 | 126 | ||
| 127 | On-disk metadata is committed every time a REQ_SYNC or REQ_FUA bio is | 127 | On-disk metadata is committed every time a FLUSH or FUA bio is written. |
| 128 | written. If no such requests are made then commits will occur every | 128 | If no such requests are made then commits will occur every second. This |
| 129 | second. This means the cache behaves like a physical disk that has a | 129 | means the cache behaves like a physical disk that has a volatile write |
| 130 | write cache (the same is true of the thin-provisioning target). If | 130 | cache. If power is lost you may lose some recent writes. The metadata |
| 131 | power is lost you may lose some recent writes. The metadata should | 131 | should always be consistent in spite of any crash. |
| 132 | always be consistent in spite of any crash. | ||
| 133 | 132 | ||
| 134 | The 'dirty' state for a cache block changes far too frequently for us | 133 | The 'dirty' state for a cache block changes far too frequently for us |
| 135 | to keep updating it on the fly. So we treat it as a hint. In normal | 134 | to keep updating it on the fly. So we treat it as a hint. In normal |
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt index 8a7a3d46e0da..05a27e9442bd 100644 --- a/Documentation/device-mapper/thin-provisioning.txt +++ b/Documentation/device-mapper/thin-provisioning.txt | |||
| @@ -116,6 +116,35 @@ Resuming a device with a new table itself triggers an event so the | |||
| 116 | userspace daemon can use this to detect a situation where a new table | 116 | userspace daemon can use this to detect a situation where a new table |
| 117 | already exceeds the threshold. | 117 | already exceeds the threshold. |
| 118 | 118 | ||
| 119 | A low water mark for the metadata device is maintained in the kernel and | ||
| 120 | will trigger a dm event if free space on the metadata device drops below | ||
| 121 | it. | ||
| 122 | |||
| 123 | Updating on-disk metadata | ||
| 124 | ------------------------- | ||
| 125 | |||
| 126 | On-disk metadata is committed every time a FLUSH or FUA bio is written. | ||
| 127 | If no such requests are made then commits will occur every second. This | ||
| 128 | means the thin-provisioning target behaves like a physical disk that has | ||
| 129 | a volatile write cache. If power is lost you may lose some recent | ||
| 130 | writes. The metadata should always be consistent in spite of any crash. | ||
| 131 | |||
| 132 | If data space is exhausted the pool will either error or queue IO | ||
| 133 | according to the configuration (see: error_if_no_space). If metadata | ||
| 134 | space is exhausted or a metadata operation fails: the pool will error IO | ||
| 135 | until the pool is taken offline and repair is performed to 1) fix any | ||
| 136 | potential inconsistencies and 2) clear the flag that imposes repair. | ||
| 137 | Once the pool's metadata device is repaired it may be resized, which | ||
| 138 | will allow the pool to return to normal operation. Note that if a pool | ||
| 139 | is flagged as needing repair, the pool's data and metadata devices | ||
| 140 | cannot be resized until repair is performed. It should also be noted | ||
| 141 | that when the pool's metadata space is exhausted the current metadata | ||
| 142 | transaction is aborted. Given that the pool will cache IO whose | ||
| 143 | completion may have already been acknowledged to upper IO layers | ||
| 144 | (e.g. filesystem) it is strongly suggested that consistency checks | ||
| 145 | (e.g. fsck) be performed on those layers when repair of the pool is | ||
| 146 | required. | ||
| 147 | |||
| 119 | Thin provisioning | 148 | Thin provisioning |
| 120 | ----------------- | 149 | ----------------- |
| 121 | 150 | ||
| @@ -258,10 +287,9 @@ ii) Status | |||
| 258 | should register for the event and then check the target's status. | 287 | should register for the event and then check the target's status. |
| 259 | 288 | ||
| 260 | held metadata root: | 289 | held metadata root: |
| 261 | The location, in sectors, of the metadata root that has been | 290 | The location, in blocks, of the metadata root that has been |
| 262 | 'held' for userspace read access. '-' indicates there is no | 291 | 'held' for userspace read access. '-' indicates there is no |
| 263 | held root. This feature is not yet implemented so '-' is | 292 | held root. |
| 264 | always returned. | ||
| 265 | 293 | ||
| 266 | discard_passdown|no_discard_passdown | 294 | discard_passdown|no_discard_passdown |
| 267 | Whether or not discards are actually being passed down to the | 295 | Whether or not discards are actually being passed down to the |
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 9a06fe883766..95ad936e6048 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
| @@ -254,16 +254,6 @@ config DM_THIN_PROVISIONING | |||
| 254 | ---help--- | 254 | ---help--- |
| 255 | Provides thin provisioning and snapshots that share a data store. | 255 | Provides thin provisioning and snapshots that share a data store. |
| 256 | 256 | ||
| 257 | config DM_DEBUG_BLOCK_STACK_TRACING | ||
| 258 | boolean "Keep stack trace of persistent data block lock holders" | ||
| 259 | depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA | ||
| 260 | select STACKTRACE | ||
| 261 | ---help--- | ||
| 262 | Enable this for messages that may help debug problems with the | ||
| 263 | block manager locking used by thin provisioning and caching. | ||
| 264 | |||
| 265 | If unsure, say N. | ||
| 266 | |||
| 267 | config DM_CACHE | 257 | config DM_CACHE |
| 268 | tristate "Cache target (EXPERIMENTAL)" | 258 | tristate "Cache target (EXPERIMENTAL)" |
| 269 | depends on BLK_DEV_DM | 259 | depends on BLK_DEV_DM |
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 1e018e986610..0e385e40909e 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c | |||
| @@ -872,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p) | |||
| 872 | { | 872 | { |
| 873 | struct mq_policy *mq = to_mq_policy(p); | 873 | struct mq_policy *mq = to_mq_policy(p); |
| 874 | 874 | ||
| 875 | kfree(mq->table); | 875 | vfree(mq->table); |
| 876 | epool_exit(&mq->cache_pool); | 876 | epool_exit(&mq->cache_pool); |
| 877 | epool_exit(&mq->pre_cache_pool); | 877 | epool_exit(&mq->pre_cache_pool); |
| 878 | kfree(mq); | 878 | kfree(mq); |
| @@ -1245,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, | |||
| 1245 | 1245 | ||
| 1246 | mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); | 1246 | mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); |
| 1247 | mq->hash_bits = ffs(mq->nr_buckets) - 1; | 1247 | mq->hash_bits = ffs(mq->nr_buckets) - 1; |
| 1248 | mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); | 1248 | mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets); |
| 1249 | if (!mq->table) | 1249 | if (!mq->table) |
| 1250 | goto bad_alloc_table; | 1250 | goto bad_alloc_table; |
| 1251 | 1251 | ||
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index afc3d017de4c..d6e88178d22c 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
| @@ -546,6 +546,9 @@ static int read_exceptions(struct pstore *ps, | |||
| 546 | r = insert_exceptions(ps, area, callback, callback_context, | 546 | r = insert_exceptions(ps, area, callback, callback_context, |
| 547 | &full); | 547 | &full); |
| 548 | 548 | ||
| 549 | if (!full) | ||
| 550 | memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT); | ||
| 551 | |||
| 549 | dm_bufio_release(bp); | 552 | dm_bufio_release(bp); |
| 550 | 553 | ||
| 551 | dm_bufio_forget(client, chunk); | 554 | dm_bufio_forget(client, chunk); |
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index baa87ff12816..fb9efc829182 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c | |||
| @@ -76,7 +76,7 @@ | |||
| 76 | 76 | ||
| 77 | #define THIN_SUPERBLOCK_MAGIC 27022010 | 77 | #define THIN_SUPERBLOCK_MAGIC 27022010 |
| 78 | #define THIN_SUPERBLOCK_LOCATION 0 | 78 | #define THIN_SUPERBLOCK_LOCATION 0 |
| 79 | #define THIN_VERSION 1 | 79 | #define THIN_VERSION 2 |
| 80 | #define THIN_METADATA_CACHE_SIZE 64 | 80 | #define THIN_METADATA_CACHE_SIZE 64 |
| 81 | #define SECTOR_TO_BLOCK_SHIFT 3 | 81 | #define SECTOR_TO_BLOCK_SHIFT 3 |
| 82 | 82 | ||
| @@ -1755,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, | |||
| 1755 | 1755 | ||
| 1756 | return r; | 1756 | return r; |
| 1757 | } | 1757 | } |
| 1758 | |||
| 1759 | int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) | ||
| 1760 | { | ||
| 1761 | int r; | ||
| 1762 | struct dm_block *sblock; | ||
| 1763 | struct thin_disk_superblock *disk_super; | ||
| 1764 | |||
| 1765 | down_write(&pmd->root_lock); | ||
| 1766 | pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; | ||
| 1767 | |||
| 1768 | r = superblock_lock(pmd, &sblock); | ||
| 1769 | if (r) { | ||
| 1770 | DMERR("couldn't read superblock"); | ||
| 1771 | goto out; | ||
| 1772 | } | ||
| 1773 | |||
| 1774 | disk_super = dm_block_data(sblock); | ||
| 1775 | disk_super->flags = cpu_to_le32(pmd->flags); | ||
| 1776 | |||
| 1777 | dm_bm_unlock(sblock); | ||
| 1778 | out: | ||
| 1779 | up_write(&pmd->root_lock); | ||
| 1780 | return r; | ||
| 1781 | } | ||
| 1782 | |||
| 1783 | bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) | ||
| 1784 | { | ||
| 1785 | bool needs_check; | ||
| 1786 | |||
| 1787 | down_read(&pmd->root_lock); | ||
| 1788 | needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG; | ||
| 1789 | up_read(&pmd->root_lock); | ||
| 1790 | |||
| 1791 | return needs_check; | ||
| 1792 | } | ||
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 82ea384d36ff..e3c857db195a 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h | |||
| @@ -25,6 +25,11 @@ | |||
| 25 | 25 | ||
| 26 | /*----------------------------------------------------------------*/ | 26 | /*----------------------------------------------------------------*/ |
| 27 | 27 | ||
| 28 | /* | ||
| 29 | * Thin metadata superblock flags. | ||
| 30 | */ | ||
| 31 | #define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0) | ||
| 32 | |||
| 28 | struct dm_pool_metadata; | 33 | struct dm_pool_metadata; |
| 29 | struct dm_thin_device; | 34 | struct dm_thin_device; |
| 30 | 35 | ||
| @@ -202,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, | |||
| 202 | dm_sm_threshold_fn fn, | 207 | dm_sm_threshold_fn fn, |
| 203 | void *context); | 208 | void *context); |
| 204 | 209 | ||
| 210 | /* | ||
| 211 | * Updates the superblock immediately. | ||
| 212 | */ | ||
| 213 | int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd); | ||
| 214 | bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); | ||
| 215 | |||
| 205 | /*----------------------------------------------------------------*/ | 216 | /*----------------------------------------------------------------*/ |
| 206 | 217 | ||
| 207 | #endif | 218 | #endif |
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 7e84baccf0ad..be70d38745f7 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
| @@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, | |||
| 130 | struct dm_thin_new_mapping; | 130 | struct dm_thin_new_mapping; |
| 131 | 131 | ||
| 132 | /* | 132 | /* |
| 133 | * The pool runs in 3 modes. Ordered in degraded order for comparisons. | 133 | * The pool runs in 4 modes. Ordered in degraded order for comparisons. |
| 134 | */ | 134 | */ |
| 135 | enum pool_mode { | 135 | enum pool_mode { |
| 136 | PM_WRITE, /* metadata may be changed */ | 136 | PM_WRITE, /* metadata may be changed */ |
| 137 | PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */ | ||
| 137 | PM_READ_ONLY, /* metadata may not be changed */ | 138 | PM_READ_ONLY, /* metadata may not be changed */ |
| 138 | PM_FAIL, /* all I/O fails */ | 139 | PM_FAIL, /* all I/O fails */ |
| 139 | }; | 140 | }; |
| @@ -198,7 +199,6 @@ struct pool { | |||
| 198 | }; | 199 | }; |
| 199 | 200 | ||
| 200 | static enum pool_mode get_pool_mode(struct pool *pool); | 201 | static enum pool_mode get_pool_mode(struct pool *pool); |
| 201 | static void out_of_data_space(struct pool *pool); | ||
| 202 | static void metadata_operation_failed(struct pool *pool, const char *op, int r); | 202 | static void metadata_operation_failed(struct pool *pool, const char *op, int r); |
| 203 | 203 | ||
| 204 | /* | 204 | /* |
| @@ -226,6 +226,7 @@ struct thin_c { | |||
| 226 | 226 | ||
| 227 | struct pool *pool; | 227 | struct pool *pool; |
| 228 | struct dm_thin_device *td; | 228 | struct dm_thin_device *td; |
| 229 | bool requeue_mode:1; | ||
| 229 | }; | 230 | }; |
| 230 | 231 | ||
| 231 | /*----------------------------------------------------------------*/ | 232 | /*----------------------------------------------------------------*/ |
| @@ -369,14 +370,18 @@ struct dm_thin_endio_hook { | |||
| 369 | struct dm_thin_new_mapping *overwrite_mapping; | 370 | struct dm_thin_new_mapping *overwrite_mapping; |
| 370 | }; | 371 | }; |
| 371 | 372 | ||
| 372 | static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) | 373 | static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) |
| 373 | { | 374 | { |
| 374 | struct bio *bio; | 375 | struct bio *bio; |
| 375 | struct bio_list bios; | 376 | struct bio_list bios; |
| 377 | unsigned long flags; | ||
| 376 | 378 | ||
| 377 | bio_list_init(&bios); | 379 | bio_list_init(&bios); |
| 380 | |||
| 381 | spin_lock_irqsave(&tc->pool->lock, flags); | ||
| 378 | bio_list_merge(&bios, master); | 382 | bio_list_merge(&bios, master); |
| 379 | bio_list_init(master); | 383 | bio_list_init(master); |
| 384 | spin_unlock_irqrestore(&tc->pool->lock, flags); | ||
| 380 | 385 | ||
| 381 | while ((bio = bio_list_pop(&bios))) { | 386 | while ((bio = bio_list_pop(&bios))) { |
| 382 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | 387 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); |
| @@ -391,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) | |||
| 391 | static void requeue_io(struct thin_c *tc) | 396 | static void requeue_io(struct thin_c *tc) |
| 392 | { | 397 | { |
| 393 | struct pool *pool = tc->pool; | 398 | struct pool *pool = tc->pool; |
| 399 | |||
| 400 | requeue_bio_list(tc, &pool->deferred_bios); | ||
| 401 | requeue_bio_list(tc, &pool->retry_on_resume_list); | ||
| 402 | } | ||
| 403 | |||
| 404 | static void error_retry_list(struct pool *pool) | ||
| 405 | { | ||
| 406 | struct bio *bio; | ||
| 394 | unsigned long flags; | 407 | unsigned long flags; |
| 408 | struct bio_list bios; | ||
| 409 | |||
| 410 | bio_list_init(&bios); | ||
| 395 | 411 | ||
| 396 | spin_lock_irqsave(&pool->lock, flags); | 412 | spin_lock_irqsave(&pool->lock, flags); |
| 397 | __requeue_bio_list(tc, &pool->deferred_bios); | 413 | bio_list_merge(&bios, &pool->retry_on_resume_list); |
| 398 | __requeue_bio_list(tc, &pool->retry_on_resume_list); | 414 | bio_list_init(&pool->retry_on_resume_list); |
| 399 | spin_unlock_irqrestore(&pool->lock, flags); | 415 | spin_unlock_irqrestore(&pool->lock, flags); |
| 416 | |||
| 417 | while ((bio = bio_list_pop(&bios))) | ||
| 418 | bio_io_error(bio); | ||
| 400 | } | 419 | } |
| 401 | 420 | ||
| 402 | /* | 421 | /* |
| @@ -925,13 +944,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) | |||
| 925 | } | 944 | } |
| 926 | } | 945 | } |
| 927 | 946 | ||
| 947 | static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); | ||
| 948 | |||
| 928 | static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | 949 | static int alloc_data_block(struct thin_c *tc, dm_block_t *result) |
| 929 | { | 950 | { |
| 930 | int r; | 951 | int r; |
| 931 | dm_block_t free_blocks; | 952 | dm_block_t free_blocks; |
| 932 | struct pool *pool = tc->pool; | 953 | struct pool *pool = tc->pool; |
| 933 | 954 | ||
| 934 | if (get_pool_mode(pool) != PM_WRITE) | 955 | if (WARN_ON(get_pool_mode(pool) != PM_WRITE)) |
| 935 | return -EINVAL; | 956 | return -EINVAL; |
| 936 | 957 | ||
| 937 | r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); | 958 | r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); |
| @@ -958,7 +979,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | |||
| 958 | } | 979 | } |
| 959 | 980 | ||
| 960 | if (!free_blocks) { | 981 | if (!free_blocks) { |
| 961 | out_of_data_space(pool); | 982 | set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); |
| 962 | return -ENOSPC; | 983 | return -ENOSPC; |
| 963 | } | 984 | } |
| 964 | } | 985 | } |
| @@ -988,15 +1009,32 @@ static void retry_on_resume(struct bio *bio) | |||
| 988 | spin_unlock_irqrestore(&pool->lock, flags); | 1009 | spin_unlock_irqrestore(&pool->lock, flags); |
| 989 | } | 1010 | } |
| 990 | 1011 | ||
| 991 | static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) | 1012 | static bool should_error_unserviceable_bio(struct pool *pool) |
| 992 | { | 1013 | { |
| 993 | /* | 1014 | enum pool_mode m = get_pool_mode(pool); |
| 994 | * When pool is read-only, no cell locking is needed because | 1015 | |
| 995 | * nothing is changing. | 1016 | switch (m) { |
| 996 | */ | 1017 | case PM_WRITE: |
| 997 | WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY); | 1018 | /* Shouldn't get here */ |
| 1019 | DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); | ||
| 1020 | return true; | ||
| 1021 | |||
| 1022 | case PM_OUT_OF_DATA_SPACE: | ||
| 1023 | return pool->pf.error_if_no_space; | ||
| 998 | 1024 | ||
| 999 | if (pool->pf.error_if_no_space) | 1025 | case PM_READ_ONLY: |
| 1026 | case PM_FAIL: | ||
| 1027 | return true; | ||
| 1028 | default: | ||
| 1029 | /* Shouldn't get here */ | ||
| 1030 | DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); | ||
| 1031 | return true; | ||
| 1032 | } | ||
| 1033 | } | ||
| 1034 | |||
| 1035 | static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) | ||
| 1036 | { | ||
| 1037 | if (should_error_unserviceable_bio(pool)) | ||
| 1000 | bio_io_error(bio); | 1038 | bio_io_error(bio); |
| 1001 | else | 1039 | else |
| 1002 | retry_on_resume(bio); | 1040 | retry_on_resume(bio); |
| @@ -1007,11 +1045,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c | |||
| 1007 | struct bio *bio; | 1045 | struct bio *bio; |
| 1008 | struct bio_list bios; | 1046 | struct bio_list bios; |
| 1009 | 1047 | ||
| 1048 | if (should_error_unserviceable_bio(pool)) { | ||
| 1049 | cell_error(pool, cell); | ||
| 1050 | return; | ||
| 1051 | } | ||
| 1052 | |||
| 1010 | bio_list_init(&bios); | 1053 | bio_list_init(&bios); |
| 1011 | cell_release(pool, cell, &bios); | 1054 | cell_release(pool, cell, &bios); |
| 1012 | 1055 | ||
| 1013 | while ((bio = bio_list_pop(&bios))) | 1056 | if (should_error_unserviceable_bio(pool)) |
| 1014 | handle_unserviceable_bio(pool, bio); | 1057 | while ((bio = bio_list_pop(&bios))) |
| 1058 | bio_io_error(bio); | ||
| 1059 | else | ||
| 1060 | while ((bio = bio_list_pop(&bios))) | ||
| 1061 | retry_on_resume(bio); | ||
| 1015 | } | 1062 | } |
| 1016 | 1063 | ||
| 1017 | static void process_discard(struct thin_c *tc, struct bio *bio) | 1064 | static void process_discard(struct thin_c *tc, struct bio *bio) |
| @@ -1296,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) | |||
| 1296 | } | 1343 | } |
| 1297 | } | 1344 | } |
| 1298 | 1345 | ||
| 1346 | static void process_bio_success(struct thin_c *tc, struct bio *bio) | ||
| 1347 | { | ||
| 1348 | bio_endio(bio, 0); | ||
| 1349 | } | ||
| 1350 | |||
| 1299 | static void process_bio_fail(struct thin_c *tc, struct bio *bio) | 1351 | static void process_bio_fail(struct thin_c *tc, struct bio *bio) |
| 1300 | { | 1352 | { |
| 1301 | bio_io_error(bio); | 1353 | bio_io_error(bio); |
| @@ -1328,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool) | |||
| 1328 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | 1380 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); |
| 1329 | struct thin_c *tc = h->tc; | 1381 | struct thin_c *tc = h->tc; |
| 1330 | 1382 | ||
| 1383 | if (tc->requeue_mode) { | ||
| 1384 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
| 1385 | continue; | ||
| 1386 | } | ||
| 1387 | |||
| 1331 | /* | 1388 | /* |
| 1332 | * If we've got no free new_mapping structs, and processing | 1389 | * If we've got no free new_mapping structs, and processing |
| 1333 | * this bio might require one, we pause until there are some | 1390 | * this bio might require one, we pause until there are some |
| @@ -1394,51 +1451,134 @@ static void do_waker(struct work_struct *ws) | |||
| 1394 | 1451 | ||
| 1395 | /*----------------------------------------------------------------*/ | 1452 | /*----------------------------------------------------------------*/ |
| 1396 | 1453 | ||
| 1454 | struct noflush_work { | ||
| 1455 | struct work_struct worker; | ||
| 1456 | struct thin_c *tc; | ||
| 1457 | |||
| 1458 | atomic_t complete; | ||
| 1459 | wait_queue_head_t wait; | ||
| 1460 | }; | ||
| 1461 | |||
| 1462 | static void complete_noflush_work(struct noflush_work *w) | ||
| 1463 | { | ||
| 1464 | atomic_set(&w->complete, 1); | ||
| 1465 | wake_up(&w->wait); | ||
| 1466 | } | ||
| 1467 | |||
| 1468 | static void do_noflush_start(struct work_struct *ws) | ||
| 1469 | { | ||
| 1470 | struct noflush_work *w = container_of(ws, struct noflush_work, worker); | ||
| 1471 | w->tc->requeue_mode = true; | ||
| 1472 | requeue_io(w->tc); | ||
| 1473 | complete_noflush_work(w); | ||
| 1474 | } | ||
| 1475 | |||
| 1476 | static void do_noflush_stop(struct work_struct *ws) | ||
| 1477 | { | ||
| 1478 | struct noflush_work *w = container_of(ws, struct noflush_work, worker); | ||
| 1479 | w->tc->requeue_mode = false; | ||
| 1480 | complete_noflush_work(w); | ||
| 1481 | } | ||
| 1482 | |||
| 1483 | static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) | ||
| 1484 | { | ||
| 1485 | struct noflush_work w; | ||
| 1486 | |||
| 1487 | INIT_WORK(&w.worker, fn); | ||
| 1488 | w.tc = tc; | ||
| 1489 | atomic_set(&w.complete, 0); | ||
| 1490 | init_waitqueue_head(&w.wait); | ||
| 1491 | |||
| 1492 | queue_work(tc->pool->wq, &w.worker); | ||
| 1493 | |||
| 1494 | wait_event(w.wait, atomic_read(&w.complete)); | ||
| 1495 | } | ||
| 1496 | |||
| 1497 | /*----------------------------------------------------------------*/ | ||
| 1498 | |||
| 1397 | static enum pool_mode get_pool_mode(struct pool *pool) | 1499 | static enum pool_mode get_pool_mode(struct pool *pool) |
| 1398 | { | 1500 | { |
| 1399 | return pool->pf.mode; | 1501 | return pool->pf.mode; |
| 1400 | } | 1502 | } |
| 1401 | 1503 | ||
| 1504 | static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode) | ||
| 1505 | { | ||
| 1506 | dm_table_event(pool->ti->table); | ||
| 1507 | DMINFO("%s: switching pool to %s mode", | ||
| 1508 | dm_device_name(pool->pool_md), new_mode); | ||
| 1509 | } | ||
| 1510 | |||
| 1402 | static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) | 1511 | static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) |
| 1403 | { | 1512 | { |
| 1404 | int r; | 1513 | struct pool_c *pt = pool->ti->private; |
| 1405 | enum pool_mode old_mode = pool->pf.mode; | 1514 | bool needs_check = dm_pool_metadata_needs_check(pool->pmd); |
| 1515 | enum pool_mode old_mode = get_pool_mode(pool); | ||
| 1516 | |||
| 1517 | /* | ||
| 1518 | * Never allow the pool to transition to PM_WRITE mode if user | ||
| 1519 | * intervention is required to verify metadata and data consistency. | ||
| 1520 | */ | ||
| 1521 | if (new_mode == PM_WRITE && needs_check) { | ||
| 1522 | DMERR("%s: unable to switch pool to write mode until repaired.", | ||
| 1523 | dm_device_name(pool->pool_md)); | ||
| 1524 | if (old_mode != new_mode) | ||
| 1525 | new_mode = old_mode; | ||
| 1526 | else | ||
| 1527 | new_mode = PM_READ_ONLY; | ||
| 1528 | } | ||
| 1529 | /* | ||
| 1530 | * If we were in PM_FAIL mode, rollback of metadata failed. We're | ||
| 1531 | * not going to recover without a thin_repair. So we never let the | ||
| 1532 | * pool move out of the old mode. | ||
| 1533 | */ | ||
| 1534 | if (old_mode == PM_FAIL) | ||
| 1535 | new_mode = old_mode; | ||
| 1406 | 1536 | ||
| 1407 | switch (new_mode) { | 1537 | switch (new_mode) { |
| 1408 | case PM_FAIL: | 1538 | case PM_FAIL: |
| 1409 | if (old_mode != new_mode) | 1539 | if (old_mode != new_mode) |
| 1410 | DMERR("%s: switching pool to failure mode", | 1540 | notify_of_pool_mode_change(pool, "failure"); |
| 1411 | dm_device_name(pool->pool_md)); | ||
| 1412 | dm_pool_metadata_read_only(pool->pmd); | 1541 | dm_pool_metadata_read_only(pool->pmd); |
| 1413 | pool->process_bio = process_bio_fail; | 1542 | pool->process_bio = process_bio_fail; |
| 1414 | pool->process_discard = process_bio_fail; | 1543 | pool->process_discard = process_bio_fail; |
| 1415 | pool->process_prepared_mapping = process_prepared_mapping_fail; | 1544 | pool->process_prepared_mapping = process_prepared_mapping_fail; |
| 1416 | pool->process_prepared_discard = process_prepared_discard_fail; | 1545 | pool->process_prepared_discard = process_prepared_discard_fail; |
| 1546 | |||
| 1547 | error_retry_list(pool); | ||
| 1417 | break; | 1548 | break; |
| 1418 | 1549 | ||
| 1419 | case PM_READ_ONLY: | 1550 | case PM_READ_ONLY: |
| 1420 | if (old_mode != new_mode) | 1551 | if (old_mode != new_mode) |
| 1421 | DMERR("%s: switching pool to read-only mode", | 1552 | notify_of_pool_mode_change(pool, "read-only"); |
| 1422 | dm_device_name(pool->pool_md)); | 1553 | dm_pool_metadata_read_only(pool->pmd); |
| 1423 | r = dm_pool_abort_metadata(pool->pmd); | 1554 | pool->process_bio = process_bio_read_only; |
| 1424 | if (r) { | 1555 | pool->process_discard = process_bio_success; |
| 1425 | DMERR("%s: aborting transaction failed", | 1556 | pool->process_prepared_mapping = process_prepared_mapping_fail; |
| 1426 | dm_device_name(pool->pool_md)); | 1557 | pool->process_prepared_discard = process_prepared_discard_passdown; |
| 1427 | new_mode = PM_FAIL; | 1558 | |
| 1428 | set_pool_mode(pool, new_mode); | 1559 | error_retry_list(pool); |
| 1429 | } else { | 1560 | break; |
| 1430 | dm_pool_metadata_read_only(pool->pmd); | 1561 | |
| 1431 | pool->process_bio = process_bio_read_only; | 1562 | case PM_OUT_OF_DATA_SPACE: |
| 1432 | pool->process_discard = process_discard; | 1563 | /* |
| 1433 | pool->process_prepared_mapping = process_prepared_mapping_fail; | 1564 | * Ideally we'd never hit this state; the low water mark |
| 1434 | pool->process_prepared_discard = process_prepared_discard_passdown; | 1565 | * would trigger userland to extend the pool before we |
| 1435 | } | 1566 | * completely run out of data space. However, many small |
| 1567 | * IOs to unprovisioned space can consume data space at an | ||
| 1568 | * alarming rate. Adjust your low water mark if you're | ||
| 1569 | * frequently seeing this mode. | ||
| 1570 | */ | ||
| 1571 | if (old_mode != new_mode) | ||
| 1572 | notify_of_pool_mode_change(pool, "out-of-data-space"); | ||
| 1573 | pool->process_bio = process_bio_read_only; | ||
| 1574 | pool->process_discard = process_discard; | ||
| 1575 | pool->process_prepared_mapping = process_prepared_mapping; | ||
| 1576 | pool->process_prepared_discard = process_prepared_discard_passdown; | ||
| 1436 | break; | 1577 | break; |
| 1437 | 1578 | ||
| 1438 | case PM_WRITE: | 1579 | case PM_WRITE: |
| 1439 | if (old_mode != new_mode) | 1580 | if (old_mode != new_mode) |
| 1440 | DMINFO("%s: switching pool to write mode", | 1581 | notify_of_pool_mode_change(pool, "write"); |
| 1441 | dm_device_name(pool->pool_md)); | ||
| 1442 | dm_pool_metadata_read_write(pool->pmd); | 1582 | dm_pool_metadata_read_write(pool->pmd); |
| 1443 | pool->process_bio = process_bio; | 1583 | pool->process_bio = process_bio; |
| 1444 | pool->process_discard = process_discard; | 1584 | pool->process_discard = process_discard; |
| @@ -1448,32 +1588,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) | |||
| 1448 | } | 1588 | } |
| 1449 | 1589 | ||
| 1450 | pool->pf.mode = new_mode; | 1590 | pool->pf.mode = new_mode; |
| 1591 | /* | ||
| 1592 | * The pool mode may have changed, sync it so bind_control_target() | ||
| 1593 | * doesn't cause an unexpected mode transition on resume. | ||
| 1594 | */ | ||
| 1595 | pt->adjusted_pf.mode = new_mode; | ||
| 1451 | } | 1596 | } |
| 1452 | 1597 | ||
| 1453 | /* | 1598 | static void abort_transaction(struct pool *pool) |
| 1454 | * Rather than calling set_pool_mode directly, use these which describe the | ||
| 1455 | * reason for mode degradation. | ||
| 1456 | */ | ||
| 1457 | static void out_of_data_space(struct pool *pool) | ||
| 1458 | { | 1599 | { |
| 1459 | DMERR_LIMIT("%s: no free data space available.", | 1600 | const char *dev_name = dm_device_name(pool->pool_md); |
| 1460 | dm_device_name(pool->pool_md)); | 1601 | |
| 1461 | set_pool_mode(pool, PM_READ_ONLY); | 1602 | DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); |
| 1603 | if (dm_pool_abort_metadata(pool->pmd)) { | ||
| 1604 | DMERR("%s: failed to abort metadata transaction", dev_name); | ||
| 1605 | set_pool_mode(pool, PM_FAIL); | ||
| 1606 | } | ||
| 1607 | |||
| 1608 | if (dm_pool_metadata_set_needs_check(pool->pmd)) { | ||
| 1609 | DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); | ||
| 1610 | set_pool_mode(pool, PM_FAIL); | ||
| 1611 | } | ||
| 1462 | } | 1612 | } |
| 1463 | 1613 | ||
| 1464 | static void metadata_operation_failed(struct pool *pool, const char *op, int r) | 1614 | static void metadata_operation_failed(struct pool *pool, const char *op, int r) |
| 1465 | { | 1615 | { |
| 1466 | dm_block_t free_blocks; | ||
| 1467 | |||
| 1468 | DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", | 1616 | DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", |
| 1469 | dm_device_name(pool->pool_md), op, r); | 1617 | dm_device_name(pool->pool_md), op, r); |
| 1470 | 1618 | ||
| 1471 | if (r == -ENOSPC && | 1619 | abort_transaction(pool); |
| 1472 | !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && | ||
| 1473 | !free_blocks) | ||
| 1474 | DMERR_LIMIT("%s: no free metadata space available.", | ||
| 1475 | dm_device_name(pool->pool_md)); | ||
| 1476 | |||
| 1477 | set_pool_mode(pool, PM_READ_ONLY); | 1620 | set_pool_mode(pool, PM_READ_ONLY); |
| 1478 | } | 1621 | } |
| 1479 | 1622 | ||
| @@ -1524,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) | |||
| 1524 | 1667 | ||
| 1525 | thin_hook_bio(tc, bio); | 1668 | thin_hook_bio(tc, bio); |
| 1526 | 1669 | ||
| 1670 | if (tc->requeue_mode) { | ||
| 1671 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
| 1672 | return DM_MAPIO_SUBMITTED; | ||
| 1673 | } | ||
| 1674 | |||
| 1527 | if (get_pool_mode(tc->pool) == PM_FAIL) { | 1675 | if (get_pool_mode(tc->pool) == PM_FAIL) { |
| 1528 | bio_io_error(bio); | 1676 | bio_io_error(bio); |
| 1529 | return DM_MAPIO_SUBMITTED; | 1677 | return DM_MAPIO_SUBMITTED; |
| @@ -1687,7 +1835,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) | |||
| 1687 | /* | 1835 | /* |
| 1688 | * We want to make sure that a pool in PM_FAIL mode is never upgraded. | 1836 | * We want to make sure that a pool in PM_FAIL mode is never upgraded. |
| 1689 | */ | 1837 | */ |
| 1690 | enum pool_mode old_mode = pool->pf.mode; | 1838 | enum pool_mode old_mode = get_pool_mode(pool); |
| 1691 | enum pool_mode new_mode = pt->adjusted_pf.mode; | 1839 | enum pool_mode new_mode = pt->adjusted_pf.mode; |
| 1692 | 1840 | ||
| 1693 | /* | 1841 | /* |
| @@ -1701,16 +1849,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) | |||
| 1701 | pool->pf = pt->adjusted_pf; | 1849 | pool->pf = pt->adjusted_pf; |
| 1702 | pool->low_water_blocks = pt->low_water_blocks; | 1850 | pool->low_water_blocks = pt->low_water_blocks; |
| 1703 | 1851 | ||
| 1704 | /* | ||
| 1705 | * If we were in PM_FAIL mode, rollback of metadata failed. We're | ||
| 1706 | * not going to recover without a thin_repair. So we never let the | ||
| 1707 | * pool move out of the old mode. On the other hand a PM_READ_ONLY | ||
| 1708 | * may have been due to a lack of metadata or data space, and may | ||
| 1709 | * now work (ie. if the underlying devices have been resized). | ||
| 1710 | */ | ||
| 1711 | if (old_mode == PM_FAIL) | ||
| 1712 | new_mode = old_mode; | ||
| 1713 | |||
| 1714 | set_pool_mode(pool, new_mode); | 1852 | set_pool_mode(pool, new_mode); |
| 1715 | 1853 | ||
| 1716 | return 0; | 1854 | return 0; |
| @@ -2253,6 +2391,12 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit) | |||
| 2253 | return -EINVAL; | 2391 | return -EINVAL; |
| 2254 | 2392 | ||
| 2255 | } else if (data_size > sb_data_size) { | 2393 | } else if (data_size > sb_data_size) { |
| 2394 | if (dm_pool_metadata_needs_check(pool->pmd)) { | ||
| 2395 | DMERR("%s: unable to grow the data device until repaired.", | ||
| 2396 | dm_device_name(pool->pool_md)); | ||
| 2397 | return 0; | ||
| 2398 | } | ||
| 2399 | |||
| 2256 | if (sb_data_size) | 2400 | if (sb_data_size) |
| 2257 | DMINFO("%s: growing the data device from %llu to %llu blocks", | 2401 | DMINFO("%s: growing the data device from %llu to %llu blocks", |
| 2258 | dm_device_name(pool->pool_md), | 2402 | dm_device_name(pool->pool_md), |
| @@ -2294,6 +2438,12 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) | |||
| 2294 | return -EINVAL; | 2438 | return -EINVAL; |
| 2295 | 2439 | ||
| 2296 | } else if (metadata_dev_size > sb_metadata_dev_size) { | 2440 | } else if (metadata_dev_size > sb_metadata_dev_size) { |
| 2441 | if (dm_pool_metadata_needs_check(pool->pmd)) { | ||
| 2442 | DMERR("%s: unable to grow the metadata device until repaired.", | ||
| 2443 | dm_device_name(pool->pool_md)); | ||
| 2444 | return 0; | ||
| 2445 | } | ||
| 2446 | |||
| 2297 | warn_if_metadata_device_too_big(pool->md_dev); | 2447 | warn_if_metadata_device_too_big(pool->md_dev); |
| 2298 | DMINFO("%s: growing the metadata device from %llu to %llu blocks", | 2448 | DMINFO("%s: growing the metadata device from %llu to %llu blocks", |
| 2299 | dm_device_name(pool->pool_md), | 2449 | dm_device_name(pool->pool_md), |
| @@ -2681,7 +2831,9 @@ static void pool_status(struct dm_target *ti, status_type_t type, | |||
| 2681 | else | 2831 | else |
| 2682 | DMEMIT("- "); | 2832 | DMEMIT("- "); |
| 2683 | 2833 | ||
| 2684 | if (pool->pf.mode == PM_READ_ONLY) | 2834 | if (pool->pf.mode == PM_OUT_OF_DATA_SPACE) |
| 2835 | DMEMIT("out_of_data_space "); | ||
| 2836 | else if (pool->pf.mode == PM_READ_ONLY) | ||
| 2685 | DMEMIT("ro "); | 2837 | DMEMIT("ro "); |
| 2686 | else | 2838 | else |
| 2687 | DMEMIT("rw "); | 2839 | DMEMIT("rw "); |
| @@ -2795,7 +2947,7 @@ static struct target_type pool_target = { | |||
| 2795 | .name = "thin-pool", | 2947 | .name = "thin-pool", |
| 2796 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | | 2948 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | |
| 2797 | DM_TARGET_IMMUTABLE, | 2949 | DM_TARGET_IMMUTABLE, |
| 2798 | .version = {1, 10, 0}, | 2950 | .version = {1, 11, 0}, |
| 2799 | .module = THIS_MODULE, | 2951 | .module = THIS_MODULE, |
| 2800 | .ctr = pool_ctr, | 2952 | .ctr = pool_ctr, |
| 2801 | .dtr = pool_dtr, | 2953 | .dtr = pool_dtr, |
| @@ -2997,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err) | |||
| 2997 | return 0; | 3149 | return 0; |
| 2998 | } | 3150 | } |
| 2999 | 3151 | ||
| 3000 | static void thin_postsuspend(struct dm_target *ti) | 3152 | static void thin_presuspend(struct dm_target *ti) |
| 3001 | { | 3153 | { |
| 3154 | struct thin_c *tc = ti->private; | ||
| 3155 | |||
| 3002 | if (dm_noflush_suspending(ti)) | 3156 | if (dm_noflush_suspending(ti)) |
| 3003 | requeue_io((struct thin_c *)ti->private); | 3157 | noflush_work(tc, do_noflush_start); |
| 3158 | } | ||
| 3159 | |||
| 3160 | static void thin_postsuspend(struct dm_target *ti) | ||
| 3161 | { | ||
| 3162 | struct thin_c *tc = ti->private; | ||
| 3163 | |||
| 3164 | /* | ||
| 3165 | * The dm_noflush_suspending flag has been cleared by now, so | ||
| 3166 | * unfortunately we must always run this. | ||
| 3167 | */ | ||
| 3168 | noflush_work(tc, do_noflush_stop); | ||
| 3004 | } | 3169 | } |
| 3005 | 3170 | ||
| 3006 | /* | 3171 | /* |
| @@ -3085,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti, | |||
| 3085 | 3250 | ||
| 3086 | static struct target_type thin_target = { | 3251 | static struct target_type thin_target = { |
| 3087 | .name = "thin", | 3252 | .name = "thin", |
| 3088 | .version = {1, 10, 0}, | 3253 | .version = {1, 11, 0}, |
| 3089 | .module = THIS_MODULE, | 3254 | .module = THIS_MODULE, |
| 3090 | .ctr = thin_ctr, | 3255 | .ctr = thin_ctr, |
| 3091 | .dtr = thin_dtr, | 3256 | .dtr = thin_dtr, |
| 3092 | .map = thin_map, | 3257 | .map = thin_map, |
| 3093 | .end_io = thin_endio, | 3258 | .end_io = thin_endio, |
| 3259 | .presuspend = thin_presuspend, | ||
| 3094 | .postsuspend = thin_postsuspend, | 3260 | .postsuspend = thin_postsuspend, |
| 3095 | .status = thin_status, | 3261 | .status = thin_status, |
| 3096 | .iterate_devices = thin_iterate_devices, | 3262 | .iterate_devices = thin_iterate_devices, |
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index 19b268795415..0c2dec7aec20 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig | |||
| @@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA | |||
| 6 | ---help--- | 6 | ---help--- |
| 7 | Library providing immutable on-disk data structure support for | 7 | Library providing immutable on-disk data structure support for |
| 8 | device-mapper targets such as the thin provisioning target. | 8 | device-mapper targets such as the thin provisioning target. |
| 9 | |||
| 10 | config DM_DEBUG_BLOCK_STACK_TRACING | ||
| 11 | boolean "Keep stack trace of persistent data block lock holders" | ||
| 12 | depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA | ||
| 13 | select STACKTRACE | ||
| 14 | ---help--- | ||
| 15 | Enable this for messages that may help debug problems with the | ||
| 16 | block manager locking used by thin provisioning and caching. | ||
| 17 | |||
| 18 | If unsure, say N. | ||
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index e9bdd462f4f5..786b689bdfc7 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c | |||
| @@ -91,6 +91,69 @@ struct block_op { | |||
| 91 | dm_block_t block; | 91 | dm_block_t block; |
| 92 | }; | 92 | }; |
| 93 | 93 | ||
| 94 | struct bop_ring_buffer { | ||
| 95 | unsigned begin; | ||
| 96 | unsigned end; | ||
| 97 | struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1]; | ||
| 98 | }; | ||
| 99 | |||
| 100 | static void brb_init(struct bop_ring_buffer *brb) | ||
| 101 | { | ||
| 102 | brb->begin = 0; | ||
| 103 | brb->end = 0; | ||
| 104 | } | ||
| 105 | |||
| 106 | static bool brb_empty(struct bop_ring_buffer *brb) | ||
| 107 | { | ||
| 108 | return brb->begin == brb->end; | ||
| 109 | } | ||
| 110 | |||
| 111 | static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) | ||
| 112 | { | ||
| 113 | unsigned r = old + 1; | ||
| 114 | return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r; | ||
| 115 | } | ||
| 116 | |||
| 117 | static int brb_push(struct bop_ring_buffer *brb, | ||
| 118 | enum block_op_type type, dm_block_t b) | ||
| 119 | { | ||
| 120 | struct block_op *bop; | ||
| 121 | unsigned next = brb_next(brb, brb->end); | ||
| 122 | |||
| 123 | /* | ||
| 124 | * We don't allow the last bop to be filled, this way we can | ||
| 125 | * differentiate between full and empty. | ||
| 126 | */ | ||
| 127 | if (next == brb->begin) | ||
| 128 | return -ENOMEM; | ||
| 129 | |||
| 130 | bop = brb->bops + brb->end; | ||
| 131 | bop->type = type; | ||
| 132 | bop->block = b; | ||
| 133 | |||
| 134 | brb->end = next; | ||
| 135 | |||
| 136 | return 0; | ||
| 137 | } | ||
| 138 | |||
| 139 | static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result) | ||
| 140 | { | ||
| 141 | struct block_op *bop; | ||
| 142 | |||
| 143 | if (brb_empty(brb)) | ||
| 144 | return -ENODATA; | ||
| 145 | |||
| 146 | bop = brb->bops + brb->begin; | ||
| 147 | result->type = bop->type; | ||
| 148 | result->block = bop->block; | ||
| 149 | |||
| 150 | brb->begin = brb_next(brb, brb->begin); | ||
| 151 | |||
| 152 | return 0; | ||
| 153 | } | ||
| 154 | |||
| 155 | /*----------------------------------------------------------------*/ | ||
| 156 | |||
| 94 | struct sm_metadata { | 157 | struct sm_metadata { |
| 95 | struct dm_space_map sm; | 158 | struct dm_space_map sm; |
| 96 | 159 | ||
| @@ -101,25 +164,20 @@ struct sm_metadata { | |||
| 101 | 164 | ||
| 102 | unsigned recursion_count; | 165 | unsigned recursion_count; |
| 103 | unsigned allocated_this_transaction; | 166 | unsigned allocated_this_transaction; |
| 104 | unsigned nr_uncommitted; | 167 | struct bop_ring_buffer uncommitted; |
| 105 | struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS]; | ||
| 106 | 168 | ||
| 107 | struct threshold threshold; | 169 | struct threshold threshold; |
| 108 | }; | 170 | }; |
| 109 | 171 | ||
| 110 | static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) | 172 | static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) |
| 111 | { | 173 | { |
| 112 | struct block_op *op; | 174 | int r = brb_push(&smm->uncommitted, type, b); |
| 113 | 175 | ||
| 114 | if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { | 176 | if (r) { |
| 115 | DMERR("too many recursive allocations"); | 177 | DMERR("too many recursive allocations"); |
| 116 | return -ENOMEM; | 178 | return -ENOMEM; |
| 117 | } | 179 | } |
| 118 | 180 | ||
| 119 | op = smm->uncommitted + smm->nr_uncommitted++; | ||
| 120 | op->type = type; | ||
| 121 | op->block = b; | ||
| 122 | |||
| 123 | return 0; | 181 | return 0; |
| 124 | } | 182 | } |
| 125 | 183 | ||
| @@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm) | |||
| 158 | return -ENOMEM; | 216 | return -ENOMEM; |
| 159 | } | 217 | } |
| 160 | 218 | ||
| 161 | if (smm->recursion_count == 1 && smm->nr_uncommitted) { | 219 | if (smm->recursion_count == 1) { |
| 162 | while (smm->nr_uncommitted && !r) { | 220 | while (!brb_empty(&smm->uncommitted)) { |
| 163 | smm->nr_uncommitted--; | 221 | struct block_op bop; |
| 164 | r = commit_bop(smm, smm->uncommitted + | 222 | |
| 165 | smm->nr_uncommitted); | 223 | r = brb_pop(&smm->uncommitted, &bop); |
| 224 | if (r) { | ||
| 225 | DMERR("bug in bop ring buffer"); | ||
| 226 | break; | ||
| 227 | } | ||
| 228 | |||
| 229 | r = commit_bop(smm, &bop); | ||
| 166 | if (r) | 230 | if (r) |
| 167 | break; | 231 | break; |
| 168 | } | 232 | } |
| @@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count) | |||
| 217 | static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, | 281 | static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, |
| 218 | uint32_t *result) | 282 | uint32_t *result) |
| 219 | { | 283 | { |
| 220 | int r, i; | 284 | int r; |
| 285 | unsigned i; | ||
| 221 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); | 286 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); |
| 222 | unsigned adjustment = 0; | 287 | unsigned adjustment = 0; |
| 223 | 288 | ||
| @@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, | |||
| 225 | * We may have some uncommitted adjustments to add. This list | 290 | * We may have some uncommitted adjustments to add. This list |
| 226 | * should always be really short. | 291 | * should always be really short. |
| 227 | */ | 292 | */ |
| 228 | for (i = 0; i < smm->nr_uncommitted; i++) { | 293 | for (i = smm->uncommitted.begin; |
| 229 | struct block_op *op = smm->uncommitted + i; | 294 | i != smm->uncommitted.end; |
| 295 | i = brb_next(&smm->uncommitted, i)) { | ||
| 296 | struct block_op *op = smm->uncommitted.bops + i; | ||
| 230 | 297 | ||
| 231 | if (op->block != b) | 298 | if (op->block != b) |
| 232 | continue; | 299 | continue; |
| @@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, | |||
| 254 | static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, | 321 | static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, |
| 255 | dm_block_t b, int *result) | 322 | dm_block_t b, int *result) |
| 256 | { | 323 | { |
| 257 | int r, i, adjustment = 0; | 324 | int r, adjustment = 0; |
| 325 | unsigned i; | ||
| 258 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); | 326 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); |
| 259 | uint32_t rc; | 327 | uint32_t rc; |
| 260 | 328 | ||
| @@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, | |||
| 262 | * We may have some uncommitted adjustments to add. This list | 330 | * We may have some uncommitted adjustments to add. This list |
| 263 | * should always be really short. | 331 | * should always be really short. |
| 264 | */ | 332 | */ |
| 265 | for (i = 0; i < smm->nr_uncommitted; i++) { | 333 | for (i = smm->uncommitted.begin; |
| 266 | struct block_op *op = smm->uncommitted + i; | 334 | i != smm->uncommitted.end; |
| 335 | i = brb_next(&smm->uncommitted, i)) { | ||
| 336 | |||
| 337 | struct block_op *op = smm->uncommitted.bops + i; | ||
| 267 | 338 | ||
| 268 | if (op->block != b) | 339 | if (op->block != b) |
| 269 | continue; | 340 | continue; |
| @@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm, | |||
| 671 | smm->begin = superblock + 1; | 742 | smm->begin = superblock + 1; |
| 672 | smm->recursion_count = 0; | 743 | smm->recursion_count = 0; |
| 673 | smm->allocated_this_transaction = 0; | 744 | smm->allocated_this_transaction = 0; |
| 674 | smm->nr_uncommitted = 0; | 745 | brb_init(&smm->uncommitted); |
| 675 | threshold_init(&smm->threshold); | 746 | threshold_init(&smm->threshold); |
| 676 | 747 | ||
| 677 | memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); | 748 | memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); |
| @@ -715,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm, | |||
| 715 | smm->begin = 0; | 786 | smm->begin = 0; |
| 716 | smm->recursion_count = 0; | 787 | smm->recursion_count = 0; |
| 717 | smm->allocated_this_transaction = 0; | 788 | smm->allocated_this_transaction = 0; |
| 718 | smm->nr_uncommitted = 0; | 789 | brb_init(&smm->uncommitted); |
| 719 | threshold_init(&smm->threshold); | 790 | threshold_init(&smm->threshold); |
| 720 | 791 | ||
| 721 | memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); | 792 | memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); |
