aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-01-22 23:17:48 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-01-22 23:17:48 -0500
commitfe41c2c018b8af9b370a40845f547e22894ff68a (patch)
tree3573a10912e24ffcd48177785043e0de17b8e9d0 /drivers
parent194e57fd1835564735fd0ba5e3870230861cacd2 (diff)
parent5066a4df1f427faac8372d20494483bb09a4a1cd (diff)
Merge tag 'dm-3.14-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device-mapper changes from Mike Snitzer: "A lot of attention was paid to improving the thin-provisioning target's handling of metadata operation failures and running out of space. A new 'error_if_no_space' feature was added to allow users to error IOs rather than queue them when either the data or metadata space is exhausted. Additional fixes/features include: - a few fixes to properly support thin metadata device resizing - a solution for reliably waiting for a DM device's embedded kobject to be released before destroying the device - old dm-snapshot is updated to use the dm-bufio interface to take advantage of readahead capabilities that improve snapshot activation - new dm-cache target tunables to control how quickly data is promoted to the cache (fast) device - improved write efficiency of cluster mirror target by combining userspace flush and mark requests" * tag 'dm-3.14-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (35 commits) dm log userspace: allow mark requests to piggyback on flush requests dm space map metadata: fix bug in resizing of thin metadata dm cache: add policy name to status output dm thin: fix pool feature parsing dm sysfs: fix a module unload race dm snapshot: use dm-bufio prefetch dm snapshot: use dm-bufio dm snapshot: prepare for switch to using dm-bufio dm snapshot: use GFP_KERNEL when initializing exceptions dm cache: add block sizes and total cache blocks to status output dm btree: add dm_btree_find_lowest_key dm space map metadata: fix extending the space map dm space map common: make sure new space is used during extend dm: wait until embedded kobject is released before destroying a device dm: remove pointless kobject comparison in dm_get_from_kobject dm snapshot: call destroy_work_on_stack() to pair with INIT_WORK_ONSTACK() dm cache policy mq: introduce three promotion threshold tunables dm cache policy mq: use list_del_init instead of list_del + INIT_LIST_HEAD dm thin: fix set_pool_mode exposed pool operation races dm thin: eliminate the no_free_space flag ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/Kconfig11
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-bufio.c36
-rw-r--r--drivers/md/dm-bufio.h12
-rw-r--r--drivers/md/dm-builtin.c48
-rw-r--r--drivers/md/dm-cache-policy-mq.c70
-rw-r--r--drivers/md/dm-cache-policy.c4
-rw-r--r--drivers/md/dm-cache-policy.h6
-rw-r--r--drivers/md/dm-cache-target.c20
-rw-r--r--drivers/md/dm-delay.c35
-rw-r--r--drivers/md/dm-log-userspace-base.c206
-rw-r--r--drivers/md/dm-snap-persistent.c87
-rw-r--r--drivers/md/dm-snap.c10
-rw-r--r--drivers/md/dm-sysfs.c5
-rw-r--r--drivers/md/dm-table.c22
-rw-r--r--drivers/md/dm-thin-metadata.c20
-rw-r--r--drivers/md/dm-thin-metadata.h4
-rw-r--r--drivers/md/dm-thin.c284
-rw-r--r--drivers/md/dm.c15
-rw-r--r--drivers/md/dm.h17
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c2
-rw-r--r--drivers/md/persistent-data/dm-btree.c33
-rw-r--r--drivers/md/persistent-data/dm-btree.h8
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c6
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c32
25 files changed, 700 insertions, 294 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index f2ccbc3b9fe4..9a06fe883766 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -176,8 +176,12 @@ config MD_FAULTY
176 176
177source "drivers/md/bcache/Kconfig" 177source "drivers/md/bcache/Kconfig"
178 178
179config BLK_DEV_DM_BUILTIN
180 boolean
181
179config BLK_DEV_DM 182config BLK_DEV_DM
180 tristate "Device mapper support" 183 tristate "Device mapper support"
184 select BLK_DEV_DM_BUILTIN
181 ---help--- 185 ---help---
182 Device-mapper is a low level volume manager. It works by allowing 186 Device-mapper is a low level volume manager. It works by allowing
183 people to specify mappings for ranges of logical sectors. Various 187 people to specify mappings for ranges of logical sectors. Various
@@ -238,6 +242,7 @@ config DM_CRYPT
238config DM_SNAPSHOT 242config DM_SNAPSHOT
239 tristate "Snapshot target" 243 tristate "Snapshot target"
240 depends on BLK_DEV_DM 244 depends on BLK_DEV_DM
245 select DM_BUFIO
241 ---help--- 246 ---help---
242 Allow volume managers to take writable snapshots of a device. 247 Allow volume managers to take writable snapshots of a device.
243 248
@@ -250,12 +255,12 @@ config DM_THIN_PROVISIONING
250 Provides thin provisioning and snapshots that share a data store. 255 Provides thin provisioning and snapshots that share a data store.
251 256
252config DM_DEBUG_BLOCK_STACK_TRACING 257config DM_DEBUG_BLOCK_STACK_TRACING
253 boolean "Keep stack trace of thin provisioning block lock holders" 258 boolean "Keep stack trace of persistent data block lock holders"
254 depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING 259 depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
255 select STACKTRACE 260 select STACKTRACE
256 ---help--- 261 ---help---
257 Enable this for messages that may help debug problems with the 262 Enable this for messages that may help debug problems with the
258 block manager locking used by thin provisioning. 263 block manager locking used by thin provisioning and caching.
259 264
260 If unsure, say N. 265 If unsure, say N.
261 266
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 2acc43fe0229..f26d83292579 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_MD_FAULTY) += faulty.o
32obj-$(CONFIG_BCACHE) += bcache/ 32obj-$(CONFIG_BCACHE) += bcache/
33obj-$(CONFIG_BLK_DEV_MD) += md-mod.o 33obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
34obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o 34obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
35obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o
35obj-$(CONFIG_DM_BUFIO) += dm-bufio.o 36obj-$(CONFIG_DM_BUFIO) += dm-bufio.o
36obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o 37obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o
37obj-$(CONFIG_DM_CRYPT) += dm-crypt.o 38obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 54bdd923316f..9ed42125514b 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -104,6 +104,8 @@ struct dm_bufio_client {
104 struct list_head reserved_buffers; 104 struct list_head reserved_buffers;
105 unsigned need_reserved_buffers; 105 unsigned need_reserved_buffers;
106 106
107 unsigned minimum_buffers;
108
107 struct hlist_head *cache_hash; 109 struct hlist_head *cache_hash;
108 wait_queue_head_t free_buffer_wait; 110 wait_queue_head_t free_buffer_wait;
109 111
@@ -861,8 +863,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
861 buffers = dm_bufio_cache_size_per_client >> 863 buffers = dm_bufio_cache_size_per_client >>
862 (c->sectors_per_block_bits + SECTOR_SHIFT); 864 (c->sectors_per_block_bits + SECTOR_SHIFT);
863 865
864 if (buffers < DM_BUFIO_MIN_BUFFERS) 866 if (buffers < c->minimum_buffers)
865 buffers = DM_BUFIO_MIN_BUFFERS; 867 buffers = c->minimum_buffers;
866 868
867 *limit_buffers = buffers; 869 *limit_buffers = buffers;
868 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 870 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
@@ -1350,6 +1352,34 @@ retry:
1350} 1352}
1351EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1353EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1352 1354
1355/*
1356 * Free the given buffer.
1357 *
1358 * This is just a hint, if the buffer is in use or dirty, this function
1359 * does nothing.
1360 */
1361void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
1362{
1363 struct dm_buffer *b;
1364
1365 dm_bufio_lock(c);
1366
1367 b = __find(c, block);
1368 if (b && likely(!b->hold_count) && likely(!b->state)) {
1369 __unlink_buffer(b);
1370 __free_buffer_wake(b);
1371 }
1372
1373 dm_bufio_unlock(c);
1374}
1375EXPORT_SYMBOL(dm_bufio_forget);
1376
1377void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
1378{
1379 c->minimum_buffers = n;
1380}
1381EXPORT_SYMBOL(dm_bufio_set_minimum_buffers);
1382
1353unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1383unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1354{ 1384{
1355 return c->block_size; 1385 return c->block_size;
@@ -1546,6 +1576,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
1546 INIT_LIST_HEAD(&c->reserved_buffers); 1576 INIT_LIST_HEAD(&c->reserved_buffers);
1547 c->need_reserved_buffers = reserved_buffers; 1577 c->need_reserved_buffers = reserved_buffers;
1548 1578
1579 c->minimum_buffers = DM_BUFIO_MIN_BUFFERS;
1580
1549 init_waitqueue_head(&c->free_buffer_wait); 1581 init_waitqueue_head(&c->free_buffer_wait);
1550 c->async_write_error = 0; 1582 c->async_write_error = 0;
1551 1583
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index b142946a9e32..c096779a7292 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -108,6 +108,18 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c);
108 */ 108 */
109void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block); 109void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
110 110
111/*
112 * Free the given buffer.
113 * This is just a hint, if the buffer is in use or dirty, this function
114 * does nothing.
115 */
116void dm_bufio_forget(struct dm_bufio_client *c, sector_t block);
117
118/*
119 * Set the minimum number of buffers before cleanup happens.
120 */
121void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n);
122
111unsigned dm_bufio_get_block_size(struct dm_bufio_client *c); 123unsigned dm_bufio_get_block_size(struct dm_bufio_client *c);
112sector_t dm_bufio_get_device_size(struct dm_bufio_client *c); 124sector_t dm_bufio_get_device_size(struct dm_bufio_client *c);
113sector_t dm_bufio_get_block_number(struct dm_buffer *b); 125sector_t dm_bufio_get_block_number(struct dm_buffer *b);
diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c
new file mode 100644
index 000000000000..6c9049c51b2b
--- /dev/null
+++ b/drivers/md/dm-builtin.c
@@ -0,0 +1,48 @@
1#include "dm.h"
2
3/*
4 * The kobject release method must not be placed in the module itself,
5 * otherwise we are subject to module unload races.
6 *
7 * The release method is called when the last reference to the kobject is
8 * dropped. It may be called by any other kernel code that drops the last
9 * reference.
10 *
11 * The release method suffers from module unload race. We may prevent the
12 * module from being unloaded at the start of the release method (using
13 * increased module reference count or synchronizing against the release
14 * method), however there is no way to prevent the module from being
15 * unloaded at the end of the release method.
16 *
17 * If this code were placed in the dm module, the following race may
18 * happen:
19 * 1. Some other process takes a reference to dm kobject
20 * 2. The user issues ioctl function to unload the dm device
21 * 3. dm_sysfs_exit calls kobject_put, however the object is not released
22 * because of the other reference taken at step 1
23 * 4. dm_sysfs_exit waits on the completion
24 * 5. The other process that took the reference in step 1 drops it,
25 * dm_kobject_release is called from this process
26 * 6. dm_kobject_release calls complete()
27 * 7. a reschedule happens before dm_kobject_release returns
28 * 8. dm_sysfs_exit continues, the dm device is unloaded, module reference
29 * count is decremented
30 * 9. The user unloads the dm module
31 * 10. The other process that was rescheduled in step 7 continues to run,
32 * it is now executing code in unloaded module, so it crashes
33 *
34 * Note that if the process that takes the foreign reference to dm kobject
35 * has a low priority and the system is sufficiently loaded with
36 * higher-priority processes that prevent the low-priority process from
37 * being scheduled long enough, this bug may really happen.
38 *
39 * In order to fix this module unload race, we place the release method
40 * into a helper code that is compiled directly into the kernel.
41 */
42
43void dm_kobject_release(struct kobject *kobj)
44{
45 complete(dm_get_completion_from_kobject(kobj));
46}
47
48EXPORT_SYMBOL(dm_kobject_release);
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 64780ad73bb0..930e8c3d73e9 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -287,9 +287,8 @@ static struct entry *alloc_entry(struct entry_pool *ep)
287static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) 287static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
288{ 288{
289 struct entry *e = ep->entries + from_cblock(cblock); 289 struct entry *e = ep->entries + from_cblock(cblock);
290 list_del(&e->list);
291 290
292 INIT_LIST_HEAD(&e->list); 291 list_del_init(&e->list);
293 INIT_HLIST_NODE(&e->hlist); 292 INIT_HLIST_NODE(&e->hlist);
294 ep->nr_allocated++; 293 ep->nr_allocated++;
295 294
@@ -391,6 +390,10 @@ struct mq_policy {
391 */ 390 */
392 unsigned promote_threshold; 391 unsigned promote_threshold;
393 392
393 unsigned discard_promote_adjustment;
394 unsigned read_promote_adjustment;
395 unsigned write_promote_adjustment;
396
394 /* 397 /*
395 * The hash table allows us to quickly find an entry by origin 398 * The hash table allows us to quickly find an entry by origin
396 * block. Both pre_cache and cache entries are in here. 399 * block. Both pre_cache and cache entries are in here.
@@ -400,6 +403,10 @@ struct mq_policy {
400 struct hlist_head *table; 403 struct hlist_head *table;
401}; 404};
402 405
406#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
407#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
408#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
409
403/*----------------------------------------------------------------*/ 410/*----------------------------------------------------------------*/
404 411
405/* 412/*
@@ -642,25 +649,21 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
642 * We bias towards reads, since they can be demoted at no cost if they 649 * We bias towards reads, since they can be demoted at no cost if they
643 * haven't been dirtied. 650 * haven't been dirtied.
644 */ 651 */
645#define DISCARDED_PROMOTE_THRESHOLD 1
646#define READ_PROMOTE_THRESHOLD 4
647#define WRITE_PROMOTE_THRESHOLD 8
648
649static unsigned adjusted_promote_threshold(struct mq_policy *mq, 652static unsigned adjusted_promote_threshold(struct mq_policy *mq,
650 bool discarded_oblock, int data_dir) 653 bool discarded_oblock, int data_dir)
651{ 654{
652 if (data_dir == READ) 655 if (data_dir == READ)
653 return mq->promote_threshold + READ_PROMOTE_THRESHOLD; 656 return mq->promote_threshold + mq->read_promote_adjustment;
654 657
655 if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { 658 if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
656 /* 659 /*
657 * We don't need to do any copying at all, so give this a 660 * We don't need to do any copying at all, so give this a
658 * very low threshold. 661 * very low threshold.
659 */ 662 */
660 return DISCARDED_PROMOTE_THRESHOLD; 663 return mq->discard_promote_adjustment;
661 } 664 }
662 665
663 return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD; 666 return mq->promote_threshold + mq->write_promote_adjustment;
664} 667}
665 668
666static bool should_promote(struct mq_policy *mq, struct entry *e, 669static bool should_promote(struct mq_policy *mq, struct entry *e,
@@ -809,7 +812,7 @@ static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
809 bool can_migrate, bool discarded_oblock, 812 bool can_migrate, bool discarded_oblock,
810 int data_dir, struct policy_result *result) 813 int data_dir, struct policy_result *result)
811{ 814{
812 if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) { 815 if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
813 if (can_migrate) 816 if (can_migrate)
814 insert_in_cache(mq, oblock, result); 817 insert_in_cache(mq, oblock, result);
815 else 818 else
@@ -1135,20 +1138,28 @@ static int mq_set_config_value(struct dm_cache_policy *p,
1135 const char *key, const char *value) 1138 const char *key, const char *value)
1136{ 1139{
1137 struct mq_policy *mq = to_mq_policy(p); 1140 struct mq_policy *mq = to_mq_policy(p);
1138 enum io_pattern pattern;
1139 unsigned long tmp; 1141 unsigned long tmp;
1140 1142
1141 if (!strcasecmp(key, "random_threshold"))
1142 pattern = PATTERN_RANDOM;
1143 else if (!strcasecmp(key, "sequential_threshold"))
1144 pattern = PATTERN_SEQUENTIAL;
1145 else
1146 return -EINVAL;
1147
1148 if (kstrtoul(value, 10, &tmp)) 1143 if (kstrtoul(value, 10, &tmp))
1149 return -EINVAL; 1144 return -EINVAL;
1150 1145
1151 mq->tracker.thresholds[pattern] = tmp; 1146 if (!strcasecmp(key, "random_threshold")) {
1147 mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
1148
1149 } else if (!strcasecmp(key, "sequential_threshold")) {
1150 mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
1151
1152 } else if (!strcasecmp(key, "discard_promote_adjustment"))
1153 mq->discard_promote_adjustment = tmp;
1154
1155 else if (!strcasecmp(key, "read_promote_adjustment"))
1156 mq->read_promote_adjustment = tmp;
1157
1158 else if (!strcasecmp(key, "write_promote_adjustment"))
1159 mq->write_promote_adjustment = tmp;
1160
1161 else
1162 return -EINVAL;
1152 1163
1153 return 0; 1164 return 0;
1154} 1165}
@@ -1158,9 +1169,16 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsign
1158 ssize_t sz = 0; 1169 ssize_t sz = 0;
1159 struct mq_policy *mq = to_mq_policy(p); 1170 struct mq_policy *mq = to_mq_policy(p);
1160 1171
1161 DMEMIT("4 random_threshold %u sequential_threshold %u", 1172 DMEMIT("10 random_threshold %u "
1173 "sequential_threshold %u "
1174 "discard_promote_adjustment %u "
1175 "read_promote_adjustment %u "
1176 "write_promote_adjustment %u",
1162 mq->tracker.thresholds[PATTERN_RANDOM], 1177 mq->tracker.thresholds[PATTERN_RANDOM],
1163 mq->tracker.thresholds[PATTERN_SEQUENTIAL]); 1178 mq->tracker.thresholds[PATTERN_SEQUENTIAL],
1179 mq->discard_promote_adjustment,
1180 mq->read_promote_adjustment,
1181 mq->write_promote_adjustment);
1164 1182
1165 return 0; 1183 return 0;
1166} 1184}
@@ -1213,6 +1231,9 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1213 mq->hit_count = 0; 1231 mq->hit_count = 0;
1214 mq->generation = 0; 1232 mq->generation = 0;
1215 mq->promote_threshold = 0; 1233 mq->promote_threshold = 0;
1234 mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
1235 mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
1236 mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
1216 mutex_init(&mq->lock); 1237 mutex_init(&mq->lock);
1217 spin_lock_init(&mq->tick_lock); 1238 spin_lock_init(&mq->tick_lock);
1218 1239
@@ -1244,7 +1265,7 @@ bad_pre_cache_init:
1244 1265
1245static struct dm_cache_policy_type mq_policy_type = { 1266static struct dm_cache_policy_type mq_policy_type = {
1246 .name = "mq", 1267 .name = "mq",
1247 .version = {1, 1, 0}, 1268 .version = {1, 2, 0},
1248 .hint_size = 4, 1269 .hint_size = 4,
1249 .owner = THIS_MODULE, 1270 .owner = THIS_MODULE,
1250 .create = mq_create 1271 .create = mq_create
@@ -1252,10 +1273,11 @@ static struct dm_cache_policy_type mq_policy_type = {
1252 1273
1253static struct dm_cache_policy_type default_policy_type = { 1274static struct dm_cache_policy_type default_policy_type = {
1254 .name = "default", 1275 .name = "default",
1255 .version = {1, 1, 0}, 1276 .version = {1, 2, 0},
1256 .hint_size = 4, 1277 .hint_size = 4,
1257 .owner = THIS_MODULE, 1278 .owner = THIS_MODULE,
1258 .create = mq_create 1279 .create = mq_create,
1280 .real = &mq_policy_type
1259}; 1281};
1260 1282
1261static int __init mq_init(void) 1283static int __init mq_init(void)
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
index d80057968407..c1a3cee99b44 100644
--- a/drivers/md/dm-cache-policy.c
+++ b/drivers/md/dm-cache-policy.c
@@ -146,6 +146,10 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
146{ 146{
147 struct dm_cache_policy_type *t = p->private; 147 struct dm_cache_policy_type *t = p->private;
148 148
149 /* if t->real is set then an alias was used (e.g. "default") */
150 if (t->real)
151 return t->real->name;
152
149 return t->name; 153 return t->name;
150} 154}
151EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); 155EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 052c00a84a5c..f50fe360c546 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -223,6 +223,12 @@ struct dm_cache_policy_type {
223 unsigned version[CACHE_POLICY_VERSION_SIZE]; 223 unsigned version[CACHE_POLICY_VERSION_SIZE];
224 224
225 /* 225 /*
226 * For use by an alias dm_cache_policy_type to point to the
227 * real dm_cache_policy_type.
228 */
229 struct dm_cache_policy_type *real;
230
231 /*
226 * Policies may store a hint for each each cache block. 232 * Policies may store a hint for each each cache block.
227 * Currently the size of this hint must be 0 or 4 bytes but we 233 * Currently the size of this hint must be 0 or 4 bytes but we
228 * expect to relax this in future. 234 * expect to relax this in future.
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1b1469ebe5cb..09334c275c79 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2826,12 +2826,13 @@ static void cache_resume(struct dm_target *ti)
2826/* 2826/*
2827 * Status format: 2827 * Status format:
2828 * 2828 *
2829 * <#used metadata blocks>/<#total metadata blocks> 2829 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
2830 * <cache block size> <#used cache blocks>/<#total cache blocks>
2830 * <#read hits> <#read misses> <#write hits> <#write misses> 2831 * <#read hits> <#read misses> <#write hits> <#write misses>
2831 * <#demotions> <#promotions> <#blocks in cache> <#dirty> 2832 * <#demotions> <#promotions> <#dirty>
2832 * <#features> <features>* 2833 * <#features> <features>*
2833 * <#core args> <core args> 2834 * <#core args> <core args>
2834 * <#policy args> <policy args>* 2835 * <policy name> <#policy args> <policy args>*
2835 */ 2836 */
2836static void cache_status(struct dm_target *ti, status_type_t type, 2837static void cache_status(struct dm_target *ti, status_type_t type,
2837 unsigned status_flags, char *result, unsigned maxlen) 2838 unsigned status_flags, char *result, unsigned maxlen)
@@ -2869,17 +2870,20 @@ static void cache_status(struct dm_target *ti, status_type_t type,
2869 2870
2870 residency = policy_residency(cache->policy); 2871 residency = policy_residency(cache->policy);
2871 2872
2872 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ", 2873 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %llu ",
2874 (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
2873 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2875 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2874 (unsigned long long)nr_blocks_metadata, 2876 (unsigned long long)nr_blocks_metadata,
2877 cache->sectors_per_block,
2878 (unsigned long long) from_cblock(residency),
2879 (unsigned long long) from_cblock(cache->cache_size),
2875 (unsigned) atomic_read(&cache->stats.read_hit), 2880 (unsigned) atomic_read(&cache->stats.read_hit),
2876 (unsigned) atomic_read(&cache->stats.read_miss), 2881 (unsigned) atomic_read(&cache->stats.read_miss),
2877 (unsigned) atomic_read(&cache->stats.write_hit), 2882 (unsigned) atomic_read(&cache->stats.write_hit),
2878 (unsigned) atomic_read(&cache->stats.write_miss), 2883 (unsigned) atomic_read(&cache->stats.write_miss),
2879 (unsigned) atomic_read(&cache->stats.demotion), 2884 (unsigned) atomic_read(&cache->stats.demotion),
2880 (unsigned) atomic_read(&cache->stats.promotion), 2885 (unsigned) atomic_read(&cache->stats.promotion),
2881 (unsigned long long) from_cblock(residency), 2886 (unsigned long long) from_cblock(cache->nr_dirty));
2882 cache->nr_dirty);
2883 2887
2884 if (writethrough_mode(&cache->features)) 2888 if (writethrough_mode(&cache->features))
2885 DMEMIT("1 writethrough "); 2889 DMEMIT("1 writethrough ");
@@ -2896,6 +2900,8 @@ static void cache_status(struct dm_target *ti, status_type_t type,
2896 } 2900 }
2897 2901
2898 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 2902 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2903
2904 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
2899 if (sz < maxlen) { 2905 if (sz < maxlen) {
2900 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); 2906 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2901 if (r) 2907 if (r)
@@ -3129,7 +3135,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3129 3135
3130static struct target_type cache_target = { 3136static struct target_type cache_target = {
3131 .name = "cache", 3137 .name = "cache",
3132 .version = {1, 2, 0}, 3138 .version = {1, 3, 0},
3133 .module = THIS_MODULE, 3139 .module = THIS_MODULE,
3134 .ctr = cache_ctr, 3140 .ctr = cache_ctr,
3135 .dtr = cache_dtr, 3141 .dtr = cache_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 2f91d6d4a2cc..a8a511c053a5 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -24,7 +24,6 @@ struct delay_c {
24 struct work_struct flush_expired_bios; 24 struct work_struct flush_expired_bios;
25 struct list_head delayed_bios; 25 struct list_head delayed_bios;
26 atomic_t may_delay; 26 atomic_t may_delay;
27 mempool_t *delayed_pool;
28 27
29 struct dm_dev *dev_read; 28 struct dm_dev *dev_read;
30 sector_t start_read; 29 sector_t start_read;
@@ -40,14 +39,11 @@ struct delay_c {
40struct dm_delay_info { 39struct dm_delay_info {
41 struct delay_c *context; 40 struct delay_c *context;
42 struct list_head list; 41 struct list_head list;
43 struct bio *bio;
44 unsigned long expires; 42 unsigned long expires;
45}; 43};
46 44
47static DEFINE_MUTEX(delayed_bios_lock); 45static DEFINE_MUTEX(delayed_bios_lock);
48 46
49static struct kmem_cache *delayed_cache;
50
51static void handle_delayed_timer(unsigned long data) 47static void handle_delayed_timer(unsigned long data)
52{ 48{
53 struct delay_c *dc = (struct delay_c *)data; 49 struct delay_c *dc = (struct delay_c *)data;
@@ -87,13 +83,14 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
87 mutex_lock(&delayed_bios_lock); 83 mutex_lock(&delayed_bios_lock);
88 list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { 84 list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
89 if (flush_all || time_after_eq(jiffies, delayed->expires)) { 85 if (flush_all || time_after_eq(jiffies, delayed->expires)) {
86 struct bio *bio = dm_bio_from_per_bio_data(delayed,
87 sizeof(struct dm_delay_info));
90 list_del(&delayed->list); 88 list_del(&delayed->list);
91 bio_list_add(&flush_bios, delayed->bio); 89 bio_list_add(&flush_bios, bio);
92 if ((bio_data_dir(delayed->bio) == WRITE)) 90 if ((bio_data_dir(bio) == WRITE))
93 delayed->context->writes--; 91 delayed->context->writes--;
94 else 92 else
95 delayed->context->reads--; 93 delayed->context->reads--;
96 mempool_free(delayed, dc->delayed_pool);
97 continue; 94 continue;
98 } 95 }
99 96
@@ -185,12 +182,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
185 } 182 }
186 183
187out: 184out:
188 dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
189 if (!dc->delayed_pool) {
190 DMERR("Couldn't create delayed bio pool.");
191 goto bad_dev_write;
192 }
193
194 dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); 185 dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
195 if (!dc->kdelayd_wq) { 186 if (!dc->kdelayd_wq) {
196 DMERR("Couldn't start kdelayd"); 187 DMERR("Couldn't start kdelayd");
@@ -206,12 +197,11 @@ out:
206 197
207 ti->num_flush_bios = 1; 198 ti->num_flush_bios = 1;
208 ti->num_discard_bios = 1; 199 ti->num_discard_bios = 1;
200 ti->per_bio_data_size = sizeof(struct dm_delay_info);
209 ti->private = dc; 201 ti->private = dc;
210 return 0; 202 return 0;
211 203
212bad_queue: 204bad_queue:
213 mempool_destroy(dc->delayed_pool);
214bad_dev_write:
215 if (dc->dev_write) 205 if (dc->dev_write)
216 dm_put_device(ti, dc->dev_write); 206 dm_put_device(ti, dc->dev_write);
217bad_dev_read: 207bad_dev_read:
@@ -232,7 +222,6 @@ static void delay_dtr(struct dm_target *ti)
232 if (dc->dev_write) 222 if (dc->dev_write)
233 dm_put_device(ti, dc->dev_write); 223 dm_put_device(ti, dc->dev_write);
234 224
235 mempool_destroy(dc->delayed_pool);
236 kfree(dc); 225 kfree(dc);
237} 226}
238 227
@@ -244,10 +233,9 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
244 if (!delay || !atomic_read(&dc->may_delay)) 233 if (!delay || !atomic_read(&dc->may_delay))
245 return 1; 234 return 1;
246 235
247 delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO); 236 delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
248 237
249 delayed->context = dc; 238 delayed->context = dc;
250 delayed->bio = bio;
251 delayed->expires = expires = jiffies + (delay * HZ / 1000); 239 delayed->expires = expires = jiffies + (delay * HZ / 1000);
252 240
253 mutex_lock(&delayed_bios_lock); 241 mutex_lock(&delayed_bios_lock);
@@ -356,13 +344,7 @@ static struct target_type delay_target = {
356 344
357static int __init dm_delay_init(void) 345static int __init dm_delay_init(void)
358{ 346{
359 int r = -ENOMEM; 347 int r;
360
361 delayed_cache = KMEM_CACHE(dm_delay_info, 0);
362 if (!delayed_cache) {
363 DMERR("Couldn't create delayed bio cache.");
364 goto bad_memcache;
365 }
366 348
367 r = dm_register_target(&delay_target); 349 r = dm_register_target(&delay_target);
368 if (r < 0) { 350 if (r < 0) {
@@ -373,15 +355,12 @@ static int __init dm_delay_init(void)
373 return 0; 355 return 0;
374 356
375bad_register: 357bad_register:
376 kmem_cache_destroy(delayed_cache);
377bad_memcache:
378 return r; 358 return r;
379} 359}
380 360
381static void __exit dm_delay_exit(void) 361static void __exit dm_delay_exit(void)
382{ 362{
383 dm_unregister_target(&delay_target); 363 dm_unregister_target(&delay_target);
384 kmem_cache_destroy(delayed_cache);
385} 364}
386 365
387/* Module hooks */ 366/* Module hooks */
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 9429159d9ee3..b953db6cc229 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -10,10 +10,11 @@
10#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
11#include <linux/dm-log-userspace.h> 11#include <linux/dm-log-userspace.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/workqueue.h>
13 14
14#include "dm-log-userspace-transfer.h" 15#include "dm-log-userspace-transfer.h"
15 16
16#define DM_LOG_USERSPACE_VSN "1.1.0" 17#define DM_LOG_USERSPACE_VSN "1.3.0"
17 18
18struct flush_entry { 19struct flush_entry {
19 int type; 20 int type;
@@ -58,6 +59,18 @@ struct log_c {
58 spinlock_t flush_lock; 59 spinlock_t flush_lock;
59 struct list_head mark_list; 60 struct list_head mark_list;
60 struct list_head clear_list; 61 struct list_head clear_list;
62
63 /*
64 * Workqueue for flush of clear region requests.
65 */
66 struct workqueue_struct *dmlog_wq;
67 struct delayed_work flush_log_work;
68 atomic_t sched_flush;
69
70 /*
71 * Combine userspace flush and mark requests for efficiency.
72 */
73 uint32_t integrated_flush;
61}; 74};
62 75
63static mempool_t *flush_entry_pool; 76static mempool_t *flush_entry_pool;
@@ -122,6 +135,9 @@ static int build_constructor_string(struct dm_target *ti,
122 135
123 *ctr_str = NULL; 136 *ctr_str = NULL;
124 137
138 /*
139 * Determine overall size of the string.
140 */
125 for (i = 0, str_size = 0; i < argc; i++) 141 for (i = 0, str_size = 0; i < argc; i++)
126 str_size += strlen(argv[i]) + 1; /* +1 for space between args */ 142 str_size += strlen(argv[i]) + 1; /* +1 for space between args */
127 143
@@ -141,18 +157,39 @@ static int build_constructor_string(struct dm_target *ti,
141 return str_size; 157 return str_size;
142} 158}
143 159
160static void do_flush(struct work_struct *work)
161{
162 int r;
163 struct log_c *lc = container_of(work, struct log_c, flush_log_work.work);
164
165 atomic_set(&lc->sched_flush, 0);
166
167 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL);
168
169 if (r)
170 dm_table_event(lc->ti->table);
171}
172
144/* 173/*
145 * userspace_ctr 174 * userspace_ctr
146 * 175 *
147 * argv contains: 176 * argv contains:
148 * <UUID> <other args> 177 * <UUID> [integrated_flush] <other args>
149 * Where 'other args' is the userspace implementation specific log 178 * Where 'other args' are the userspace implementation-specific log
150 * arguments. An example might be: 179 * arguments.
151 * <UUID> clustered-disk <arg count> <log dev> <region_size> [[no]sync] 180 *
181 * Example:
182 * <UUID> [integrated_flush] clustered-disk <arg count> <log dev>
183 * <region_size> [[no]sync]
184 *
185 * This module strips off the <UUID> and uses it for identification
186 * purposes when communicating with userspace about a log.
152 * 187 *
153 * So, this module will strip off the <UUID> for identification purposes 188 * If integrated_flush is defined, the kernel combines flush
154 * when communicating with userspace about a log; but will pass on everything 189 * and mark requests.
155 * else. 190 *
191 * The rest of the line, beginning with 'clustered-disk', is passed
192 * to the userspace ctr function.
156 */ 193 */
157static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, 194static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
158 unsigned argc, char **argv) 195 unsigned argc, char **argv)
@@ -188,12 +225,22 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
188 return -EINVAL; 225 return -EINVAL;
189 } 226 }
190 227
228 lc->usr_argc = argc;
229
191 strncpy(lc->uuid, argv[0], DM_UUID_LEN); 230 strncpy(lc->uuid, argv[0], DM_UUID_LEN);
231 argc--;
232 argv++;
192 spin_lock_init(&lc->flush_lock); 233 spin_lock_init(&lc->flush_lock);
193 INIT_LIST_HEAD(&lc->mark_list); 234 INIT_LIST_HEAD(&lc->mark_list);
194 INIT_LIST_HEAD(&lc->clear_list); 235 INIT_LIST_HEAD(&lc->clear_list);
195 236
196 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); 237 if (!strcasecmp(argv[0], "integrated_flush")) {
238 lc->integrated_flush = 1;
239 argc--;
240 argv++;
241 }
242
243 str_size = build_constructor_string(ti, argc, argv, &ctr_str);
197 if (str_size < 0) { 244 if (str_size < 0) {
198 kfree(lc); 245 kfree(lc);
199 return str_size; 246 return str_size;
@@ -246,6 +293,19 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
246 DMERR("Failed to register %s with device-mapper", 293 DMERR("Failed to register %s with device-mapper",
247 devices_rdata); 294 devices_rdata);
248 } 295 }
296
297 if (lc->integrated_flush) {
298 lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0);
299 if (!lc->dmlog_wq) {
300 DMERR("couldn't start dmlogd");
301 r = -ENOMEM;
302 goto out;
303 }
304
305 INIT_DELAYED_WORK(&lc->flush_log_work, do_flush);
306 atomic_set(&lc->sched_flush, 0);
307 }
308
249out: 309out:
250 kfree(devices_rdata); 310 kfree(devices_rdata);
251 if (r) { 311 if (r) {
@@ -253,7 +313,6 @@ out:
253 kfree(ctr_str); 313 kfree(ctr_str);
254 } else { 314 } else {
255 lc->usr_argv_str = ctr_str; 315 lc->usr_argv_str = ctr_str;
256 lc->usr_argc = argc;
257 log->context = lc; 316 log->context = lc;
258 } 317 }
259 318
@@ -264,9 +323,16 @@ static void userspace_dtr(struct dm_dirty_log *log)
264{ 323{
265 struct log_c *lc = log->context; 324 struct log_c *lc = log->context;
266 325
326 if (lc->integrated_flush) {
327 /* flush workqueue */
328 if (atomic_read(&lc->sched_flush))
329 flush_delayed_work(&lc->flush_log_work);
330
331 destroy_workqueue(lc->dmlog_wq);
332 }
333
267 (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, 334 (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
268 NULL, 0, 335 NULL, 0, NULL, NULL);
269 NULL, NULL);
270 336
271 if (lc->log_dev) 337 if (lc->log_dev)
272 dm_put_device(lc->ti, lc->log_dev); 338 dm_put_device(lc->ti, lc->log_dev);
@@ -283,8 +349,7 @@ static int userspace_presuspend(struct dm_dirty_log *log)
283 struct log_c *lc = log->context; 349 struct log_c *lc = log->context;
284 350
285 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, 351 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
286 NULL, 0, 352 NULL, 0, NULL, NULL);
287 NULL, NULL);
288 353
289 return r; 354 return r;
290} 355}
@@ -294,9 +359,14 @@ static int userspace_postsuspend(struct dm_dirty_log *log)
294 int r; 359 int r;
295 struct log_c *lc = log->context; 360 struct log_c *lc = log->context;
296 361
362 /*
363 * Run planned flush earlier.
364 */
365 if (lc->integrated_flush && atomic_read(&lc->sched_flush))
366 flush_delayed_work(&lc->flush_log_work);
367
297 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, 368 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
298 NULL, 0, 369 NULL, 0, NULL, NULL);
299 NULL, NULL);
300 370
301 return r; 371 return r;
302} 372}
@@ -308,8 +378,7 @@ static int userspace_resume(struct dm_dirty_log *log)
308 378
309 lc->in_sync_hint = 0; 379 lc->in_sync_hint = 0;
310 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, 380 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
311 NULL, 0, 381 NULL, 0, NULL, NULL);
312 NULL, NULL);
313 382
314 return r; 383 return r;
315} 384}
@@ -405,7 +474,8 @@ static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
405 return r; 474 return r;
406} 475}
407 476
408static int flush_by_group(struct log_c *lc, struct list_head *flush_list) 477static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
478 int flush_with_payload)
409{ 479{
410 int r = 0; 480 int r = 0;
411 int count; 481 int count;
@@ -431,15 +501,29 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
431 break; 501 break;
432 } 502 }
433 503
434 r = userspace_do_request(lc, lc->uuid, type, 504 if (flush_with_payload) {
435 (char *)(group), 505 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
436 count * sizeof(uint64_t), 506 (char *)(group),
437 NULL, NULL); 507 count * sizeof(uint64_t),
438 if (r) { 508 NULL, NULL);
439 /* Group send failed. Attempt one-by-one. */ 509 /*
440 list_splice_init(&tmp_list, flush_list); 510 * Integrated flush failed.
441 r = flush_one_by_one(lc, flush_list); 511 */
442 break; 512 if (r)
513 break;
514 } else {
515 r = userspace_do_request(lc, lc->uuid, type,
516 (char *)(group),
517 count * sizeof(uint64_t),
518 NULL, NULL);
519 if (r) {
520 /*
521 * Group send failed. Attempt one-by-one.
522 */
523 list_splice_init(&tmp_list, flush_list);
524 r = flush_one_by_one(lc, flush_list);
525 break;
526 }
443 } 527 }
444 } 528 }
445 529
@@ -476,6 +560,8 @@ static int userspace_flush(struct dm_dirty_log *log)
476 struct log_c *lc = log->context; 560 struct log_c *lc = log->context;
477 LIST_HEAD(mark_list); 561 LIST_HEAD(mark_list);
478 LIST_HEAD(clear_list); 562 LIST_HEAD(clear_list);
563 int mark_list_is_empty;
564 int clear_list_is_empty;
479 struct flush_entry *fe, *tmp_fe; 565 struct flush_entry *fe, *tmp_fe;
480 566
481 spin_lock_irqsave(&lc->flush_lock, flags); 567 spin_lock_irqsave(&lc->flush_lock, flags);
@@ -483,23 +569,51 @@ static int userspace_flush(struct dm_dirty_log *log)
483 list_splice_init(&lc->clear_list, &clear_list); 569 list_splice_init(&lc->clear_list, &clear_list);
484 spin_unlock_irqrestore(&lc->flush_lock, flags); 570 spin_unlock_irqrestore(&lc->flush_lock, flags);
485 571
486 if (list_empty(&mark_list) && list_empty(&clear_list)) 572 mark_list_is_empty = list_empty(&mark_list);
573 clear_list_is_empty = list_empty(&clear_list);
574
575 if (mark_list_is_empty && clear_list_is_empty)
487 return 0; 576 return 0;
488 577
489 r = flush_by_group(lc, &mark_list); 578 r = flush_by_group(lc, &clear_list, 0);
490 if (r) 579 if (r)
491 goto fail; 580 goto out;
492 581
493 r = flush_by_group(lc, &clear_list); 582 if (!lc->integrated_flush) {
583 r = flush_by_group(lc, &mark_list, 0);
584 if (r)
585 goto out;
586 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
587 NULL, 0, NULL, NULL);
588 goto out;
589 }
590
591 /*
592 * Send integrated flush request with mark_list as payload.
593 */
594 r = flush_by_group(lc, &mark_list, 1);
494 if (r) 595 if (r)
495 goto fail; 596 goto out;
496 597
497 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, 598 if (mark_list_is_empty && !atomic_read(&lc->sched_flush)) {
498 NULL, 0, NULL, NULL); 599 /*
600 * When there are only clear region requests,
601 * we schedule a flush in the future.
602 */
603 queue_delayed_work(lc->dmlog_wq, &lc->flush_log_work, 3 * HZ);
604 atomic_set(&lc->sched_flush, 1);
605 } else {
606 /*
607 * Cancel pending flush because we
608 * have already flushed in mark_region.
609 */
610 cancel_delayed_work(&lc->flush_log_work);
611 atomic_set(&lc->sched_flush, 0);
612 }
499 613
500fail: 614out:
501 /* 615 /*
502 * We can safely remove these entries, even if failure. 616 * We can safely remove these entries, even after failure.
503 * Calling code will receive an error and will know that 617 * Calling code will receive an error and will know that
504 * the log facility has failed. 618 * the log facility has failed.
505 */ 619 */
@@ -603,8 +717,7 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
603 717
604 rdata_size = sizeof(pkg); 718 rdata_size = sizeof(pkg);
605 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, 719 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
606 NULL, 0, 720 NULL, 0, (char *)&pkg, &rdata_size);
607 (char *)&pkg, &rdata_size);
608 721
609 *region = pkg.r; 722 *region = pkg.r;
610 return (r) ? r : (int)pkg.i; 723 return (r) ? r : (int)pkg.i;
@@ -630,8 +743,7 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
630 pkg.i = (int64_t)in_sync; 743 pkg.i = (int64_t)in_sync;
631 744
632 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, 745 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
633 (char *)&pkg, sizeof(pkg), 746 (char *)&pkg, sizeof(pkg), NULL, NULL);
634 NULL, NULL);
635 747
636 /* 748 /*
637 * It would be nice to be able to report failures. 749 * It would be nice to be able to report failures.
@@ -657,8 +769,7 @@ static region_t userspace_get_sync_count(struct dm_dirty_log *log)
657 769
658 rdata_size = sizeof(sync_count); 770 rdata_size = sizeof(sync_count);
659 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, 771 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
660 NULL, 0, 772 NULL, 0, (char *)&sync_count, &rdata_size);
661 (char *)&sync_count, &rdata_size);
662 773
663 if (r) 774 if (r)
664 return 0; 775 return 0;
@@ -685,8 +796,7 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
685 switch (status_type) { 796 switch (status_type) {
686 case STATUSTYPE_INFO: 797 case STATUSTYPE_INFO:
687 r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, 798 r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
688 NULL, 0, 799 NULL, 0, result, &sz);
689 result, &sz);
690 800
691 if (r) { 801 if (r) {
692 sz = 0; 802 sz = 0;
@@ -699,8 +809,10 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
699 BUG_ON(!table_args); /* There will always be a ' ' */ 809 BUG_ON(!table_args); /* There will always be a ' ' */
700 table_args++; 810 table_args++;
701 811
702 DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc, 812 DMEMIT("%s %u %s ", log->type->name, lc->usr_argc, lc->uuid);
703 lc->uuid, table_args); 813 if (lc->integrated_flush)
814 DMEMIT("integrated_flush ");
815 DMEMIT("%s ", table_args);
704 break; 816 break;
705 } 817 }
706 return (r) ? 0 : (int)sz; 818 return (r) ? 0 : (int)sz;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2d2b1b7588d7..afc3d017de4c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -13,10 +13,13 @@
13#include <linux/export.h> 13#include <linux/export.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/dm-io.h> 15#include <linux/dm-io.h>
16#include "dm-bufio.h"
16 17
17#define DM_MSG_PREFIX "persistent snapshot" 18#define DM_MSG_PREFIX "persistent snapshot"
18#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ 19#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
19 20
21#define DM_PREFETCH_CHUNKS 12
22
20/*----------------------------------------------------------------- 23/*-----------------------------------------------------------------
21 * Persistent snapshots, by persistent we mean that the snapshot 24 * Persistent snapshots, by persistent we mean that the snapshot
22 * will survive a reboot. 25 * will survive a reboot.
@@ -257,6 +260,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
257 INIT_WORK_ONSTACK(&req.work, do_metadata); 260 INIT_WORK_ONSTACK(&req.work, do_metadata);
258 queue_work(ps->metadata_wq, &req.work); 261 queue_work(ps->metadata_wq, &req.work);
259 flush_workqueue(ps->metadata_wq); 262 flush_workqueue(ps->metadata_wq);
263 destroy_work_on_stack(&req.work);
260 264
261 return req.result; 265 return req.result;
262} 266}
@@ -401,17 +405,18 @@ static int write_header(struct pstore *ps)
401/* 405/*
402 * Access functions for the disk exceptions, these do the endian conversions. 406 * Access functions for the disk exceptions, these do the endian conversions.
403 */ 407 */
404static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) 408static struct disk_exception *get_exception(struct pstore *ps, void *ps_area,
409 uint32_t index)
405{ 410{
406 BUG_ON(index >= ps->exceptions_per_area); 411 BUG_ON(index >= ps->exceptions_per_area);
407 412
408 return ((struct disk_exception *) ps->area) + index; 413 return ((struct disk_exception *) ps_area) + index;
409} 414}
410 415
411static void read_exception(struct pstore *ps, 416static void read_exception(struct pstore *ps, void *ps_area,
412 uint32_t index, struct core_exception *result) 417 uint32_t index, struct core_exception *result)
413{ 418{
414 struct disk_exception *de = get_exception(ps, index); 419 struct disk_exception *de = get_exception(ps, ps_area, index);
415 420
416 /* copy it */ 421 /* copy it */
417 result->old_chunk = le64_to_cpu(de->old_chunk); 422 result->old_chunk = le64_to_cpu(de->old_chunk);
@@ -421,7 +426,7 @@ static void read_exception(struct pstore *ps,
421static void write_exception(struct pstore *ps, 426static void write_exception(struct pstore *ps,
422 uint32_t index, struct core_exception *e) 427 uint32_t index, struct core_exception *e)
423{ 428{
424 struct disk_exception *de = get_exception(ps, index); 429 struct disk_exception *de = get_exception(ps, ps->area, index);
425 430
426 /* copy it */ 431 /* copy it */
427 de->old_chunk = cpu_to_le64(e->old_chunk); 432 de->old_chunk = cpu_to_le64(e->old_chunk);
@@ -430,7 +435,7 @@ static void write_exception(struct pstore *ps,
430 435
431static void clear_exception(struct pstore *ps, uint32_t index) 436static void clear_exception(struct pstore *ps, uint32_t index)
432{ 437{
433 struct disk_exception *de = get_exception(ps, index); 438 struct disk_exception *de = get_exception(ps, ps->area, index);
434 439
435 /* clear it */ 440 /* clear it */
436 de->old_chunk = 0; 441 de->old_chunk = 0;
@@ -442,7 +447,7 @@ static void clear_exception(struct pstore *ps, uint32_t index)
442 * 'full' is filled in to indicate if the area has been 447 * 'full' is filled in to indicate if the area has been
443 * filled. 448 * filled.
444 */ 449 */
445static int insert_exceptions(struct pstore *ps, 450static int insert_exceptions(struct pstore *ps, void *ps_area,
446 int (*callback)(void *callback_context, 451 int (*callback)(void *callback_context,
447 chunk_t old, chunk_t new), 452 chunk_t old, chunk_t new),
448 void *callback_context, 453 void *callback_context,
@@ -456,7 +461,7 @@ static int insert_exceptions(struct pstore *ps,
456 *full = 1; 461 *full = 1;
457 462
458 for (i = 0; i < ps->exceptions_per_area; i++) { 463 for (i = 0; i < ps->exceptions_per_area; i++) {
459 read_exception(ps, i, &e); 464 read_exception(ps, ps_area, i, &e);
460 465
461 /* 466 /*
462 * If the new_chunk is pointing at the start of 467 * If the new_chunk is pointing at the start of
@@ -493,26 +498,72 @@ static int read_exceptions(struct pstore *ps,
493 void *callback_context) 498 void *callback_context)
494{ 499{
495 int r, full = 1; 500 int r, full = 1;
501 struct dm_bufio_client *client;
502 chunk_t prefetch_area = 0;
503
504 client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev,
505 ps->store->chunk_size << SECTOR_SHIFT,
506 1, 0, NULL, NULL);
507
508 if (IS_ERR(client))
509 return PTR_ERR(client);
510
511 /*
512 * Setup for one current buffer + desired readahead buffers.
513 */
514 dm_bufio_set_minimum_buffers(client, 1 + DM_PREFETCH_CHUNKS);
496 515
497 /* 516 /*
498 * Keeping reading chunks and inserting exceptions until 517 * Keeping reading chunks and inserting exceptions until
499 * we find a partially full area. 518 * we find a partially full area.
500 */ 519 */
501 for (ps->current_area = 0; full; ps->current_area++) { 520 for (ps->current_area = 0; full; ps->current_area++) {
502 r = area_io(ps, READ); 521 struct dm_buffer *bp;
503 if (r) 522 void *area;
504 return r; 523 chunk_t chunk;
524
525 if (unlikely(prefetch_area < ps->current_area))
526 prefetch_area = ps->current_area;
527
528 if (DM_PREFETCH_CHUNKS) do {
529 chunk_t pf_chunk = area_location(ps, prefetch_area);
530 if (unlikely(pf_chunk >= dm_bufio_get_device_size(client)))
531 break;
532 dm_bufio_prefetch(client, pf_chunk, 1);
533 prefetch_area++;
534 if (unlikely(!prefetch_area))
535 break;
536 } while (prefetch_area <= ps->current_area + DM_PREFETCH_CHUNKS);
537
538 chunk = area_location(ps, ps->current_area);
539
540 area = dm_bufio_read(client, chunk, &bp);
541 if (unlikely(IS_ERR(area))) {
542 r = PTR_ERR(area);
543 goto ret_destroy_bufio;
544 }
505 545
506 r = insert_exceptions(ps, callback, callback_context, &full); 546 r = insert_exceptions(ps, area, callback, callback_context,
507 if (r) 547 &full);
508 return r; 548
549 dm_bufio_release(bp);
550
551 dm_bufio_forget(client, chunk);
552
553 if (unlikely(r))
554 goto ret_destroy_bufio;
509 } 555 }
510 556
511 ps->current_area--; 557 ps->current_area--;
512 558
513 skip_metadata(ps); 559 skip_metadata(ps);
514 560
515 return 0; 561 r = 0;
562
563ret_destroy_bufio:
564 dm_bufio_client_destroy(client);
565
566 return r;
516} 567}
517 568
518static struct pstore *get_info(struct dm_exception_store *store) 569static struct pstore *get_info(struct dm_exception_store *store)
@@ -733,7 +784,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
733 ps->current_committed = ps->exceptions_per_area; 784 ps->current_committed = ps->exceptions_per_area;
734 } 785 }
735 786
736 read_exception(ps, ps->current_committed - 1, &ce); 787 read_exception(ps, ps->area, ps->current_committed - 1, &ce);
737 *last_old_chunk = ce.old_chunk; 788 *last_old_chunk = ce.old_chunk;
738 *last_new_chunk = ce.new_chunk; 789 *last_new_chunk = ce.new_chunk;
739 790
@@ -743,8 +794,8 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
743 */ 794 */
744 for (nr_consecutive = 1; nr_consecutive < ps->current_committed; 795 for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
745 nr_consecutive++) { 796 nr_consecutive++) {
746 read_exception(ps, ps->current_committed - 1 - nr_consecutive, 797 read_exception(ps, ps->area,
747 &ce); 798 ps->current_committed - 1 - nr_consecutive, &ce);
748 if (ce.old_chunk != *last_old_chunk - nr_consecutive || 799 if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
749 ce.new_chunk != *last_new_chunk - nr_consecutive) 800 ce.new_chunk != *last_new_chunk - nr_consecutive)
750 break; 801 break;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 944690bafd93..717718558bd9 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -610,12 +610,12 @@ static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
610 return NULL; 610 return NULL;
611} 611}
612 612
613static struct dm_exception *alloc_completed_exception(void) 613static struct dm_exception *alloc_completed_exception(gfp_t gfp)
614{ 614{
615 struct dm_exception *e; 615 struct dm_exception *e;
616 616
617 e = kmem_cache_alloc(exception_cache, GFP_NOIO); 617 e = kmem_cache_alloc(exception_cache, gfp);
618 if (!e) 618 if (!e && gfp == GFP_NOIO)
619 e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); 619 e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
620 620
621 return e; 621 return e;
@@ -697,7 +697,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
697 struct dm_snapshot *s = context; 697 struct dm_snapshot *s = context;
698 struct dm_exception *e; 698 struct dm_exception *e;
699 699
700 e = alloc_completed_exception(); 700 e = alloc_completed_exception(GFP_KERNEL);
701 if (!e) 701 if (!e)
702 return -ENOMEM; 702 return -ENOMEM;
703 703
@@ -1405,7 +1405,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1405 goto out; 1405 goto out;
1406 } 1406 }
1407 1407
1408 e = alloc_completed_exception(); 1408 e = alloc_completed_exception(GFP_NOIO);
1409 if (!e) { 1409 if (!e) {
1410 down_write(&s->lock); 1410 down_write(&s->lock);
1411 __invalidate_snapshot(s, -ENOMEM); 1411 __invalidate_snapshot(s, -ENOMEM);
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 84d2b91e4efb..c62c5ab6aed5 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -86,6 +86,7 @@ static const struct sysfs_ops dm_sysfs_ops = {
86static struct kobj_type dm_ktype = { 86static struct kobj_type dm_ktype = {
87 .sysfs_ops = &dm_sysfs_ops, 87 .sysfs_ops = &dm_sysfs_ops,
88 .default_attrs = dm_attrs, 88 .default_attrs = dm_attrs,
89 .release = dm_kobject_release,
89}; 90};
90 91
91/* 92/*
@@ -104,5 +105,7 @@ int dm_sysfs_init(struct mapped_device *md)
104 */ 105 */
105void dm_sysfs_exit(struct mapped_device *md) 106void dm_sysfs_exit(struct mapped_device *md)
106{ 107{
107 kobject_put(dm_kobject(md)); 108 struct kobject *kobj = dm_kobject(md);
109 kobject_put(kobj);
110 wait_for_completion(dm_get_completion_from_kobject(kobj));
108} 111}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ba6a3859ce3..6a7f2b83a126 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -155,7 +155,6 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
155{ 155{
156 sector_t *n_highs; 156 sector_t *n_highs;
157 struct dm_target *n_targets; 157 struct dm_target *n_targets;
158 int n = t->num_targets;
159 158
160 /* 159 /*
161 * Allocate both the target array and offset array at once. 160 * Allocate both the target array and offset array at once.
@@ -169,12 +168,7 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
169 168
170 n_targets = (struct dm_target *) (n_highs + num); 169 n_targets = (struct dm_target *) (n_highs + num);
171 170
172 if (n) { 171 memset(n_highs, -1, sizeof(*n_highs) * num);
173 memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
174 memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
175 }
176
177 memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
178 vfree(t->highs); 172 vfree(t->highs);
179 173
180 t->num_allocated = num; 174 t->num_allocated = num;
@@ -261,17 +255,6 @@ void dm_table_destroy(struct dm_table *t)
261} 255}
262 256
263/* 257/*
264 * Checks to see if we need to extend highs or targets.
265 */
266static inline int check_space(struct dm_table *t)
267{
268 if (t->num_targets >= t->num_allocated)
269 return alloc_targets(t, t->num_allocated * 2);
270
271 return 0;
272}
273
274/*
275 * See if we've already got a device in the list. 258 * See if we've already got a device in the list.
276 */ 259 */
277static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) 260static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
@@ -731,8 +714,7 @@ int dm_table_add_target(struct dm_table *t, const char *type,
731 return -EINVAL; 714 return -EINVAL;
732 } 715 }
733 716
734 if ((r = check_space(t))) 717 BUG_ON(t->num_targets >= t->num_allocated);
735 return r;
736 718
737 tgt = t->targets + t->num_targets; 719 tgt = t->targets + t->num_targets;
738 memset(tgt, 0, sizeof(*tgt)); 720 memset(tgt, 0, sizeof(*tgt));
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 8a30ad54bd46..7da347665552 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1349,6 +1349,12 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1349 return td->id; 1349 return td->id;
1350} 1350}
1351 1351
1352/*
1353 * Check whether @time (of block creation) is older than @td's last snapshot.
1354 * If so then the associated block is shared with the last snapshot device.
1355 * Any block on a device created *after* the device last got snapshotted is
1356 * necessarily not shared.
1357 */
1352static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) 1358static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1353{ 1359{
1354 return td->snapshotted_time > time; 1360 return td->snapshotted_time > time;
@@ -1458,6 +1464,20 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1458 return r; 1464 return r;
1459} 1465}
1460 1466
1467int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
1468{
1469 int r;
1470 uint32_t ref_count;
1471
1472 down_read(&pmd->root_lock);
1473 r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
1474 if (!r)
1475 *result = (ref_count != 0);
1476 up_read(&pmd->root_lock);
1477
1478 return r;
1479}
1480
1461bool dm_thin_changed_this_transaction(struct dm_thin_device *td) 1481bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1462{ 1482{
1463 int r; 1483 int r;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 7bcc0e1d6238..9a368567632f 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -131,7 +131,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
131 131
132struct dm_thin_lookup_result { 132struct dm_thin_lookup_result {
133 dm_block_t block; 133 dm_block_t block;
134 unsigned shared:1; 134 bool shared:1;
135}; 135};
136 136
137/* 137/*
@@ -181,6 +181,8 @@ int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
181 181
182int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); 182int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
183 183
184int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
185
184/* 186/*
185 * Returns -ENOSPC if the new size is too small and already allocated 187 * Returns -ENOSPC if the new size is too small and already allocated
186 * blocks would be lost. 188 * blocks would be lost.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index ee29037ffc2e..726228b33a01 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -144,6 +144,7 @@ struct pool_features {
144 bool zero_new_blocks:1; 144 bool zero_new_blocks:1;
145 bool discard_enabled:1; 145 bool discard_enabled:1;
146 bool discard_passdown:1; 146 bool discard_passdown:1;
147 bool error_if_no_space:1;
147}; 148};
148 149
149struct thin_c; 150struct thin_c;
@@ -163,8 +164,7 @@ struct pool {
163 int sectors_per_block_shift; 164 int sectors_per_block_shift;
164 165
165 struct pool_features pf; 166 struct pool_features pf;
166 unsigned low_water_triggered:1; /* A dm event has been sent */ 167 bool low_water_triggered:1; /* A dm event has been sent */
167 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
168 168
169 struct dm_bio_prison *prison; 169 struct dm_bio_prison *prison;
170 struct dm_kcopyd_client *copier; 170 struct dm_kcopyd_client *copier;
@@ -198,7 +198,8 @@ struct pool {
198}; 198};
199 199
200static enum pool_mode get_pool_mode(struct pool *pool); 200static enum pool_mode get_pool_mode(struct pool *pool);
201static void set_pool_mode(struct pool *pool, enum pool_mode mode); 201static void out_of_data_space(struct pool *pool);
202static void metadata_operation_failed(struct pool *pool, const char *op, int r);
202 203
203/* 204/*
204 * Target context for a pool. 205 * Target context for a pool.
@@ -509,15 +510,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
509struct dm_thin_new_mapping { 510struct dm_thin_new_mapping {
510 struct list_head list; 511 struct list_head list;
511 512
512 unsigned quiesced:1; 513 bool quiesced:1;
513 unsigned prepared:1; 514 bool prepared:1;
514 unsigned pass_discard:1; 515 bool pass_discard:1;
516 bool definitely_not_shared:1;
515 517
518 int err;
516 struct thin_c *tc; 519 struct thin_c *tc;
517 dm_block_t virt_block; 520 dm_block_t virt_block;
518 dm_block_t data_block; 521 dm_block_t data_block;
519 struct dm_bio_prison_cell *cell, *cell2; 522 struct dm_bio_prison_cell *cell, *cell2;
520 int err;
521 523
522 /* 524 /*
523 * If the bio covers the whole area of a block then we can avoid 525 * If the bio covers the whole area of a block then we can avoid
@@ -534,7 +536,7 @@ static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
534 struct pool *pool = m->tc->pool; 536 struct pool *pool = m->tc->pool;
535 537
536 if (m->quiesced && m->prepared) { 538 if (m->quiesced && m->prepared) {
537 list_add(&m->list, &pool->prepared_mappings); 539 list_add_tail(&m->list, &pool->prepared_mappings);
538 wake_worker(pool); 540 wake_worker(pool);
539 } 541 }
540} 542}
@@ -548,7 +550,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
548 m->err = read_err || write_err ? -EIO : 0; 550 m->err = read_err || write_err ? -EIO : 0;
549 551
550 spin_lock_irqsave(&pool->lock, flags); 552 spin_lock_irqsave(&pool->lock, flags);
551 m->prepared = 1; 553 m->prepared = true;
552 __maybe_add_mapping(m); 554 __maybe_add_mapping(m);
553 spin_unlock_irqrestore(&pool->lock, flags); 555 spin_unlock_irqrestore(&pool->lock, flags);
554} 556}
@@ -563,7 +565,7 @@ static void overwrite_endio(struct bio *bio, int err)
563 m->err = err; 565 m->err = err;
564 566
565 spin_lock_irqsave(&pool->lock, flags); 567 spin_lock_irqsave(&pool->lock, flags);
566 m->prepared = 1; 568 m->prepared = true;
567 __maybe_add_mapping(m); 569 __maybe_add_mapping(m);
568 spin_unlock_irqrestore(&pool->lock, flags); 570 spin_unlock_irqrestore(&pool->lock, flags);
569} 571}
@@ -640,9 +642,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
640 */ 642 */
641 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 643 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
642 if (r) { 644 if (r) {
643 DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d", 645 metadata_operation_failed(pool, "dm_thin_insert_block", r);
644 dm_device_name(pool->pool_md), r);
645 set_pool_mode(pool, PM_READ_ONLY);
646 cell_error(pool, m->cell); 646 cell_error(pool, m->cell);
647 goto out; 647 goto out;
648 } 648 }
@@ -683,7 +683,15 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
683 cell_defer_no_holder(tc, m->cell2); 683 cell_defer_no_holder(tc, m->cell2);
684 684
685 if (m->pass_discard) 685 if (m->pass_discard)
686 remap_and_issue(tc, m->bio, m->data_block); 686 if (m->definitely_not_shared)
687 remap_and_issue(tc, m->bio, m->data_block);
688 else {
689 bool used = false;
690 if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
691 bio_endio(m->bio, 0);
692 else
693 remap_and_issue(tc, m->bio, m->data_block);
694 }
687 else 695 else
688 bio_endio(m->bio, 0); 696 bio_endio(m->bio, 0);
689 697
@@ -751,13 +759,17 @@ static int ensure_next_mapping(struct pool *pool)
751 759
752static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) 760static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
753{ 761{
754 struct dm_thin_new_mapping *r = pool->next_mapping; 762 struct dm_thin_new_mapping *m = pool->next_mapping;
755 763
756 BUG_ON(!pool->next_mapping); 764 BUG_ON(!pool->next_mapping);
757 765
766 memset(m, 0, sizeof(struct dm_thin_new_mapping));
767 INIT_LIST_HEAD(&m->list);
768 m->bio = NULL;
769
758 pool->next_mapping = NULL; 770 pool->next_mapping = NULL;
759 771
760 return r; 772 return m;
761} 773}
762 774
763static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 775static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -769,18 +781,13 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
769 struct pool *pool = tc->pool; 781 struct pool *pool = tc->pool;
770 struct dm_thin_new_mapping *m = get_next_mapping(pool); 782 struct dm_thin_new_mapping *m = get_next_mapping(pool);
771 783
772 INIT_LIST_HEAD(&m->list);
773 m->quiesced = 0;
774 m->prepared = 0;
775 m->tc = tc; 784 m->tc = tc;
776 m->virt_block = virt_block; 785 m->virt_block = virt_block;
777 m->data_block = data_dest; 786 m->data_block = data_dest;
778 m->cell = cell; 787 m->cell = cell;
779 m->err = 0;
780 m->bio = NULL;
781 788
782 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) 789 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
783 m->quiesced = 1; 790 m->quiesced = true;
784 791
785 /* 792 /*
786 * IO to pool_dev remaps to the pool target's data_dev. 793 * IO to pool_dev remaps to the pool target's data_dev.
@@ -840,15 +847,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
840 struct pool *pool = tc->pool; 847 struct pool *pool = tc->pool;
841 struct dm_thin_new_mapping *m = get_next_mapping(pool); 848 struct dm_thin_new_mapping *m = get_next_mapping(pool);
842 849
843 INIT_LIST_HEAD(&m->list); 850 m->quiesced = true;
844 m->quiesced = 1; 851 m->prepared = false;
845 m->prepared = 0;
846 m->tc = tc; 852 m->tc = tc;
847 m->virt_block = virt_block; 853 m->virt_block = virt_block;
848 m->data_block = data_block; 854 m->data_block = data_block;
849 m->cell = cell; 855 m->cell = cell;
850 m->err = 0;
851 m->bio = NULL;
852 856
853 /* 857 /*
854 * If the whole block of data is being overwritten or we are not 858 * If the whole block of data is being overwritten or we are not
@@ -895,41 +899,42 @@ static int commit(struct pool *pool)
895 return -EINVAL; 899 return -EINVAL;
896 900
897 r = dm_pool_commit_metadata(pool->pmd); 901 r = dm_pool_commit_metadata(pool->pmd);
898 if (r) { 902 if (r)
899 DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d", 903 metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
900 dm_device_name(pool->pool_md), r);
901 set_pool_mode(pool, PM_READ_ONLY);
902 }
903 904
904 return r; 905 return r;
905} 906}
906 907
907static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 908static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
908{ 909{
909 int r;
910 dm_block_t free_blocks;
911 unsigned long flags; 910 unsigned long flags;
912 struct pool *pool = tc->pool;
913
914 /*
915 * Once no_free_space is set we must not allow allocation to succeed.
916 * Otherwise it is difficult to explain, debug, test and support.
917 */
918 if (pool->no_free_space)
919 return -ENOSPC;
920
921 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
922 if (r)
923 return r;
924 911
925 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 912 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
926 DMWARN("%s: reached low water mark for data device: sending event.", 913 DMWARN("%s: reached low water mark for data device: sending event.",
927 dm_device_name(pool->pool_md)); 914 dm_device_name(pool->pool_md));
928 spin_lock_irqsave(&pool->lock, flags); 915 spin_lock_irqsave(&pool->lock, flags);
929 pool->low_water_triggered = 1; 916 pool->low_water_triggered = true;
930 spin_unlock_irqrestore(&pool->lock, flags); 917 spin_unlock_irqrestore(&pool->lock, flags);
931 dm_table_event(pool->ti->table); 918 dm_table_event(pool->ti->table);
932 } 919 }
920}
921
922static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
923{
924 int r;
925 dm_block_t free_blocks;
926 struct pool *pool = tc->pool;
927
928 if (get_pool_mode(pool) != PM_WRITE)
929 return -EINVAL;
930
931 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
932 if (r) {
933 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
934 return r;
935 }
936
937 check_low_water_mark(pool, free_blocks);
933 938
934 if (!free_blocks) { 939 if (!free_blocks) {
935 /* 940 /*
@@ -941,35 +946,20 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
941 return r; 946 return r;
942 947
943 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 948 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
944 if (r) 949 if (r) {
950 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
945 return r; 951 return r;
952 }
946 953
947 /*
948 * If we still have no space we set a flag to avoid
949 * doing all this checking and return -ENOSPC. This
950 * flag serves as a latch that disallows allocations from
951 * this pool until the admin takes action (e.g. resize or
952 * table reload).
953 */
954 if (!free_blocks) { 954 if (!free_blocks) {
955 DMWARN("%s: no free data space available.", 955 out_of_data_space(pool);
956 dm_device_name(pool->pool_md));
957 spin_lock_irqsave(&pool->lock, flags);
958 pool->no_free_space = 1;
959 spin_unlock_irqrestore(&pool->lock, flags);
960 return -ENOSPC; 956 return -ENOSPC;
961 } 957 }
962 } 958 }
963 959
964 r = dm_pool_alloc_data_block(pool->pmd, result); 960 r = dm_pool_alloc_data_block(pool->pmd, result);
965 if (r) { 961 if (r) {
966 if (r == -ENOSPC && 962 metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
967 !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
968 !free_blocks) {
969 DMWARN("%s: no free metadata space available.",
970 dm_device_name(pool->pool_md));
971 set_pool_mode(pool, PM_READ_ONLY);
972 }
973 return r; 963 return r;
974 } 964 }
975 965
@@ -992,7 +982,21 @@ static void retry_on_resume(struct bio *bio)
992 spin_unlock_irqrestore(&pool->lock, flags); 982 spin_unlock_irqrestore(&pool->lock, flags);
993} 983}
994 984
995static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell) 985static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
986{
987 /*
988 * When pool is read-only, no cell locking is needed because
989 * nothing is changing.
990 */
991 WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
992
993 if (pool->pf.error_if_no_space)
994 bio_io_error(bio);
995 else
996 retry_on_resume(bio);
997}
998
999static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
996{ 1000{
997 struct bio *bio; 1001 struct bio *bio;
998 struct bio_list bios; 1002 struct bio_list bios;
@@ -1001,7 +1005,7 @@ static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell)
1001 cell_release(pool, cell, &bios); 1005 cell_release(pool, cell, &bios);
1002 1006
1003 while ((bio = bio_list_pop(&bios))) 1007 while ((bio = bio_list_pop(&bios)))
1004 retry_on_resume(bio); 1008 handle_unserviceable_bio(pool, bio);
1005} 1009}
1006 1010
1007static void process_discard(struct thin_c *tc, struct bio *bio) 1011static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -1040,17 +1044,17 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
1040 */ 1044 */
1041 m = get_next_mapping(pool); 1045 m = get_next_mapping(pool);
1042 m->tc = tc; 1046 m->tc = tc;
1043 m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown; 1047 m->pass_discard = pool->pf.discard_passdown;
1048 m->definitely_not_shared = !lookup_result.shared;
1044 m->virt_block = block; 1049 m->virt_block = block;
1045 m->data_block = lookup_result.block; 1050 m->data_block = lookup_result.block;
1046 m->cell = cell; 1051 m->cell = cell;
1047 m->cell2 = cell2; 1052 m->cell2 = cell2;
1048 m->err = 0;
1049 m->bio = bio; 1053 m->bio = bio;
1050 1054
1051 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { 1055 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
1052 spin_lock_irqsave(&pool->lock, flags); 1056 spin_lock_irqsave(&pool->lock, flags);
1053 list_add(&m->list, &pool->prepared_discards); 1057 list_add_tail(&m->list, &pool->prepared_discards);
1054 spin_unlock_irqrestore(&pool->lock, flags); 1058 spin_unlock_irqrestore(&pool->lock, flags);
1055 wake_worker(pool); 1059 wake_worker(pool);
1056 } 1060 }
@@ -1105,13 +1109,12 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1105 break; 1109 break;
1106 1110
1107 case -ENOSPC: 1111 case -ENOSPC:
1108 no_space(pool, cell); 1112 retry_bios_on_resume(pool, cell);
1109 break; 1113 break;
1110 1114
1111 default: 1115 default:
1112 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", 1116 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1113 __func__, r); 1117 __func__, r);
1114 set_pool_mode(pool, PM_READ_ONLY);
1115 cell_error(pool, cell); 1118 cell_error(pool, cell);
1116 break; 1119 break;
1117 } 1120 }
@@ -1184,13 +1187,12 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1184 break; 1187 break;
1185 1188
1186 case -ENOSPC: 1189 case -ENOSPC:
1187 no_space(pool, cell); 1190 retry_bios_on_resume(pool, cell);
1188 break; 1191 break;
1189 1192
1190 default: 1193 default:
1191 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", 1194 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1192 __func__, r); 1195 __func__, r);
1193 set_pool_mode(pool, PM_READ_ONLY);
1194 cell_error(pool, cell); 1196 cell_error(pool, cell);
1195 break; 1197 break;
1196 } 1198 }
@@ -1257,7 +1259,7 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1257 switch (r) { 1259 switch (r) {
1258 case 0: 1260 case 0:
1259 if (lookup_result.shared && (rw == WRITE) && bio->bi_size) 1261 if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
1260 bio_io_error(bio); 1262 handle_unserviceable_bio(tc->pool, bio);
1261 else { 1263 else {
1262 inc_all_io_entry(tc->pool, bio); 1264 inc_all_io_entry(tc->pool, bio);
1263 remap_and_issue(tc, bio, lookup_result.block); 1265 remap_and_issue(tc, bio, lookup_result.block);
@@ -1266,7 +1268,7 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1266 1268
1267 case -ENODATA: 1269 case -ENODATA:
1268 if (rw != READ) { 1270 if (rw != READ) {
1269 bio_io_error(bio); 1271 handle_unserviceable_bio(tc->pool, bio);
1270 break; 1272 break;
1271 } 1273 }
1272 1274
@@ -1390,16 +1392,16 @@ static enum pool_mode get_pool_mode(struct pool *pool)
1390 return pool->pf.mode; 1392 return pool->pf.mode;
1391} 1393}
1392 1394
1393static void set_pool_mode(struct pool *pool, enum pool_mode mode) 1395static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1394{ 1396{
1395 int r; 1397 int r;
1398 enum pool_mode old_mode = pool->pf.mode;
1396 1399
1397 pool->pf.mode = mode; 1400 switch (new_mode) {
1398
1399 switch (mode) {
1400 case PM_FAIL: 1401 case PM_FAIL:
1401 DMERR("%s: switching pool to failure mode", 1402 if (old_mode != new_mode)
1402 dm_device_name(pool->pool_md)); 1403 DMERR("%s: switching pool to failure mode",
1404 dm_device_name(pool->pool_md));
1403 dm_pool_metadata_read_only(pool->pmd); 1405 dm_pool_metadata_read_only(pool->pmd);
1404 pool->process_bio = process_bio_fail; 1406 pool->process_bio = process_bio_fail;
1405 pool->process_discard = process_bio_fail; 1407 pool->process_discard = process_bio_fail;
@@ -1408,13 +1410,15 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1408 break; 1410 break;
1409 1411
1410 case PM_READ_ONLY: 1412 case PM_READ_ONLY:
1411 DMERR("%s: switching pool to read-only mode", 1413 if (old_mode != new_mode)
1412 dm_device_name(pool->pool_md)); 1414 DMERR("%s: switching pool to read-only mode",
1415 dm_device_name(pool->pool_md));
1413 r = dm_pool_abort_metadata(pool->pmd); 1416 r = dm_pool_abort_metadata(pool->pmd);
1414 if (r) { 1417 if (r) {
1415 DMERR("%s: aborting transaction failed", 1418 DMERR("%s: aborting transaction failed",
1416 dm_device_name(pool->pool_md)); 1419 dm_device_name(pool->pool_md));
1417 set_pool_mode(pool, PM_FAIL); 1420 new_mode = PM_FAIL;
1421 set_pool_mode(pool, new_mode);
1418 } else { 1422 } else {
1419 dm_pool_metadata_read_only(pool->pmd); 1423 dm_pool_metadata_read_only(pool->pmd);
1420 pool->process_bio = process_bio_read_only; 1424 pool->process_bio = process_bio_read_only;
@@ -1425,6 +1429,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1425 break; 1429 break;
1426 1430
1427 case PM_WRITE: 1431 case PM_WRITE:
1432 if (old_mode != new_mode)
1433 DMINFO("%s: switching pool to write mode",
1434 dm_device_name(pool->pool_md));
1428 dm_pool_metadata_read_write(pool->pmd); 1435 dm_pool_metadata_read_write(pool->pmd);
1429 pool->process_bio = process_bio; 1436 pool->process_bio = process_bio;
1430 pool->process_discard = process_discard; 1437 pool->process_discard = process_discard;
@@ -1432,6 +1439,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1432 pool->process_prepared_discard = process_prepared_discard; 1439 pool->process_prepared_discard = process_prepared_discard;
1433 break; 1440 break;
1434 } 1441 }
1442
1443 pool->pf.mode = new_mode;
1444}
1445
1446/*
1447 * Rather than calling set_pool_mode directly, use these which describe the
1448 * reason for mode degradation.
1449 */
1450static void out_of_data_space(struct pool *pool)
1451{
1452 DMERR_LIMIT("%s: no free data space available.",
1453 dm_device_name(pool->pool_md));
1454 set_pool_mode(pool, PM_READ_ONLY);
1455}
1456
1457static void metadata_operation_failed(struct pool *pool, const char *op, int r)
1458{
1459 dm_block_t free_blocks;
1460
1461 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1462 dm_device_name(pool->pool_md), op, r);
1463
1464 if (r == -ENOSPC &&
1465 !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
1466 !free_blocks)
1467 DMERR_LIMIT("%s: no free metadata space available.",
1468 dm_device_name(pool->pool_md));
1469
1470 set_pool_mode(pool, PM_READ_ONLY);
1435} 1471}
1436 1472
1437/*----------------------------------------------------------------*/ 1473/*----------------------------------------------------------------*/
@@ -1538,9 +1574,9 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1538 if (get_pool_mode(tc->pool) == PM_READ_ONLY) { 1574 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1539 /* 1575 /*
1540 * This block isn't provisioned, and we have no way 1576 * This block isn't provisioned, and we have no way
1541 * of doing so. Just error it. 1577 * of doing so.
1542 */ 1578 */
1543 bio_io_error(bio); 1579 handle_unserviceable_bio(tc->pool, bio);
1544 return DM_MAPIO_SUBMITTED; 1580 return DM_MAPIO_SUBMITTED;
1545 } 1581 }
1546 /* fall through */ 1582 /* fall through */
@@ -1648,6 +1684,17 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1648 enum pool_mode new_mode = pt->adjusted_pf.mode; 1684 enum pool_mode new_mode = pt->adjusted_pf.mode;
1649 1685
1650 /* 1686 /*
1687 * Don't change the pool's mode until set_pool_mode() below.
1688 * Otherwise the pool's process_* function pointers may
1689 * not match the desired pool mode.
1690 */
1691 pt->adjusted_pf.mode = old_mode;
1692
1693 pool->ti = ti;
1694 pool->pf = pt->adjusted_pf;
1695 pool->low_water_blocks = pt->low_water_blocks;
1696
1697 /*
1651 * If we were in PM_FAIL mode, rollback of metadata failed. We're 1698 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1652 * not going to recover without a thin_repair. So we never let the 1699 * not going to recover without a thin_repair. So we never let the
1653 * pool move out of the old mode. On the other hand a PM_READ_ONLY 1700 * pool move out of the old mode. On the other hand a PM_READ_ONLY
@@ -1657,10 +1704,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1657 if (old_mode == PM_FAIL) 1704 if (old_mode == PM_FAIL)
1658 new_mode = old_mode; 1705 new_mode = old_mode;
1659 1706
1660 pool->ti = ti;
1661 pool->low_water_blocks = pt->low_water_blocks;
1662 pool->pf = pt->adjusted_pf;
1663
1664 set_pool_mode(pool, new_mode); 1707 set_pool_mode(pool, new_mode);
1665 1708
1666 return 0; 1709 return 0;
@@ -1682,6 +1725,7 @@ static void pool_features_init(struct pool_features *pf)
1682 pf->zero_new_blocks = true; 1725 pf->zero_new_blocks = true;
1683 pf->discard_enabled = true; 1726 pf->discard_enabled = true;
1684 pf->discard_passdown = true; 1727 pf->discard_passdown = true;
1728 pf->error_if_no_space = false;
1685} 1729}
1686 1730
1687static void __pool_destroy(struct pool *pool) 1731static void __pool_destroy(struct pool *pool)
@@ -1772,8 +1816,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1772 bio_list_init(&pool->deferred_flush_bios); 1816 bio_list_init(&pool->deferred_flush_bios);
1773 INIT_LIST_HEAD(&pool->prepared_mappings); 1817 INIT_LIST_HEAD(&pool->prepared_mappings);
1774 INIT_LIST_HEAD(&pool->prepared_discards); 1818 INIT_LIST_HEAD(&pool->prepared_discards);
1775 pool->low_water_triggered = 0; 1819 pool->low_water_triggered = false;
1776 pool->no_free_space = 0;
1777 bio_list_init(&pool->retry_on_resume_list); 1820 bio_list_init(&pool->retry_on_resume_list);
1778 1821
1779 pool->shared_read_ds = dm_deferred_set_create(); 1822 pool->shared_read_ds = dm_deferred_set_create();
@@ -1898,7 +1941,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1898 const char *arg_name; 1941 const char *arg_name;
1899 1942
1900 static struct dm_arg _args[] = { 1943 static struct dm_arg _args[] = {
1901 {0, 3, "Invalid number of pool feature arguments"}, 1944 {0, 4, "Invalid number of pool feature arguments"},
1902 }; 1945 };
1903 1946
1904 /* 1947 /*
@@ -1927,6 +1970,9 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1927 else if (!strcasecmp(arg_name, "read_only")) 1970 else if (!strcasecmp(arg_name, "read_only"))
1928 pf->mode = PM_READ_ONLY; 1971 pf->mode = PM_READ_ONLY;
1929 1972
1973 else if (!strcasecmp(arg_name, "error_if_no_space"))
1974 pf->error_if_no_space = true;
1975
1930 else { 1976 else {
1931 ti->error = "Unrecognised pool feature requested"; 1977 ti->error = "Unrecognised pool feature requested";
1932 r = -EINVAL; 1978 r = -EINVAL;
@@ -1997,6 +2043,8 @@ static dm_block_t calc_metadata_threshold(struct pool_c *pt)
1997 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 2043 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1998 * ignore_discard: disable discard 2044 * ignore_discard: disable discard
1999 * no_discard_passdown: don't pass discards down to the data device 2045 * no_discard_passdown: don't pass discards down to the data device
2046 * read_only: Don't allow any changes to be made to the pool metadata.
2047 * error_if_no_space: error IOs, instead of queueing, if no space.
2000 */ 2048 */
2001static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 2049static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2002{ 2050{
@@ -2192,11 +2240,13 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
2192 return -EINVAL; 2240 return -EINVAL;
2193 2241
2194 } else if (data_size > sb_data_size) { 2242 } else if (data_size > sb_data_size) {
2243 if (sb_data_size)
2244 DMINFO("%s: growing the data device from %llu to %llu blocks",
2245 dm_device_name(pool->pool_md),
2246 sb_data_size, (unsigned long long)data_size);
2195 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2247 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2196 if (r) { 2248 if (r) {
2197 DMERR("%s: failed to resize data device", 2249 metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
2198 dm_device_name(pool->pool_md));
2199 set_pool_mode(pool, PM_READ_ONLY);
2200 return r; 2250 return r;
2201 } 2251 }
2202 2252
@@ -2231,10 +2281,12 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2231 return -EINVAL; 2281 return -EINVAL;
2232 2282
2233 } else if (metadata_dev_size > sb_metadata_dev_size) { 2283 } else if (metadata_dev_size > sb_metadata_dev_size) {
2284 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
2285 dm_device_name(pool->pool_md),
2286 sb_metadata_dev_size, metadata_dev_size);
2234 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); 2287 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
2235 if (r) { 2288 if (r) {
2236 DMERR("%s: failed to resize metadata device", 2289 metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
2237 dm_device_name(pool->pool_md));
2238 return r; 2290 return r;
2239 } 2291 }
2240 2292
@@ -2290,8 +2342,7 @@ static void pool_resume(struct dm_target *ti)
2290 unsigned long flags; 2342 unsigned long flags;
2291 2343
2292 spin_lock_irqsave(&pool->lock, flags); 2344 spin_lock_irqsave(&pool->lock, flags);
2293 pool->low_water_triggered = 0; 2345 pool->low_water_triggered = false;
2294 pool->no_free_space = 0;
2295 __requeue_bios(pool); 2346 __requeue_bios(pool);
2296 spin_unlock_irqrestore(&pool->lock, flags); 2347 spin_unlock_irqrestore(&pool->lock, flags);
2297 2348
@@ -2510,7 +2561,8 @@ static void emit_flags(struct pool_features *pf, char *result,
2510 unsigned sz, unsigned maxlen) 2561 unsigned sz, unsigned maxlen)
2511{ 2562{
2512 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + 2563 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2513 !pf->discard_passdown + (pf->mode == PM_READ_ONLY); 2564 !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
2565 pf->error_if_no_space;
2514 DMEMIT("%u ", count); 2566 DMEMIT("%u ", count);
2515 2567
2516 if (!pf->zero_new_blocks) 2568 if (!pf->zero_new_blocks)
@@ -2524,6 +2576,9 @@ static void emit_flags(struct pool_features *pf, char *result,
2524 2576
2525 if (pf->mode == PM_READ_ONLY) 2577 if (pf->mode == PM_READ_ONLY)
2526 DMEMIT("read_only "); 2578 DMEMIT("read_only ");
2579
2580 if (pf->error_if_no_space)
2581 DMEMIT("error_if_no_space ");
2527} 2582}
2528 2583
2529/* 2584/*
@@ -2618,11 +2673,16 @@ static void pool_status(struct dm_target *ti, status_type_t type,
2618 DMEMIT("rw "); 2673 DMEMIT("rw ");
2619 2674
2620 if (!pool->pf.discard_enabled) 2675 if (!pool->pf.discard_enabled)
2621 DMEMIT("ignore_discard"); 2676 DMEMIT("ignore_discard ");
2622 else if (pool->pf.discard_passdown) 2677 else if (pool->pf.discard_passdown)
2623 DMEMIT("discard_passdown"); 2678 DMEMIT("discard_passdown ");
2679 else
2680 DMEMIT("no_discard_passdown ");
2681
2682 if (pool->pf.error_if_no_space)
2683 DMEMIT("error_if_no_space ");
2624 else 2684 else
2625 DMEMIT("no_discard_passdown"); 2685 DMEMIT("queue_if_no_space ");
2626 2686
2627 break; 2687 break;
2628 2688
@@ -2721,7 +2781,7 @@ static struct target_type pool_target = {
2721 .name = "thin-pool", 2781 .name = "thin-pool",
2722 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2782 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2723 DM_TARGET_IMMUTABLE, 2783 DM_TARGET_IMMUTABLE,
2724 .version = {1, 9, 0}, 2784 .version = {1, 10, 0},
2725 .module = THIS_MODULE, 2785 .module = THIS_MODULE,
2726 .ctr = pool_ctr, 2786 .ctr = pool_ctr,
2727 .dtr = pool_dtr, 2787 .dtr = pool_dtr,
@@ -2899,7 +2959,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
2899 spin_lock_irqsave(&pool->lock, flags); 2959 spin_lock_irqsave(&pool->lock, flags);
2900 list_for_each_entry_safe(m, tmp, &work, list) { 2960 list_for_each_entry_safe(m, tmp, &work, list) {
2901 list_del(&m->list); 2961 list_del(&m->list);
2902 m->quiesced = 1; 2962 m->quiesced = true;
2903 __maybe_add_mapping(m); 2963 __maybe_add_mapping(m);
2904 } 2964 }
2905 spin_unlock_irqrestore(&pool->lock, flags); 2965 spin_unlock_irqrestore(&pool->lock, flags);
@@ -2911,7 +2971,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
2911 if (!list_empty(&work)) { 2971 if (!list_empty(&work)) {
2912 spin_lock_irqsave(&pool->lock, flags); 2972 spin_lock_irqsave(&pool->lock, flags);
2913 list_for_each_entry_safe(m, tmp, &work, list) 2973 list_for_each_entry_safe(m, tmp, &work, list)
2914 list_add(&m->list, &pool->prepared_discards); 2974 list_add_tail(&m->list, &pool->prepared_discards);
2915 spin_unlock_irqrestore(&pool->lock, flags); 2975 spin_unlock_irqrestore(&pool->lock, flags);
2916 wake_worker(pool); 2976 wake_worker(pool);
2917 } 2977 }
@@ -3008,7 +3068,7 @@ static int thin_iterate_devices(struct dm_target *ti,
3008 3068
3009static struct target_type thin_target = { 3069static struct target_type thin_target = {
3010 .name = "thin", 3070 .name = "thin",
3011 .version = {1, 9, 0}, 3071 .version = {1, 10, 0},
3012 .module = THIS_MODULE, 3072 .module = THIS_MODULE,
3013 .ctr = thin_ctr, 3073 .ctr = thin_ctr,
3014 .dtr = thin_dtr, 3074 .dtr = thin_dtr,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0704c523a76b..b49c76284241 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -200,8 +200,8 @@ struct mapped_device {
200 /* forced geometry settings */ 200 /* forced geometry settings */
201 struct hd_geometry geometry; 201 struct hd_geometry geometry;
202 202
203 /* sysfs handle */ 203 /* kobject and completion */
204 struct kobject kobj; 204 struct dm_kobject_holder kobj_holder;
205 205
206 /* zero-length flush that will be cloned and submitted to targets */ 206 /* zero-length flush that will be cloned and submitted to targets */
207 struct bio flush_bio; 207 struct bio flush_bio;
@@ -2041,6 +2041,7 @@ static struct mapped_device *alloc_dev(int minor)
2041 init_waitqueue_head(&md->wait); 2041 init_waitqueue_head(&md->wait);
2042 INIT_WORK(&md->work, dm_wq_work); 2042 INIT_WORK(&md->work, dm_wq_work);
2043 init_waitqueue_head(&md->eventq); 2043 init_waitqueue_head(&md->eventq);
2044 init_completion(&md->kobj_holder.completion);
2044 2045
2045 md->disk->major = _major; 2046 md->disk->major = _major;
2046 md->disk->first_minor = minor; 2047 md->disk->first_minor = minor;
@@ -2902,20 +2903,14 @@ struct gendisk *dm_disk(struct mapped_device *md)
2902 2903
2903struct kobject *dm_kobject(struct mapped_device *md) 2904struct kobject *dm_kobject(struct mapped_device *md)
2904{ 2905{
2905 return &md->kobj; 2906 return &md->kobj_holder.kobj;
2906} 2907}
2907 2908
2908/*
2909 * struct mapped_device should not be exported outside of dm.c
2910 * so use this check to verify that kobj is part of md structure
2911 */
2912struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2909struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2913{ 2910{
2914 struct mapped_device *md; 2911 struct mapped_device *md;
2915 2912
2916 md = container_of(kobj, struct mapped_device, kobj); 2913 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2917 if (&md->kobj != kobj)
2918 return NULL;
2919 2914
2920 if (test_bit(DMF_FREEING, &md->flags) || 2915 if (test_bit(DMF_FREEING, &md->flags) ||
2921 dm_deleting_md(md)) 2916 dm_deleting_md(md))
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index c57ba550f69e..c4569f02f50f 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -15,6 +15,8 @@
15#include <linux/list.h> 15#include <linux/list.h>
16#include <linux/blkdev.h> 16#include <linux/blkdev.h>
17#include <linux/hdreg.h> 17#include <linux/hdreg.h>
18#include <linux/completion.h>
19#include <linux/kobject.h>
18 20
19#include "dm-stats.h" 21#include "dm-stats.h"
20 22
@@ -148,12 +150,27 @@ void dm_interface_exit(void);
148/* 150/*
149 * sysfs interface 151 * sysfs interface
150 */ 152 */
153struct dm_kobject_holder {
154 struct kobject kobj;
155 struct completion completion;
156};
157
158static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
159{
160 return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
161}
162
151int dm_sysfs_init(struct mapped_device *md); 163int dm_sysfs_init(struct mapped_device *md);
152void dm_sysfs_exit(struct mapped_device *md); 164void dm_sysfs_exit(struct mapped_device *md);
153struct kobject *dm_kobject(struct mapped_device *md); 165struct kobject *dm_kobject(struct mapped_device *md);
154struct mapped_device *dm_get_from_kobject(struct kobject *kobj); 166struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
155 167
156/* 168/*
169 * The kobject helper
170 */
171void dm_kobject_release(struct kobject *kobj);
172
173/*
157 * Targets for linear and striped mappings 174 * Targets for linear and striped mappings
158 */ 175 */
159int dm_linear_init(void); 176int dm_linear_init(void);
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 064a3c271baa..455f79279a16 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -104,7 +104,7 @@ static int __check_holder(struct block_lock *lock)
104 104
105 for (i = 0; i < MAX_HOLDERS; i++) { 105 for (i = 0; i < MAX_HOLDERS; i++) {
106 if (lock->holders[i] == current) { 106 if (lock->holders[i] == current) {
107 DMERR("recursive lock detected in pool metadata"); 107 DMERR("recursive lock detected in metadata");
108#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 108#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
109 DMERR("previously held here:"); 109 DMERR("previously held here:");
110 print_stack_trace(lock->traces + i, 4); 110 print_stack_trace(lock->traces + i, 4);
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 468e371ee9b2..416060c25709 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -770,8 +770,8 @@ EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
770 770
771/*----------------------------------------------------------------*/ 771/*----------------------------------------------------------------*/
772 772
773static int find_highest_key(struct ro_spine *s, dm_block_t block, 773static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
774 uint64_t *result_key, dm_block_t *next_block) 774 uint64_t *result_key, dm_block_t *next_block)
775{ 775{
776 int i, r; 776 int i, r;
777 uint32_t flags; 777 uint32_t flags;
@@ -788,7 +788,11 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block,
788 else 788 else
789 i--; 789 i--;
790 790
791 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 791 if (find_highest)
792 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
793 else
794 *result_key = le64_to_cpu(ro_node(s)->keys[0]);
795
792 if (next_block || flags & INTERNAL_NODE) 796 if (next_block || flags & INTERNAL_NODE)
793 block = value64(ro_node(s), i); 797 block = value64(ro_node(s), i);
794 798
@@ -799,16 +803,16 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block,
799 return 0; 803 return 0;
800} 804}
801 805
802int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 806static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
803 uint64_t *result_keys) 807 bool find_highest, uint64_t *result_keys)
804{ 808{
805 int r = 0, count = 0, level; 809 int r = 0, count = 0, level;
806 struct ro_spine spine; 810 struct ro_spine spine;
807 811
808 init_ro_spine(&spine, info); 812 init_ro_spine(&spine, info);
809 for (level = 0; level < info->levels; level++) { 813 for (level = 0; level < info->levels; level++) {
810 r = find_highest_key(&spine, root, result_keys + level, 814 r = find_key(&spine, root, find_highest, result_keys + level,
811 level == info->levels - 1 ? NULL : &root); 815 level == info->levels - 1 ? NULL : &root);
812 if (r == -ENODATA) { 816 if (r == -ENODATA) {
813 r = 0; 817 r = 0;
814 break; 818 break;
@@ -822,8 +826,23 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
822 826
823 return r ? r : count; 827 return r ? r : count;
824} 828}
829
830int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
831 uint64_t *result_keys)
832{
833 return dm_btree_find_key(info, root, true, result_keys);
834}
825EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); 835EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
826 836
837int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
838 uint64_t *result_keys)
839{
840 return dm_btree_find_key(info, root, false, result_keys);
841}
842EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
843
844/*----------------------------------------------------------------*/
845
827/* 846/*
828 * FIXME: We shouldn't use a recursive algorithm when we have limited stack 847 * FIXME: We shouldn't use a recursive algorithm when we have limited stack
829 * space. Also this only works for single level trees. 848 * space. Also this only works for single level trees.
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index 8672d159e0b5..dacfc34180b4 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -137,6 +137,14 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
137/* 137/*
138 * Returns < 0 on failure. Otherwise the number of key entries that have 138 * Returns < 0 on failure. Otherwise the number of key entries that have
139 * been filled out. Remember trees can have zero entries, and as such have 139 * been filled out. Remember trees can have zero entries, and as such have
140 * no lowest key.
141 */
142int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
143 uint64_t *result_keys);
144
145/*
146 * Returns < 0 on failure. Otherwise the number of key entries that have
147 * been filled out. Remember trees can have zero entries, and as such have
140 * no highest key. 148 * no highest key.
141 */ 149 */
142int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 150int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 466a60bbd716..aacbe70c2c2e 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -245,6 +245,10 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
245 return -EINVAL; 245 return -EINVAL;
246 } 246 }
247 247
248 /*
249 * We need to set this before the dm_tm_new_block() call below.
250 */
251 ll->nr_blocks = nr_blocks;
248 for (i = old_blocks; i < blocks; i++) { 252 for (i = old_blocks; i < blocks; i++) {
249 struct dm_block *b; 253 struct dm_block *b;
250 struct disk_index_entry idx; 254 struct disk_index_entry idx;
@@ -252,6 +256,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
252 r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b); 256 r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
253 if (r < 0) 257 if (r < 0)
254 return r; 258 return r;
259
255 idx.blocknr = cpu_to_le64(dm_block_location(b)); 260 idx.blocknr = cpu_to_le64(dm_block_location(b));
256 261
257 r = dm_tm_unlock(ll->tm, b); 262 r = dm_tm_unlock(ll->tm, b);
@@ -266,7 +271,6 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
266 return r; 271 return r;
267 } 272 }
268 273
269 ll->nr_blocks = nr_blocks;
270 return 0; 274 return 0;
271} 275}
272 276
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 58fc1eef7499..536782e3bcb7 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -385,13 +385,13 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
385 385
386 int r = sm_metadata_new_block_(sm, b); 386 int r = sm_metadata_new_block_(sm, b);
387 if (r) { 387 if (r) {
388 DMERR("unable to allocate new metadata block"); 388 DMERR_LIMIT("unable to allocate new metadata block");
389 return r; 389 return r;
390 } 390 }
391 391
392 r = sm_metadata_get_nr_free(sm, &count); 392 r = sm_metadata_get_nr_free(sm, &count);
393 if (r) { 393 if (r) {
394 DMERR("couldn't get free block count"); 394 DMERR_LIMIT("couldn't get free block count");
395 return r; 395 return r;
396 } 396 }
397 397
@@ -608,20 +608,38 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
608 * Flick into a mode where all blocks get allocated in the new area. 608 * Flick into a mode where all blocks get allocated in the new area.
609 */ 609 */
610 smm->begin = old_len; 610 smm->begin = old_len;
611 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); 611 memcpy(sm, &bootstrap_ops, sizeof(*sm));
612 612
613 /* 613 /*
614 * Extend. 614 * Extend.
615 */ 615 */
616 r = sm_ll_extend(&smm->ll, extra_blocks); 616 r = sm_ll_extend(&smm->ll, extra_blocks);
617 if (r)
618 goto out;
617 619
618 /* 620 /*
619 * Switch back to normal behaviour. 621 * We repeatedly increment then commit until the commit doesn't
622 * allocate any new blocks.
620 */ 623 */
621 memcpy(&smm->sm, &ops, sizeof(smm->sm)); 624 do {
622 for (i = old_len; !r && i < smm->begin; i++) 625 for (i = old_len; !r && i < smm->begin; i++) {
623 r = sm_ll_inc(&smm->ll, i, &ev); 626 r = sm_ll_inc(&smm->ll, i, &ev);
627 if (r)
628 goto out;
629 }
630 old_len = smm->begin;
631
632 r = sm_ll_commit(&smm->ll);
633 if (r)
634 goto out;
635
636 } while (old_len != smm->begin);
624 637
638out:
639 /*
640 * Switch back to normal behaviour.
641 */
642 memcpy(sm, &ops, sizeof(*sm));
625 return r; 643 return r;
626} 644}
627 645