aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/Kconfig10
-rw-r--r--drivers/md/dm-cache-policy-mq.c4
-rw-r--r--drivers/md/dm-snap-persistent.c3
-rw-r--r--drivers/md/dm-thin-metadata.c37
-rw-r--r--drivers/md/dm-thin-metadata.h11
-rw-r--r--drivers/md/dm-thin.c304
-rw-r--r--drivers/md/persistent-data/Kconfig10
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c113
8 files changed, 389 insertions, 103 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 9a06fe883766..95ad936e6048 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -254,16 +254,6 @@ config DM_THIN_PROVISIONING
254 ---help--- 254 ---help---
255 Provides thin provisioning and snapshots that share a data store. 255 Provides thin provisioning and snapshots that share a data store.
256 256
257config DM_DEBUG_BLOCK_STACK_TRACING
258 boolean "Keep stack trace of persistent data block lock holders"
259 depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
260 select STACKTRACE
261 ---help---
262 Enable this for messages that may help debug problems with the
263 block manager locking used by thin provisioning and caching.
264
265 If unsure, say N.
266
267config DM_CACHE 257config DM_CACHE
268 tristate "Cache target (EXPERIMENTAL)" 258 tristate "Cache target (EXPERIMENTAL)"
269 depends on BLK_DEV_DM 259 depends on BLK_DEV_DM
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 1e018e986610..0e385e40909e 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -872,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p)
872{ 872{
873 struct mq_policy *mq = to_mq_policy(p); 873 struct mq_policy *mq = to_mq_policy(p);
874 874
875 kfree(mq->table); 875 vfree(mq->table);
876 epool_exit(&mq->cache_pool); 876 epool_exit(&mq->cache_pool);
877 epool_exit(&mq->pre_cache_pool); 877 epool_exit(&mq->pre_cache_pool);
878 kfree(mq); 878 kfree(mq);
@@ -1245,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1245 1245
1246 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); 1246 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
1247 mq->hash_bits = ffs(mq->nr_buckets) - 1; 1247 mq->hash_bits = ffs(mq->nr_buckets) - 1;
1248 mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); 1248 mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
1249 if (!mq->table) 1249 if (!mq->table)
1250 goto bad_alloc_table; 1250 goto bad_alloc_table;
1251 1251
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index afc3d017de4c..d6e88178d22c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -546,6 +546,9 @@ static int read_exceptions(struct pstore *ps,
546 r = insert_exceptions(ps, area, callback, callback_context, 546 r = insert_exceptions(ps, area, callback, callback_context,
547 &full); 547 &full);
548 548
549 if (!full)
550 memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT);
551
549 dm_bufio_release(bp); 552 dm_bufio_release(bp);
550 553
551 dm_bufio_forget(client, chunk); 554 dm_bufio_forget(client, chunk);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index baa87ff12816..fb9efc829182 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -76,7 +76,7 @@
76 76
77#define THIN_SUPERBLOCK_MAGIC 27022010 77#define THIN_SUPERBLOCK_MAGIC 27022010
78#define THIN_SUPERBLOCK_LOCATION 0 78#define THIN_SUPERBLOCK_LOCATION 0
79#define THIN_VERSION 1 79#define THIN_VERSION 2
80#define THIN_METADATA_CACHE_SIZE 64 80#define THIN_METADATA_CACHE_SIZE 64
81#define SECTOR_TO_BLOCK_SHIFT 3 81#define SECTOR_TO_BLOCK_SHIFT 3
82 82
@@ -1755,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
1755 1755
1756 return r; 1756 return r;
1757} 1757}
1758
1759int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
1760{
1761 int r;
1762 struct dm_block *sblock;
1763 struct thin_disk_superblock *disk_super;
1764
1765 down_write(&pmd->root_lock);
1766 pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
1767
1768 r = superblock_lock(pmd, &sblock);
1769 if (r) {
1770 DMERR("couldn't read superblock");
1771 goto out;
1772 }
1773
1774 disk_super = dm_block_data(sblock);
1775 disk_super->flags = cpu_to_le32(pmd->flags);
1776
1777 dm_bm_unlock(sblock);
1778out:
1779 up_write(&pmd->root_lock);
1780 return r;
1781}
1782
1783bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
1784{
1785 bool needs_check;
1786
1787 down_read(&pmd->root_lock);
1788 needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
1789 up_read(&pmd->root_lock);
1790
1791 return needs_check;
1792}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 82ea384d36ff..e3c857db195a 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -25,6 +25,11 @@
25 25
26/*----------------------------------------------------------------*/ 26/*----------------------------------------------------------------*/
27 27
28/*
29 * Thin metadata superblock flags.
30 */
31#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0)
32
28struct dm_pool_metadata; 33struct dm_pool_metadata;
29struct dm_thin_device; 34struct dm_thin_device;
30 35
@@ -202,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
202 dm_sm_threshold_fn fn, 207 dm_sm_threshold_fn fn,
203 void *context); 208 void *context);
204 209
210/*
211 * Updates the superblock immediately.
212 */
213int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
214bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
215
205/*----------------------------------------------------------------*/ 216/*----------------------------------------------------------------*/
206 217
207#endif 218#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 7e84baccf0ad..be70d38745f7 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
130struct dm_thin_new_mapping; 130struct dm_thin_new_mapping;
131 131
132/* 132/*
133 * The pool runs in 3 modes. Ordered in degraded order for comparisons. 133 * The pool runs in 4 modes. Ordered in degraded order for comparisons.
134 */ 134 */
135enum pool_mode { 135enum pool_mode {
136 PM_WRITE, /* metadata may be changed */ 136 PM_WRITE, /* metadata may be changed */
137 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
137 PM_READ_ONLY, /* metadata may not be changed */ 138 PM_READ_ONLY, /* metadata may not be changed */
138 PM_FAIL, /* all I/O fails */ 139 PM_FAIL, /* all I/O fails */
139}; 140};
@@ -198,7 +199,6 @@ struct pool {
198}; 199};
199 200
200static enum pool_mode get_pool_mode(struct pool *pool); 201static enum pool_mode get_pool_mode(struct pool *pool);
201static void out_of_data_space(struct pool *pool);
202static void metadata_operation_failed(struct pool *pool, const char *op, int r); 202static void metadata_operation_failed(struct pool *pool, const char *op, int r);
203 203
204/* 204/*
@@ -226,6 +226,7 @@ struct thin_c {
226 226
227 struct pool *pool; 227 struct pool *pool;
228 struct dm_thin_device *td; 228 struct dm_thin_device *td;
229 bool requeue_mode:1;
229}; 230};
230 231
231/*----------------------------------------------------------------*/ 232/*----------------------------------------------------------------*/
@@ -369,14 +370,18 @@ struct dm_thin_endio_hook {
369 struct dm_thin_new_mapping *overwrite_mapping; 370 struct dm_thin_new_mapping *overwrite_mapping;
370}; 371};
371 372
372static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 373static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
373{ 374{
374 struct bio *bio; 375 struct bio *bio;
375 struct bio_list bios; 376 struct bio_list bios;
377 unsigned long flags;
376 378
377 bio_list_init(&bios); 379 bio_list_init(&bios);
380
381 spin_lock_irqsave(&tc->pool->lock, flags);
378 bio_list_merge(&bios, master); 382 bio_list_merge(&bios, master);
379 bio_list_init(master); 383 bio_list_init(master);
384 spin_unlock_irqrestore(&tc->pool->lock, flags);
380 385
381 while ((bio = bio_list_pop(&bios))) { 386 while ((bio = bio_list_pop(&bios))) {
382 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 387 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -391,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
391static void requeue_io(struct thin_c *tc) 396static void requeue_io(struct thin_c *tc)
392{ 397{
393 struct pool *pool = tc->pool; 398 struct pool *pool = tc->pool;
399
400 requeue_bio_list(tc, &pool->deferred_bios);
401 requeue_bio_list(tc, &pool->retry_on_resume_list);
402}
403
404static void error_retry_list(struct pool *pool)
405{
406 struct bio *bio;
394 unsigned long flags; 407 unsigned long flags;
408 struct bio_list bios;
409
410 bio_list_init(&bios);
395 411
396 spin_lock_irqsave(&pool->lock, flags); 412 spin_lock_irqsave(&pool->lock, flags);
397 __requeue_bio_list(tc, &pool->deferred_bios); 413 bio_list_merge(&bios, &pool->retry_on_resume_list);
398 __requeue_bio_list(tc, &pool->retry_on_resume_list); 414 bio_list_init(&pool->retry_on_resume_list);
399 spin_unlock_irqrestore(&pool->lock, flags); 415 spin_unlock_irqrestore(&pool->lock, flags);
416
417 while ((bio = bio_list_pop(&bios)))
418 bio_io_error(bio);
400} 419}
401 420
402/* 421/*
@@ -925,13 +944,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
925 } 944 }
926} 945}
927 946
947static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
948
928static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 949static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
929{ 950{
930 int r; 951 int r;
931 dm_block_t free_blocks; 952 dm_block_t free_blocks;
932 struct pool *pool = tc->pool; 953 struct pool *pool = tc->pool;
933 954
934 if (get_pool_mode(pool) != PM_WRITE) 955 if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
935 return -EINVAL; 956 return -EINVAL;
936 957
937 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 958 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
@@ -958,7 +979,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
958 } 979 }
959 980
960 if (!free_blocks) { 981 if (!free_blocks) {
961 out_of_data_space(pool); 982 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
962 return -ENOSPC; 983 return -ENOSPC;
963 } 984 }
964 } 985 }
@@ -988,15 +1009,32 @@ static void retry_on_resume(struct bio *bio)
988 spin_unlock_irqrestore(&pool->lock, flags); 1009 spin_unlock_irqrestore(&pool->lock, flags);
989} 1010}
990 1011
991static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) 1012static bool should_error_unserviceable_bio(struct pool *pool)
992{ 1013{
993 /* 1014 enum pool_mode m = get_pool_mode(pool);
994 * When pool is read-only, no cell locking is needed because 1015
995 * nothing is changing. 1016 switch (m) {
996 */ 1017 case PM_WRITE:
997 WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY); 1018 /* Shouldn't get here */
1019 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1020 return true;
1021
1022 case PM_OUT_OF_DATA_SPACE:
1023 return pool->pf.error_if_no_space;
998 1024
999 if (pool->pf.error_if_no_space) 1025 case PM_READ_ONLY:
1026 case PM_FAIL:
1027 return true;
1028 default:
1029 /* Shouldn't get here */
1030 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1031 return true;
1032 }
1033}
1034
1035static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1036{
1037 if (should_error_unserviceable_bio(pool))
1000 bio_io_error(bio); 1038 bio_io_error(bio);
1001 else 1039 else
1002 retry_on_resume(bio); 1040 retry_on_resume(bio);
@@ -1007,11 +1045,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
1007 struct bio *bio; 1045 struct bio *bio;
1008 struct bio_list bios; 1046 struct bio_list bios;
1009 1047
1048 if (should_error_unserviceable_bio(pool)) {
1049 cell_error(pool, cell);
1050 return;
1051 }
1052
1010 bio_list_init(&bios); 1053 bio_list_init(&bios);
1011 cell_release(pool, cell, &bios); 1054 cell_release(pool, cell, &bios);
1012 1055
1013 while ((bio = bio_list_pop(&bios))) 1056 if (should_error_unserviceable_bio(pool))
1014 handle_unserviceable_bio(pool, bio); 1057 while ((bio = bio_list_pop(&bios)))
1058 bio_io_error(bio);
1059 else
1060 while ((bio = bio_list_pop(&bios)))
1061 retry_on_resume(bio);
1015} 1062}
1016 1063
1017static void process_discard(struct thin_c *tc, struct bio *bio) 1064static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -1296,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1296 } 1343 }
1297} 1344}
1298 1345
1346static void process_bio_success(struct thin_c *tc, struct bio *bio)
1347{
1348 bio_endio(bio, 0);
1349}
1350
1299static void process_bio_fail(struct thin_c *tc, struct bio *bio) 1351static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1300{ 1352{
1301 bio_io_error(bio); 1353 bio_io_error(bio);
@@ -1328,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool)
1328 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1380 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1329 struct thin_c *tc = h->tc; 1381 struct thin_c *tc = h->tc;
1330 1382
1383 if (tc->requeue_mode) {
1384 bio_endio(bio, DM_ENDIO_REQUEUE);
1385 continue;
1386 }
1387
1331 /* 1388 /*
1332 * If we've got no free new_mapping structs, and processing 1389 * If we've got no free new_mapping structs, and processing
1333 * this bio might require one, we pause until there are some 1390 * this bio might require one, we pause until there are some
@@ -1394,51 +1451,134 @@ static void do_waker(struct work_struct *ws)
1394 1451
1395/*----------------------------------------------------------------*/ 1452/*----------------------------------------------------------------*/
1396 1453
1454struct noflush_work {
1455 struct work_struct worker;
1456 struct thin_c *tc;
1457
1458 atomic_t complete;
1459 wait_queue_head_t wait;
1460};
1461
1462static void complete_noflush_work(struct noflush_work *w)
1463{
1464 atomic_set(&w->complete, 1);
1465 wake_up(&w->wait);
1466}
1467
1468static void do_noflush_start(struct work_struct *ws)
1469{
1470 struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1471 w->tc->requeue_mode = true;
1472 requeue_io(w->tc);
1473 complete_noflush_work(w);
1474}
1475
1476static void do_noflush_stop(struct work_struct *ws)
1477{
1478 struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1479 w->tc->requeue_mode = false;
1480 complete_noflush_work(w);
1481}
1482
1483static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
1484{
1485 struct noflush_work w;
1486
1487 INIT_WORK(&w.worker, fn);
1488 w.tc = tc;
1489 atomic_set(&w.complete, 0);
1490 init_waitqueue_head(&w.wait);
1491
1492 queue_work(tc->pool->wq, &w.worker);
1493
1494 wait_event(w.wait, atomic_read(&w.complete));
1495}
1496
1497/*----------------------------------------------------------------*/
1498
1397static enum pool_mode get_pool_mode(struct pool *pool) 1499static enum pool_mode get_pool_mode(struct pool *pool)
1398{ 1500{
1399 return pool->pf.mode; 1501 return pool->pf.mode;
1400} 1502}
1401 1503
1504static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
1505{
1506 dm_table_event(pool->ti->table);
1507 DMINFO("%s: switching pool to %s mode",
1508 dm_device_name(pool->pool_md), new_mode);
1509}
1510
1402static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) 1511static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1403{ 1512{
1404 int r; 1513 struct pool_c *pt = pool->ti->private;
1405 enum pool_mode old_mode = pool->pf.mode; 1514 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1515 enum pool_mode old_mode = get_pool_mode(pool);
1516
1517 /*
1518 * Never allow the pool to transition to PM_WRITE mode if user
1519 * intervention is required to verify metadata and data consistency.
1520 */
1521 if (new_mode == PM_WRITE && needs_check) {
1522 DMERR("%s: unable to switch pool to write mode until repaired.",
1523 dm_device_name(pool->pool_md));
1524 if (old_mode != new_mode)
1525 new_mode = old_mode;
1526 else
1527 new_mode = PM_READ_ONLY;
1528 }
1529 /*
1530 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1531 * not going to recover without a thin_repair. So we never let the
1532 * pool move out of the old mode.
1533 */
1534 if (old_mode == PM_FAIL)
1535 new_mode = old_mode;
1406 1536
1407 switch (new_mode) { 1537 switch (new_mode) {
1408 case PM_FAIL: 1538 case PM_FAIL:
1409 if (old_mode != new_mode) 1539 if (old_mode != new_mode)
1410 DMERR("%s: switching pool to failure mode", 1540 notify_of_pool_mode_change(pool, "failure");
1411 dm_device_name(pool->pool_md));
1412 dm_pool_metadata_read_only(pool->pmd); 1541 dm_pool_metadata_read_only(pool->pmd);
1413 pool->process_bio = process_bio_fail; 1542 pool->process_bio = process_bio_fail;
1414 pool->process_discard = process_bio_fail; 1543 pool->process_discard = process_bio_fail;
1415 pool->process_prepared_mapping = process_prepared_mapping_fail; 1544 pool->process_prepared_mapping = process_prepared_mapping_fail;
1416 pool->process_prepared_discard = process_prepared_discard_fail; 1545 pool->process_prepared_discard = process_prepared_discard_fail;
1546
1547 error_retry_list(pool);
1417 break; 1548 break;
1418 1549
1419 case PM_READ_ONLY: 1550 case PM_READ_ONLY:
1420 if (old_mode != new_mode) 1551 if (old_mode != new_mode)
1421 DMERR("%s: switching pool to read-only mode", 1552 notify_of_pool_mode_change(pool, "read-only");
1422 dm_device_name(pool->pool_md)); 1553 dm_pool_metadata_read_only(pool->pmd);
1423 r = dm_pool_abort_metadata(pool->pmd); 1554 pool->process_bio = process_bio_read_only;
1424 if (r) { 1555 pool->process_discard = process_bio_success;
1425 DMERR("%s: aborting transaction failed", 1556 pool->process_prepared_mapping = process_prepared_mapping_fail;
1426 dm_device_name(pool->pool_md)); 1557 pool->process_prepared_discard = process_prepared_discard_passdown;
1427 new_mode = PM_FAIL; 1558
1428 set_pool_mode(pool, new_mode); 1559 error_retry_list(pool);
1429 } else { 1560 break;
1430 dm_pool_metadata_read_only(pool->pmd); 1561
1431 pool->process_bio = process_bio_read_only; 1562 case PM_OUT_OF_DATA_SPACE:
1432 pool->process_discard = process_discard; 1563 /*
1433 pool->process_prepared_mapping = process_prepared_mapping_fail; 1564 * Ideally we'd never hit this state; the low water mark
1434 pool->process_prepared_discard = process_prepared_discard_passdown; 1565 * would trigger userland to extend the pool before we
1435 } 1566 * completely run out of data space. However, many small
1567 * IOs to unprovisioned space can consume data space at an
1568 * alarming rate. Adjust your low water mark if you're
1569 * frequently seeing this mode.
1570 */
1571 if (old_mode != new_mode)
1572 notify_of_pool_mode_change(pool, "out-of-data-space");
1573 pool->process_bio = process_bio_read_only;
1574 pool->process_discard = process_discard;
1575 pool->process_prepared_mapping = process_prepared_mapping;
1576 pool->process_prepared_discard = process_prepared_discard_passdown;
1436 break; 1577 break;
1437 1578
1438 case PM_WRITE: 1579 case PM_WRITE:
1439 if (old_mode != new_mode) 1580 if (old_mode != new_mode)
1440 DMINFO("%s: switching pool to write mode", 1581 notify_of_pool_mode_change(pool, "write");
1441 dm_device_name(pool->pool_md));
1442 dm_pool_metadata_read_write(pool->pmd); 1582 dm_pool_metadata_read_write(pool->pmd);
1443 pool->process_bio = process_bio; 1583 pool->process_bio = process_bio;
1444 pool->process_discard = process_discard; 1584 pool->process_discard = process_discard;
@@ -1448,32 +1588,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1448 } 1588 }
1449 1589
1450 pool->pf.mode = new_mode; 1590 pool->pf.mode = new_mode;
1591 /*
1592 * The pool mode may have changed, sync it so bind_control_target()
1593 * doesn't cause an unexpected mode transition on resume.
1594 */
1595 pt->adjusted_pf.mode = new_mode;
1451} 1596}
1452 1597
1453/* 1598static void abort_transaction(struct pool *pool)
1454 * Rather than calling set_pool_mode directly, use these which describe the
1455 * reason for mode degradation.
1456 */
1457static void out_of_data_space(struct pool *pool)
1458{ 1599{
1459 DMERR_LIMIT("%s: no free data space available.", 1600 const char *dev_name = dm_device_name(pool->pool_md);
1460 dm_device_name(pool->pool_md)); 1601
1461 set_pool_mode(pool, PM_READ_ONLY); 1602 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1603 if (dm_pool_abort_metadata(pool->pmd)) {
1604 DMERR("%s: failed to abort metadata transaction", dev_name);
1605 set_pool_mode(pool, PM_FAIL);
1606 }
1607
1608 if (dm_pool_metadata_set_needs_check(pool->pmd)) {
1609 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1610 set_pool_mode(pool, PM_FAIL);
1611 }
1462} 1612}
1463 1613
1464static void metadata_operation_failed(struct pool *pool, const char *op, int r) 1614static void metadata_operation_failed(struct pool *pool, const char *op, int r)
1465{ 1615{
1466 dm_block_t free_blocks;
1467
1468 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1616 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1469 dm_device_name(pool->pool_md), op, r); 1617 dm_device_name(pool->pool_md), op, r);
1470 1618
1471 if (r == -ENOSPC && 1619 abort_transaction(pool);
1472 !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
1473 !free_blocks)
1474 DMERR_LIMIT("%s: no free metadata space available.",
1475 dm_device_name(pool->pool_md));
1476
1477 set_pool_mode(pool, PM_READ_ONLY); 1620 set_pool_mode(pool, PM_READ_ONLY);
1478} 1621}
1479 1622
@@ -1524,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1524 1667
1525 thin_hook_bio(tc, bio); 1668 thin_hook_bio(tc, bio);
1526 1669
1670 if (tc->requeue_mode) {
1671 bio_endio(bio, DM_ENDIO_REQUEUE);
1672 return DM_MAPIO_SUBMITTED;
1673 }
1674
1527 if (get_pool_mode(tc->pool) == PM_FAIL) { 1675 if (get_pool_mode(tc->pool) == PM_FAIL) {
1528 bio_io_error(bio); 1676 bio_io_error(bio);
1529 return DM_MAPIO_SUBMITTED; 1677 return DM_MAPIO_SUBMITTED;
@@ -1687,7 +1835,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1687 /* 1835 /*
1688 * We want to make sure that a pool in PM_FAIL mode is never upgraded. 1836 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
1689 */ 1837 */
1690 enum pool_mode old_mode = pool->pf.mode; 1838 enum pool_mode old_mode = get_pool_mode(pool);
1691 enum pool_mode new_mode = pt->adjusted_pf.mode; 1839 enum pool_mode new_mode = pt->adjusted_pf.mode;
1692 1840
1693 /* 1841 /*
@@ -1701,16 +1849,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1701 pool->pf = pt->adjusted_pf; 1849 pool->pf = pt->adjusted_pf;
1702 pool->low_water_blocks = pt->low_water_blocks; 1850 pool->low_water_blocks = pt->low_water_blocks;
1703 1851
1704 /*
1705 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1706 * not going to recover without a thin_repair. So we never let the
1707 * pool move out of the old mode. On the other hand a PM_READ_ONLY
1708 * may have been due to a lack of metadata or data space, and may
1709 * now work (ie. if the underlying devices have been resized).
1710 */
1711 if (old_mode == PM_FAIL)
1712 new_mode = old_mode;
1713
1714 set_pool_mode(pool, new_mode); 1852 set_pool_mode(pool, new_mode);
1715 1853
1716 return 0; 1854 return 0;
@@ -2253,6 +2391,12 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
2253 return -EINVAL; 2391 return -EINVAL;
2254 2392
2255 } else if (data_size > sb_data_size) { 2393 } else if (data_size > sb_data_size) {
2394 if (dm_pool_metadata_needs_check(pool->pmd)) {
2395 DMERR("%s: unable to grow the data device until repaired.",
2396 dm_device_name(pool->pool_md));
2397 return 0;
2398 }
2399
2256 if (sb_data_size) 2400 if (sb_data_size)
2257 DMINFO("%s: growing the data device from %llu to %llu blocks", 2401 DMINFO("%s: growing the data device from %llu to %llu blocks",
2258 dm_device_name(pool->pool_md), 2402 dm_device_name(pool->pool_md),
@@ -2294,6 +2438,12 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2294 return -EINVAL; 2438 return -EINVAL;
2295 2439
2296 } else if (metadata_dev_size > sb_metadata_dev_size) { 2440 } else if (metadata_dev_size > sb_metadata_dev_size) {
2441 if (dm_pool_metadata_needs_check(pool->pmd)) {
2442 DMERR("%s: unable to grow the metadata device until repaired.",
2443 dm_device_name(pool->pool_md));
2444 return 0;
2445 }
2446
2297 warn_if_metadata_device_too_big(pool->md_dev); 2447 warn_if_metadata_device_too_big(pool->md_dev);
2298 DMINFO("%s: growing the metadata device from %llu to %llu blocks", 2448 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
2299 dm_device_name(pool->pool_md), 2449 dm_device_name(pool->pool_md),
@@ -2681,7 +2831,9 @@ static void pool_status(struct dm_target *ti, status_type_t type,
2681 else 2831 else
2682 DMEMIT("- "); 2832 DMEMIT("- ");
2683 2833
2684 if (pool->pf.mode == PM_READ_ONLY) 2834 if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
2835 DMEMIT("out_of_data_space ");
2836 else if (pool->pf.mode == PM_READ_ONLY)
2685 DMEMIT("ro "); 2837 DMEMIT("ro ");
2686 else 2838 else
2687 DMEMIT("rw "); 2839 DMEMIT("rw ");
@@ -2795,7 +2947,7 @@ static struct target_type pool_target = {
2795 .name = "thin-pool", 2947 .name = "thin-pool",
2796 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2948 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2797 DM_TARGET_IMMUTABLE, 2949 DM_TARGET_IMMUTABLE,
2798 .version = {1, 10, 0}, 2950 .version = {1, 11, 0},
2799 .module = THIS_MODULE, 2951 .module = THIS_MODULE,
2800 .ctr = pool_ctr, 2952 .ctr = pool_ctr,
2801 .dtr = pool_dtr, 2953 .dtr = pool_dtr,
@@ -2997,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
2997 return 0; 3149 return 0;
2998} 3150}
2999 3151
3000static void thin_postsuspend(struct dm_target *ti) 3152static void thin_presuspend(struct dm_target *ti)
3001{ 3153{
3154 struct thin_c *tc = ti->private;
3155
3002 if (dm_noflush_suspending(ti)) 3156 if (dm_noflush_suspending(ti))
3003 requeue_io((struct thin_c *)ti->private); 3157 noflush_work(tc, do_noflush_start);
3158}
3159
3160static void thin_postsuspend(struct dm_target *ti)
3161{
3162 struct thin_c *tc = ti->private;
3163
3164 /*
3165 * The dm_noflush_suspending flag has been cleared by now, so
3166 * unfortunately we must always run this.
3167 */
3168 noflush_work(tc, do_noflush_stop);
3004} 3169}
3005 3170
3006/* 3171/*
@@ -3085,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti,
3085 3250
3086static struct target_type thin_target = { 3251static struct target_type thin_target = {
3087 .name = "thin", 3252 .name = "thin",
3088 .version = {1, 10, 0}, 3253 .version = {1, 11, 0},
3089 .module = THIS_MODULE, 3254 .module = THIS_MODULE,
3090 .ctr = thin_ctr, 3255 .ctr = thin_ctr,
3091 .dtr = thin_dtr, 3256 .dtr = thin_dtr,
3092 .map = thin_map, 3257 .map = thin_map,
3093 .end_io = thin_endio, 3258 .end_io = thin_endio,
3259 .presuspend = thin_presuspend,
3094 .postsuspend = thin_postsuspend, 3260 .postsuspend = thin_postsuspend,
3095 .status = thin_status, 3261 .status = thin_status,
3096 .iterate_devices = thin_iterate_devices, 3262 .iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index 19b268795415..0c2dec7aec20 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA
6 ---help--- 6 ---help---
7 Library providing immutable on-disk data structure support for 7 Library providing immutable on-disk data structure support for
8 device-mapper targets such as the thin provisioning target. 8 device-mapper targets such as the thin provisioning target.
9
10config DM_DEBUG_BLOCK_STACK_TRACING
11 boolean "Keep stack trace of persistent data block lock holders"
12 depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
13 select STACKTRACE
14 ---help---
15 Enable this for messages that may help debug problems with the
16 block manager locking used by thin provisioning and caching.
17
18 If unsure, say N.
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index e9bdd462f4f5..786b689bdfc7 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -91,6 +91,69 @@ struct block_op {
91 dm_block_t block; 91 dm_block_t block;
92}; 92};
93 93
94struct bop_ring_buffer {
95 unsigned begin;
96 unsigned end;
97 struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
98};
99
100static void brb_init(struct bop_ring_buffer *brb)
101{
102 brb->begin = 0;
103 brb->end = 0;
104}
105
106static bool brb_empty(struct bop_ring_buffer *brb)
107{
108 return brb->begin == brb->end;
109}
110
111static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
112{
113 unsigned r = old + 1;
114 return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
115}
116
117static int brb_push(struct bop_ring_buffer *brb,
118 enum block_op_type type, dm_block_t b)
119{
120 struct block_op *bop;
121 unsigned next = brb_next(brb, brb->end);
122
123 /*
124 * We don't allow the last bop to be filled, this way we can
125 * differentiate between full and empty.
126 */
127 if (next == brb->begin)
128 return -ENOMEM;
129
130 bop = brb->bops + brb->end;
131 bop->type = type;
132 bop->block = b;
133
134 brb->end = next;
135
136 return 0;
137}
138
139static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
140{
141 struct block_op *bop;
142
143 if (brb_empty(brb))
144 return -ENODATA;
145
146 bop = brb->bops + brb->begin;
147 result->type = bop->type;
148 result->block = bop->block;
149
150 brb->begin = brb_next(brb, brb->begin);
151
152 return 0;
153}
154
155/*----------------------------------------------------------------*/
156
94struct sm_metadata { 157struct sm_metadata {
95 struct dm_space_map sm; 158 struct dm_space_map sm;
96 159
@@ -101,25 +164,20 @@ struct sm_metadata {
101 164
102 unsigned recursion_count; 165 unsigned recursion_count;
103 unsigned allocated_this_transaction; 166 unsigned allocated_this_transaction;
104 unsigned nr_uncommitted; 167 struct bop_ring_buffer uncommitted;
105 struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
106 168
107 struct threshold threshold; 169 struct threshold threshold;
108}; 170};
109 171
110static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) 172static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
111{ 173{
112 struct block_op *op; 174 int r = brb_push(&smm->uncommitted, type, b);
113 175
114 if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { 176 if (r) {
115 DMERR("too many recursive allocations"); 177 DMERR("too many recursive allocations");
116 return -ENOMEM; 178 return -ENOMEM;
117 } 179 }
118 180
119 op = smm->uncommitted + smm->nr_uncommitted++;
120 op->type = type;
121 op->block = b;
122
123 return 0; 181 return 0;
124} 182}
125 183
@@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm)
158 return -ENOMEM; 216 return -ENOMEM;
159 } 217 }
160 218
161 if (smm->recursion_count == 1 && smm->nr_uncommitted) { 219 if (smm->recursion_count == 1) {
162 while (smm->nr_uncommitted && !r) { 220 while (!brb_empty(&smm->uncommitted)) {
163 smm->nr_uncommitted--; 221 struct block_op bop;
164 r = commit_bop(smm, smm->uncommitted + 222
165 smm->nr_uncommitted); 223 r = brb_pop(&smm->uncommitted, &bop);
224 if (r) {
225 DMERR("bug in bop ring buffer");
226 break;
227 }
228
229 r = commit_bop(smm, &bop);
166 if (r) 230 if (r)
167 break; 231 break;
168 } 232 }
@@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
217static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, 281static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
218 uint32_t *result) 282 uint32_t *result)
219{ 283{
220 int r, i; 284 int r;
285 unsigned i;
221 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 286 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
222 unsigned adjustment = 0; 287 unsigned adjustment = 0;
223 288
@@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
225 * We may have some uncommitted adjustments to add. This list 290 * We may have some uncommitted adjustments to add. This list
226 * should always be really short. 291 * should always be really short.
227 */ 292 */
228 for (i = 0; i < smm->nr_uncommitted; i++) { 293 for (i = smm->uncommitted.begin;
229 struct block_op *op = smm->uncommitted + i; 294 i != smm->uncommitted.end;
295 i = brb_next(&smm->uncommitted, i)) {
296 struct block_op *op = smm->uncommitted.bops + i;
230 297
231 if (op->block != b) 298 if (op->block != b)
232 continue; 299 continue;
@@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
254static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, 321static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
255 dm_block_t b, int *result) 322 dm_block_t b, int *result)
256{ 323{
257 int r, i, adjustment = 0; 324 int r, adjustment = 0;
325 unsigned i;
258 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 326 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
259 uint32_t rc; 327 uint32_t rc;
260 328
@@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
262 * We may have some uncommitted adjustments to add. This list 330 * We may have some uncommitted adjustments to add. This list
263 * should always be really short. 331 * should always be really short.
264 */ 332 */
265 for (i = 0; i < smm->nr_uncommitted; i++) { 333 for (i = smm->uncommitted.begin;
266 struct block_op *op = smm->uncommitted + i; 334 i != smm->uncommitted.end;
335 i = brb_next(&smm->uncommitted, i)) {
336
337 struct block_op *op = smm->uncommitted.bops + i;
267 338
268 if (op->block != b) 339 if (op->block != b)
269 continue; 340 continue;
@@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
671 smm->begin = superblock + 1; 742 smm->begin = superblock + 1;
672 smm->recursion_count = 0; 743 smm->recursion_count = 0;
673 smm->allocated_this_transaction = 0; 744 smm->allocated_this_transaction = 0;
674 smm->nr_uncommitted = 0; 745 brb_init(&smm->uncommitted);
675 threshold_init(&smm->threshold); 746 threshold_init(&smm->threshold);
676 747
677 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); 748 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
@@ -715,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
715 smm->begin = 0; 786 smm->begin = 0;
716 smm->recursion_count = 0; 787 smm->recursion_count = 0;
717 smm->allocated_this_transaction = 0; 788 smm->allocated_this_transaction = 0;
718 smm->nr_uncommitted = 0; 789 brb_init(&smm->uncommitted);
719 threshold_init(&smm->threshold); 790 threshold_init(&smm->threshold);
720 791
721 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); 792 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));