aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorJoe Thornber <ejt@redhat.com>2014-03-03 11:03:26 -0500
committerMike Snitzer <snitzer@redhat.com>2014-03-05 15:26:58 -0500
commit3e1a0699095803e53072699a4a1485af7744601d (patch)
tree17fda0c238e26a828081d5c80b6c15a2d34079f5 /drivers/md
parent07f2b6e0382ec4c59887d5954683f1a0b265574e (diff)
dm thin: fix out of data space handling
Ideally a thin pool would never run out of data space; the low water mark would trigger userland to extend the pool before we completely run out of space. However, many small random IOs to unprovisioned space can consume data space at an alarming rate. Adjust your low water mark if you're frequently seeing "out-of-data-space" mode. Before this fix, if data space ran out the pool would be put in PM_READ_ONLY mode which also aborted the pool's current metadata transaction (data loss for any changes in the transaction). This had a side-effect of needlessly compromising data consistency. And retry of queued unserviceable bios, once the data pool was resized, could initiate changes to potentially inconsistent pool metadata. Now when the pool's data space is exhausted transition to a new pool mode (PM_OUT_OF_DATA_SPACE) that allows metadata to be changed but data may not be allocated. This allows users to remove thin volumes or discard data to recover data space. The pool is no longer put in PM_READ_ONLY mode in response to the pool running out of data space. And PM_READ_ONLY mode no longer aborts the pool's current metadata transaction. Also, set_pool_mode() will now notify userspace when the pool mode is changed. Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-thin.c147
1 files changed, 102 insertions, 45 deletions
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index a04eba905922..38a063f7afa4 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
130struct dm_thin_new_mapping; 130struct dm_thin_new_mapping;
131 131
132/* 132/*
133 * The pool runs in 3 modes. Ordered in degraded order for comparisons. 133 * The pool runs in 4 modes. Ordered in degraded order for comparisons.
134 */ 134 */
135enum pool_mode { 135enum pool_mode {
136 PM_WRITE, /* metadata may be changed */ 136 PM_WRITE, /* metadata may be changed */
137 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
137 PM_READ_ONLY, /* metadata may not be changed */ 138 PM_READ_ONLY, /* metadata may not be changed */
138 PM_FAIL, /* all I/O fails */ 139 PM_FAIL, /* all I/O fails */
139}; 140};
@@ -198,7 +199,6 @@ struct pool {
198}; 199};
199 200
200static enum pool_mode get_pool_mode(struct pool *pool); 201static enum pool_mode get_pool_mode(struct pool *pool);
201static void out_of_data_space(struct pool *pool);
202static void metadata_operation_failed(struct pool *pool, const char *op, int r); 202static void metadata_operation_failed(struct pool *pool, const char *op, int r);
203 203
204/* 204/*
@@ -399,6 +399,23 @@ static void requeue_io(struct thin_c *tc)
399 spin_unlock_irqrestore(&pool->lock, flags); 399 spin_unlock_irqrestore(&pool->lock, flags);
400} 400}
401 401
402static void error_retry_list(struct pool *pool)
403{
404 struct bio *bio;
405 unsigned long flags;
406 struct bio_list bios;
407
408 bio_list_init(&bios);
409
410 spin_lock_irqsave(&pool->lock, flags);
411 bio_list_merge(&bios, &pool->retry_on_resume_list);
412 bio_list_init(&pool->retry_on_resume_list);
413 spin_unlock_irqrestore(&pool->lock, flags);
414
415 while ((bio = bio_list_pop(&bios)))
416 bio_io_error(bio);
417}
418
402/* 419/*
403 * This section of code contains the logic for processing a thin device's IO. 420 * This section of code contains the logic for processing a thin device's IO.
404 * Much of the code depends on pool object resources (lists, workqueues, etc) 421 * Much of the code depends on pool object resources (lists, workqueues, etc)
@@ -925,13 +942,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
925 } 942 }
926} 943}
927 944
945static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
946
928static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 947static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
929{ 948{
930 int r; 949 int r;
931 dm_block_t free_blocks; 950 dm_block_t free_blocks;
932 struct pool *pool = tc->pool; 951 struct pool *pool = tc->pool;
933 952
934 if (get_pool_mode(pool) != PM_WRITE) 953 if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
935 return -EINVAL; 954 return -EINVAL;
936 955
937 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 956 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
@@ -958,7 +977,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
958 } 977 }
959 978
960 if (!free_blocks) { 979 if (!free_blocks) {
961 out_of_data_space(pool); 980 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
962 return -ENOSPC; 981 return -ENOSPC;
963 } 982 }
964 } 983 }
@@ -988,15 +1007,32 @@ static void retry_on_resume(struct bio *bio)
988 spin_unlock_irqrestore(&pool->lock, flags); 1007 spin_unlock_irqrestore(&pool->lock, flags);
989} 1008}
990 1009
991static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) 1010static bool should_error_unserviceable_bio(struct pool *pool)
992{ 1011{
993 /* 1012 enum pool_mode m = get_pool_mode(pool);
994 * When pool is read-only, no cell locking is needed because 1013
995 * nothing is changing. 1014 switch (m) {
996 */ 1015 case PM_WRITE:
997 WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY); 1016 /* Shouldn't get here */
1017 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1018 return true;
1019
1020 case PM_OUT_OF_DATA_SPACE:
1021 return pool->pf.error_if_no_space;
1022
1023 case PM_READ_ONLY:
1024 case PM_FAIL:
1025 return true;
1026 default:
1027 /* Shouldn't get here */
1028 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1029 return true;
1030 }
1031}
998 1032
999 if (pool->pf.error_if_no_space) 1033static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1034{
1035 if (should_error_unserviceable_bio(pool))
1000 bio_io_error(bio); 1036 bio_io_error(bio);
1001 else 1037 else
1002 retry_on_resume(bio); 1038 retry_on_resume(bio);
@@ -1007,11 +1043,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
1007 struct bio *bio; 1043 struct bio *bio;
1008 struct bio_list bios; 1044 struct bio_list bios;
1009 1045
1046 if (should_error_unserviceable_bio(pool)) {
1047 cell_error(pool, cell);
1048 return;
1049 }
1050
1010 bio_list_init(&bios); 1051 bio_list_init(&bios);
1011 cell_release(pool, cell, &bios); 1052 cell_release(pool, cell, &bios);
1012 1053
1013 while ((bio = bio_list_pop(&bios))) 1054 if (should_error_unserviceable_bio(pool))
1014 handle_unserviceable_bio(pool, bio); 1055 while ((bio = bio_list_pop(&bios)))
1056 bio_io_error(bio);
1057 else
1058 while ((bio = bio_list_pop(&bios)))
1059 retry_on_resume(bio);
1015} 1060}
1016 1061
1017static void process_discard(struct thin_c *tc, struct bio *bio) 1062static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -1296,6 +1341,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1296 } 1341 }
1297} 1342}
1298 1343
1344static void process_bio_success(struct thin_c *tc, struct bio *bio)
1345{
1346 bio_endio(bio, 0);
1347}
1348
1299static void process_bio_fail(struct thin_c *tc, struct bio *bio) 1349static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1300{ 1350{
1301 bio_io_error(bio); 1351 bio_io_error(bio);
@@ -1399,9 +1449,15 @@ static enum pool_mode get_pool_mode(struct pool *pool)
1399 return pool->pf.mode; 1449 return pool->pf.mode;
1400} 1450}
1401 1451
1452static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
1453{
1454 dm_table_event(pool->ti->table);
1455 DMINFO("%s: switching pool to %s mode",
1456 dm_device_name(pool->pool_md), new_mode);
1457}
1458
1402static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) 1459static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1403{ 1460{
1404 int r;
1405 struct pool_c *pt = pool->ti->private; 1461 struct pool_c *pt = pool->ti->private;
1406 bool needs_check = dm_pool_metadata_needs_check(pool->pmd); 1462 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1407 enum pool_mode old_mode = get_pool_mode(pool); 1463 enum pool_mode old_mode = get_pool_mode(pool);
@@ -1429,38 +1485,48 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1429 switch (new_mode) { 1485 switch (new_mode) {
1430 case PM_FAIL: 1486 case PM_FAIL:
1431 if (old_mode != new_mode) 1487 if (old_mode != new_mode)
1432 DMERR("%s: switching pool to failure mode", 1488 notify_of_pool_mode_change(pool, "failure");
1433 dm_device_name(pool->pool_md));
1434 dm_pool_metadata_read_only(pool->pmd); 1489 dm_pool_metadata_read_only(pool->pmd);
1435 pool->process_bio = process_bio_fail; 1490 pool->process_bio = process_bio_fail;
1436 pool->process_discard = process_bio_fail; 1491 pool->process_discard = process_bio_fail;
1437 pool->process_prepared_mapping = process_prepared_mapping_fail; 1492 pool->process_prepared_mapping = process_prepared_mapping_fail;
1438 pool->process_prepared_discard = process_prepared_discard_fail; 1493 pool->process_prepared_discard = process_prepared_discard_fail;
1494
1495 error_retry_list(pool);
1439 break; 1496 break;
1440 1497
1441 case PM_READ_ONLY: 1498 case PM_READ_ONLY:
1442 if (old_mode != new_mode) 1499 if (old_mode != new_mode)
1443 DMERR("%s: switching pool to read-only mode", 1500 notify_of_pool_mode_change(pool, "read-only");
1444 dm_device_name(pool->pool_md)); 1501 dm_pool_metadata_read_only(pool->pmd);
1445 r = dm_pool_abort_metadata(pool->pmd); 1502 pool->process_bio = process_bio_read_only;
1446 if (r) { 1503 pool->process_discard = process_bio_success;
1447 DMERR("%s: aborting transaction failed", 1504 pool->process_prepared_mapping = process_prepared_mapping_fail;
1448 dm_device_name(pool->pool_md)); 1505 pool->process_prepared_discard = process_prepared_discard_passdown;
1449 new_mode = PM_FAIL; 1506
1450 set_pool_mode(pool, new_mode); 1507 error_retry_list(pool);
1451 } else { 1508 break;
1452 dm_pool_metadata_read_only(pool->pmd); 1509
1453 pool->process_bio = process_bio_read_only; 1510 case PM_OUT_OF_DATA_SPACE:
1454 pool->process_discard = process_discard; 1511 /*
1455 pool->process_prepared_mapping = process_prepared_mapping_fail; 1512 * Ideally we'd never hit this state; the low water mark
1456 pool->process_prepared_discard = process_prepared_discard_passdown; 1513 * would trigger userland to extend the pool before we
1457 } 1514 * completely run out of data space. However, many small
1515 * IOs to unprovisioned space can consume data space at an
1516 * alarming rate. Adjust your low water mark if you're
1517 * frequently seeing this mode.
1518 */
1519 if (old_mode != new_mode)
1520 notify_of_pool_mode_change(pool, "out-of-data-space");
1521 pool->process_bio = process_bio_read_only;
1522 pool->process_discard = process_discard;
1523 pool->process_prepared_mapping = process_prepared_mapping;
1524 pool->process_prepared_discard = process_prepared_discard_passdown;
1458 break; 1525 break;
1459 1526
1460 case PM_WRITE: 1527 case PM_WRITE:
1461 if (old_mode != new_mode) 1528 if (old_mode != new_mode)
1462 DMINFO("%s: switching pool to write mode", 1529 notify_of_pool_mode_change(pool, "write");
1463 dm_device_name(pool->pool_md));
1464 dm_pool_metadata_read_write(pool->pmd); 1530 dm_pool_metadata_read_write(pool->pmd);
1465 pool->process_bio = process_bio; 1531 pool->process_bio = process_bio;
1466 pool->process_discard = process_discard; 1532 pool->process_discard = process_discard;
@@ -1477,17 +1543,6 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1477 pt->adjusted_pf.mode = new_mode; 1543 pt->adjusted_pf.mode = new_mode;
1478} 1544}
1479 1545
1480/*
1481 * Rather than calling set_pool_mode directly, use these which describe the
1482 * reason for mode degradation.
1483 */
1484static void out_of_data_space(struct pool *pool)
1485{
1486 DMERR_LIMIT("%s: no free data space available.",
1487 dm_device_name(pool->pool_md));
1488 set_pool_mode(pool, PM_READ_ONLY);
1489}
1490
1491static void abort_transaction(struct pool *pool) 1546static void abort_transaction(struct pool *pool)
1492{ 1547{
1493 const char *dev_name = dm_device_name(pool->pool_md); 1548 const char *dev_name = dm_device_name(pool->pool_md);
@@ -2719,7 +2774,9 @@ static void pool_status(struct dm_target *ti, status_type_t type,
2719 else 2774 else
2720 DMEMIT("- "); 2775 DMEMIT("- ");
2721 2776
2722 if (pool->pf.mode == PM_READ_ONLY) 2777 if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
2778 DMEMIT("out_of_data_space ");
2779 else if (pool->pf.mode == PM_READ_ONLY)
2723 DMEMIT("ro "); 2780 DMEMIT("ro ");
2724 else 2781 else
2725 DMEMIT("rw "); 2782 DMEMIT("rw ");