aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoe Thornber <ejt@redhat.com>2018-09-10 11:50:09 -0400
committerMike Snitzer <snitzer@redhat.com>2018-09-10 17:03:18 -0400
commit3ab91828166895600efd9cdc3a0eb32001f7204a (patch)
treeb0a357366b35d36d697c8f6f1d2f6fede0074b63
parent5380c05b682991a6818c3755d450a3e87eeac0e5 (diff)
dm thin metadata: try to avoid ever aborting transactions
Committing a transaction can consume some metadata of it's own, we now reserve a small amount of metadata to cover this. Free metadata reported by the kernel will not include this reserve. If any of the reserve has been used after a commit we enter a new internal state PM_OUT_OF_METADATA_SPACE. This is reported as PM_READ_ONLY, so no userland changes are needed. If the metadata device is resized the pool will move back to PM_WRITE. These changes mean we never need to abort and rollback a transaction due to running out of metadata space. This is particularly important because there have been a handful of reports of data corruption against DM thin-provisioning that can all be attributed to the thin-pool having ran out of metadata space. Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
-rw-r--r--drivers/md/dm-thin-metadata.c36
-rw-r--r--drivers/md/dm-thin.c73
2 files changed, 100 insertions, 9 deletions
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 72142021b5c9..74f6770c70b1 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -189,6 +189,12 @@ struct dm_pool_metadata {
189 sector_t data_block_size; 189 sector_t data_block_size;
190 190
191 /* 191 /*
192 * We reserve a section of the metadata for commit overhead.
193 * All reported space does *not* include this.
194 */
195 dm_block_t metadata_reserve;
196
197 /*
192 * Set if a transaction has to be aborted but the attempt to roll back 198 * Set if a transaction has to be aborted but the attempt to roll back
193 * to the previous (good) transaction failed. The only pool metadata 199 * to the previous (good) transaction failed. The only pool metadata
194 * operation possible in this state is the closing of the device. 200 * operation possible in this state is the closing of the device.
@@ -816,6 +822,22 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
816 return dm_tm_commit(pmd->tm, sblock); 822 return dm_tm_commit(pmd->tm, sblock);
817} 823}
818 824
825static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
826{
827 int r;
828 dm_block_t total;
829 dm_block_t max_blocks = 4096; /* 16M */
830
831 r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
832 if (r) {
833 DMERR("could not get size of metadata device");
834 pmd->metadata_reserve = max_blocks;
835 } else {
836 sector_div(total, 10);
837 pmd->metadata_reserve = min(max_blocks, total);
838 }
839}
840
819struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, 841struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
820 sector_t data_block_size, 842 sector_t data_block_size,
821 bool format_device) 843 bool format_device)
@@ -849,6 +871,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
849 return ERR_PTR(r); 871 return ERR_PTR(r);
850 } 872 }
851 873
874 __set_metadata_reserve(pmd);
875
852 return pmd; 876 return pmd;
853} 877}
854 878
@@ -1820,6 +1844,13 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1820 down_read(&pmd->root_lock); 1844 down_read(&pmd->root_lock);
1821 if (!pmd->fail_io) 1845 if (!pmd->fail_io)
1822 r = dm_sm_get_nr_free(pmd->metadata_sm, result); 1846 r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1847
1848 if (!r) {
1849 if (*result < pmd->metadata_reserve)
1850 *result = 0;
1851 else
1852 *result -= pmd->metadata_reserve;
1853 }
1823 up_read(&pmd->root_lock); 1854 up_read(&pmd->root_lock);
1824 1855
1825 return r; 1856 return r;
@@ -1932,8 +1963,11 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
1932 int r = -EINVAL; 1963 int r = -EINVAL;
1933 1964
1934 down_write(&pmd->root_lock); 1965 down_write(&pmd->root_lock);
1935 if (!pmd->fail_io) 1966 if (!pmd->fail_io) {
1936 r = __resize_space_map(pmd->metadata_sm, new_count); 1967 r = __resize_space_map(pmd->metadata_sm, new_count);
1968 if (!r)
1969 __set_metadata_reserve(pmd);
1970 }
1937 up_write(&pmd->root_lock); 1971 up_write(&pmd->root_lock);
1938 1972
1939 return r; 1973 return r;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 7bd60a150f8f..aaf1ad481ee8 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -200,7 +200,13 @@ struct dm_thin_new_mapping;
200enum pool_mode { 200enum pool_mode {
201 PM_WRITE, /* metadata may be changed */ 201 PM_WRITE, /* metadata may be changed */
202 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */ 202 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
203
204 /*
205 * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
206 */
207 PM_OUT_OF_METADATA_SPACE,
203 PM_READ_ONLY, /* metadata may not be changed */ 208 PM_READ_ONLY, /* metadata may not be changed */
209
204 PM_FAIL, /* all I/O fails */ 210 PM_FAIL, /* all I/O fails */
205}; 211};
206 212
@@ -1371,7 +1377,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
1371 1377
1372static void requeue_bios(struct pool *pool); 1378static void requeue_bios(struct pool *pool);
1373 1379
1374static void check_for_space(struct pool *pool) 1380static bool is_read_only_pool_mode(enum pool_mode mode)
1381{
1382 return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
1383}
1384
1385static bool is_read_only(struct pool *pool)
1386{
1387 return is_read_only_pool_mode(get_pool_mode(pool));
1388}
1389
1390static void check_for_metadata_space(struct pool *pool)
1391{
1392 int r;
1393 const char *ooms_reason = NULL;
1394 dm_block_t nr_free;
1395
1396 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
1397 if (r)
1398 ooms_reason = "Could not get free metadata blocks";
1399 else if (!nr_free)
1400 ooms_reason = "No free metadata blocks";
1401
1402 if (ooms_reason && !is_read_only(pool)) {
1403 DMERR("%s", ooms_reason);
1404 set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
1405 }
1406}
1407
1408static void check_for_data_space(struct pool *pool)
1375{ 1409{
1376 int r; 1410 int r;
1377 dm_block_t nr_free; 1411 dm_block_t nr_free;
@@ -1397,14 +1431,16 @@ static int commit(struct pool *pool)
1397{ 1431{
1398 int r; 1432 int r;
1399 1433
1400 if (get_pool_mode(pool) >= PM_READ_ONLY) 1434 if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
1401 return -EINVAL; 1435 return -EINVAL;
1402 1436
1403 r = dm_pool_commit_metadata(pool->pmd); 1437 r = dm_pool_commit_metadata(pool->pmd);
1404 if (r) 1438 if (r)
1405 metadata_operation_failed(pool, "dm_pool_commit_metadata", r); 1439 metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
1406 else 1440 else {
1407 check_for_space(pool); 1441 check_for_metadata_space(pool);
1442 check_for_data_space(pool);
1443 }
1408 1444
1409 return r; 1445 return r;
1410} 1446}
@@ -1470,6 +1506,19 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1470 return r; 1506 return r;
1471 } 1507 }
1472 1508
1509 r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
1510 if (r) {
1511 metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
1512 return r;
1513 }
1514
1515 if (!free_blocks) {
1516 /* Let's commit before we use up the metadata reserve. */
1517 r = commit(pool);
1518 if (r)
1519 return r;
1520 }
1521
1473 return 0; 1522 return 0;
1474} 1523}
1475 1524
@@ -1501,6 +1550,7 @@ static blk_status_t should_error_unserviceable_bio(struct pool *pool)
1501 case PM_OUT_OF_DATA_SPACE: 1550 case PM_OUT_OF_DATA_SPACE:
1502 return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0; 1551 return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
1503 1552
1553 case PM_OUT_OF_METADATA_SPACE:
1504 case PM_READ_ONLY: 1554 case PM_READ_ONLY:
1505 case PM_FAIL: 1555 case PM_FAIL:
1506 return BLK_STS_IOERR; 1556 return BLK_STS_IOERR;
@@ -2464,8 +2514,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
2464 error_retry_list(pool); 2514 error_retry_list(pool);
2465 break; 2515 break;
2466 2516
2517 case PM_OUT_OF_METADATA_SPACE:
2467 case PM_READ_ONLY: 2518 case PM_READ_ONLY:
2468 if (old_mode != new_mode) 2519 if (!is_read_only_pool_mode(old_mode))
2469 notify_of_pool_mode_change(pool, "read-only"); 2520 notify_of_pool_mode_change(pool, "read-only");
2470 dm_pool_metadata_read_only(pool->pmd); 2521 dm_pool_metadata_read_only(pool->pmd);
2471 pool->process_bio = process_bio_read_only; 2522 pool->process_bio = process_bio_read_only;
@@ -3403,6 +3454,10 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
3403 DMINFO("%s: growing the metadata device from %llu to %llu blocks", 3454 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
3404 dm_device_name(pool->pool_md), 3455 dm_device_name(pool->pool_md),
3405 sb_metadata_dev_size, metadata_dev_size); 3456 sb_metadata_dev_size, metadata_dev_size);
3457
3458 if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
3459 set_pool_mode(pool, PM_WRITE);
3460
3406 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); 3461 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
3407 if (r) { 3462 if (r) {
3408 metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r); 3463 metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
@@ -3707,7 +3762,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv,
3707 struct pool_c *pt = ti->private; 3762 struct pool_c *pt = ti->private;
3708 struct pool *pool = pt->pool; 3763 struct pool *pool = pt->pool;
3709 3764
3710 if (get_pool_mode(pool) >= PM_READ_ONLY) { 3765 if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
3711 DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode", 3766 DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
3712 dm_device_name(pool->pool_md)); 3767 dm_device_name(pool->pool_md));
3713 return -EOPNOTSUPP; 3768 return -EOPNOTSUPP;
@@ -3781,6 +3836,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
3781 dm_block_t nr_blocks_data; 3836 dm_block_t nr_blocks_data;
3782 dm_block_t nr_blocks_metadata; 3837 dm_block_t nr_blocks_metadata;
3783 dm_block_t held_root; 3838 dm_block_t held_root;
3839 enum pool_mode mode;
3784 char buf[BDEVNAME_SIZE]; 3840 char buf[BDEVNAME_SIZE];
3785 char buf2[BDEVNAME_SIZE]; 3841 char buf2[BDEVNAME_SIZE];
3786 struct pool_c *pt = ti->private; 3842 struct pool_c *pt = ti->private;
@@ -3851,9 +3907,10 @@ static void pool_status(struct dm_target *ti, status_type_t type,
3851 else 3907 else
3852 DMEMIT("- "); 3908 DMEMIT("- ");
3853 3909
3854 if (pool->pf.mode == PM_OUT_OF_DATA_SPACE) 3910 mode = get_pool_mode(pool);
3911 if (mode == PM_OUT_OF_DATA_SPACE)
3855 DMEMIT("out_of_data_space "); 3912 DMEMIT("out_of_data_space ");
3856 else if (pool->pf.mode == PM_READ_ONLY) 3913 else if (is_read_only_pool_mode(mode))
3857 DMEMIT("ro "); 3914 DMEMIT("ro ");
3858 else 3915 else
3859 DMEMIT("rw "); 3916 DMEMIT("rw ");