diff options
author | Joe Thornber <ejt@redhat.com> | 2018-09-10 11:50:09 -0400 |
---|---|---|
committer | Mike Snitzer <snitzer@redhat.com> | 2018-09-10 17:03:18 -0400 |
commit | 3ab91828166895600efd9cdc3a0eb32001f7204a (patch) | |
tree | b0a357366b35d36d697c8f6f1d2f6fede0074b63 | |
parent | 5380c05b682991a6818c3755d450a3e87eeac0e5 (diff) |
dm thin metadata: try to avoid ever aborting transactions
Committing a transaction can consume some metadata of it's own, we now
reserve a small amount of metadata to cover this. Free metadata
reported by the kernel will not include this reserve.
If any of the reserve has been used after a commit we enter a new
internal state PM_OUT_OF_METADATA_SPACE. This is reported as
PM_READ_ONLY, so no userland changes are needed. If the metadata
device is resized the pool will move back to PM_WRITE.
These changes mean we never need to abort and rollback a transaction due
to running out of metadata space. This is particularly important
because there have been a handful of reports of data corruption against
DM thin-provisioning that can all be attributed to the thin-pool having
ran out of metadata space.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 36 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 73 |
2 files changed, 100 insertions, 9 deletions
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 72142021b5c9..74f6770c70b1 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c | |||
@@ -189,6 +189,12 @@ struct dm_pool_metadata { | |||
189 | sector_t data_block_size; | 189 | sector_t data_block_size; |
190 | 190 | ||
191 | /* | 191 | /* |
192 | * We reserve a section of the metadata for commit overhead. | ||
193 | * All reported space does *not* include this. | ||
194 | */ | ||
195 | dm_block_t metadata_reserve; | ||
196 | |||
197 | /* | ||
192 | * Set if a transaction has to be aborted but the attempt to roll back | 198 | * Set if a transaction has to be aborted but the attempt to roll back |
193 | * to the previous (good) transaction failed. The only pool metadata | 199 | * to the previous (good) transaction failed. The only pool metadata |
194 | * operation possible in this state is the closing of the device. | 200 | * operation possible in this state is the closing of the device. |
@@ -816,6 +822,22 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) | |||
816 | return dm_tm_commit(pmd->tm, sblock); | 822 | return dm_tm_commit(pmd->tm, sblock); |
817 | } | 823 | } |
818 | 824 | ||
825 | static void __set_metadata_reserve(struct dm_pool_metadata *pmd) | ||
826 | { | ||
827 | int r; | ||
828 | dm_block_t total; | ||
829 | dm_block_t max_blocks = 4096; /* 16M */ | ||
830 | |||
831 | r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total); | ||
832 | if (r) { | ||
833 | DMERR("could not get size of metadata device"); | ||
834 | pmd->metadata_reserve = max_blocks; | ||
835 | } else { | ||
836 | sector_div(total, 10); | ||
837 | pmd->metadata_reserve = min(max_blocks, total); | ||
838 | } | ||
839 | } | ||
840 | |||
819 | struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, | 841 | struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, |
820 | sector_t data_block_size, | 842 | sector_t data_block_size, |
821 | bool format_device) | 843 | bool format_device) |
@@ -849,6 +871,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, | |||
849 | return ERR_PTR(r); | 871 | return ERR_PTR(r); |
850 | } | 872 | } |
851 | 873 | ||
874 | __set_metadata_reserve(pmd); | ||
875 | |||
852 | return pmd; | 876 | return pmd; |
853 | } | 877 | } |
854 | 878 | ||
@@ -1820,6 +1844,13 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, | |||
1820 | down_read(&pmd->root_lock); | 1844 | down_read(&pmd->root_lock); |
1821 | if (!pmd->fail_io) | 1845 | if (!pmd->fail_io) |
1822 | r = dm_sm_get_nr_free(pmd->metadata_sm, result); | 1846 | r = dm_sm_get_nr_free(pmd->metadata_sm, result); |
1847 | |||
1848 | if (!r) { | ||
1849 | if (*result < pmd->metadata_reserve) | ||
1850 | *result = 0; | ||
1851 | else | ||
1852 | *result -= pmd->metadata_reserve; | ||
1853 | } | ||
1823 | up_read(&pmd->root_lock); | 1854 | up_read(&pmd->root_lock); |
1824 | 1855 | ||
1825 | return r; | 1856 | return r; |
@@ -1932,8 +1963,11 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou | |||
1932 | int r = -EINVAL; | 1963 | int r = -EINVAL; |
1933 | 1964 | ||
1934 | down_write(&pmd->root_lock); | 1965 | down_write(&pmd->root_lock); |
1935 | if (!pmd->fail_io) | 1966 | if (!pmd->fail_io) { |
1936 | r = __resize_space_map(pmd->metadata_sm, new_count); | 1967 | r = __resize_space_map(pmd->metadata_sm, new_count); |
1968 | if (!r) | ||
1969 | __set_metadata_reserve(pmd); | ||
1970 | } | ||
1937 | up_write(&pmd->root_lock); | 1971 | up_write(&pmd->root_lock); |
1938 | 1972 | ||
1939 | return r; | 1973 | return r; |
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 7bd60a150f8f..aaf1ad481ee8 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
@@ -200,7 +200,13 @@ struct dm_thin_new_mapping; | |||
200 | enum pool_mode { | 200 | enum pool_mode { |
201 | PM_WRITE, /* metadata may be changed */ | 201 | PM_WRITE, /* metadata may be changed */ |
202 | PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */ | 202 | PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */ |
203 | |||
204 | /* | ||
205 | * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY. | ||
206 | */ | ||
207 | PM_OUT_OF_METADATA_SPACE, | ||
203 | PM_READ_ONLY, /* metadata may not be changed */ | 208 | PM_READ_ONLY, /* metadata may not be changed */ |
209 | |||
204 | PM_FAIL, /* all I/O fails */ | 210 | PM_FAIL, /* all I/O fails */ |
205 | }; | 211 | }; |
206 | 212 | ||
@@ -1371,7 +1377,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); | |||
1371 | 1377 | ||
1372 | static void requeue_bios(struct pool *pool); | 1378 | static void requeue_bios(struct pool *pool); |
1373 | 1379 | ||
1374 | static void check_for_space(struct pool *pool) | 1380 | static bool is_read_only_pool_mode(enum pool_mode mode) |
1381 | { | ||
1382 | return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY); | ||
1383 | } | ||
1384 | |||
1385 | static bool is_read_only(struct pool *pool) | ||
1386 | { | ||
1387 | return is_read_only_pool_mode(get_pool_mode(pool)); | ||
1388 | } | ||
1389 | |||
1390 | static void check_for_metadata_space(struct pool *pool) | ||
1391 | { | ||
1392 | int r; | ||
1393 | const char *ooms_reason = NULL; | ||
1394 | dm_block_t nr_free; | ||
1395 | |||
1396 | r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free); | ||
1397 | if (r) | ||
1398 | ooms_reason = "Could not get free metadata blocks"; | ||
1399 | else if (!nr_free) | ||
1400 | ooms_reason = "No free metadata blocks"; | ||
1401 | |||
1402 | if (ooms_reason && !is_read_only(pool)) { | ||
1403 | DMERR("%s", ooms_reason); | ||
1404 | set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE); | ||
1405 | } | ||
1406 | } | ||
1407 | |||
1408 | static void check_for_data_space(struct pool *pool) | ||
1375 | { | 1409 | { |
1376 | int r; | 1410 | int r; |
1377 | dm_block_t nr_free; | 1411 | dm_block_t nr_free; |
@@ -1397,14 +1431,16 @@ static int commit(struct pool *pool) | |||
1397 | { | 1431 | { |
1398 | int r; | 1432 | int r; |
1399 | 1433 | ||
1400 | if (get_pool_mode(pool) >= PM_READ_ONLY) | 1434 | if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) |
1401 | return -EINVAL; | 1435 | return -EINVAL; |
1402 | 1436 | ||
1403 | r = dm_pool_commit_metadata(pool->pmd); | 1437 | r = dm_pool_commit_metadata(pool->pmd); |
1404 | if (r) | 1438 | if (r) |
1405 | metadata_operation_failed(pool, "dm_pool_commit_metadata", r); | 1439 | metadata_operation_failed(pool, "dm_pool_commit_metadata", r); |
1406 | else | 1440 | else { |
1407 | check_for_space(pool); | 1441 | check_for_metadata_space(pool); |
1442 | check_for_data_space(pool); | ||
1443 | } | ||
1408 | 1444 | ||
1409 | return r; | 1445 | return r; |
1410 | } | 1446 | } |
@@ -1470,6 +1506,19 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | |||
1470 | return r; | 1506 | return r; |
1471 | } | 1507 | } |
1472 | 1508 | ||
1509 | r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks); | ||
1510 | if (r) { | ||
1511 | metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r); | ||
1512 | return r; | ||
1513 | } | ||
1514 | |||
1515 | if (!free_blocks) { | ||
1516 | /* Let's commit before we use up the metadata reserve. */ | ||
1517 | r = commit(pool); | ||
1518 | if (r) | ||
1519 | return r; | ||
1520 | } | ||
1521 | |||
1473 | return 0; | 1522 | return 0; |
1474 | } | 1523 | } |
1475 | 1524 | ||
@@ -1501,6 +1550,7 @@ static blk_status_t should_error_unserviceable_bio(struct pool *pool) | |||
1501 | case PM_OUT_OF_DATA_SPACE: | 1550 | case PM_OUT_OF_DATA_SPACE: |
1502 | return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0; | 1551 | return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0; |
1503 | 1552 | ||
1553 | case PM_OUT_OF_METADATA_SPACE: | ||
1504 | case PM_READ_ONLY: | 1554 | case PM_READ_ONLY: |
1505 | case PM_FAIL: | 1555 | case PM_FAIL: |
1506 | return BLK_STS_IOERR; | 1556 | return BLK_STS_IOERR; |
@@ -2464,8 +2514,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) | |||
2464 | error_retry_list(pool); | 2514 | error_retry_list(pool); |
2465 | break; | 2515 | break; |
2466 | 2516 | ||
2517 | case PM_OUT_OF_METADATA_SPACE: | ||
2467 | case PM_READ_ONLY: | 2518 | case PM_READ_ONLY: |
2468 | if (old_mode != new_mode) | 2519 | if (!is_read_only_pool_mode(old_mode)) |
2469 | notify_of_pool_mode_change(pool, "read-only"); | 2520 | notify_of_pool_mode_change(pool, "read-only"); |
2470 | dm_pool_metadata_read_only(pool->pmd); | 2521 | dm_pool_metadata_read_only(pool->pmd); |
2471 | pool->process_bio = process_bio_read_only; | 2522 | pool->process_bio = process_bio_read_only; |
@@ -3403,6 +3454,10 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) | |||
3403 | DMINFO("%s: growing the metadata device from %llu to %llu blocks", | 3454 | DMINFO("%s: growing the metadata device from %llu to %llu blocks", |
3404 | dm_device_name(pool->pool_md), | 3455 | dm_device_name(pool->pool_md), |
3405 | sb_metadata_dev_size, metadata_dev_size); | 3456 | sb_metadata_dev_size, metadata_dev_size); |
3457 | |||
3458 | if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE) | ||
3459 | set_pool_mode(pool, PM_WRITE); | ||
3460 | |||
3406 | r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); | 3461 | r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); |
3407 | if (r) { | 3462 | if (r) { |
3408 | metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r); | 3463 | metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r); |
@@ -3707,7 +3762,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv, | |||
3707 | struct pool_c *pt = ti->private; | 3762 | struct pool_c *pt = ti->private; |
3708 | struct pool *pool = pt->pool; | 3763 | struct pool *pool = pt->pool; |
3709 | 3764 | ||
3710 | if (get_pool_mode(pool) >= PM_READ_ONLY) { | 3765 | if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) { |
3711 | DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode", | 3766 | DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode", |
3712 | dm_device_name(pool->pool_md)); | 3767 | dm_device_name(pool->pool_md)); |
3713 | return -EOPNOTSUPP; | 3768 | return -EOPNOTSUPP; |
@@ -3781,6 +3836,7 @@ static void pool_status(struct dm_target *ti, status_type_t type, | |||
3781 | dm_block_t nr_blocks_data; | 3836 | dm_block_t nr_blocks_data; |
3782 | dm_block_t nr_blocks_metadata; | 3837 | dm_block_t nr_blocks_metadata; |
3783 | dm_block_t held_root; | 3838 | dm_block_t held_root; |
3839 | enum pool_mode mode; | ||
3784 | char buf[BDEVNAME_SIZE]; | 3840 | char buf[BDEVNAME_SIZE]; |
3785 | char buf2[BDEVNAME_SIZE]; | 3841 | char buf2[BDEVNAME_SIZE]; |
3786 | struct pool_c *pt = ti->private; | 3842 | struct pool_c *pt = ti->private; |
@@ -3851,9 +3907,10 @@ static void pool_status(struct dm_target *ti, status_type_t type, | |||
3851 | else | 3907 | else |
3852 | DMEMIT("- "); | 3908 | DMEMIT("- "); |
3853 | 3909 | ||
3854 | if (pool->pf.mode == PM_OUT_OF_DATA_SPACE) | 3910 | mode = get_pool_mode(pool); |
3911 | if (mode == PM_OUT_OF_DATA_SPACE) | ||
3855 | DMEMIT("out_of_data_space "); | 3912 | DMEMIT("out_of_data_space "); |
3856 | else if (pool->pf.mode == PM_READ_ONLY) | 3913 | else if (is_read_only_pool_mode(mode)) |
3857 | DMEMIT("ro "); | 3914 | DMEMIT("ro "); |
3858 | else | 3915 | else |
3859 | DMEMIT("rw "); | 3916 | DMEMIT("rw "); |