diff options
author | Mike Snitzer <snitzer@redhat.com> | 2014-03-21 18:33:41 -0400 |
---|---|---|
committer | Mike Snitzer <snitzer@redhat.com> | 2014-04-04 14:53:03 -0400 |
commit | 67324ea18812bc952ef96892fbd5817b9050413f (patch) | |
tree | 00c8f83280e10b94f3d4cbdf15d468b74cc6438a | |
parent | c140e1c4e23bdaf0a5c00b6a8b6d18f259d39a00 (diff) |
dm thin: sort the per thin deferred bios using an rb_tree
A thin-pool will allocate blocks using FIFO order for all thin devices
which share the thin-pool. Because of this simplistic allocation the
thin-pool's space can become fragmented quite easily; especially when
multiple threads are requesting blocks in parallel.
Sort each thin device's deferred_bio_list based on logical sector to
help reduce fragmentation of the thin-pool's ondisk layout.
The following tables illustrate the realized gains/potential offered by
sorting each thin device's deferred_bio_list. An "io size"-sized random
read of the device would result in "seeks/io" fragments being read, with
an average "distance/seek" between each fragment.
Data was written to a single thin device using multiple threads via
iozone (8 threads, 64K for both the block_size and io_size).
unsorted:
io size seeks/io distance/seek
--------------------------------------
4k 0.000 0b
16k 0.013 11m
64k 0.065 11m
256k 0.274 10m
1m 1.109 10m
4m 4.411 10m
16m 17.097 11m
64m 60.055 13m
256m 148.798 25m
1g 809.929 21m
sorted:
io size seeks/io distance/seek
--------------------------------------
4k 0.000 0b
16k 0.000 1g
64k 0.001 1g
256k 0.003 1g
1m 0.011 1g
4m 0.045 1g
16m 0.181 1g
64m 0.747 1011m
256m 3.299 1g
1g 14.373 1g
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
-rw-r--r-- | drivers/md/dm-thin.c | 84 |
1 files changed, 82 insertions, 2 deletions
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 08e62aef361d..53728be84dee 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
19 | #include <linux/rbtree.h> | ||
19 | 20 | ||
20 | #define DM_MSG_PREFIX "thin" | 21 | #define DM_MSG_PREFIX "thin" |
21 | 22 | ||
@@ -230,6 +231,7 @@ struct thin_c { | |||
230 | spinlock_t lock; | 231 | spinlock_t lock; |
231 | struct bio_list deferred_bio_list; | 232 | struct bio_list deferred_bio_list; |
232 | struct bio_list retry_on_resume_list; | 233 | struct bio_list retry_on_resume_list; |
234 | struct rb_root sort_bio_list; /* sorted list of deferred bios */ | ||
233 | }; | 235 | }; |
234 | 236 | ||
235 | /*----------------------------------------------------------------*/ | 237 | /*----------------------------------------------------------------*/ |
@@ -371,6 +373,7 @@ struct dm_thin_endio_hook { | |||
371 | struct dm_deferred_entry *shared_read_entry; | 373 | struct dm_deferred_entry *shared_read_entry; |
372 | struct dm_deferred_entry *all_io_entry; | 374 | struct dm_deferred_entry *all_io_entry; |
373 | struct dm_thin_new_mapping *overwrite_mapping; | 375 | struct dm_thin_new_mapping *overwrite_mapping; |
376 | struct rb_node rb_node; | ||
374 | }; | 377 | }; |
375 | 378 | ||
376 | static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) | 379 | static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) |
@@ -1367,12 +1370,77 @@ static int need_commit_due_to_time(struct pool *pool) | |||
1367 | jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; | 1370 | jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; |
1368 | } | 1371 | } |
1369 | 1372 | ||
1373 | #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node) | ||
1374 | #define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook)) | ||
1375 | |||
1376 | static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio) | ||
1377 | { | ||
1378 | struct rb_node **rbp, *parent; | ||
1379 | struct dm_thin_endio_hook *pbd; | ||
1380 | sector_t bi_sector = bio->bi_iter.bi_sector; | ||
1381 | |||
1382 | rbp = &tc->sort_bio_list.rb_node; | ||
1383 | parent = NULL; | ||
1384 | while (*rbp) { | ||
1385 | parent = *rbp; | ||
1386 | pbd = thin_pbd(parent); | ||
1387 | |||
1388 | if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector) | ||
1389 | rbp = &(*rbp)->rb_left; | ||
1390 | else | ||
1391 | rbp = &(*rbp)->rb_right; | ||
1392 | } | ||
1393 | |||
1394 | pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | ||
1395 | rb_link_node(&pbd->rb_node, parent, rbp); | ||
1396 | rb_insert_color(&pbd->rb_node, &tc->sort_bio_list); | ||
1397 | } | ||
1398 | |||
1399 | static void __extract_sorted_bios(struct thin_c *tc) | ||
1400 | { | ||
1401 | struct rb_node *node; | ||
1402 | struct dm_thin_endio_hook *pbd; | ||
1403 | struct bio *bio; | ||
1404 | |||
1405 | for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) { | ||
1406 | pbd = thin_pbd(node); | ||
1407 | bio = thin_bio(pbd); | ||
1408 | |||
1409 | bio_list_add(&tc->deferred_bio_list, bio); | ||
1410 | rb_erase(&pbd->rb_node, &tc->sort_bio_list); | ||
1411 | } | ||
1412 | |||
1413 | WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list)); | ||
1414 | } | ||
1415 | |||
1416 | static void __sort_thin_deferred_bios(struct thin_c *tc) | ||
1417 | { | ||
1418 | struct bio *bio; | ||
1419 | struct bio_list bios; | ||
1420 | |||
1421 | bio_list_init(&bios); | ||
1422 | bio_list_merge(&bios, &tc->deferred_bio_list); | ||
1423 | bio_list_init(&tc->deferred_bio_list); | ||
1424 | |||
1425 | /* Sort deferred_bio_list using rb-tree */ | ||
1426 | while ((bio = bio_list_pop(&bios))) | ||
1427 | __thin_bio_rb_add(tc, bio); | ||
1428 | |||
1429 | /* | ||
1430 | * Transfer the sorted bios in sort_bio_list back to | ||
1431 | * deferred_bio_list to allow lockless submission of | ||
1432 | * all bios. | ||
1433 | */ | ||
1434 | __extract_sorted_bios(tc); | ||
1435 | } | ||
1436 | |||
1370 | static void process_thin_deferred_bios(struct thin_c *tc) | 1437 | static void process_thin_deferred_bios(struct thin_c *tc) |
1371 | { | 1438 | { |
1372 | struct pool *pool = tc->pool; | 1439 | struct pool *pool = tc->pool; |
1373 | unsigned long flags; | 1440 | unsigned long flags; |
1374 | struct bio *bio; | 1441 | struct bio *bio; |
1375 | struct bio_list bios; | 1442 | struct bio_list bios; |
1443 | struct blk_plug plug; | ||
1376 | 1444 | ||
1377 | if (tc->requeue_mode) { | 1445 | if (tc->requeue_mode) { |
1378 | requeue_bio_list(tc, &tc->deferred_bio_list); | 1446 | requeue_bio_list(tc, &tc->deferred_bio_list); |
@@ -1382,10 +1450,20 @@ static void process_thin_deferred_bios(struct thin_c *tc) | |||
1382 | bio_list_init(&bios); | 1450 | bio_list_init(&bios); |
1383 | 1451 | ||
1384 | spin_lock_irqsave(&tc->lock, flags); | 1452 | spin_lock_irqsave(&tc->lock, flags); |
1453 | |||
1454 | if (bio_list_empty(&tc->deferred_bio_list)) { | ||
1455 | spin_unlock_irqrestore(&tc->lock, flags); | ||
1456 | return; | ||
1457 | } | ||
1458 | |||
1459 | __sort_thin_deferred_bios(tc); | ||
1460 | |||
1385 | bio_list_merge(&bios, &tc->deferred_bio_list); | 1461 | bio_list_merge(&bios, &tc->deferred_bio_list); |
1386 | bio_list_init(&tc->deferred_bio_list); | 1462 | bio_list_init(&tc->deferred_bio_list); |
1463 | |||
1387 | spin_unlock_irqrestore(&tc->lock, flags); | 1464 | spin_unlock_irqrestore(&tc->lock, flags); |
1388 | 1465 | ||
1466 | blk_start_plug(&plug); | ||
1389 | while ((bio = bio_list_pop(&bios))) { | 1467 | while ((bio = bio_list_pop(&bios))) { |
1390 | /* | 1468 | /* |
1391 | * If we've got no free new_mapping structs, and processing | 1469 | * If we've got no free new_mapping structs, and processing |
@@ -1405,6 +1483,7 @@ static void process_thin_deferred_bios(struct thin_c *tc) | |||
1405 | else | 1483 | else |
1406 | pool->process_bio(tc, bio); | 1484 | pool->process_bio(tc, bio); |
1407 | } | 1485 | } |
1486 | blk_finish_plug(&plug); | ||
1408 | } | 1487 | } |
1409 | 1488 | ||
1410 | static void process_deferred_bios(struct pool *pool) | 1489 | static void process_deferred_bios(struct pool *pool) |
@@ -2964,7 +3043,7 @@ static struct target_type pool_target = { | |||
2964 | .name = "thin-pool", | 3043 | .name = "thin-pool", |
2965 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | | 3044 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | |
2966 | DM_TARGET_IMMUTABLE, | 3045 | DM_TARGET_IMMUTABLE, |
2967 | .version = {1, 11, 0}, | 3046 | .version = {1, 12, 0}, |
2968 | .module = THIS_MODULE, | 3047 | .module = THIS_MODULE, |
2969 | .ctr = pool_ctr, | 3048 | .ctr = pool_ctr, |
2970 | .dtr = pool_dtr, | 3049 | .dtr = pool_dtr, |
@@ -3040,6 +3119,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
3040 | spin_lock_init(&tc->lock); | 3119 | spin_lock_init(&tc->lock); |
3041 | bio_list_init(&tc->deferred_bio_list); | 3120 | bio_list_init(&tc->deferred_bio_list); |
3042 | bio_list_init(&tc->retry_on_resume_list); | 3121 | bio_list_init(&tc->retry_on_resume_list); |
3122 | tc->sort_bio_list = RB_ROOT; | ||
3043 | 3123 | ||
3044 | if (argc == 3) { | 3124 | if (argc == 3) { |
3045 | r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); | 3125 | r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); |
@@ -3287,7 +3367,7 @@ static int thin_iterate_devices(struct dm_target *ti, | |||
3287 | 3367 | ||
3288 | static struct target_type thin_target = { | 3368 | static struct target_type thin_target = { |
3289 | .name = "thin", | 3369 | .name = "thin", |
3290 | .version = {1, 11, 0}, | 3370 | .version = {1, 12, 0}, |
3291 | .module = THIS_MODULE, | 3371 | .module = THIS_MODULE, |
3292 | .ctr = thin_ctr, | 3372 | .ctr = thin_ctr, |
3293 | .dtr = thin_dtr, | 3373 | .dtr = thin_dtr, |