aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorJoe Thornber <ejt@redhat.com>2012-12-21 15:23:31 -0500
committerAlasdair G Kergon <agk@redhat.com>2012-12-21 15:23:31 -0500
commite8088073c9610af017fd47fddd104a2c3afb32e8 (patch)
tree4478742b0e876536f20031faaaeebe5b2bfc135a /drivers/md
parentb7ca9c9273e5eebd63880dd8a6e4e5c18fc7901d (diff)
dm thin: fix race between simultaneous io and discards to same block
There is a race when discard bios and non-discard bios are issued simultaneously to the same block. Discard support is expensive for all thin devices precisely because you have to be careful to quiesce the area you're discarding. DM thin must handle this conflicting IO pattern (simultaneous non-discard vs discard) even though a sane application shouldn't be issuing such IO. The race manifests as follows: 1. A non-discard bio is mapped in thin_bio_map. This doesn't lock out parallel activity to the same block. 2. A discard bio is issued to the same block as the non-discard bio. 3. The discard bio is locked in a dm_bio_prison_cell in process_discard to lock out parallel activity against the same block. 4. The non-discard bio's mapping continues and its all_io_entry is incremented so the bio is accounted for in the thin pool's all_io_ds which is a dm_deferred_set used to track time locality of non-discard IO. 5. The non-discard bio is finally locked in a dm_bio_prison_cell in process_bio. The race can result in deadlock, leaving the block layer hanging waiting for completion of a discard bio that never completes, e.g.: INFO: task ruby:15354 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ruby D ffffffff8160f0e0 0 15354 15314 0x00000000 ffff8802fb08bc58 0000000000000082 ffff8802fb08bfd8 0000000000012900 ffff8802fb08a010 0000000000012900 0000000000012900 0000000000012900 ffff8802fb08bfd8 0000000000012900 ffff8803324b9480 ffff88032c6f14c0 Call Trace: [<ffffffff814e5a19>] schedule+0x29/0x70 [<ffffffff814e3d85>] schedule_timeout+0x195/0x220 [<ffffffffa06b9bc1>] ? _dm_request+0x111/0x160 [dm_mod] [<ffffffff814e589e>] wait_for_common+0x11e/0x190 [<ffffffff8107a170>] ? try_to_wake_up+0x2b0/0x2b0 [<ffffffff814e59ed>] wait_for_completion+0x1d/0x20 [<ffffffff81233289>] blkdev_issue_discard+0x219/0x260 [<ffffffff81233e79>] blkdev_ioctl+0x6e9/0x7b0 [<ffffffff8119a65c>] block_ioctl+0x3c/0x40 [<ffffffff8117539c>] do_vfs_ioctl+0x8c/0x340 [<ffffffff8119a547>] ? block_llseek+0x67/0xb0 [<ffffffff811756f1>] sys_ioctl+0xa1/0xb0 [<ffffffff810561f6>] ? sys_rt_sigprocmask+0x86/0xd0 [<ffffffff814ef099>] system_call_fastpath+0x16/0x1b The thinp-test-suite's test_discard_random_sectors reliably hits this deadlock on fast SSD storage. The fix for this race is that the all_io_entry for a bio must be incremented whilst the dm_bio_prison_cell is held for the bio's associated virtual and physical blocks. That cell locking wasn't occurring early enough in thin_bio_map. This patch fixes this. Care is taken to always call the new function inc_all_io_entry() with the relevant cells locked, but they are generally unlocked before calling issue() to try to avoid holding the cells locked across generic_submit_request. Also, now that thin_bio_map may lock bios in a cell, process_bio() is no longer the only thread that will do so. Because of this we must be sure to use cell_defer_except() to release all non-holder entries, that were added by the other thread, because they must be deferred. This patch depends on "dm thin: replace dm_cell_release_singleton with cell_defer_except". Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: stable@vger.kernel.org
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-thin.c84
1 files changed, 59 insertions, 25 deletions
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 25dfd2311a61..41c9e81ba74a 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -368,6 +368,17 @@ static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
368 dm_thin_changed_this_transaction(tc->td); 368 dm_thin_changed_this_transaction(tc->td);
369} 369}
370 370
371static void inc_all_io_entry(struct pool *pool, struct bio *bio)
372{
373 struct dm_thin_endio_hook *h;
374
375 if (bio->bi_rw & REQ_DISCARD)
376 return;
377
378 h = dm_get_mapinfo(bio)->ptr;
379 h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
380}
381
371static void issue(struct thin_c *tc, struct bio *bio) 382static void issue(struct thin_c *tc, struct bio *bio)
372{ 383{
373 struct pool *pool = tc->pool; 384 struct pool *pool = tc->pool;
@@ -596,13 +607,15 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
596{ 607{
597 struct thin_c *tc = m->tc; 608 struct thin_c *tc = m->tc;
598 609
610 inc_all_io_entry(tc->pool, m->bio);
611 cell_defer_except(tc, m->cell);
612 cell_defer_except(tc, m->cell2);
613
599 if (m->pass_discard) 614 if (m->pass_discard)
600 remap_and_issue(tc, m->bio, m->data_block); 615 remap_and_issue(tc, m->bio, m->data_block);
601 else 616 else
602 bio_endio(m->bio, 0); 617 bio_endio(m->bio, 0);
603 618
604 cell_defer_except(tc, m->cell);
605 cell_defer_except(tc, m->cell2);
606 mempool_free(m, tc->pool->mapping_pool); 619 mempool_free(m, tc->pool->mapping_pool);
607} 620}
608 621
@@ -710,6 +723,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
710 h->overwrite_mapping = m; 723 h->overwrite_mapping = m;
711 m->bio = bio; 724 m->bio = bio;
712 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 725 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
726 inc_all_io_entry(pool, bio);
713 remap_and_issue(tc, bio, data_dest); 727 remap_and_issue(tc, bio, data_dest);
714 } else { 728 } else {
715 struct dm_io_region from, to; 729 struct dm_io_region from, to;
@@ -779,6 +793,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
779 h->overwrite_mapping = m; 793 h->overwrite_mapping = m;
780 m->bio = bio; 794 m->bio = bio;
781 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 795 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
796 inc_all_io_entry(pool, bio);
782 remap_and_issue(tc, bio, data_block); 797 remap_and_issue(tc, bio, data_block);
783 } else { 798 } else {
784 int r; 799 int r;
@@ -961,13 +976,15 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
961 wake_worker(pool); 976 wake_worker(pool);
962 } 977 }
963 } else { 978 } else {
979 inc_all_io_entry(pool, bio);
980 cell_defer_except(tc, cell);
981 cell_defer_except(tc, cell2);
982
964 /* 983 /*
965 * The DM core makes sure that the discard doesn't span 984 * The DM core makes sure that the discard doesn't span
966 * a block boundary. So we submit the discard of a 985 * a block boundary. So we submit the discard of a
967 * partial block appropriately. 986 * partial block appropriately.
968 */ 987 */
969 cell_defer_except(tc, cell);
970 cell_defer_except(tc, cell2);
971 if ((!lookup_result.shared) && pool->pf.discard_passdown) 988 if ((!lookup_result.shared) && pool->pf.discard_passdown)
972 remap_and_issue(tc, bio, lookup_result.block); 989 remap_and_issue(tc, bio, lookup_result.block);
973 else 990 else
@@ -1039,8 +1056,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1039 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1056 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1040 1057
1041 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); 1058 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1042 1059 inc_all_io_entry(pool, bio);
1043 cell_defer_except(tc, cell); 1060 cell_defer_except(tc, cell);
1061
1044 remap_and_issue(tc, bio, lookup_result->block); 1062 remap_and_issue(tc, bio, lookup_result->block);
1045 } 1063 }
1046} 1064}
@@ -1055,7 +1073,9 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1055 * Remap empty bios (flushes) immediately, without provisioning. 1073 * Remap empty bios (flushes) immediately, without provisioning.
1056 */ 1074 */
1057 if (!bio->bi_size) { 1075 if (!bio->bi_size) {
1076 inc_all_io_entry(tc->pool, bio);
1058 cell_defer_except(tc, cell); 1077 cell_defer_except(tc, cell);
1078
1059 remap_and_issue(tc, bio, 0); 1079 remap_and_issue(tc, bio, 0);
1060 return; 1080 return;
1061 } 1081 }
@@ -1110,26 +1130,22 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1110 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1130 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1111 switch (r) { 1131 switch (r) {
1112 case 0: 1132 case 0:
1113 /* 1133 if (lookup_result.shared) {
1114 * We can release this cell now. This thread is the only
1115 * one that puts bios into a cell, and we know there were
1116 * no preceding bios.
1117 */
1118 /*
1119 * TODO: this will probably have to change when discard goes
1120 * back in.
1121 */
1122 cell_defer_except(tc, cell);
1123
1124 if (lookup_result.shared)
1125 process_shared_bio(tc, bio, block, &lookup_result); 1134 process_shared_bio(tc, bio, block, &lookup_result);
1126 else 1135 cell_defer_except(tc, cell);
1136 } else {
1137 inc_all_io_entry(tc->pool, bio);
1138 cell_defer_except(tc, cell);
1139
1127 remap_and_issue(tc, bio, lookup_result.block); 1140 remap_and_issue(tc, bio, lookup_result.block);
1141 }
1128 break; 1142 break;
1129 1143
1130 case -ENODATA: 1144 case -ENODATA:
1131 if (bio_data_dir(bio) == READ && tc->origin_dev) { 1145 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1146 inc_all_io_entry(tc->pool, bio);
1132 cell_defer_except(tc, cell); 1147 cell_defer_except(tc, cell);
1148
1133 remap_to_origin_and_issue(tc, bio); 1149 remap_to_origin_and_issue(tc, bio);
1134 } else 1150 } else
1135 provision_block(tc, bio, block, cell); 1151 provision_block(tc, bio, block, cell);
@@ -1155,8 +1171,10 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1155 case 0: 1171 case 0:
1156 if (lookup_result.shared && (rw == WRITE) && bio->bi_size) 1172 if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
1157 bio_io_error(bio); 1173 bio_io_error(bio);
1158 else 1174 else {
1175 inc_all_io_entry(tc->pool, bio);
1159 remap_and_issue(tc, bio, lookup_result.block); 1176 remap_and_issue(tc, bio, lookup_result.block);
1177 }
1160 break; 1178 break;
1161 1179
1162 case -ENODATA: 1180 case -ENODATA:
@@ -1166,6 +1184,7 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1166 } 1184 }
1167 1185
1168 if (tc->origin_dev) { 1186 if (tc->origin_dev) {
1187 inc_all_io_entry(tc->pool, bio);
1169 remap_to_origin_and_issue(tc, bio); 1188 remap_to_origin_and_issue(tc, bio);
1170 break; 1189 break;
1171 } 1190 }
@@ -1346,7 +1365,7 @@ static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *b
1346 1365
1347 h->tc = tc; 1366 h->tc = tc;
1348 h->shared_read_entry = NULL; 1367 h->shared_read_entry = NULL;
1349 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : dm_deferred_entry_inc(pool->all_io_ds); 1368 h->all_io_entry = NULL;
1350 h->overwrite_mapping = NULL; 1369 h->overwrite_mapping = NULL;
1351 1370
1352 return h; 1371 return h;
@@ -1363,6 +1382,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1363 dm_block_t block = get_bio_block(tc, bio); 1382 dm_block_t block = get_bio_block(tc, bio);
1364 struct dm_thin_device *td = tc->td; 1383 struct dm_thin_device *td = tc->td;
1365 struct dm_thin_lookup_result result; 1384 struct dm_thin_lookup_result result;
1385 struct dm_bio_prison_cell *cell1, *cell2;
1386 struct dm_cell_key key;
1366 1387
1367 map_context->ptr = thin_hook_bio(tc, bio); 1388 map_context->ptr = thin_hook_bio(tc, bio);
1368 1389
@@ -1399,12 +1420,25 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1399 * shared flag will be set in their case. 1420 * shared flag will be set in their case.
1400 */ 1421 */
1401 thin_defer_bio(tc, bio); 1422 thin_defer_bio(tc, bio);
1402 r = DM_MAPIO_SUBMITTED; 1423 return DM_MAPIO_SUBMITTED;
1403 } else {
1404 remap(tc, bio, result.block);
1405 r = DM_MAPIO_REMAPPED;
1406 } 1424 }
1407 break; 1425
1426 build_virtual_key(tc->td, block, &key);
1427 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1))
1428 return DM_MAPIO_SUBMITTED;
1429
1430 build_data_key(tc->td, result.block, &key);
1431 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2)) {
1432 cell_defer_except(tc, cell1);
1433 return DM_MAPIO_SUBMITTED;
1434 }
1435
1436 inc_all_io_entry(tc->pool, bio);
1437 cell_defer_except(tc, cell2);
1438 cell_defer_except(tc, cell1);
1439
1440 remap(tc, bio, result.block);
1441 return DM_MAPIO_REMAPPED;
1408 1442
1409 case -ENODATA: 1443 case -ENODATA:
1410 if (get_pool_mode(tc->pool) == PM_READ_ONLY) { 1444 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {