aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorJoe Thornber <ejt@redhat.com>2013-03-20 13:21:27 -0400
committerAlasdair G Kergon <agk@redhat.com>2013-03-20 13:21:27 -0400
commite2e74d617eadc15f601983270c4f4a6935c5a943 (patch)
tree0d1b5d9f8dc1298df947701dd7f50acabdcec689 /drivers/md
parent79ed9caffc9fff67aa64fd683e791aa70f1bcb51 (diff)
dm cache: fix race in writethrough implementation
We have found a race in the optimisation used in the dm cache writethrough implementation. Currently, dm core sends the cache target two bios, one for the origin device and one for the cache device and these are processed in parallel. This patch avoids the race by changing the code back to a simpler (slower) implementation which processes the two writes in series, one after the other, until we can develop a complete fix for the problem. When the cache is in writethrough mode it needs to send WRITE bios to both the origin and cache devices. Previously we've been implementing this by having dm core query the cache target on every write to find out how many copies of the bio it wants. The cache will ask for two bios if the block is in the cache, and one otherwise. Then main problem with this is it's racey. At the time this check is made the bio hasn't yet been submitted and so isn't being taken into account when quiescing a block for migration (promotion or demotion). This means a single bio may be submitted when two were needed because the block has since been promoted to the cache (catastrophic), or two bios where only one is needed (harmless). I really don't want to start entering bios into the quiescing system (deferred_set) in the get_num_write_bios callback. Instead this patch simplifies things; only one bio is submitted by the core, this is first written to the origin and then the cache device in series. Obviously this will have a latency impact. deferred_writethrough_bios is introduced to record bios that must be later issued to the cache device from the worker thread. This deferred submission, after the origin bio completes, is required given that we're in interrupt context (writethrough_endio). Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-cache-target.c138
1 files changed, 88 insertions, 50 deletions
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 79ac8603644d..ff267db60025 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -142,6 +142,7 @@ struct cache {
142 spinlock_t lock; 142 spinlock_t lock;
143 struct bio_list deferred_bios; 143 struct bio_list deferred_bios;
144 struct bio_list deferred_flush_bios; 144 struct bio_list deferred_flush_bios;
145 struct bio_list deferred_writethrough_bios;
145 struct list_head quiesced_migrations; 146 struct list_head quiesced_migrations;
146 struct list_head completed_migrations; 147 struct list_head completed_migrations;
147 struct list_head need_commit_migrations; 148 struct list_head need_commit_migrations;
@@ -199,6 +200,11 @@ struct per_bio_data {
199 bool tick:1; 200 bool tick:1;
200 unsigned req_nr:2; 201 unsigned req_nr:2;
201 struct dm_deferred_entry *all_io_entry; 202 struct dm_deferred_entry *all_io_entry;
203
204 /* writethrough fields */
205 struct cache *cache;
206 dm_cblock_t cblock;
207 bio_end_io_t *saved_bi_end_io;
202}; 208};
203 209
204struct dm_cache_migration { 210struct dm_cache_migration {
@@ -616,6 +622,56 @@ static void issue(struct cache *cache, struct bio *bio)
616 spin_unlock_irqrestore(&cache->lock, flags); 622 spin_unlock_irqrestore(&cache->lock, flags);
617} 623}
618 624
625static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
626{
627 unsigned long flags;
628
629 spin_lock_irqsave(&cache->lock, flags);
630 bio_list_add(&cache->deferred_writethrough_bios, bio);
631 spin_unlock_irqrestore(&cache->lock, flags);
632
633 wake_worker(cache);
634}
635
636static void writethrough_endio(struct bio *bio, int err)
637{
638 struct per_bio_data *pb = get_per_bio_data(bio);
639 bio->bi_end_io = pb->saved_bi_end_io;
640
641 if (err) {
642 bio_endio(bio, err);
643 return;
644 }
645
646 remap_to_cache(pb->cache, bio, pb->cblock);
647
648 /*
649 * We can't issue this bio directly, since we're in interrupt
650 * context. So it get's put on a bio list for processing by the
651 * worker thread.
652 */
653 defer_writethrough_bio(pb->cache, bio);
654}
655
656/*
657 * When running in writethrough mode we need to send writes to clean blocks
658 * to both the cache and origin devices. In future we'd like to clone the
659 * bio and send them in parallel, but for now we're doing them in
660 * series as this is easier.
661 */
662static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
663 dm_oblock_t oblock, dm_cblock_t cblock)
664{
665 struct per_bio_data *pb = get_per_bio_data(bio);
666
667 pb->cache = cache;
668 pb->cblock = cblock;
669 pb->saved_bi_end_io = bio->bi_end_io;
670 bio->bi_end_io = writethrough_endio;
671
672 remap_to_origin_clear_discard(pb->cache, bio, oblock);
673}
674
619/*---------------------------------------------------------------- 675/*----------------------------------------------------------------
620 * Migration processing 676 * Migration processing
621 * 677 *
@@ -1077,14 +1133,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1077 inc_hit_counter(cache, bio); 1133 inc_hit_counter(cache, bio);
1078 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1134 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1079 1135
1080 if (is_writethrough_io(cache, bio, lookup_result.cblock)) { 1136 if (is_writethrough_io(cache, bio, lookup_result.cblock))
1081 /* 1137 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1082 * No need to mark anything dirty in write through mode. 1138 else
1083 */
1084 pb->req_nr == 0 ?
1085 remap_to_cache(cache, bio, lookup_result.cblock) :
1086 remap_to_origin_clear_discard(cache, bio, block);
1087 } else
1088 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 1139 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1089 1140
1090 issue(cache, bio); 1141 issue(cache, bio);
@@ -1093,17 +1144,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1093 case POLICY_MISS: 1144 case POLICY_MISS:
1094 inc_miss_counter(cache, bio); 1145 inc_miss_counter(cache, bio);
1095 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1146 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1096 1147 remap_to_origin_clear_discard(cache, bio, block);
1097 if (pb->req_nr != 0) { 1148 issue(cache, bio);
1098 /*
1099 * This is a duplicate writethrough io that is no
1100 * longer needed because the block has been demoted.
1101 */
1102 bio_endio(bio, 0);
1103 } else {
1104 remap_to_origin_clear_discard(cache, bio, block);
1105 issue(cache, bio);
1106 }
1107 break; 1149 break;
1108 1150
1109 case POLICY_NEW: 1151 case POLICY_NEW:
@@ -1224,6 +1266,23 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1224 submit_bios ? generic_make_request(bio) : bio_io_error(bio); 1266 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1225} 1267}
1226 1268
1269static void process_deferred_writethrough_bios(struct cache *cache)
1270{
1271 unsigned long flags;
1272 struct bio_list bios;
1273 struct bio *bio;
1274
1275 bio_list_init(&bios);
1276
1277 spin_lock_irqsave(&cache->lock, flags);
1278 bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1279 bio_list_init(&cache->deferred_writethrough_bios);
1280 spin_unlock_irqrestore(&cache->lock, flags);
1281
1282 while ((bio = bio_list_pop(&bios)))
1283 generic_make_request(bio);
1284}
1285
1227static void writeback_some_dirty_blocks(struct cache *cache) 1286static void writeback_some_dirty_blocks(struct cache *cache)
1228{ 1287{
1229 int r = 0; 1288 int r = 0;
@@ -1320,6 +1379,7 @@ static int more_work(struct cache *cache)
1320 else 1379 else
1321 return !bio_list_empty(&cache->deferred_bios) || 1380 return !bio_list_empty(&cache->deferred_bios) ||
1322 !bio_list_empty(&cache->deferred_flush_bios) || 1381 !bio_list_empty(&cache->deferred_flush_bios) ||
1382 !bio_list_empty(&cache->deferred_writethrough_bios) ||
1323 !list_empty(&cache->quiesced_migrations) || 1383 !list_empty(&cache->quiesced_migrations) ||
1324 !list_empty(&cache->completed_migrations) || 1384 !list_empty(&cache->completed_migrations) ||
1325 !list_empty(&cache->need_commit_migrations); 1385 !list_empty(&cache->need_commit_migrations);
@@ -1338,6 +1398,8 @@ static void do_worker(struct work_struct *ws)
1338 1398
1339 writeback_some_dirty_blocks(cache); 1399 writeback_some_dirty_blocks(cache);
1340 1400
1401 process_deferred_writethrough_bios(cache);
1402
1341 if (commit_if_needed(cache)) { 1403 if (commit_if_needed(cache)) {
1342 process_deferred_flush_bios(cache, false); 1404 process_deferred_flush_bios(cache, false);
1343 1405
@@ -1803,8 +1865,6 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size,
1803 1865
1804#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100) 1866#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
1805 1867
1806static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
1807
1808static int cache_create(struct cache_args *ca, struct cache **result) 1868static int cache_create(struct cache_args *ca, struct cache **result)
1809{ 1869{
1810 int r = 0; 1870 int r = 0;
@@ -1831,9 +1891,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
1831 1891
1832 memcpy(&cache->features, &ca->features, sizeof(cache->features)); 1892 memcpy(&cache->features, &ca->features, sizeof(cache->features));
1833 1893
1834 if (cache->features.write_through)
1835 ti->num_write_bios = cache_num_write_bios;
1836
1837 cache->callbacks.congested_fn = cache_is_congested; 1894 cache->callbacks.congested_fn = cache_is_congested;
1838 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 1895 dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1839 1896
@@ -1883,6 +1940,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
1883 spin_lock_init(&cache->lock); 1940 spin_lock_init(&cache->lock);
1884 bio_list_init(&cache->deferred_bios); 1941 bio_list_init(&cache->deferred_bios);
1885 bio_list_init(&cache->deferred_flush_bios); 1942 bio_list_init(&cache->deferred_flush_bios);
1943 bio_list_init(&cache->deferred_writethrough_bios);
1886 INIT_LIST_HEAD(&cache->quiesced_migrations); 1944 INIT_LIST_HEAD(&cache->quiesced_migrations);
1887 INIT_LIST_HEAD(&cache->completed_migrations); 1945 INIT_LIST_HEAD(&cache->completed_migrations);
1888 INIT_LIST_HEAD(&cache->need_commit_migrations); 1946 INIT_LIST_HEAD(&cache->need_commit_migrations);
@@ -2028,20 +2086,6 @@ out:
2028 return r; 2086 return r;
2029} 2087}
2030 2088
2031static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
2032{
2033 int r;
2034 struct cache *cache = ti->private;
2035 dm_oblock_t block = get_bio_block(cache, bio);
2036 dm_cblock_t cblock;
2037
2038 r = policy_lookup(cache->policy, block, &cblock);
2039 if (r < 0)
2040 return 2; /* assume the worst */
2041
2042 return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
2043}
2044
2045static int cache_map(struct dm_target *ti, struct bio *bio) 2089static int cache_map(struct dm_target *ti, struct bio *bio)
2046{ 2090{
2047 struct cache *cache = ti->private; 2091 struct cache *cache = ti->private;
@@ -2109,18 +2153,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
2109 inc_hit_counter(cache, bio); 2153 inc_hit_counter(cache, bio);
2110 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 2154 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2111 2155
2112 if (is_writethrough_io(cache, bio, lookup_result.cblock)) { 2156 if (is_writethrough_io(cache, bio, lookup_result.cblock))
2113 /* 2157 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2114 * No need to mark anything dirty in write through mode. 2158 else
2115 */
2116 pb->req_nr == 0 ?
2117 remap_to_cache(cache, bio, lookup_result.cblock) :
2118 remap_to_origin_clear_discard(cache, bio, block);
2119 cell_defer(cache, cell, false);
2120 } else {
2121 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 2159 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2122 cell_defer(cache, cell, false); 2160
2123 } 2161 cell_defer(cache, cell, false);
2124 break; 2162 break;
2125 2163
2126 case POLICY_MISS: 2164 case POLICY_MISS:
@@ -2547,7 +2585,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2547 2585
2548static struct target_type cache_target = { 2586static struct target_type cache_target = {
2549 .name = "cache", 2587 .name = "cache",
2550 .version = {1, 0, 0}, 2588 .version = {1, 1, 0},
2551 .module = THIS_MODULE, 2589 .module = THIS_MODULE,
2552 .ctr = cache_ctr, 2590 .ctr = cache_ctr,
2553 .dtr = cache_dtr, 2591 .dtr = cache_dtr,