diff options
author | Linus Walleij <linus.walleij@linaro.org> | 2014-03-14 05:26:45 -0400 |
---|---|---|
committer | Linus Walleij <linus.walleij@linaro.org> | 2014-03-14 05:26:45 -0400 |
commit | 9e294427f6e427dbaf46140303acded06365f53c (patch) | |
tree | 0669100cbd79fe8612463900171c98873d8dc454 /drivers/md | |
parent | 23600969ff137cf4c3bc9098f77e381de334e3f7 (diff) | |
parent | fa389e220254c69ffae0d403eac4146171062d08 (diff) |
Merge tag 'v3.14-rc6' into devel
Linux 3.14-rc6
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 10 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 4 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 7 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/extents.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 6 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 13 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 23 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 7 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-snap-persistent.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 58 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.h | 21 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 343 | ||||
-rw-r--r-- | drivers/md/persistent-data/Kconfig | 10 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-metadata.c | 115 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-metadata.h | 11 | ||||
-rw-r--r-- | drivers/md/raid1.c | 13 | ||||
-rw-r--r-- | drivers/md/raid5.c | 90 |
21 files changed, 547 insertions, 202 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 9a06fe883766..95ad936e6048 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -254,16 +254,6 @@ config DM_THIN_PROVISIONING | |||
254 | ---help--- | 254 | ---help--- |
255 | Provides thin provisioning and snapshots that share a data store. | 255 | Provides thin provisioning and snapshots that share a data store. |
256 | 256 | ||
257 | config DM_DEBUG_BLOCK_STACK_TRACING | ||
258 | boolean "Keep stack trace of persistent data block lock holders" | ||
259 | depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA | ||
260 | select STACKTRACE | ||
261 | ---help--- | ||
262 | Enable this for messages that may help debug problems with the | ||
263 | block manager locking used by thin provisioning and caching. | ||
264 | |||
265 | If unsure, say N. | ||
266 | |||
267 | config DM_CACHE | 257 | config DM_CACHE |
268 | tristate "Cache target (EXPERIMENTAL)" | 258 | tristate "Cache target (EXPERIMENTAL)" |
269 | depends on BLK_DEV_DM | 259 | depends on BLK_DEV_DM |
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 0c707e4f4eaf..a4c7306ff43d 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -210,7 +210,9 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); | |||
210 | #define GC_MARK_RECLAIMABLE 0 | 210 | #define GC_MARK_RECLAIMABLE 0 |
211 | #define GC_MARK_DIRTY 1 | 211 | #define GC_MARK_DIRTY 1 |
212 | #define GC_MARK_METADATA 2 | 212 | #define GC_MARK_METADATA 2 |
213 | BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13); | 213 | #define GC_SECTORS_USED_SIZE 13 |
214 | #define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) | ||
215 | BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); | ||
214 | BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); | 216 | BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); |
215 | 217 | ||
216 | #include "journal.h" | 218 | #include "journal.h" |
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 4f6b5940e609..3f74b4b0747b 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c | |||
@@ -23,7 +23,7 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) | |||
23 | for (k = i->start; k < bset_bkey_last(i); k = next) { | 23 | for (k = i->start; k < bset_bkey_last(i); k = next) { |
24 | next = bkey_next(k); | 24 | next = bkey_next(k); |
25 | 25 | ||
26 | printk(KERN_ERR "block %u key %zi/%u: ", set, | 26 | printk(KERN_ERR "block %u key %li/%u: ", set, |
27 | (uint64_t *) k - i->d, i->keys); | 27 | (uint64_t *) k - i->d, i->keys); |
28 | 28 | ||
29 | if (b->ops->key_dump) | 29 | if (b->ops->key_dump) |
@@ -1185,9 +1185,12 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, | |||
1185 | struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, | 1185 | struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, |
1186 | order); | 1186 | order); |
1187 | if (!out) { | 1187 | if (!out) { |
1188 | struct page *outp; | ||
1189 | |||
1188 | BUG_ON(order > state->page_order); | 1190 | BUG_ON(order > state->page_order); |
1189 | 1191 | ||
1190 | out = page_address(mempool_alloc(state->pool, GFP_NOIO)); | 1192 | outp = mempool_alloc(state->pool, GFP_NOIO); |
1193 | out = page_address(outp); | ||
1191 | used_mempool = true; | 1194 | used_mempool = true; |
1192 | order = state->page_order; | 1195 | order = state->page_order; |
1193 | } | 1196 | } |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 98cc0a810a36..5f9c2a665ca5 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
@@ -1167,7 +1167,7 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | |||
1167 | /* guard against overflow */ | 1167 | /* guard against overflow */ |
1168 | SET_GC_SECTORS_USED(g, min_t(unsigned, | 1168 | SET_GC_SECTORS_USED(g, min_t(unsigned, |
1169 | GC_SECTORS_USED(g) + KEY_SIZE(k), | 1169 | GC_SECTORS_USED(g) + KEY_SIZE(k), |
1170 | (1 << 14) - 1)); | 1170 | MAX_GC_SECTORS_USED)); |
1171 | 1171 | ||
1172 | BUG_ON(!GC_SECTORS_USED(g)); | 1172 | BUG_ON(!GC_SECTORS_USED(g)); |
1173 | } | 1173 | } |
@@ -1805,7 +1805,7 @@ static bool btree_insert_key(struct btree *b, struct bkey *k, | |||
1805 | 1805 | ||
1806 | static size_t insert_u64s_remaining(struct btree *b) | 1806 | static size_t insert_u64s_remaining(struct btree *b) |
1807 | { | 1807 | { |
1808 | ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys); | 1808 | long ret = bch_btree_keys_u64s_remaining(&b->keys); |
1809 | 1809 | ||
1810 | /* | 1810 | /* |
1811 | * Might land in the middle of an existing extent and have to split it | 1811 | * Might land in the middle of an existing extent and have to split it |
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index c3ead586dc27..416d1a3e028e 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c | |||
@@ -194,7 +194,7 @@ err: | |||
194 | mutex_unlock(&b->c->bucket_lock); | 194 | mutex_unlock(&b->c->bucket_lock); |
195 | bch_extent_to_text(buf, sizeof(buf), k); | 195 | bch_extent_to_text(buf, sizeof(buf), k); |
196 | btree_bug(b, | 196 | btree_bug(b, |
197 | "inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | 197 | "inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", |
198 | buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), | 198 | buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), |
199 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | 199 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); |
200 | return true; | 200 | return true; |
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 72cd213f213f..5d5d031cf381 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c | |||
@@ -353,14 +353,14 @@ static void bch_data_insert_start(struct closure *cl) | |||
353 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); | 353 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); |
354 | struct bio *bio = op->bio, *n; | 354 | struct bio *bio = op->bio, *n; |
355 | 355 | ||
356 | if (op->bypass) | ||
357 | return bch_data_invalidate(cl); | ||
358 | |||
359 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { | 356 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { |
360 | set_gc_sectors(op->c); | 357 | set_gc_sectors(op->c); |
361 | wake_up_gc(op->c); | 358 | wake_up_gc(op->c); |
362 | } | 359 | } |
363 | 360 | ||
361 | if (op->bypass) | ||
362 | return bch_data_invalidate(cl); | ||
363 | |||
364 | /* | 364 | /* |
365 | * Journal writes are marked REQ_FLUSH; if the original write was a | 365 | * Journal writes are marked REQ_FLUSH; if the original write was a |
366 | * flush, it'll wait on the journal write. | 366 | * flush, it'll wait on the journal write. |
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index c6ab69333a6d..d8458d477a12 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
@@ -416,7 +416,7 @@ static int btree_bset_stats(struct btree_op *b_op, struct btree *b) | |||
416 | return MAP_CONTINUE; | 416 | return MAP_CONTINUE; |
417 | } | 417 | } |
418 | 418 | ||
419 | int bch_bset_print_stats(struct cache_set *c, char *buf) | 419 | static int bch_bset_print_stats(struct cache_set *c, char *buf) |
420 | { | 420 | { |
421 | struct bset_stats_op op; | 421 | struct bset_stats_op op; |
422 | int ret; | 422 | int ret; |
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 1e018e986610..0e385e40909e 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c | |||
@@ -872,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p) | |||
872 | { | 872 | { |
873 | struct mq_policy *mq = to_mq_policy(p); | 873 | struct mq_policy *mq = to_mq_policy(p); |
874 | 874 | ||
875 | kfree(mq->table); | 875 | vfree(mq->table); |
876 | epool_exit(&mq->cache_pool); | 876 | epool_exit(&mq->cache_pool); |
877 | epool_exit(&mq->pre_cache_pool); | 877 | epool_exit(&mq->pre_cache_pool); |
878 | kfree(mq); | 878 | kfree(mq); |
@@ -1245,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, | |||
1245 | 1245 | ||
1246 | mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); | 1246 | mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); |
1247 | mq->hash_bits = ffs(mq->nr_buckets) - 1; | 1247 | mq->hash_bits = ffs(mq->nr_buckets) - 1; |
1248 | mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); | 1248 | mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets); |
1249 | if (!mq->table) | 1249 | if (!mq->table) |
1250 | goto bad_alloc_table; | 1250 | goto bad_alloc_table; |
1251 | 1251 | ||
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index ffd472e015ca..1af70145fab9 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c | |||
@@ -289,6 +289,7 @@ struct per_bio_data { | |||
289 | bool tick:1; | 289 | bool tick:1; |
290 | unsigned req_nr:2; | 290 | unsigned req_nr:2; |
291 | struct dm_deferred_entry *all_io_entry; | 291 | struct dm_deferred_entry *all_io_entry; |
292 | struct dm_hook_info hook_info; | ||
292 | 293 | ||
293 | /* | 294 | /* |
294 | * writethrough fields. These MUST remain at the end of this | 295 | * writethrough fields. These MUST remain at the end of this |
@@ -297,7 +298,6 @@ struct per_bio_data { | |||
297 | */ | 298 | */ |
298 | struct cache *cache; | 299 | struct cache *cache; |
299 | dm_cblock_t cblock; | 300 | dm_cblock_t cblock; |
300 | struct dm_hook_info hook_info; | ||
301 | struct dm_bio_details bio_details; | 301 | struct dm_bio_details bio_details; |
302 | }; | 302 | }; |
303 | 303 | ||
@@ -671,15 +671,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio, | |||
671 | dm_cblock_t cblock) | 671 | dm_cblock_t cblock) |
672 | { | 672 | { |
673 | sector_t bi_sector = bio->bi_iter.bi_sector; | 673 | sector_t bi_sector = bio->bi_iter.bi_sector; |
674 | sector_t block = from_cblock(cblock); | ||
674 | 675 | ||
675 | bio->bi_bdev = cache->cache_dev->bdev; | 676 | bio->bi_bdev = cache->cache_dev->bdev; |
676 | if (!block_size_is_power_of_two(cache)) | 677 | if (!block_size_is_power_of_two(cache)) |
677 | bio->bi_iter.bi_sector = | 678 | bio->bi_iter.bi_sector = |
678 | (from_cblock(cblock) * cache->sectors_per_block) + | 679 | (block * cache->sectors_per_block) + |
679 | sector_div(bi_sector, cache->sectors_per_block); | 680 | sector_div(bi_sector, cache->sectors_per_block); |
680 | else | 681 | else |
681 | bio->bi_iter.bi_sector = | 682 | bio->bi_iter.bi_sector = |
682 | (from_cblock(cblock) << cache->sectors_per_block_shift) | | 683 | (block << cache->sectors_per_block_shift) | |
683 | (bi_sector & (cache->sectors_per_block - 1)); | 684 | (bi_sector & (cache->sectors_per_block - 1)); |
684 | } | 685 | } |
685 | 686 | ||
@@ -1010,13 +1011,15 @@ static void overwrite_endio(struct bio *bio, int err) | |||
1010 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); | 1011 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); |
1011 | unsigned long flags; | 1012 | unsigned long flags; |
1012 | 1013 | ||
1014 | dm_unhook_bio(&pb->hook_info, bio); | ||
1015 | |||
1013 | if (err) | 1016 | if (err) |
1014 | mg->err = true; | 1017 | mg->err = true; |
1015 | 1018 | ||
1019 | mg->requeue_holder = false; | ||
1020 | |||
1016 | spin_lock_irqsave(&cache->lock, flags); | 1021 | spin_lock_irqsave(&cache->lock, flags); |
1017 | list_add_tail(&mg->list, &cache->completed_migrations); | 1022 | list_add_tail(&mg->list, &cache->completed_migrations); |
1018 | dm_unhook_bio(&pb->hook_info, bio); | ||
1019 | mg->requeue_holder = false; | ||
1020 | spin_unlock_irqrestore(&cache->lock, flags); | 1023 | spin_unlock_irqrestore(&cache->lock, flags); |
1021 | 1024 | ||
1022 | wake_worker(cache); | 1025 | wake_worker(cache); |
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index b2b8a10e8427..3842ac738f98 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c | |||
@@ -201,29 +201,28 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse | |||
201 | /* | 201 | /* |
202 | * Functions for getting the pages from a bvec. | 202 | * Functions for getting the pages from a bvec. |
203 | */ | 203 | */ |
204 | static void bio_get_page(struct dpages *dp, | 204 | static void bio_get_page(struct dpages *dp, struct page **p, |
205 | struct page **p, unsigned long *len, unsigned *offset) | 205 | unsigned long *len, unsigned *offset) |
206 | { | 206 | { |
207 | struct bio *bio = dp->context_ptr; | 207 | struct bio_vec *bvec = dp->context_ptr; |
208 | struct bio_vec bvec = bio_iovec(bio); | 208 | *p = bvec->bv_page; |
209 | *p = bvec.bv_page; | 209 | *len = bvec->bv_len - dp->context_u; |
210 | *len = bvec.bv_len; | 210 | *offset = bvec->bv_offset + dp->context_u; |
211 | *offset = bvec.bv_offset; | ||
212 | } | 211 | } |
213 | 212 | ||
214 | static void bio_next_page(struct dpages *dp) | 213 | static void bio_next_page(struct dpages *dp) |
215 | { | 214 | { |
216 | struct bio *bio = dp->context_ptr; | 215 | struct bio_vec *bvec = dp->context_ptr; |
217 | struct bio_vec bvec = bio_iovec(bio); | 216 | dp->context_ptr = bvec + 1; |
218 | 217 | dp->context_u = 0; | |
219 | bio_advance(bio, bvec.bv_len); | ||
220 | } | 218 | } |
221 | 219 | ||
222 | static void bio_dp_init(struct dpages *dp, struct bio *bio) | 220 | static void bio_dp_init(struct dpages *dp, struct bio *bio) |
223 | { | 221 | { |
224 | dp->get_page = bio_get_page; | 222 | dp->get_page = bio_get_page; |
225 | dp->next_page = bio_next_page; | 223 | dp->next_page = bio_next_page; |
226 | dp->context_ptr = bio; | 224 | dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); |
225 | dp->context_u = bio->bi_iter.bi_bvec_done; | ||
227 | } | 226 | } |
228 | 227 | ||
229 | /* | 228 | /* |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6eb9dc9ef8f3..422a9fdeb53e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -1626,8 +1626,11 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | |||
1626 | /* | 1626 | /* |
1627 | * Only pass ioctls through if the device sizes match exactly. | 1627 | * Only pass ioctls through if the device sizes match exactly. |
1628 | */ | 1628 | */ |
1629 | if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) | 1629 | if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) { |
1630 | r = scsi_verify_blk_ioctl(NULL, cmd); | 1630 | int err = scsi_verify_blk_ioctl(NULL, cmd); |
1631 | if (err) | ||
1632 | r = err; | ||
1633 | } | ||
1631 | 1634 | ||
1632 | if (r == -ENOTCONN && !fatal_signal_pending(current)) | 1635 | if (r == -ENOTCONN && !fatal_signal_pending(current)) |
1633 | queue_work(kmultipathd, &m->process_queued_ios); | 1636 | queue_work(kmultipathd, &m->process_queued_ios); |
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index f284e0bfb25f..7dfdb5c746d6 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -1244,6 +1244,9 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) | |||
1244 | 1244 | ||
1245 | dm_bio_restore(bd, bio); | 1245 | dm_bio_restore(bd, bio); |
1246 | bio_record->details.bi_bdev = NULL; | 1246 | bio_record->details.bi_bdev = NULL; |
1247 | |||
1248 | atomic_inc(&bio->bi_remaining); | ||
1249 | |||
1247 | queue_bio(ms, bio, rw); | 1250 | queue_bio(ms, bio, rw); |
1248 | return DM_ENDIO_INCOMPLETE; | 1251 | return DM_ENDIO_INCOMPLETE; |
1249 | } | 1252 | } |
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index afc3d017de4c..d6e88178d22c 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
@@ -546,6 +546,9 @@ static int read_exceptions(struct pstore *ps, | |||
546 | r = insert_exceptions(ps, area, callback, callback_context, | 546 | r = insert_exceptions(ps, area, callback, callback_context, |
547 | &full); | 547 | &full); |
548 | 548 | ||
549 | if (!full) | ||
550 | memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT); | ||
551 | |||
549 | dm_bufio_release(bp); | 552 | dm_bufio_release(bp); |
550 | 553 | ||
551 | dm_bufio_forget(client, chunk); | 554 | dm_bufio_forget(client, chunk); |
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 7da347665552..fb9efc829182 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c | |||
@@ -76,7 +76,7 @@ | |||
76 | 76 | ||
77 | #define THIN_SUPERBLOCK_MAGIC 27022010 | 77 | #define THIN_SUPERBLOCK_MAGIC 27022010 |
78 | #define THIN_SUPERBLOCK_LOCATION 0 | 78 | #define THIN_SUPERBLOCK_LOCATION 0 |
79 | #define THIN_VERSION 1 | 79 | #define THIN_VERSION 2 |
80 | #define THIN_METADATA_CACHE_SIZE 64 | 80 | #define THIN_METADATA_CACHE_SIZE 64 |
81 | #define SECTOR_TO_BLOCK_SHIFT 3 | 81 | #define SECTOR_TO_BLOCK_SHIFT 3 |
82 | 82 | ||
@@ -483,7 +483,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd) | |||
483 | 483 | ||
484 | disk_super->data_mapping_root = cpu_to_le64(pmd->root); | 484 | disk_super->data_mapping_root = cpu_to_le64(pmd->root); |
485 | disk_super->device_details_root = cpu_to_le64(pmd->details_root); | 485 | disk_super->device_details_root = cpu_to_le64(pmd->details_root); |
486 | disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); | 486 | disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE); |
487 | disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); | 487 | disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); |
488 | disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); | 488 | disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); |
489 | 489 | ||
@@ -651,7 +651,7 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f | |||
651 | { | 651 | { |
652 | int r; | 652 | int r; |
653 | 653 | ||
654 | pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE, | 654 | pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, |
655 | THIN_METADATA_CACHE_SIZE, | 655 | THIN_METADATA_CACHE_SIZE, |
656 | THIN_MAX_CONCURRENT_LOCKS); | 656 | THIN_MAX_CONCURRENT_LOCKS); |
657 | if (IS_ERR(pmd->bm)) { | 657 | if (IS_ERR(pmd->bm)) { |
@@ -1489,6 +1489,23 @@ bool dm_thin_changed_this_transaction(struct dm_thin_device *td) | |||
1489 | return r; | 1489 | return r; |
1490 | } | 1490 | } |
1491 | 1491 | ||
1492 | bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd) | ||
1493 | { | ||
1494 | bool r = false; | ||
1495 | struct dm_thin_device *td, *tmp; | ||
1496 | |||
1497 | down_read(&pmd->root_lock); | ||
1498 | list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { | ||
1499 | if (td->changed) { | ||
1500 | r = td->changed; | ||
1501 | break; | ||
1502 | } | ||
1503 | } | ||
1504 | up_read(&pmd->root_lock); | ||
1505 | |||
1506 | return r; | ||
1507 | } | ||
1508 | |||
1492 | bool dm_thin_aborted_changes(struct dm_thin_device *td) | 1509 | bool dm_thin_aborted_changes(struct dm_thin_device *td) |
1493 | { | 1510 | { |
1494 | bool r; | 1511 | bool r; |
@@ -1738,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, | |||
1738 | 1755 | ||
1739 | return r; | 1756 | return r; |
1740 | } | 1757 | } |
1758 | |||
1759 | int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) | ||
1760 | { | ||
1761 | int r; | ||
1762 | struct dm_block *sblock; | ||
1763 | struct thin_disk_superblock *disk_super; | ||
1764 | |||
1765 | down_write(&pmd->root_lock); | ||
1766 | pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; | ||
1767 | |||
1768 | r = superblock_lock(pmd, &sblock); | ||
1769 | if (r) { | ||
1770 | DMERR("couldn't read superblock"); | ||
1771 | goto out; | ||
1772 | } | ||
1773 | |||
1774 | disk_super = dm_block_data(sblock); | ||
1775 | disk_super->flags = cpu_to_le32(pmd->flags); | ||
1776 | |||
1777 | dm_bm_unlock(sblock); | ||
1778 | out: | ||
1779 | up_write(&pmd->root_lock); | ||
1780 | return r; | ||
1781 | } | ||
1782 | |||
1783 | bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) | ||
1784 | { | ||
1785 | bool needs_check; | ||
1786 | |||
1787 | down_read(&pmd->root_lock); | ||
1788 | needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG; | ||
1789 | up_read(&pmd->root_lock); | ||
1790 | |||
1791 | return needs_check; | ||
1792 | } | ||
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 9a368567632f..e3c857db195a 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h | |||
@@ -9,16 +9,14 @@ | |||
9 | 9 | ||
10 | #include "persistent-data/dm-block-manager.h" | 10 | #include "persistent-data/dm-block-manager.h" |
11 | #include "persistent-data/dm-space-map.h" | 11 | #include "persistent-data/dm-space-map.h" |
12 | #include "persistent-data/dm-space-map-metadata.h" | ||
12 | 13 | ||
13 | #define THIN_METADATA_BLOCK_SIZE 4096 | 14 | #define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE |
14 | 15 | ||
15 | /* | 16 | /* |
16 | * The metadata device is currently limited in size. | 17 | * The metadata device is currently limited in size. |
17 | * | ||
18 | * We have one block of index, which can hold 255 index entries. Each | ||
19 | * index entry contains allocation info about 16k metadata blocks. | ||
20 | */ | 18 | */ |
21 | #define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) | 19 | #define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS |
22 | 20 | ||
23 | /* | 21 | /* |
24 | * A metadata device larger than 16GB triggers a warning. | 22 | * A metadata device larger than 16GB triggers a warning. |
@@ -27,6 +25,11 @@ | |||
27 | 25 | ||
28 | /*----------------------------------------------------------------*/ | 26 | /*----------------------------------------------------------------*/ |
29 | 27 | ||
28 | /* | ||
29 | * Thin metadata superblock flags. | ||
30 | */ | ||
31 | #define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0) | ||
32 | |||
30 | struct dm_pool_metadata; | 33 | struct dm_pool_metadata; |
31 | struct dm_thin_device; | 34 | struct dm_thin_device; |
32 | 35 | ||
@@ -161,6 +164,8 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block); | |||
161 | */ | 164 | */ |
162 | bool dm_thin_changed_this_transaction(struct dm_thin_device *td); | 165 | bool dm_thin_changed_this_transaction(struct dm_thin_device *td); |
163 | 166 | ||
167 | bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd); | ||
168 | |||
164 | bool dm_thin_aborted_changes(struct dm_thin_device *td); | 169 | bool dm_thin_aborted_changes(struct dm_thin_device *td); |
165 | 170 | ||
166 | int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, | 171 | int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, |
@@ -202,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, | |||
202 | dm_sm_threshold_fn fn, | 207 | dm_sm_threshold_fn fn, |
203 | void *context); | 208 | void *context); |
204 | 209 | ||
210 | /* | ||
211 | * Updates the superblock immediately. | ||
212 | */ | ||
213 | int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd); | ||
214 | bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); | ||
215 | |||
205 | /*----------------------------------------------------------------*/ | 216 | /*----------------------------------------------------------------*/ |
206 | 217 | ||
207 | #endif | 218 | #endif |
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index faaf944597ab..be70d38745f7 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, | |||
130 | struct dm_thin_new_mapping; | 130 | struct dm_thin_new_mapping; |
131 | 131 | ||
132 | /* | 132 | /* |
133 | * The pool runs in 3 modes. Ordered in degraded order for comparisons. | 133 | * The pool runs in 4 modes. Ordered in degraded order for comparisons. |
134 | */ | 134 | */ |
135 | enum pool_mode { | 135 | enum pool_mode { |
136 | PM_WRITE, /* metadata may be changed */ | 136 | PM_WRITE, /* metadata may be changed */ |
137 | PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */ | ||
137 | PM_READ_ONLY, /* metadata may not be changed */ | 138 | PM_READ_ONLY, /* metadata may not be changed */ |
138 | PM_FAIL, /* all I/O fails */ | 139 | PM_FAIL, /* all I/O fails */ |
139 | }; | 140 | }; |
@@ -198,7 +199,6 @@ struct pool { | |||
198 | }; | 199 | }; |
199 | 200 | ||
200 | static enum pool_mode get_pool_mode(struct pool *pool); | 201 | static enum pool_mode get_pool_mode(struct pool *pool); |
201 | static void out_of_data_space(struct pool *pool); | ||
202 | static void metadata_operation_failed(struct pool *pool, const char *op, int r); | 202 | static void metadata_operation_failed(struct pool *pool, const char *op, int r); |
203 | 203 | ||
204 | /* | 204 | /* |
@@ -226,6 +226,7 @@ struct thin_c { | |||
226 | 226 | ||
227 | struct pool *pool; | 227 | struct pool *pool; |
228 | struct dm_thin_device *td; | 228 | struct dm_thin_device *td; |
229 | bool requeue_mode:1; | ||
229 | }; | 230 | }; |
230 | 231 | ||
231 | /*----------------------------------------------------------------*/ | 232 | /*----------------------------------------------------------------*/ |
@@ -369,14 +370,18 @@ struct dm_thin_endio_hook { | |||
369 | struct dm_thin_new_mapping *overwrite_mapping; | 370 | struct dm_thin_new_mapping *overwrite_mapping; |
370 | }; | 371 | }; |
371 | 372 | ||
372 | static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) | 373 | static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) |
373 | { | 374 | { |
374 | struct bio *bio; | 375 | struct bio *bio; |
375 | struct bio_list bios; | 376 | struct bio_list bios; |
377 | unsigned long flags; | ||
376 | 378 | ||
377 | bio_list_init(&bios); | 379 | bio_list_init(&bios); |
380 | |||
381 | spin_lock_irqsave(&tc->pool->lock, flags); | ||
378 | bio_list_merge(&bios, master); | 382 | bio_list_merge(&bios, master); |
379 | bio_list_init(master); | 383 | bio_list_init(master); |
384 | spin_unlock_irqrestore(&tc->pool->lock, flags); | ||
380 | 385 | ||
381 | while ((bio = bio_list_pop(&bios))) { | 386 | while ((bio = bio_list_pop(&bios))) { |
382 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | 387 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); |
@@ -391,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) | |||
391 | static void requeue_io(struct thin_c *tc) | 396 | static void requeue_io(struct thin_c *tc) |
392 | { | 397 | { |
393 | struct pool *pool = tc->pool; | 398 | struct pool *pool = tc->pool; |
399 | |||
400 | requeue_bio_list(tc, &pool->deferred_bios); | ||
401 | requeue_bio_list(tc, &pool->retry_on_resume_list); | ||
402 | } | ||
403 | |||
404 | static void error_retry_list(struct pool *pool) | ||
405 | { | ||
406 | struct bio *bio; | ||
394 | unsigned long flags; | 407 | unsigned long flags; |
408 | struct bio_list bios; | ||
409 | |||
410 | bio_list_init(&bios); | ||
395 | 411 | ||
396 | spin_lock_irqsave(&pool->lock, flags); | 412 | spin_lock_irqsave(&pool->lock, flags); |
397 | __requeue_bio_list(tc, &pool->deferred_bios); | 413 | bio_list_merge(&bios, &pool->retry_on_resume_list); |
398 | __requeue_bio_list(tc, &pool->retry_on_resume_list); | 414 | bio_list_init(&pool->retry_on_resume_list); |
399 | spin_unlock_irqrestore(&pool->lock, flags); | 415 | spin_unlock_irqrestore(&pool->lock, flags); |
416 | |||
417 | while ((bio = bio_list_pop(&bios))) | ||
418 | bio_io_error(bio); | ||
400 | } | 419 | } |
401 | 420 | ||
402 | /* | 421 | /* |
@@ -925,13 +944,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) | |||
925 | } | 944 | } |
926 | } | 945 | } |
927 | 946 | ||
947 | static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); | ||
948 | |||
928 | static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | 949 | static int alloc_data_block(struct thin_c *tc, dm_block_t *result) |
929 | { | 950 | { |
930 | int r; | 951 | int r; |
931 | dm_block_t free_blocks; | 952 | dm_block_t free_blocks; |
932 | struct pool *pool = tc->pool; | 953 | struct pool *pool = tc->pool; |
933 | 954 | ||
934 | if (get_pool_mode(pool) != PM_WRITE) | 955 | if (WARN_ON(get_pool_mode(pool) != PM_WRITE)) |
935 | return -EINVAL; | 956 | return -EINVAL; |
936 | 957 | ||
937 | r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); | 958 | r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); |
@@ -958,7 +979,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | |||
958 | } | 979 | } |
959 | 980 | ||
960 | if (!free_blocks) { | 981 | if (!free_blocks) { |
961 | out_of_data_space(pool); | 982 | set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); |
962 | return -ENOSPC; | 983 | return -ENOSPC; |
963 | } | 984 | } |
964 | } | 985 | } |
@@ -988,15 +1009,32 @@ static void retry_on_resume(struct bio *bio) | |||
988 | spin_unlock_irqrestore(&pool->lock, flags); | 1009 | spin_unlock_irqrestore(&pool->lock, flags); |
989 | } | 1010 | } |
990 | 1011 | ||
991 | static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) | 1012 | static bool should_error_unserviceable_bio(struct pool *pool) |
992 | { | 1013 | { |
993 | /* | 1014 | enum pool_mode m = get_pool_mode(pool); |
994 | * When pool is read-only, no cell locking is needed because | ||
995 | * nothing is changing. | ||
996 | */ | ||
997 | WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY); | ||
998 | 1015 | ||
999 | if (pool->pf.error_if_no_space) | 1016 | switch (m) { |
1017 | case PM_WRITE: | ||
1018 | /* Shouldn't get here */ | ||
1019 | DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); | ||
1020 | return true; | ||
1021 | |||
1022 | case PM_OUT_OF_DATA_SPACE: | ||
1023 | return pool->pf.error_if_no_space; | ||
1024 | |||
1025 | case PM_READ_ONLY: | ||
1026 | case PM_FAIL: | ||
1027 | return true; | ||
1028 | default: | ||
1029 | /* Shouldn't get here */ | ||
1030 | DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); | ||
1031 | return true; | ||
1032 | } | ||
1033 | } | ||
1034 | |||
1035 | static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) | ||
1036 | { | ||
1037 | if (should_error_unserviceable_bio(pool)) | ||
1000 | bio_io_error(bio); | 1038 | bio_io_error(bio); |
1001 | else | 1039 | else |
1002 | retry_on_resume(bio); | 1040 | retry_on_resume(bio); |
@@ -1007,11 +1045,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c | |||
1007 | struct bio *bio; | 1045 | struct bio *bio; |
1008 | struct bio_list bios; | 1046 | struct bio_list bios; |
1009 | 1047 | ||
1048 | if (should_error_unserviceable_bio(pool)) { | ||
1049 | cell_error(pool, cell); | ||
1050 | return; | ||
1051 | } | ||
1052 | |||
1010 | bio_list_init(&bios); | 1053 | bio_list_init(&bios); |
1011 | cell_release(pool, cell, &bios); | 1054 | cell_release(pool, cell, &bios); |
1012 | 1055 | ||
1013 | while ((bio = bio_list_pop(&bios))) | 1056 | if (should_error_unserviceable_bio(pool)) |
1014 | handle_unserviceable_bio(pool, bio); | 1057 | while ((bio = bio_list_pop(&bios))) |
1058 | bio_io_error(bio); | ||
1059 | else | ||
1060 | while ((bio = bio_list_pop(&bios))) | ||
1061 | retry_on_resume(bio); | ||
1015 | } | 1062 | } |
1016 | 1063 | ||
1017 | static void process_discard(struct thin_c *tc, struct bio *bio) | 1064 | static void process_discard(struct thin_c *tc, struct bio *bio) |
@@ -1296,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) | |||
1296 | } | 1343 | } |
1297 | } | 1344 | } |
1298 | 1345 | ||
1346 | static void process_bio_success(struct thin_c *tc, struct bio *bio) | ||
1347 | { | ||
1348 | bio_endio(bio, 0); | ||
1349 | } | ||
1350 | |||
1299 | static void process_bio_fail(struct thin_c *tc, struct bio *bio) | 1351 | static void process_bio_fail(struct thin_c *tc, struct bio *bio) |
1300 | { | 1352 | { |
1301 | bio_io_error(bio); | 1353 | bio_io_error(bio); |
@@ -1328,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool) | |||
1328 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | 1380 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); |
1329 | struct thin_c *tc = h->tc; | 1381 | struct thin_c *tc = h->tc; |
1330 | 1382 | ||
1383 | if (tc->requeue_mode) { | ||
1384 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
1385 | continue; | ||
1386 | } | ||
1387 | |||
1331 | /* | 1388 | /* |
1332 | * If we've got no free new_mapping structs, and processing | 1389 | * If we've got no free new_mapping structs, and processing |
1333 | * this bio might require one, we pause until there are some | 1390 | * this bio might require one, we pause until there are some |
@@ -1357,7 +1414,8 @@ static void process_deferred_bios(struct pool *pool) | |||
1357 | bio_list_init(&pool->deferred_flush_bios); | 1414 | bio_list_init(&pool->deferred_flush_bios); |
1358 | spin_unlock_irqrestore(&pool->lock, flags); | 1415 | spin_unlock_irqrestore(&pool->lock, flags); |
1359 | 1416 | ||
1360 | if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) | 1417 | if (bio_list_empty(&bios) && |
1418 | !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) | ||
1361 | return; | 1419 | return; |
1362 | 1420 | ||
1363 | if (commit(pool)) { | 1421 | if (commit(pool)) { |
@@ -1393,51 +1451,134 @@ static void do_waker(struct work_struct *ws) | |||
1393 | 1451 | ||
1394 | /*----------------------------------------------------------------*/ | 1452 | /*----------------------------------------------------------------*/ |
1395 | 1453 | ||
1454 | struct noflush_work { | ||
1455 | struct work_struct worker; | ||
1456 | struct thin_c *tc; | ||
1457 | |||
1458 | atomic_t complete; | ||
1459 | wait_queue_head_t wait; | ||
1460 | }; | ||
1461 | |||
1462 | static void complete_noflush_work(struct noflush_work *w) | ||
1463 | { | ||
1464 | atomic_set(&w->complete, 1); | ||
1465 | wake_up(&w->wait); | ||
1466 | } | ||
1467 | |||
1468 | static void do_noflush_start(struct work_struct *ws) | ||
1469 | { | ||
1470 | struct noflush_work *w = container_of(ws, struct noflush_work, worker); | ||
1471 | w->tc->requeue_mode = true; | ||
1472 | requeue_io(w->tc); | ||
1473 | complete_noflush_work(w); | ||
1474 | } | ||
1475 | |||
1476 | static void do_noflush_stop(struct work_struct *ws) | ||
1477 | { | ||
1478 | struct noflush_work *w = container_of(ws, struct noflush_work, worker); | ||
1479 | w->tc->requeue_mode = false; | ||
1480 | complete_noflush_work(w); | ||
1481 | } | ||
1482 | |||
1483 | static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) | ||
1484 | { | ||
1485 | struct noflush_work w; | ||
1486 | |||
1487 | INIT_WORK(&w.worker, fn); | ||
1488 | w.tc = tc; | ||
1489 | atomic_set(&w.complete, 0); | ||
1490 | init_waitqueue_head(&w.wait); | ||
1491 | |||
1492 | queue_work(tc->pool->wq, &w.worker); | ||
1493 | |||
1494 | wait_event(w.wait, atomic_read(&w.complete)); | ||
1495 | } | ||
1496 | |||
1497 | /*----------------------------------------------------------------*/ | ||
1498 | |||
1396 | static enum pool_mode get_pool_mode(struct pool *pool) | 1499 | static enum pool_mode get_pool_mode(struct pool *pool) |
1397 | { | 1500 | { |
1398 | return pool->pf.mode; | 1501 | return pool->pf.mode; |
1399 | } | 1502 | } |
1400 | 1503 | ||
1504 | static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode) | ||
1505 | { | ||
1506 | dm_table_event(pool->ti->table); | ||
1507 | DMINFO("%s: switching pool to %s mode", | ||
1508 | dm_device_name(pool->pool_md), new_mode); | ||
1509 | } | ||
1510 | |||
1401 | static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) | 1511 | static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) |
1402 | { | 1512 | { |
1403 | int r; | 1513 | struct pool_c *pt = pool->ti->private; |
1404 | enum pool_mode old_mode = pool->pf.mode; | 1514 | bool needs_check = dm_pool_metadata_needs_check(pool->pmd); |
1515 | enum pool_mode old_mode = get_pool_mode(pool); | ||
1516 | |||
1517 | /* | ||
1518 | * Never allow the pool to transition to PM_WRITE mode if user | ||
1519 | * intervention is required to verify metadata and data consistency. | ||
1520 | */ | ||
1521 | if (new_mode == PM_WRITE && needs_check) { | ||
1522 | DMERR("%s: unable to switch pool to write mode until repaired.", | ||
1523 | dm_device_name(pool->pool_md)); | ||
1524 | if (old_mode != new_mode) | ||
1525 | new_mode = old_mode; | ||
1526 | else | ||
1527 | new_mode = PM_READ_ONLY; | ||
1528 | } | ||
1529 | /* | ||
1530 | * If we were in PM_FAIL mode, rollback of metadata failed. We're | ||
1531 | * not going to recover without a thin_repair. So we never let the | ||
1532 | * pool move out of the old mode. | ||
1533 | */ | ||
1534 | if (old_mode == PM_FAIL) | ||
1535 | new_mode = old_mode; | ||
1405 | 1536 | ||
1406 | switch (new_mode) { | 1537 | switch (new_mode) { |
1407 | case PM_FAIL: | 1538 | case PM_FAIL: |
1408 | if (old_mode != new_mode) | 1539 | if (old_mode != new_mode) |
1409 | DMERR("%s: switching pool to failure mode", | 1540 | notify_of_pool_mode_change(pool, "failure"); |
1410 | dm_device_name(pool->pool_md)); | ||
1411 | dm_pool_metadata_read_only(pool->pmd); | 1541 | dm_pool_metadata_read_only(pool->pmd); |
1412 | pool->process_bio = process_bio_fail; | 1542 | pool->process_bio = process_bio_fail; |
1413 | pool->process_discard = process_bio_fail; | 1543 | pool->process_discard = process_bio_fail; |
1414 | pool->process_prepared_mapping = process_prepared_mapping_fail; | 1544 | pool->process_prepared_mapping = process_prepared_mapping_fail; |
1415 | pool->process_prepared_discard = process_prepared_discard_fail; | 1545 | pool->process_prepared_discard = process_prepared_discard_fail; |
1546 | |||
1547 | error_retry_list(pool); | ||
1416 | break; | 1548 | break; |
1417 | 1549 | ||
1418 | case PM_READ_ONLY: | 1550 | case PM_READ_ONLY: |
1419 | if (old_mode != new_mode) | 1551 | if (old_mode != new_mode) |
1420 | DMERR("%s: switching pool to read-only mode", | 1552 | notify_of_pool_mode_change(pool, "read-only"); |
1421 | dm_device_name(pool->pool_md)); | 1553 | dm_pool_metadata_read_only(pool->pmd); |
1422 | r = dm_pool_abort_metadata(pool->pmd); | 1554 | pool->process_bio = process_bio_read_only; |
1423 | if (r) { | 1555 | pool->process_discard = process_bio_success; |
1424 | DMERR("%s: aborting transaction failed", | 1556 | pool->process_prepared_mapping = process_prepared_mapping_fail; |
1425 | dm_device_name(pool->pool_md)); | 1557 | pool->process_prepared_discard = process_prepared_discard_passdown; |
1426 | new_mode = PM_FAIL; | 1558 | |
1427 | set_pool_mode(pool, new_mode); | 1559 | error_retry_list(pool); |
1428 | } else { | 1560 | break; |
1429 | dm_pool_metadata_read_only(pool->pmd); | 1561 | |
1430 | pool->process_bio = process_bio_read_only; | 1562 | case PM_OUT_OF_DATA_SPACE: |
1431 | pool->process_discard = process_discard; | 1563 | /* |
1432 | pool->process_prepared_mapping = process_prepared_mapping_fail; | 1564 | * Ideally we'd never hit this state; the low water mark |
1433 | pool->process_prepared_discard = process_prepared_discard_passdown; | 1565 | * would trigger userland to extend the pool before we |
1434 | } | 1566 | * completely run out of data space. However, many small |
1567 | * IOs to unprovisioned space can consume data space at an | ||
1568 | * alarming rate. Adjust your low water mark if you're | ||
1569 | * frequently seeing this mode. | ||
1570 | */ | ||
1571 | if (old_mode != new_mode) | ||
1572 | notify_of_pool_mode_change(pool, "out-of-data-space"); | ||
1573 | pool->process_bio = process_bio_read_only; | ||
1574 | pool->process_discard = process_discard; | ||
1575 | pool->process_prepared_mapping = process_prepared_mapping; | ||
1576 | pool->process_prepared_discard = process_prepared_discard_passdown; | ||
1435 | break; | 1577 | break; |
1436 | 1578 | ||
1437 | case PM_WRITE: | 1579 | case PM_WRITE: |
1438 | if (old_mode != new_mode) | 1580 | if (old_mode != new_mode) |
1439 | DMINFO("%s: switching pool to write mode", | 1581 | notify_of_pool_mode_change(pool, "write"); |
1440 | dm_device_name(pool->pool_md)); | ||
1441 | dm_pool_metadata_read_write(pool->pmd); | 1582 | dm_pool_metadata_read_write(pool->pmd); |
1442 | pool->process_bio = process_bio; | 1583 | pool->process_bio = process_bio; |
1443 | pool->process_discard = process_discard; | 1584 | pool->process_discard = process_discard; |
@@ -1447,32 +1588,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) | |||
1447 | } | 1588 | } |
1448 | 1589 | ||
1449 | pool->pf.mode = new_mode; | 1590 | pool->pf.mode = new_mode; |
1591 | /* | ||
1592 | * The pool mode may have changed, sync it so bind_control_target() | ||
1593 | * doesn't cause an unexpected mode transition on resume. | ||
1594 | */ | ||
1595 | pt->adjusted_pf.mode = new_mode; | ||
1450 | } | 1596 | } |
1451 | 1597 | ||
1452 | /* | 1598 | static void abort_transaction(struct pool *pool) |
1453 | * Rather than calling set_pool_mode directly, use these which describe the | ||
1454 | * reason for mode degradation. | ||
1455 | */ | ||
1456 | static void out_of_data_space(struct pool *pool) | ||
1457 | { | 1599 | { |
1458 | DMERR_LIMIT("%s: no free data space available.", | 1600 | const char *dev_name = dm_device_name(pool->pool_md); |
1459 | dm_device_name(pool->pool_md)); | 1601 | |
1460 | set_pool_mode(pool, PM_READ_ONLY); | 1602 | DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); |
1603 | if (dm_pool_abort_metadata(pool->pmd)) { | ||
1604 | DMERR("%s: failed to abort metadata transaction", dev_name); | ||
1605 | set_pool_mode(pool, PM_FAIL); | ||
1606 | } | ||
1607 | |||
1608 | if (dm_pool_metadata_set_needs_check(pool->pmd)) { | ||
1609 | DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); | ||
1610 | set_pool_mode(pool, PM_FAIL); | ||
1611 | } | ||
1461 | } | 1612 | } |
1462 | 1613 | ||
1463 | static void metadata_operation_failed(struct pool *pool, const char *op, int r) | 1614 | static void metadata_operation_failed(struct pool *pool, const char *op, int r) |
1464 | { | 1615 | { |
1465 | dm_block_t free_blocks; | ||
1466 | |||
1467 | DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", | 1616 | DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", |
1468 | dm_device_name(pool->pool_md), op, r); | 1617 | dm_device_name(pool->pool_md), op, r); |
1469 | 1618 | ||
1470 | if (r == -ENOSPC && | 1619 | abort_transaction(pool); |
1471 | !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && | ||
1472 | !free_blocks) | ||
1473 | DMERR_LIMIT("%s: no free metadata space available.", | ||
1474 | dm_device_name(pool->pool_md)); | ||
1475 | |||
1476 | set_pool_mode(pool, PM_READ_ONLY); | 1620 | set_pool_mode(pool, PM_READ_ONLY); |
1477 | } | 1621 | } |
1478 | 1622 | ||
@@ -1523,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) | |||
1523 | 1667 | ||
1524 | thin_hook_bio(tc, bio); | 1668 | thin_hook_bio(tc, bio); |
1525 | 1669 | ||
1670 | if (tc->requeue_mode) { | ||
1671 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
1672 | return DM_MAPIO_SUBMITTED; | ||
1673 | } | ||
1674 | |||
1526 | if (get_pool_mode(tc->pool) == PM_FAIL) { | 1675 | if (get_pool_mode(tc->pool) == PM_FAIL) { |
1527 | bio_io_error(bio); | 1676 | bio_io_error(bio); |
1528 | return DM_MAPIO_SUBMITTED; | 1677 | return DM_MAPIO_SUBMITTED; |
@@ -1686,7 +1835,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) | |||
1686 | /* | 1835 | /* |
1687 | * We want to make sure that a pool in PM_FAIL mode is never upgraded. | 1836 | * We want to make sure that a pool in PM_FAIL mode is never upgraded. |
1688 | */ | 1837 | */ |
1689 | enum pool_mode old_mode = pool->pf.mode; | 1838 | enum pool_mode old_mode = get_pool_mode(pool); |
1690 | enum pool_mode new_mode = pt->adjusted_pf.mode; | 1839 | enum pool_mode new_mode = pt->adjusted_pf.mode; |
1691 | 1840 | ||
1692 | /* | 1841 | /* |
@@ -1700,16 +1849,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) | |||
1700 | pool->pf = pt->adjusted_pf; | 1849 | pool->pf = pt->adjusted_pf; |
1701 | pool->low_water_blocks = pt->low_water_blocks; | 1850 | pool->low_water_blocks = pt->low_water_blocks; |
1702 | 1851 | ||
1703 | /* | ||
1704 | * If we were in PM_FAIL mode, rollback of metadata failed. We're | ||
1705 | * not going to recover without a thin_repair. So we never let the | ||
1706 | * pool move out of the old mode. On the other hand a PM_READ_ONLY | ||
1707 | * may have been due to a lack of metadata or data space, and may | ||
1708 | * now work (ie. if the underlying devices have been resized). | ||
1709 | */ | ||
1710 | if (old_mode == PM_FAIL) | ||
1711 | new_mode = old_mode; | ||
1712 | |||
1713 | set_pool_mode(pool, new_mode); | 1852 | set_pool_mode(pool, new_mode); |
1714 | 1853 | ||
1715 | return 0; | 1854 | return 0; |
@@ -1999,16 +2138,27 @@ static void metadata_low_callback(void *context) | |||
1999 | dm_table_event(pool->ti->table); | 2138 | dm_table_event(pool->ti->table); |
2000 | } | 2139 | } |
2001 | 2140 | ||
2002 | static sector_t get_metadata_dev_size(struct block_device *bdev) | 2141 | static sector_t get_dev_size(struct block_device *bdev) |
2142 | { | ||
2143 | return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; | ||
2144 | } | ||
2145 | |||
2146 | static void warn_if_metadata_device_too_big(struct block_device *bdev) | ||
2003 | { | 2147 | { |
2004 | sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; | 2148 | sector_t metadata_dev_size = get_dev_size(bdev); |
2005 | char buffer[BDEVNAME_SIZE]; | 2149 | char buffer[BDEVNAME_SIZE]; |
2006 | 2150 | ||
2007 | if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) { | 2151 | if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) |
2008 | DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", | 2152 | DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", |
2009 | bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); | 2153 | bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); |
2010 | metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING; | 2154 | } |
2011 | } | 2155 | |
2156 | static sector_t get_metadata_dev_size(struct block_device *bdev) | ||
2157 | { | ||
2158 | sector_t metadata_dev_size = get_dev_size(bdev); | ||
2159 | |||
2160 | if (metadata_dev_size > THIN_METADATA_MAX_SECTORS) | ||
2161 | metadata_dev_size = THIN_METADATA_MAX_SECTORS; | ||
2012 | 2162 | ||
2013 | return metadata_dev_size; | 2163 | return metadata_dev_size; |
2014 | } | 2164 | } |
@@ -2017,7 +2167,7 @@ static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev) | |||
2017 | { | 2167 | { |
2018 | sector_t metadata_dev_size = get_metadata_dev_size(bdev); | 2168 | sector_t metadata_dev_size = get_metadata_dev_size(bdev); |
2019 | 2169 | ||
2020 | sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); | 2170 | sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE); |
2021 | 2171 | ||
2022 | return metadata_dev_size; | 2172 | return metadata_dev_size; |
2023 | } | 2173 | } |
@@ -2095,12 +2245,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2095 | ti->error = "Error opening metadata block device"; | 2245 | ti->error = "Error opening metadata block device"; |
2096 | goto out_unlock; | 2246 | goto out_unlock; |
2097 | } | 2247 | } |
2098 | 2248 | warn_if_metadata_device_too_big(metadata_dev->bdev); | |
2099 | /* | ||
2100 | * Run for the side-effect of possibly issuing a warning if the | ||
2101 | * device is too big. | ||
2102 | */ | ||
2103 | (void) get_metadata_dev_size(metadata_dev->bdev); | ||
2104 | 2249 | ||
2105 | r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); | 2250 | r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); |
2106 | if (r) { | 2251 | if (r) { |
@@ -2246,6 +2391,12 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit) | |||
2246 | return -EINVAL; | 2391 | return -EINVAL; |
2247 | 2392 | ||
2248 | } else if (data_size > sb_data_size) { | 2393 | } else if (data_size > sb_data_size) { |
2394 | if (dm_pool_metadata_needs_check(pool->pmd)) { | ||
2395 | DMERR("%s: unable to grow the data device until repaired.", | ||
2396 | dm_device_name(pool->pool_md)); | ||
2397 | return 0; | ||
2398 | } | ||
2399 | |||
2249 | if (sb_data_size) | 2400 | if (sb_data_size) |
2250 | DMINFO("%s: growing the data device from %llu to %llu blocks", | 2401 | DMINFO("%s: growing the data device from %llu to %llu blocks", |
2251 | dm_device_name(pool->pool_md), | 2402 | dm_device_name(pool->pool_md), |
@@ -2287,6 +2438,13 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) | |||
2287 | return -EINVAL; | 2438 | return -EINVAL; |
2288 | 2439 | ||
2289 | } else if (metadata_dev_size > sb_metadata_dev_size) { | 2440 | } else if (metadata_dev_size > sb_metadata_dev_size) { |
2441 | if (dm_pool_metadata_needs_check(pool->pmd)) { | ||
2442 | DMERR("%s: unable to grow the metadata device until repaired.", | ||
2443 | dm_device_name(pool->pool_md)); | ||
2444 | return 0; | ||
2445 | } | ||
2446 | |||
2447 | warn_if_metadata_device_too_big(pool->md_dev); | ||
2290 | DMINFO("%s: growing the metadata device from %llu to %llu blocks", | 2448 | DMINFO("%s: growing the metadata device from %llu to %llu blocks", |
2291 | dm_device_name(pool->pool_md), | 2449 | dm_device_name(pool->pool_md), |
2292 | sb_metadata_dev_size, metadata_dev_size); | 2450 | sb_metadata_dev_size, metadata_dev_size); |
@@ -2673,7 +2831,9 @@ static void pool_status(struct dm_target *ti, status_type_t type, | |||
2673 | else | 2831 | else |
2674 | DMEMIT("- "); | 2832 | DMEMIT("- "); |
2675 | 2833 | ||
2676 | if (pool->pf.mode == PM_READ_ONLY) | 2834 | if (pool->pf.mode == PM_OUT_OF_DATA_SPACE) |
2835 | DMEMIT("out_of_data_space "); | ||
2836 | else if (pool->pf.mode == PM_READ_ONLY) | ||
2677 | DMEMIT("ro "); | 2837 | DMEMIT("ro "); |
2678 | else | 2838 | else |
2679 | DMEMIT("rw "); | 2839 | DMEMIT("rw "); |
@@ -2787,7 +2947,7 @@ static struct target_type pool_target = { | |||
2787 | .name = "thin-pool", | 2947 | .name = "thin-pool", |
2788 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | | 2948 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | |
2789 | DM_TARGET_IMMUTABLE, | 2949 | DM_TARGET_IMMUTABLE, |
2790 | .version = {1, 10, 0}, | 2950 | .version = {1, 11, 0}, |
2791 | .module = THIS_MODULE, | 2951 | .module = THIS_MODULE, |
2792 | .ctr = pool_ctr, | 2952 | .ctr = pool_ctr, |
2793 | .dtr = pool_dtr, | 2953 | .dtr = pool_dtr, |
@@ -2894,6 +3054,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2894 | 3054 | ||
2895 | if (get_pool_mode(tc->pool) == PM_FAIL) { | 3055 | if (get_pool_mode(tc->pool) == PM_FAIL) { |
2896 | ti->error = "Couldn't open thin device, Pool is in fail mode"; | 3056 | ti->error = "Couldn't open thin device, Pool is in fail mode"; |
3057 | r = -EINVAL; | ||
2897 | goto bad_thin_open; | 3058 | goto bad_thin_open; |
2898 | } | 3059 | } |
2899 | 3060 | ||
@@ -2905,7 +3066,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2905 | 3066 | ||
2906 | r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); | 3067 | r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); |
2907 | if (r) | 3068 | if (r) |
2908 | goto bad_thin_open; | 3069 | goto bad_target_max_io_len; |
2909 | 3070 | ||
2910 | ti->num_flush_bios = 1; | 3071 | ti->num_flush_bios = 1; |
2911 | ti->flush_supported = true; | 3072 | ti->flush_supported = true; |
@@ -2926,6 +3087,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2926 | 3087 | ||
2927 | return 0; | 3088 | return 0; |
2928 | 3089 | ||
3090 | bad_target_max_io_len: | ||
3091 | dm_pool_close_thin_device(tc->td); | ||
2929 | bad_thin_open: | 3092 | bad_thin_open: |
2930 | __pool_dec(tc->pool); | 3093 | __pool_dec(tc->pool); |
2931 | bad_pool_lookup: | 3094 | bad_pool_lookup: |
@@ -2986,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err) | |||
2986 | return 0; | 3149 | return 0; |
2987 | } | 3150 | } |
2988 | 3151 | ||
2989 | static void thin_postsuspend(struct dm_target *ti) | 3152 | static void thin_presuspend(struct dm_target *ti) |
2990 | { | 3153 | { |
3154 | struct thin_c *tc = ti->private; | ||
3155 | |||
2991 | if (dm_noflush_suspending(ti)) | 3156 | if (dm_noflush_suspending(ti)) |
2992 | requeue_io((struct thin_c *)ti->private); | 3157 | noflush_work(tc, do_noflush_start); |
3158 | } | ||
3159 | |||
3160 | static void thin_postsuspend(struct dm_target *ti) | ||
3161 | { | ||
3162 | struct thin_c *tc = ti->private; | ||
3163 | |||
3164 | /* | ||
3165 | * The dm_noflush_suspending flag has been cleared by now, so | ||
3166 | * unfortunately we must always run this. | ||
3167 | */ | ||
3168 | noflush_work(tc, do_noflush_stop); | ||
2993 | } | 3169 | } |
2994 | 3170 | ||
2995 | /* | 3171 | /* |
@@ -3074,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti, | |||
3074 | 3250 | ||
3075 | static struct target_type thin_target = { | 3251 | static struct target_type thin_target = { |
3076 | .name = "thin", | 3252 | .name = "thin", |
3077 | .version = {1, 10, 0}, | 3253 | .version = {1, 11, 0}, |
3078 | .module = THIS_MODULE, | 3254 | .module = THIS_MODULE, |
3079 | .ctr = thin_ctr, | 3255 | .ctr = thin_ctr, |
3080 | .dtr = thin_dtr, | 3256 | .dtr = thin_dtr, |
3081 | .map = thin_map, | 3257 | .map = thin_map, |
3082 | .end_io = thin_endio, | 3258 | .end_io = thin_endio, |
3259 | .presuspend = thin_presuspend, | ||
3083 | .postsuspend = thin_postsuspend, | 3260 | .postsuspend = thin_postsuspend, |
3084 | .status = thin_status, | 3261 | .status = thin_status, |
3085 | .iterate_devices = thin_iterate_devices, | 3262 | .iterate_devices = thin_iterate_devices, |
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index 19b268795415..0c2dec7aec20 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig | |||
@@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA | |||
6 | ---help--- | 6 | ---help--- |
7 | Library providing immutable on-disk data structure support for | 7 | Library providing immutable on-disk data structure support for |
8 | device-mapper targets such as the thin provisioning target. | 8 | device-mapper targets such as the thin provisioning target. |
9 | |||
10 | config DM_DEBUG_BLOCK_STACK_TRACING | ||
11 | boolean "Keep stack trace of persistent data block lock holders" | ||
12 | depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA | ||
13 | select STACKTRACE | ||
14 | ---help--- | ||
15 | Enable this for messages that may help debug problems with the | ||
16 | block manager locking used by thin provisioning and caching. | ||
17 | |||
18 | If unsure, say N. | ||
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 536782e3bcb7..786b689bdfc7 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c | |||
@@ -91,6 +91,69 @@ struct block_op { | |||
91 | dm_block_t block; | 91 | dm_block_t block; |
92 | }; | 92 | }; |
93 | 93 | ||
94 | struct bop_ring_buffer { | ||
95 | unsigned begin; | ||
96 | unsigned end; | ||
97 | struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1]; | ||
98 | }; | ||
99 | |||
100 | static void brb_init(struct bop_ring_buffer *brb) | ||
101 | { | ||
102 | brb->begin = 0; | ||
103 | brb->end = 0; | ||
104 | } | ||
105 | |||
106 | static bool brb_empty(struct bop_ring_buffer *brb) | ||
107 | { | ||
108 | return brb->begin == brb->end; | ||
109 | } | ||
110 | |||
111 | static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) | ||
112 | { | ||
113 | unsigned r = old + 1; | ||
114 | return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r; | ||
115 | } | ||
116 | |||
117 | static int brb_push(struct bop_ring_buffer *brb, | ||
118 | enum block_op_type type, dm_block_t b) | ||
119 | { | ||
120 | struct block_op *bop; | ||
121 | unsigned next = brb_next(brb, brb->end); | ||
122 | |||
123 | /* | ||
124 | * We don't allow the last bop to be filled, this way we can | ||
125 | * differentiate between full and empty. | ||
126 | */ | ||
127 | if (next == brb->begin) | ||
128 | return -ENOMEM; | ||
129 | |||
130 | bop = brb->bops + brb->end; | ||
131 | bop->type = type; | ||
132 | bop->block = b; | ||
133 | |||
134 | brb->end = next; | ||
135 | |||
136 | return 0; | ||
137 | } | ||
138 | |||
139 | static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result) | ||
140 | { | ||
141 | struct block_op *bop; | ||
142 | |||
143 | if (brb_empty(brb)) | ||
144 | return -ENODATA; | ||
145 | |||
146 | bop = brb->bops + brb->begin; | ||
147 | result->type = bop->type; | ||
148 | result->block = bop->block; | ||
149 | |||
150 | brb->begin = brb_next(brb, brb->begin); | ||
151 | |||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | /*----------------------------------------------------------------*/ | ||
156 | |||
94 | struct sm_metadata { | 157 | struct sm_metadata { |
95 | struct dm_space_map sm; | 158 | struct dm_space_map sm; |
96 | 159 | ||
@@ -101,25 +164,20 @@ struct sm_metadata { | |||
101 | 164 | ||
102 | unsigned recursion_count; | 165 | unsigned recursion_count; |
103 | unsigned allocated_this_transaction; | 166 | unsigned allocated_this_transaction; |
104 | unsigned nr_uncommitted; | 167 | struct bop_ring_buffer uncommitted; |
105 | struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS]; | ||
106 | 168 | ||
107 | struct threshold threshold; | 169 | struct threshold threshold; |
108 | }; | 170 | }; |
109 | 171 | ||
110 | static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) | 172 | static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) |
111 | { | 173 | { |
112 | struct block_op *op; | 174 | int r = brb_push(&smm->uncommitted, type, b); |
113 | 175 | ||
114 | if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { | 176 | if (r) { |
115 | DMERR("too many recursive allocations"); | 177 | DMERR("too many recursive allocations"); |
116 | return -ENOMEM; | 178 | return -ENOMEM; |
117 | } | 179 | } |
118 | 180 | ||
119 | op = smm->uncommitted + smm->nr_uncommitted++; | ||
120 | op->type = type; | ||
121 | op->block = b; | ||
122 | |||
123 | return 0; | 181 | return 0; |
124 | } | 182 | } |
125 | 183 | ||
@@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm) | |||
158 | return -ENOMEM; | 216 | return -ENOMEM; |
159 | } | 217 | } |
160 | 218 | ||
161 | if (smm->recursion_count == 1 && smm->nr_uncommitted) { | 219 | if (smm->recursion_count == 1) { |
162 | while (smm->nr_uncommitted && !r) { | 220 | while (!brb_empty(&smm->uncommitted)) { |
163 | smm->nr_uncommitted--; | 221 | struct block_op bop; |
164 | r = commit_bop(smm, smm->uncommitted + | 222 | |
165 | smm->nr_uncommitted); | 223 | r = brb_pop(&smm->uncommitted, &bop); |
224 | if (r) { | ||
225 | DMERR("bug in bop ring buffer"); | ||
226 | break; | ||
227 | } | ||
228 | |||
229 | r = commit_bop(smm, &bop); | ||
166 | if (r) | 230 | if (r) |
167 | break; | 231 | break; |
168 | } | 232 | } |
@@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count) | |||
217 | static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, | 281 | static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, |
218 | uint32_t *result) | 282 | uint32_t *result) |
219 | { | 283 | { |
220 | int r, i; | 284 | int r; |
285 | unsigned i; | ||
221 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); | 286 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); |
222 | unsigned adjustment = 0; | 287 | unsigned adjustment = 0; |
223 | 288 | ||
@@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, | |||
225 | * We may have some uncommitted adjustments to add. This list | 290 | * We may have some uncommitted adjustments to add. This list |
226 | * should always be really short. | 291 | * should always be really short. |
227 | */ | 292 | */ |
228 | for (i = 0; i < smm->nr_uncommitted; i++) { | 293 | for (i = smm->uncommitted.begin; |
229 | struct block_op *op = smm->uncommitted + i; | 294 | i != smm->uncommitted.end; |
295 | i = brb_next(&smm->uncommitted, i)) { | ||
296 | struct block_op *op = smm->uncommitted.bops + i; | ||
230 | 297 | ||
231 | if (op->block != b) | 298 | if (op->block != b) |
232 | continue; | 299 | continue; |
@@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, | |||
254 | static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, | 321 | static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, |
255 | dm_block_t b, int *result) | 322 | dm_block_t b, int *result) |
256 | { | 323 | { |
257 | int r, i, adjustment = 0; | 324 | int r, adjustment = 0; |
325 | unsigned i; | ||
258 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); | 326 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); |
259 | uint32_t rc; | 327 | uint32_t rc; |
260 | 328 | ||
@@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, | |||
262 | * We may have some uncommitted adjustments to add. This list | 330 | * We may have some uncommitted adjustments to add. This list |
263 | * should always be really short. | 331 | * should always be really short. |
264 | */ | 332 | */ |
265 | for (i = 0; i < smm->nr_uncommitted; i++) { | 333 | for (i = smm->uncommitted.begin; |
266 | struct block_op *op = smm->uncommitted + i; | 334 | i != smm->uncommitted.end; |
335 | i = brb_next(&smm->uncommitted, i)) { | ||
336 | |||
337 | struct block_op *op = smm->uncommitted.bops + i; | ||
267 | 338 | ||
268 | if (op->block != b) | 339 | if (op->block != b) |
269 | continue; | 340 | continue; |
@@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm, | |||
671 | smm->begin = superblock + 1; | 742 | smm->begin = superblock + 1; |
672 | smm->recursion_count = 0; | 743 | smm->recursion_count = 0; |
673 | smm->allocated_this_transaction = 0; | 744 | smm->allocated_this_transaction = 0; |
674 | smm->nr_uncommitted = 0; | 745 | brb_init(&smm->uncommitted); |
675 | threshold_init(&smm->threshold); | 746 | threshold_init(&smm->threshold); |
676 | 747 | ||
677 | memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); | 748 | memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); |
@@ -680,6 +751,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm, | |||
680 | if (r) | 751 | if (r) |
681 | return r; | 752 | return r; |
682 | 753 | ||
754 | if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS) | ||
755 | nr_blocks = DM_SM_METADATA_MAX_BLOCKS; | ||
683 | r = sm_ll_extend(&smm->ll, nr_blocks); | 756 | r = sm_ll_extend(&smm->ll, nr_blocks); |
684 | if (r) | 757 | if (r) |
685 | return r; | 758 | return r; |
@@ -713,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm, | |||
713 | smm->begin = 0; | 786 | smm->begin = 0; |
714 | smm->recursion_count = 0; | 787 | smm->recursion_count = 0; |
715 | smm->allocated_this_transaction = 0; | 788 | smm->allocated_this_transaction = 0; |
716 | smm->nr_uncommitted = 0; | 789 | brb_init(&smm->uncommitted); |
717 | threshold_init(&smm->threshold); | 790 | threshold_init(&smm->threshold); |
718 | 791 | ||
719 | memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); | 792 | memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); |
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h index 39bba0801cf2..64df923974d8 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.h +++ b/drivers/md/persistent-data/dm-space-map-metadata.h | |||
@@ -9,6 +9,17 @@ | |||
9 | 9 | ||
10 | #include "dm-transaction-manager.h" | 10 | #include "dm-transaction-manager.h" |
11 | 11 | ||
12 | #define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT) | ||
13 | |||
14 | /* | ||
15 | * The metadata device is currently limited in size. | ||
16 | * | ||
17 | * We have one block of index, which can hold 255 index entries. Each | ||
18 | * index entry contains allocation info about ~16k metadata blocks. | ||
19 | */ | ||
20 | #define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64)) | ||
21 | #define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE) | ||
22 | |||
12 | /* | 23 | /* |
13 | * Unfortunately we have to use two-phase construction due to the cycle | 24 | * Unfortunately we have to use two-phase construction due to the cycle |
14 | * between the tm and sm. | 25 | * between the tm and sm. |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fd3a2a14b587..4a6ca1cb2e78 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -1953,11 +1953,15 @@ static int process_checks(struct r1bio *r1_bio) | |||
1953 | for (i = 0; i < conf->raid_disks * 2; i++) { | 1953 | for (i = 0; i < conf->raid_disks * 2; i++) { |
1954 | int j; | 1954 | int j; |
1955 | int size; | 1955 | int size; |
1956 | int uptodate; | ||
1956 | struct bio *b = r1_bio->bios[i]; | 1957 | struct bio *b = r1_bio->bios[i]; |
1957 | if (b->bi_end_io != end_sync_read) | 1958 | if (b->bi_end_io != end_sync_read) |
1958 | continue; | 1959 | continue; |
1959 | /* fixup the bio for reuse */ | 1960 | /* fixup the bio for reuse, but preserve BIO_UPTODATE */ |
1961 | uptodate = test_bit(BIO_UPTODATE, &b->bi_flags); | ||
1960 | bio_reset(b); | 1962 | bio_reset(b); |
1963 | if (!uptodate) | ||
1964 | clear_bit(BIO_UPTODATE, &b->bi_flags); | ||
1961 | b->bi_vcnt = vcnt; | 1965 | b->bi_vcnt = vcnt; |
1962 | b->bi_iter.bi_size = r1_bio->sectors << 9; | 1966 | b->bi_iter.bi_size = r1_bio->sectors << 9; |
1963 | b->bi_iter.bi_sector = r1_bio->sector + | 1967 | b->bi_iter.bi_sector = r1_bio->sector + |
@@ -1990,11 +1994,14 @@ static int process_checks(struct r1bio *r1_bio) | |||
1990 | int j; | 1994 | int j; |
1991 | struct bio *pbio = r1_bio->bios[primary]; | 1995 | struct bio *pbio = r1_bio->bios[primary]; |
1992 | struct bio *sbio = r1_bio->bios[i]; | 1996 | struct bio *sbio = r1_bio->bios[i]; |
1997 | int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags); | ||
1993 | 1998 | ||
1994 | if (sbio->bi_end_io != end_sync_read) | 1999 | if (sbio->bi_end_io != end_sync_read) |
1995 | continue; | 2000 | continue; |
2001 | /* Now we can 'fixup' the BIO_UPTODATE flag */ | ||
2002 | set_bit(BIO_UPTODATE, &sbio->bi_flags); | ||
1996 | 2003 | ||
1997 | if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { | 2004 | if (uptodate) { |
1998 | for (j = vcnt; j-- ; ) { | 2005 | for (j = vcnt; j-- ; ) { |
1999 | struct page *p, *s; | 2006 | struct page *p, *s; |
2000 | p = pbio->bi_io_vec[j].bv_page; | 2007 | p = pbio->bi_io_vec[j].bv_page; |
@@ -2009,7 +2016,7 @@ static int process_checks(struct r1bio *r1_bio) | |||
2009 | if (j >= 0) | 2016 | if (j >= 0) |
2010 | atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); | 2017 | atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); |
2011 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) | 2018 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) |
2012 | && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { | 2019 | && uptodate)) { |
2013 | /* No need to write to this device. */ | 2020 | /* No need to write to this device. */ |
2014 | sbio->bi_end_io = NULL; | 2021 | sbio->bi_end_io = NULL; |
2015 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); | 2022 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f1feadeb7bb2..16f5c21963db 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -5514,23 +5514,43 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) | |||
5514 | return sectors * (raid_disks - conf->max_degraded); | 5514 | return sectors * (raid_disks - conf->max_degraded); |
5515 | } | 5515 | } |
5516 | 5516 | ||
5517 | static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) | ||
5518 | { | ||
5519 | safe_put_page(percpu->spare_page); | ||
5520 | kfree(percpu->scribble); | ||
5521 | percpu->spare_page = NULL; | ||
5522 | percpu->scribble = NULL; | ||
5523 | } | ||
5524 | |||
5525 | static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) | ||
5526 | { | ||
5527 | if (conf->level == 6 && !percpu->spare_page) | ||
5528 | percpu->spare_page = alloc_page(GFP_KERNEL); | ||
5529 | if (!percpu->scribble) | ||
5530 | percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); | ||
5531 | |||
5532 | if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { | ||
5533 | free_scratch_buffer(conf, percpu); | ||
5534 | return -ENOMEM; | ||
5535 | } | ||
5536 | |||
5537 | return 0; | ||
5538 | } | ||
5539 | |||
5517 | static void raid5_free_percpu(struct r5conf *conf) | 5540 | static void raid5_free_percpu(struct r5conf *conf) |
5518 | { | 5541 | { |
5519 | struct raid5_percpu *percpu; | ||
5520 | unsigned long cpu; | 5542 | unsigned long cpu; |
5521 | 5543 | ||
5522 | if (!conf->percpu) | 5544 | if (!conf->percpu) |
5523 | return; | 5545 | return; |
5524 | 5546 | ||
5525 | get_online_cpus(); | ||
5526 | for_each_possible_cpu(cpu) { | ||
5527 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
5528 | safe_put_page(percpu->spare_page); | ||
5529 | kfree(percpu->scribble); | ||
5530 | } | ||
5531 | #ifdef CONFIG_HOTPLUG_CPU | 5547 | #ifdef CONFIG_HOTPLUG_CPU |
5532 | unregister_cpu_notifier(&conf->cpu_notify); | 5548 | unregister_cpu_notifier(&conf->cpu_notify); |
5533 | #endif | 5549 | #endif |
5550 | |||
5551 | get_online_cpus(); | ||
5552 | for_each_possible_cpu(cpu) | ||
5553 | free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); | ||
5534 | put_online_cpus(); | 5554 | put_online_cpus(); |
5535 | 5555 | ||
5536 | free_percpu(conf->percpu); | 5556 | free_percpu(conf->percpu); |
@@ -5557,15 +5577,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, | |||
5557 | switch (action) { | 5577 | switch (action) { |
5558 | case CPU_UP_PREPARE: | 5578 | case CPU_UP_PREPARE: |
5559 | case CPU_UP_PREPARE_FROZEN: | 5579 | case CPU_UP_PREPARE_FROZEN: |
5560 | if (conf->level == 6 && !percpu->spare_page) | 5580 | if (alloc_scratch_buffer(conf, percpu)) { |
5561 | percpu->spare_page = alloc_page(GFP_KERNEL); | ||
5562 | if (!percpu->scribble) | ||
5563 | percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); | ||
5564 | |||
5565 | if (!percpu->scribble || | ||
5566 | (conf->level == 6 && !percpu->spare_page)) { | ||
5567 | safe_put_page(percpu->spare_page); | ||
5568 | kfree(percpu->scribble); | ||
5569 | pr_err("%s: failed memory allocation for cpu%ld\n", | 5581 | pr_err("%s: failed memory allocation for cpu%ld\n", |
5570 | __func__, cpu); | 5582 | __func__, cpu); |
5571 | return notifier_from_errno(-ENOMEM); | 5583 | return notifier_from_errno(-ENOMEM); |
@@ -5573,10 +5585,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, | |||
5573 | break; | 5585 | break; |
5574 | case CPU_DEAD: | 5586 | case CPU_DEAD: |
5575 | case CPU_DEAD_FROZEN: | 5587 | case CPU_DEAD_FROZEN: |
5576 | safe_put_page(percpu->spare_page); | 5588 | free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); |
5577 | kfree(percpu->scribble); | ||
5578 | percpu->spare_page = NULL; | ||
5579 | percpu->scribble = NULL; | ||
5580 | break; | 5589 | break; |
5581 | default: | 5590 | default: |
5582 | break; | 5591 | break; |
@@ -5588,40 +5597,29 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, | |||
5588 | static int raid5_alloc_percpu(struct r5conf *conf) | 5597 | static int raid5_alloc_percpu(struct r5conf *conf) |
5589 | { | 5598 | { |
5590 | unsigned long cpu; | 5599 | unsigned long cpu; |
5591 | struct page *spare_page; | 5600 | int err = 0; |
5592 | struct raid5_percpu __percpu *allcpus; | ||
5593 | void *scribble; | ||
5594 | int err; | ||
5595 | 5601 | ||
5596 | allcpus = alloc_percpu(struct raid5_percpu); | 5602 | conf->percpu = alloc_percpu(struct raid5_percpu); |
5597 | if (!allcpus) | 5603 | if (!conf->percpu) |
5598 | return -ENOMEM; | 5604 | return -ENOMEM; |
5599 | conf->percpu = allcpus; | 5605 | |
5606 | #ifdef CONFIG_HOTPLUG_CPU | ||
5607 | conf->cpu_notify.notifier_call = raid456_cpu_notify; | ||
5608 | conf->cpu_notify.priority = 0; | ||
5609 | err = register_cpu_notifier(&conf->cpu_notify); | ||
5610 | if (err) | ||
5611 | return err; | ||
5612 | #endif | ||
5600 | 5613 | ||
5601 | get_online_cpus(); | 5614 | get_online_cpus(); |
5602 | err = 0; | ||
5603 | for_each_present_cpu(cpu) { | 5615 | for_each_present_cpu(cpu) { |
5604 | if (conf->level == 6) { | 5616 | err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); |
5605 | spare_page = alloc_page(GFP_KERNEL); | 5617 | if (err) { |
5606 | if (!spare_page) { | 5618 | pr_err("%s: failed memory allocation for cpu%ld\n", |
5607 | err = -ENOMEM; | 5619 | __func__, cpu); |
5608 | break; | ||
5609 | } | ||
5610 | per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; | ||
5611 | } | ||
5612 | scribble = kmalloc(conf->scribble_len, GFP_KERNEL); | ||
5613 | if (!scribble) { | ||
5614 | err = -ENOMEM; | ||
5615 | break; | 5620 | break; |
5616 | } | 5621 | } |
5617 | per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; | ||
5618 | } | 5622 | } |
5619 | #ifdef CONFIG_HOTPLUG_CPU | ||
5620 | conf->cpu_notify.notifier_call = raid456_cpu_notify; | ||
5621 | conf->cpu_notify.priority = 0; | ||
5622 | if (err == 0) | ||
5623 | err = register_cpu_notifier(&conf->cpu_notify); | ||
5624 | #endif | ||
5625 | put_online_cpus(); | 5623 | put_online_cpus(); |
5626 | 5624 | ||
5627 | return err; | 5625 | return err; |