aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Walleij <linus.walleij@linaro.org>2014-03-14 05:26:45 -0400
committerLinus Walleij <linus.walleij@linaro.org>2014-03-14 05:26:45 -0400
commit9e294427f6e427dbaf46140303acded06365f53c (patch)
tree0669100cbd79fe8612463900171c98873d8dc454 /drivers/md
parent23600969ff137cf4c3bc9098f77e381de334e3f7 (diff)
parentfa389e220254c69ffae0d403eac4146171062d08 (diff)
Merge tag 'v3.14-rc6' into devel
Linux 3.14-rc6
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig10
-rw-r--r--drivers/md/bcache/bcache.h4
-rw-r--r--drivers/md/bcache/bset.c7
-rw-r--r--drivers/md/bcache/btree.c4
-rw-r--r--drivers/md/bcache/extents.c2
-rw-r--r--drivers/md/bcache/request.c6
-rw-r--r--drivers/md/bcache/sysfs.c2
-rw-r--r--drivers/md/dm-cache-policy-mq.c4
-rw-r--r--drivers/md/dm-cache-target.c13
-rw-r--r--drivers/md/dm-io.c23
-rw-r--r--drivers/md/dm-mpath.c7
-rw-r--r--drivers/md/dm-raid1.c3
-rw-r--r--drivers/md/dm-snap-persistent.c3
-rw-r--r--drivers/md/dm-thin-metadata.c58
-rw-r--r--drivers/md/dm-thin-metadata.h21
-rw-r--r--drivers/md/dm-thin.c343
-rw-r--r--drivers/md/persistent-data/Kconfig10
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c115
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.h11
-rw-r--r--drivers/md/raid1.c13
-rw-r--r--drivers/md/raid5.c90
21 files changed, 547 insertions, 202 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 9a06fe883766..95ad936e6048 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -254,16 +254,6 @@ config DM_THIN_PROVISIONING
254 ---help--- 254 ---help---
255 Provides thin provisioning and snapshots that share a data store. 255 Provides thin provisioning and snapshots that share a data store.
256 256
257config DM_DEBUG_BLOCK_STACK_TRACING
258 boolean "Keep stack trace of persistent data block lock holders"
259 depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
260 select STACKTRACE
261 ---help---
262 Enable this for messages that may help debug problems with the
263 block manager locking used by thin provisioning and caching.
264
265 If unsure, say N.
266
267config DM_CACHE 257config DM_CACHE
268 tristate "Cache target (EXPERIMENTAL)" 258 tristate "Cache target (EXPERIMENTAL)"
269 depends on BLK_DEV_DM 259 depends on BLK_DEV_DM
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 0c707e4f4eaf..a4c7306ff43d 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -210,7 +210,9 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
210#define GC_MARK_RECLAIMABLE 0 210#define GC_MARK_RECLAIMABLE 0
211#define GC_MARK_DIRTY 1 211#define GC_MARK_DIRTY 1
212#define GC_MARK_METADATA 2 212#define GC_MARK_METADATA 2
213BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13); 213#define GC_SECTORS_USED_SIZE 13
214#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE))
215BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
214BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); 216BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
215 217
216#include "journal.h" 218#include "journal.h"
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 4f6b5940e609..3f74b4b0747b 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -23,7 +23,7 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
23 for (k = i->start; k < bset_bkey_last(i); k = next) { 23 for (k = i->start; k < bset_bkey_last(i); k = next) {
24 next = bkey_next(k); 24 next = bkey_next(k);
25 25
26 printk(KERN_ERR "block %u key %zi/%u: ", set, 26 printk(KERN_ERR "block %u key %li/%u: ", set,
27 (uint64_t *) k - i->d, i->keys); 27 (uint64_t *) k - i->d, i->keys);
28 28
29 if (b->ops->key_dump) 29 if (b->ops->key_dump)
@@ -1185,9 +1185,12 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
1185 struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, 1185 struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
1186 order); 1186 order);
1187 if (!out) { 1187 if (!out) {
1188 struct page *outp;
1189
1188 BUG_ON(order > state->page_order); 1190 BUG_ON(order > state->page_order);
1189 1191
1190 out = page_address(mempool_alloc(state->pool, GFP_NOIO)); 1192 outp = mempool_alloc(state->pool, GFP_NOIO);
1193 out = page_address(outp);
1191 used_mempool = true; 1194 used_mempool = true;
1192 order = state->page_order; 1195 order = state->page_order;
1193 } 1196 }
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 98cc0a810a36..5f9c2a665ca5 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1167,7 +1167,7 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1167 /* guard against overflow */ 1167 /* guard against overflow */
1168 SET_GC_SECTORS_USED(g, min_t(unsigned, 1168 SET_GC_SECTORS_USED(g, min_t(unsigned,
1169 GC_SECTORS_USED(g) + KEY_SIZE(k), 1169 GC_SECTORS_USED(g) + KEY_SIZE(k),
1170 (1 << 14) - 1)); 1170 MAX_GC_SECTORS_USED));
1171 1171
1172 BUG_ON(!GC_SECTORS_USED(g)); 1172 BUG_ON(!GC_SECTORS_USED(g));
1173 } 1173 }
@@ -1805,7 +1805,7 @@ static bool btree_insert_key(struct btree *b, struct bkey *k,
1805 1805
1806static size_t insert_u64s_remaining(struct btree *b) 1806static size_t insert_u64s_remaining(struct btree *b)
1807{ 1807{
1808 ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys); 1808 long ret = bch_btree_keys_u64s_remaining(&b->keys);
1809 1809
1810 /* 1810 /*
1811 * Might land in the middle of an existing extent and have to split it 1811 * Might land in the middle of an existing extent and have to split it
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index c3ead586dc27..416d1a3e028e 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -194,7 +194,7 @@ err:
194 mutex_unlock(&b->c->bucket_lock); 194 mutex_unlock(&b->c->bucket_lock);
195 bch_extent_to_text(buf, sizeof(buf), k); 195 bch_extent_to_text(buf, sizeof(buf), k);
196 btree_bug(b, 196 btree_bug(b,
197"inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", 197"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
198 buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), 198 buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
199 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); 199 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
200 return true; 200 return true;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 72cd213f213f..5d5d031cf381 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -353,14 +353,14 @@ static void bch_data_insert_start(struct closure *cl)
353 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 353 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
354 struct bio *bio = op->bio, *n; 354 struct bio *bio = op->bio, *n;
355 355
356 if (op->bypass)
357 return bch_data_invalidate(cl);
358
359 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { 356 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
360 set_gc_sectors(op->c); 357 set_gc_sectors(op->c);
361 wake_up_gc(op->c); 358 wake_up_gc(op->c);
362 } 359 }
363 360
361 if (op->bypass)
362 return bch_data_invalidate(cl);
363
364 /* 364 /*
365 * Journal writes are marked REQ_FLUSH; if the original write was a 365 * Journal writes are marked REQ_FLUSH; if the original write was a
366 * flush, it'll wait on the journal write. 366 * flush, it'll wait on the journal write.
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index c6ab69333a6d..d8458d477a12 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -416,7 +416,7 @@ static int btree_bset_stats(struct btree_op *b_op, struct btree *b)
416 return MAP_CONTINUE; 416 return MAP_CONTINUE;
417} 417}
418 418
419int bch_bset_print_stats(struct cache_set *c, char *buf) 419static int bch_bset_print_stats(struct cache_set *c, char *buf)
420{ 420{
421 struct bset_stats_op op; 421 struct bset_stats_op op;
422 int ret; 422 int ret;
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 1e018e986610..0e385e40909e 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -872,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p)
872{ 872{
873 struct mq_policy *mq = to_mq_policy(p); 873 struct mq_policy *mq = to_mq_policy(p);
874 874
875 kfree(mq->table); 875 vfree(mq->table);
876 epool_exit(&mq->cache_pool); 876 epool_exit(&mq->cache_pool);
877 epool_exit(&mq->pre_cache_pool); 877 epool_exit(&mq->pre_cache_pool);
878 kfree(mq); 878 kfree(mq);
@@ -1245,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1245 1245
1246 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); 1246 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
1247 mq->hash_bits = ffs(mq->nr_buckets) - 1; 1247 mq->hash_bits = ffs(mq->nr_buckets) - 1;
1248 mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); 1248 mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
1249 if (!mq->table) 1249 if (!mq->table)
1250 goto bad_alloc_table; 1250 goto bad_alloc_table;
1251 1251
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index ffd472e015ca..1af70145fab9 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -289,6 +289,7 @@ struct per_bio_data {
289 bool tick:1; 289 bool tick:1;
290 unsigned req_nr:2; 290 unsigned req_nr:2;
291 struct dm_deferred_entry *all_io_entry; 291 struct dm_deferred_entry *all_io_entry;
292 struct dm_hook_info hook_info;
292 293
293 /* 294 /*
294 * writethrough fields. These MUST remain at the end of this 295 * writethrough fields. These MUST remain at the end of this
@@ -297,7 +298,6 @@ struct per_bio_data {
297 */ 298 */
298 struct cache *cache; 299 struct cache *cache;
299 dm_cblock_t cblock; 300 dm_cblock_t cblock;
300 struct dm_hook_info hook_info;
301 struct dm_bio_details bio_details; 301 struct dm_bio_details bio_details;
302}; 302};
303 303
@@ -671,15 +671,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio,
671 dm_cblock_t cblock) 671 dm_cblock_t cblock)
672{ 672{
673 sector_t bi_sector = bio->bi_iter.bi_sector; 673 sector_t bi_sector = bio->bi_iter.bi_sector;
674 sector_t block = from_cblock(cblock);
674 675
675 bio->bi_bdev = cache->cache_dev->bdev; 676 bio->bi_bdev = cache->cache_dev->bdev;
676 if (!block_size_is_power_of_two(cache)) 677 if (!block_size_is_power_of_two(cache))
677 bio->bi_iter.bi_sector = 678 bio->bi_iter.bi_sector =
678 (from_cblock(cblock) * cache->sectors_per_block) + 679 (block * cache->sectors_per_block) +
679 sector_div(bi_sector, cache->sectors_per_block); 680 sector_div(bi_sector, cache->sectors_per_block);
680 else 681 else
681 bio->bi_iter.bi_sector = 682 bio->bi_iter.bi_sector =
682 (from_cblock(cblock) << cache->sectors_per_block_shift) | 683 (block << cache->sectors_per_block_shift) |
683 (bi_sector & (cache->sectors_per_block - 1)); 684 (bi_sector & (cache->sectors_per_block - 1));
684} 685}
685 686
@@ -1010,13 +1011,15 @@ static void overwrite_endio(struct bio *bio, int err)
1010 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1011 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1011 unsigned long flags; 1012 unsigned long flags;
1012 1013
1014 dm_unhook_bio(&pb->hook_info, bio);
1015
1013 if (err) 1016 if (err)
1014 mg->err = true; 1017 mg->err = true;
1015 1018
1019 mg->requeue_holder = false;
1020
1016 spin_lock_irqsave(&cache->lock, flags); 1021 spin_lock_irqsave(&cache->lock, flags);
1017 list_add_tail(&mg->list, &cache->completed_migrations); 1022 list_add_tail(&mg->list, &cache->completed_migrations);
1018 dm_unhook_bio(&pb->hook_info, bio);
1019 mg->requeue_holder = false;
1020 spin_unlock_irqrestore(&cache->lock, flags); 1023 spin_unlock_irqrestore(&cache->lock, flags);
1021 1024
1022 wake_worker(cache); 1025 wake_worker(cache);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b2b8a10e8427..3842ac738f98 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -201,29 +201,28 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse
201/* 201/*
202 * Functions for getting the pages from a bvec. 202 * Functions for getting the pages from a bvec.
203 */ 203 */
204static void bio_get_page(struct dpages *dp, 204static void bio_get_page(struct dpages *dp, struct page **p,
205 struct page **p, unsigned long *len, unsigned *offset) 205 unsigned long *len, unsigned *offset)
206{ 206{
207 struct bio *bio = dp->context_ptr; 207 struct bio_vec *bvec = dp->context_ptr;
208 struct bio_vec bvec = bio_iovec(bio); 208 *p = bvec->bv_page;
209 *p = bvec.bv_page; 209 *len = bvec->bv_len - dp->context_u;
210 *len = bvec.bv_len; 210 *offset = bvec->bv_offset + dp->context_u;
211 *offset = bvec.bv_offset;
212} 211}
213 212
214static void bio_next_page(struct dpages *dp) 213static void bio_next_page(struct dpages *dp)
215{ 214{
216 struct bio *bio = dp->context_ptr; 215 struct bio_vec *bvec = dp->context_ptr;
217 struct bio_vec bvec = bio_iovec(bio); 216 dp->context_ptr = bvec + 1;
218 217 dp->context_u = 0;
219 bio_advance(bio, bvec.bv_len);
220} 218}
221 219
222static void bio_dp_init(struct dpages *dp, struct bio *bio) 220static void bio_dp_init(struct dpages *dp, struct bio *bio)
223{ 221{
224 dp->get_page = bio_get_page; 222 dp->get_page = bio_get_page;
225 dp->next_page = bio_next_page; 223 dp->next_page = bio_next_page;
226 dp->context_ptr = bio; 224 dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
225 dp->context_u = bio->bi_iter.bi_bvec_done;
227} 226}
228 227
229/* 228/*
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6eb9dc9ef8f3..422a9fdeb53e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1626,8 +1626,11 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1626 /* 1626 /*
1627 * Only pass ioctls through if the device sizes match exactly. 1627 * Only pass ioctls through if the device sizes match exactly.
1628 */ 1628 */
1629 if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) 1629 if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
1630 r = scsi_verify_blk_ioctl(NULL, cmd); 1630 int err = scsi_verify_blk_ioctl(NULL, cmd);
1631 if (err)
1632 r = err;
1633 }
1631 1634
1632 if (r == -ENOTCONN && !fatal_signal_pending(current)) 1635 if (r == -ENOTCONN && !fatal_signal_pending(current))
1633 queue_work(kmultipathd, &m->process_queued_ios); 1636 queue_work(kmultipathd, &m->process_queued_ios);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index f284e0bfb25f..7dfdb5c746d6 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1244,6 +1244,9 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
1244 1244
1245 dm_bio_restore(bd, bio); 1245 dm_bio_restore(bd, bio);
1246 bio_record->details.bi_bdev = NULL; 1246 bio_record->details.bi_bdev = NULL;
1247
1248 atomic_inc(&bio->bi_remaining);
1249
1247 queue_bio(ms, bio, rw); 1250 queue_bio(ms, bio, rw);
1248 return DM_ENDIO_INCOMPLETE; 1251 return DM_ENDIO_INCOMPLETE;
1249 } 1252 }
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index afc3d017de4c..d6e88178d22c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -546,6 +546,9 @@ static int read_exceptions(struct pstore *ps,
546 r = insert_exceptions(ps, area, callback, callback_context, 546 r = insert_exceptions(ps, area, callback, callback_context,
547 &full); 547 &full);
548 548
549 if (!full)
550 memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT);
551
549 dm_bufio_release(bp); 552 dm_bufio_release(bp);
550 553
551 dm_bufio_forget(client, chunk); 554 dm_bufio_forget(client, chunk);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 7da347665552..fb9efc829182 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -76,7 +76,7 @@
76 76
77#define THIN_SUPERBLOCK_MAGIC 27022010 77#define THIN_SUPERBLOCK_MAGIC 27022010
78#define THIN_SUPERBLOCK_LOCATION 0 78#define THIN_SUPERBLOCK_LOCATION 0
79#define THIN_VERSION 1 79#define THIN_VERSION 2
80#define THIN_METADATA_CACHE_SIZE 64 80#define THIN_METADATA_CACHE_SIZE 64
81#define SECTOR_TO_BLOCK_SHIFT 3 81#define SECTOR_TO_BLOCK_SHIFT 3
82 82
@@ -483,7 +483,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
483 483
484 disk_super->data_mapping_root = cpu_to_le64(pmd->root); 484 disk_super->data_mapping_root = cpu_to_le64(pmd->root);
485 disk_super->device_details_root = cpu_to_le64(pmd->details_root); 485 disk_super->device_details_root = cpu_to_le64(pmd->details_root);
486 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 486 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
487 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); 487 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
488 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); 488 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
489 489
@@ -651,7 +651,7 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
651{ 651{
652 int r; 652 int r;
653 653
654 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE, 654 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
655 THIN_METADATA_CACHE_SIZE, 655 THIN_METADATA_CACHE_SIZE,
656 THIN_MAX_CONCURRENT_LOCKS); 656 THIN_MAX_CONCURRENT_LOCKS);
657 if (IS_ERR(pmd->bm)) { 657 if (IS_ERR(pmd->bm)) {
@@ -1489,6 +1489,23 @@ bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1489 return r; 1489 return r;
1490} 1490}
1491 1491
1492bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
1493{
1494 bool r = false;
1495 struct dm_thin_device *td, *tmp;
1496
1497 down_read(&pmd->root_lock);
1498 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1499 if (td->changed) {
1500 r = td->changed;
1501 break;
1502 }
1503 }
1504 up_read(&pmd->root_lock);
1505
1506 return r;
1507}
1508
1492bool dm_thin_aborted_changes(struct dm_thin_device *td) 1509bool dm_thin_aborted_changes(struct dm_thin_device *td)
1493{ 1510{
1494 bool r; 1511 bool r;
@@ -1738,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
1738 1755
1739 return r; 1756 return r;
1740} 1757}
1758
1759int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
1760{
1761 int r;
1762 struct dm_block *sblock;
1763 struct thin_disk_superblock *disk_super;
1764
1765 down_write(&pmd->root_lock);
1766 pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
1767
1768 r = superblock_lock(pmd, &sblock);
1769 if (r) {
1770 DMERR("couldn't read superblock");
1771 goto out;
1772 }
1773
1774 disk_super = dm_block_data(sblock);
1775 disk_super->flags = cpu_to_le32(pmd->flags);
1776
1777 dm_bm_unlock(sblock);
1778out:
1779 up_write(&pmd->root_lock);
1780 return r;
1781}
1782
1783bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
1784{
1785 bool needs_check;
1786
1787 down_read(&pmd->root_lock);
1788 needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
1789 up_read(&pmd->root_lock);
1790
1791 return needs_check;
1792}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 9a368567632f..e3c857db195a 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -9,16 +9,14 @@
9 9
10#include "persistent-data/dm-block-manager.h" 10#include "persistent-data/dm-block-manager.h"
11#include "persistent-data/dm-space-map.h" 11#include "persistent-data/dm-space-map.h"
12#include "persistent-data/dm-space-map-metadata.h"
12 13
13#define THIN_METADATA_BLOCK_SIZE 4096 14#define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
14 15
15/* 16/*
16 * The metadata device is currently limited in size. 17 * The metadata device is currently limited in size.
17 *
18 * We have one block of index, which can hold 255 index entries. Each
19 * index entry contains allocation info about 16k metadata blocks.
20 */ 18 */
21#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) 19#define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
22 20
23/* 21/*
24 * A metadata device larger than 16GB triggers a warning. 22 * A metadata device larger than 16GB triggers a warning.
@@ -27,6 +25,11 @@
27 25
28/*----------------------------------------------------------------*/ 26/*----------------------------------------------------------------*/
29 27
28/*
29 * Thin metadata superblock flags.
30 */
31#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0)
32
30struct dm_pool_metadata; 33struct dm_pool_metadata;
31struct dm_thin_device; 34struct dm_thin_device;
32 35
@@ -161,6 +164,8 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
161 */ 164 */
162bool dm_thin_changed_this_transaction(struct dm_thin_device *td); 165bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
163 166
167bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd);
168
164bool dm_thin_aborted_changes(struct dm_thin_device *td); 169bool dm_thin_aborted_changes(struct dm_thin_device *td);
165 170
166int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, 171int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
@@ -202,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
202 dm_sm_threshold_fn fn, 207 dm_sm_threshold_fn fn,
203 void *context); 208 void *context);
204 209
210/*
211 * Updates the superblock immediately.
212 */
213int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
214bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
215
205/*----------------------------------------------------------------*/ 216/*----------------------------------------------------------------*/
206 217
207#endif 218#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index faaf944597ab..be70d38745f7 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
130struct dm_thin_new_mapping; 130struct dm_thin_new_mapping;
131 131
132/* 132/*
133 * The pool runs in 3 modes. Ordered in degraded order for comparisons. 133 * The pool runs in 4 modes. Ordered in degraded order for comparisons.
134 */ 134 */
135enum pool_mode { 135enum pool_mode {
136 PM_WRITE, /* metadata may be changed */ 136 PM_WRITE, /* metadata may be changed */
137 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
137 PM_READ_ONLY, /* metadata may not be changed */ 138 PM_READ_ONLY, /* metadata may not be changed */
138 PM_FAIL, /* all I/O fails */ 139 PM_FAIL, /* all I/O fails */
139}; 140};
@@ -198,7 +199,6 @@ struct pool {
198}; 199};
199 200
200static enum pool_mode get_pool_mode(struct pool *pool); 201static enum pool_mode get_pool_mode(struct pool *pool);
201static void out_of_data_space(struct pool *pool);
202static void metadata_operation_failed(struct pool *pool, const char *op, int r); 202static void metadata_operation_failed(struct pool *pool, const char *op, int r);
203 203
204/* 204/*
@@ -226,6 +226,7 @@ struct thin_c {
226 226
227 struct pool *pool; 227 struct pool *pool;
228 struct dm_thin_device *td; 228 struct dm_thin_device *td;
229 bool requeue_mode:1;
229}; 230};
230 231
231/*----------------------------------------------------------------*/ 232/*----------------------------------------------------------------*/
@@ -369,14 +370,18 @@ struct dm_thin_endio_hook {
369 struct dm_thin_new_mapping *overwrite_mapping; 370 struct dm_thin_new_mapping *overwrite_mapping;
370}; 371};
371 372
372static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 373static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
373{ 374{
374 struct bio *bio; 375 struct bio *bio;
375 struct bio_list bios; 376 struct bio_list bios;
377 unsigned long flags;
376 378
377 bio_list_init(&bios); 379 bio_list_init(&bios);
380
381 spin_lock_irqsave(&tc->pool->lock, flags);
378 bio_list_merge(&bios, master); 382 bio_list_merge(&bios, master);
379 bio_list_init(master); 383 bio_list_init(master);
384 spin_unlock_irqrestore(&tc->pool->lock, flags);
380 385
381 while ((bio = bio_list_pop(&bios))) { 386 while ((bio = bio_list_pop(&bios))) {
382 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 387 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -391,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
391static void requeue_io(struct thin_c *tc) 396static void requeue_io(struct thin_c *tc)
392{ 397{
393 struct pool *pool = tc->pool; 398 struct pool *pool = tc->pool;
399
400 requeue_bio_list(tc, &pool->deferred_bios);
401 requeue_bio_list(tc, &pool->retry_on_resume_list);
402}
403
404static void error_retry_list(struct pool *pool)
405{
406 struct bio *bio;
394 unsigned long flags; 407 unsigned long flags;
408 struct bio_list bios;
409
410 bio_list_init(&bios);
395 411
396 spin_lock_irqsave(&pool->lock, flags); 412 spin_lock_irqsave(&pool->lock, flags);
397 __requeue_bio_list(tc, &pool->deferred_bios); 413 bio_list_merge(&bios, &pool->retry_on_resume_list);
398 __requeue_bio_list(tc, &pool->retry_on_resume_list); 414 bio_list_init(&pool->retry_on_resume_list);
399 spin_unlock_irqrestore(&pool->lock, flags); 415 spin_unlock_irqrestore(&pool->lock, flags);
416
417 while ((bio = bio_list_pop(&bios)))
418 bio_io_error(bio);
400} 419}
401 420
402/* 421/*
@@ -925,13 +944,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
925 } 944 }
926} 945}
927 946
947static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
948
928static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 949static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
929{ 950{
930 int r; 951 int r;
931 dm_block_t free_blocks; 952 dm_block_t free_blocks;
932 struct pool *pool = tc->pool; 953 struct pool *pool = tc->pool;
933 954
934 if (get_pool_mode(pool) != PM_WRITE) 955 if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
935 return -EINVAL; 956 return -EINVAL;
936 957
937 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 958 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
@@ -958,7 +979,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
958 } 979 }
959 980
960 if (!free_blocks) { 981 if (!free_blocks) {
961 out_of_data_space(pool); 982 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
962 return -ENOSPC; 983 return -ENOSPC;
963 } 984 }
964 } 985 }
@@ -988,15 +1009,32 @@ static void retry_on_resume(struct bio *bio)
988 spin_unlock_irqrestore(&pool->lock, flags); 1009 spin_unlock_irqrestore(&pool->lock, flags);
989} 1010}
990 1011
991static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) 1012static bool should_error_unserviceable_bio(struct pool *pool)
992{ 1013{
993 /* 1014 enum pool_mode m = get_pool_mode(pool);
994 * When pool is read-only, no cell locking is needed because
995 * nothing is changing.
996 */
997 WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
998 1015
999 if (pool->pf.error_if_no_space) 1016 switch (m) {
1017 case PM_WRITE:
1018 /* Shouldn't get here */
1019 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1020 return true;
1021
1022 case PM_OUT_OF_DATA_SPACE:
1023 return pool->pf.error_if_no_space;
1024
1025 case PM_READ_ONLY:
1026 case PM_FAIL:
1027 return true;
1028 default:
1029 /* Shouldn't get here */
1030 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1031 return true;
1032 }
1033}
1034
1035static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1036{
1037 if (should_error_unserviceable_bio(pool))
1000 bio_io_error(bio); 1038 bio_io_error(bio);
1001 else 1039 else
1002 retry_on_resume(bio); 1040 retry_on_resume(bio);
@@ -1007,11 +1045,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
1007 struct bio *bio; 1045 struct bio *bio;
1008 struct bio_list bios; 1046 struct bio_list bios;
1009 1047
1048 if (should_error_unserviceable_bio(pool)) {
1049 cell_error(pool, cell);
1050 return;
1051 }
1052
1010 bio_list_init(&bios); 1053 bio_list_init(&bios);
1011 cell_release(pool, cell, &bios); 1054 cell_release(pool, cell, &bios);
1012 1055
1013 while ((bio = bio_list_pop(&bios))) 1056 if (should_error_unserviceable_bio(pool))
1014 handle_unserviceable_bio(pool, bio); 1057 while ((bio = bio_list_pop(&bios)))
1058 bio_io_error(bio);
1059 else
1060 while ((bio = bio_list_pop(&bios)))
1061 retry_on_resume(bio);
1015} 1062}
1016 1063
1017static void process_discard(struct thin_c *tc, struct bio *bio) 1064static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -1296,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1296 } 1343 }
1297} 1344}
1298 1345
1346static void process_bio_success(struct thin_c *tc, struct bio *bio)
1347{
1348 bio_endio(bio, 0);
1349}
1350
1299static void process_bio_fail(struct thin_c *tc, struct bio *bio) 1351static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1300{ 1352{
1301 bio_io_error(bio); 1353 bio_io_error(bio);
@@ -1328,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool)
1328 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1380 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1329 struct thin_c *tc = h->tc; 1381 struct thin_c *tc = h->tc;
1330 1382
1383 if (tc->requeue_mode) {
1384 bio_endio(bio, DM_ENDIO_REQUEUE);
1385 continue;
1386 }
1387
1331 /* 1388 /*
1332 * If we've got no free new_mapping structs, and processing 1389 * If we've got no free new_mapping structs, and processing
1333 * this bio might require one, we pause until there are some 1390 * this bio might require one, we pause until there are some
@@ -1357,7 +1414,8 @@ static void process_deferred_bios(struct pool *pool)
1357 bio_list_init(&pool->deferred_flush_bios); 1414 bio_list_init(&pool->deferred_flush_bios);
1358 spin_unlock_irqrestore(&pool->lock, flags); 1415 spin_unlock_irqrestore(&pool->lock, flags);
1359 1416
1360 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1417 if (bio_list_empty(&bios) &&
1418 !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
1361 return; 1419 return;
1362 1420
1363 if (commit(pool)) { 1421 if (commit(pool)) {
@@ -1393,51 +1451,134 @@ static void do_waker(struct work_struct *ws)
1393 1451
1394/*----------------------------------------------------------------*/ 1452/*----------------------------------------------------------------*/
1395 1453
1454struct noflush_work {
1455 struct work_struct worker;
1456 struct thin_c *tc;
1457
1458 atomic_t complete;
1459 wait_queue_head_t wait;
1460};
1461
1462static void complete_noflush_work(struct noflush_work *w)
1463{
1464 atomic_set(&w->complete, 1);
1465 wake_up(&w->wait);
1466}
1467
1468static void do_noflush_start(struct work_struct *ws)
1469{
1470 struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1471 w->tc->requeue_mode = true;
1472 requeue_io(w->tc);
1473 complete_noflush_work(w);
1474}
1475
1476static void do_noflush_stop(struct work_struct *ws)
1477{
1478 struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1479 w->tc->requeue_mode = false;
1480 complete_noflush_work(w);
1481}
1482
1483static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
1484{
1485 struct noflush_work w;
1486
1487 INIT_WORK(&w.worker, fn);
1488 w.tc = tc;
1489 atomic_set(&w.complete, 0);
1490 init_waitqueue_head(&w.wait);
1491
1492 queue_work(tc->pool->wq, &w.worker);
1493
1494 wait_event(w.wait, atomic_read(&w.complete));
1495}
1496
1497/*----------------------------------------------------------------*/
1498
1396static enum pool_mode get_pool_mode(struct pool *pool) 1499static enum pool_mode get_pool_mode(struct pool *pool)
1397{ 1500{
1398 return pool->pf.mode; 1501 return pool->pf.mode;
1399} 1502}
1400 1503
1504static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
1505{
1506 dm_table_event(pool->ti->table);
1507 DMINFO("%s: switching pool to %s mode",
1508 dm_device_name(pool->pool_md), new_mode);
1509}
1510
1401static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) 1511static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1402{ 1512{
1403 int r; 1513 struct pool_c *pt = pool->ti->private;
1404 enum pool_mode old_mode = pool->pf.mode; 1514 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1515 enum pool_mode old_mode = get_pool_mode(pool);
1516
1517 /*
1518 * Never allow the pool to transition to PM_WRITE mode if user
1519 * intervention is required to verify metadata and data consistency.
1520 */
1521 if (new_mode == PM_WRITE && needs_check) {
1522 DMERR("%s: unable to switch pool to write mode until repaired.",
1523 dm_device_name(pool->pool_md));
1524 if (old_mode != new_mode)
1525 new_mode = old_mode;
1526 else
1527 new_mode = PM_READ_ONLY;
1528 }
1529 /*
1530 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1531 * not going to recover without a thin_repair. So we never let the
1532 * pool move out of the old mode.
1533 */
1534 if (old_mode == PM_FAIL)
1535 new_mode = old_mode;
1405 1536
1406 switch (new_mode) { 1537 switch (new_mode) {
1407 case PM_FAIL: 1538 case PM_FAIL:
1408 if (old_mode != new_mode) 1539 if (old_mode != new_mode)
1409 DMERR("%s: switching pool to failure mode", 1540 notify_of_pool_mode_change(pool, "failure");
1410 dm_device_name(pool->pool_md));
1411 dm_pool_metadata_read_only(pool->pmd); 1541 dm_pool_metadata_read_only(pool->pmd);
1412 pool->process_bio = process_bio_fail; 1542 pool->process_bio = process_bio_fail;
1413 pool->process_discard = process_bio_fail; 1543 pool->process_discard = process_bio_fail;
1414 pool->process_prepared_mapping = process_prepared_mapping_fail; 1544 pool->process_prepared_mapping = process_prepared_mapping_fail;
1415 pool->process_prepared_discard = process_prepared_discard_fail; 1545 pool->process_prepared_discard = process_prepared_discard_fail;
1546
1547 error_retry_list(pool);
1416 break; 1548 break;
1417 1549
1418 case PM_READ_ONLY: 1550 case PM_READ_ONLY:
1419 if (old_mode != new_mode) 1551 if (old_mode != new_mode)
1420 DMERR("%s: switching pool to read-only mode", 1552 notify_of_pool_mode_change(pool, "read-only");
1421 dm_device_name(pool->pool_md)); 1553 dm_pool_metadata_read_only(pool->pmd);
1422 r = dm_pool_abort_metadata(pool->pmd); 1554 pool->process_bio = process_bio_read_only;
1423 if (r) { 1555 pool->process_discard = process_bio_success;
1424 DMERR("%s: aborting transaction failed", 1556 pool->process_prepared_mapping = process_prepared_mapping_fail;
1425 dm_device_name(pool->pool_md)); 1557 pool->process_prepared_discard = process_prepared_discard_passdown;
1426 new_mode = PM_FAIL; 1558
1427 set_pool_mode(pool, new_mode); 1559 error_retry_list(pool);
1428 } else { 1560 break;
1429 dm_pool_metadata_read_only(pool->pmd); 1561
1430 pool->process_bio = process_bio_read_only; 1562 case PM_OUT_OF_DATA_SPACE:
1431 pool->process_discard = process_discard; 1563 /*
1432 pool->process_prepared_mapping = process_prepared_mapping_fail; 1564 * Ideally we'd never hit this state; the low water mark
1433 pool->process_prepared_discard = process_prepared_discard_passdown; 1565 * would trigger userland to extend the pool before we
1434 } 1566 * completely run out of data space. However, many small
1567 * IOs to unprovisioned space can consume data space at an
1568 * alarming rate. Adjust your low water mark if you're
1569 * frequently seeing this mode.
1570 */
1571 if (old_mode != new_mode)
1572 notify_of_pool_mode_change(pool, "out-of-data-space");
1573 pool->process_bio = process_bio_read_only;
1574 pool->process_discard = process_discard;
1575 pool->process_prepared_mapping = process_prepared_mapping;
1576 pool->process_prepared_discard = process_prepared_discard_passdown;
1435 break; 1577 break;
1436 1578
1437 case PM_WRITE: 1579 case PM_WRITE:
1438 if (old_mode != new_mode) 1580 if (old_mode != new_mode)
1439 DMINFO("%s: switching pool to write mode", 1581 notify_of_pool_mode_change(pool, "write");
1440 dm_device_name(pool->pool_md));
1441 dm_pool_metadata_read_write(pool->pmd); 1582 dm_pool_metadata_read_write(pool->pmd);
1442 pool->process_bio = process_bio; 1583 pool->process_bio = process_bio;
1443 pool->process_discard = process_discard; 1584 pool->process_discard = process_discard;
@@ -1447,32 +1588,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1447 } 1588 }
1448 1589
1449 pool->pf.mode = new_mode; 1590 pool->pf.mode = new_mode;
1591 /*
1592 * The pool mode may have changed, sync it so bind_control_target()
1593 * doesn't cause an unexpected mode transition on resume.
1594 */
1595 pt->adjusted_pf.mode = new_mode;
1450} 1596}
1451 1597
1452/* 1598static void abort_transaction(struct pool *pool)
1453 * Rather than calling set_pool_mode directly, use these which describe the
1454 * reason for mode degradation.
1455 */
1456static void out_of_data_space(struct pool *pool)
1457{ 1599{
1458 DMERR_LIMIT("%s: no free data space available.", 1600 const char *dev_name = dm_device_name(pool->pool_md);
1459 dm_device_name(pool->pool_md)); 1601
1460 set_pool_mode(pool, PM_READ_ONLY); 1602 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1603 if (dm_pool_abort_metadata(pool->pmd)) {
1604 DMERR("%s: failed to abort metadata transaction", dev_name);
1605 set_pool_mode(pool, PM_FAIL);
1606 }
1607
1608 if (dm_pool_metadata_set_needs_check(pool->pmd)) {
1609 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1610 set_pool_mode(pool, PM_FAIL);
1611 }
1461} 1612}
1462 1613
1463static void metadata_operation_failed(struct pool *pool, const char *op, int r) 1614static void metadata_operation_failed(struct pool *pool, const char *op, int r)
1464{ 1615{
1465 dm_block_t free_blocks;
1466
1467 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1616 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1468 dm_device_name(pool->pool_md), op, r); 1617 dm_device_name(pool->pool_md), op, r);
1469 1618
1470 if (r == -ENOSPC && 1619 abort_transaction(pool);
1471 !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
1472 !free_blocks)
1473 DMERR_LIMIT("%s: no free metadata space available.",
1474 dm_device_name(pool->pool_md));
1475
1476 set_pool_mode(pool, PM_READ_ONLY); 1620 set_pool_mode(pool, PM_READ_ONLY);
1477} 1621}
1478 1622
@@ -1523,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1523 1667
1524 thin_hook_bio(tc, bio); 1668 thin_hook_bio(tc, bio);
1525 1669
1670 if (tc->requeue_mode) {
1671 bio_endio(bio, DM_ENDIO_REQUEUE);
1672 return DM_MAPIO_SUBMITTED;
1673 }
1674
1526 if (get_pool_mode(tc->pool) == PM_FAIL) { 1675 if (get_pool_mode(tc->pool) == PM_FAIL) {
1527 bio_io_error(bio); 1676 bio_io_error(bio);
1528 return DM_MAPIO_SUBMITTED; 1677 return DM_MAPIO_SUBMITTED;
@@ -1686,7 +1835,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1686 /* 1835 /*
1687 * We want to make sure that a pool in PM_FAIL mode is never upgraded. 1836 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
1688 */ 1837 */
1689 enum pool_mode old_mode = pool->pf.mode; 1838 enum pool_mode old_mode = get_pool_mode(pool);
1690 enum pool_mode new_mode = pt->adjusted_pf.mode; 1839 enum pool_mode new_mode = pt->adjusted_pf.mode;
1691 1840
1692 /* 1841 /*
@@ -1700,16 +1849,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1700 pool->pf = pt->adjusted_pf; 1849 pool->pf = pt->adjusted_pf;
1701 pool->low_water_blocks = pt->low_water_blocks; 1850 pool->low_water_blocks = pt->low_water_blocks;
1702 1851
1703 /*
1704 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1705 * not going to recover without a thin_repair. So we never let the
1706 * pool move out of the old mode. On the other hand a PM_READ_ONLY
1707 * may have been due to a lack of metadata or data space, and may
1708 * now work (ie. if the underlying devices have been resized).
1709 */
1710 if (old_mode == PM_FAIL)
1711 new_mode = old_mode;
1712
1713 set_pool_mode(pool, new_mode); 1852 set_pool_mode(pool, new_mode);
1714 1853
1715 return 0; 1854 return 0;
@@ -1999,16 +2138,27 @@ static void metadata_low_callback(void *context)
1999 dm_table_event(pool->ti->table); 2138 dm_table_event(pool->ti->table);
2000} 2139}
2001 2140
2002static sector_t get_metadata_dev_size(struct block_device *bdev) 2141static sector_t get_dev_size(struct block_device *bdev)
2142{
2143 return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
2144}
2145
2146static void warn_if_metadata_device_too_big(struct block_device *bdev)
2003{ 2147{
2004 sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 2148 sector_t metadata_dev_size = get_dev_size(bdev);
2005 char buffer[BDEVNAME_SIZE]; 2149 char buffer[BDEVNAME_SIZE];
2006 2150
2007 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) { 2151 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
2008 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2152 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2009 bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); 2153 bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
2010 metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING; 2154}
2011 } 2155
2156static sector_t get_metadata_dev_size(struct block_device *bdev)
2157{
2158 sector_t metadata_dev_size = get_dev_size(bdev);
2159
2160 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
2161 metadata_dev_size = THIN_METADATA_MAX_SECTORS;
2012 2162
2013 return metadata_dev_size; 2163 return metadata_dev_size;
2014} 2164}
@@ -2017,7 +2167,7 @@ static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
2017{ 2167{
2018 sector_t metadata_dev_size = get_metadata_dev_size(bdev); 2168 sector_t metadata_dev_size = get_metadata_dev_size(bdev);
2019 2169
2020 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 2170 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
2021 2171
2022 return metadata_dev_size; 2172 return metadata_dev_size;
2023} 2173}
@@ -2095,12 +2245,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2095 ti->error = "Error opening metadata block device"; 2245 ti->error = "Error opening metadata block device";
2096 goto out_unlock; 2246 goto out_unlock;
2097 } 2247 }
2098 2248 warn_if_metadata_device_too_big(metadata_dev->bdev);
2099 /*
2100 * Run for the side-effect of possibly issuing a warning if the
2101 * device is too big.
2102 */
2103 (void) get_metadata_dev_size(metadata_dev->bdev);
2104 2249
2105 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 2250 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
2106 if (r) { 2251 if (r) {
@@ -2246,6 +2391,12 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
2246 return -EINVAL; 2391 return -EINVAL;
2247 2392
2248 } else if (data_size > sb_data_size) { 2393 } else if (data_size > sb_data_size) {
2394 if (dm_pool_metadata_needs_check(pool->pmd)) {
2395 DMERR("%s: unable to grow the data device until repaired.",
2396 dm_device_name(pool->pool_md));
2397 return 0;
2398 }
2399
2249 if (sb_data_size) 2400 if (sb_data_size)
2250 DMINFO("%s: growing the data device from %llu to %llu blocks", 2401 DMINFO("%s: growing the data device from %llu to %llu blocks",
2251 dm_device_name(pool->pool_md), 2402 dm_device_name(pool->pool_md),
@@ -2287,6 +2438,13 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2287 return -EINVAL; 2438 return -EINVAL;
2288 2439
2289 } else if (metadata_dev_size > sb_metadata_dev_size) { 2440 } else if (metadata_dev_size > sb_metadata_dev_size) {
2441 if (dm_pool_metadata_needs_check(pool->pmd)) {
2442 DMERR("%s: unable to grow the metadata device until repaired.",
2443 dm_device_name(pool->pool_md));
2444 return 0;
2445 }
2446
2447 warn_if_metadata_device_too_big(pool->md_dev);
2290 DMINFO("%s: growing the metadata device from %llu to %llu blocks", 2448 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
2291 dm_device_name(pool->pool_md), 2449 dm_device_name(pool->pool_md),
2292 sb_metadata_dev_size, metadata_dev_size); 2450 sb_metadata_dev_size, metadata_dev_size);
@@ -2673,7 +2831,9 @@ static void pool_status(struct dm_target *ti, status_type_t type,
2673 else 2831 else
2674 DMEMIT("- "); 2832 DMEMIT("- ");
2675 2833
2676 if (pool->pf.mode == PM_READ_ONLY) 2834 if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
2835 DMEMIT("out_of_data_space ");
2836 else if (pool->pf.mode == PM_READ_ONLY)
2677 DMEMIT("ro "); 2837 DMEMIT("ro ");
2678 else 2838 else
2679 DMEMIT("rw "); 2839 DMEMIT("rw ");
@@ -2787,7 +2947,7 @@ static struct target_type pool_target = {
2787 .name = "thin-pool", 2947 .name = "thin-pool",
2788 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2948 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2789 DM_TARGET_IMMUTABLE, 2949 DM_TARGET_IMMUTABLE,
2790 .version = {1, 10, 0}, 2950 .version = {1, 11, 0},
2791 .module = THIS_MODULE, 2951 .module = THIS_MODULE,
2792 .ctr = pool_ctr, 2952 .ctr = pool_ctr,
2793 .dtr = pool_dtr, 2953 .dtr = pool_dtr,
@@ -2894,6 +3054,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2894 3054
2895 if (get_pool_mode(tc->pool) == PM_FAIL) { 3055 if (get_pool_mode(tc->pool) == PM_FAIL) {
2896 ti->error = "Couldn't open thin device, Pool is in fail mode"; 3056 ti->error = "Couldn't open thin device, Pool is in fail mode";
3057 r = -EINVAL;
2897 goto bad_thin_open; 3058 goto bad_thin_open;
2898 } 3059 }
2899 3060
@@ -2905,7 +3066,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2905 3066
2906 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); 3067 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2907 if (r) 3068 if (r)
2908 goto bad_thin_open; 3069 goto bad_target_max_io_len;
2909 3070
2910 ti->num_flush_bios = 1; 3071 ti->num_flush_bios = 1;
2911 ti->flush_supported = true; 3072 ti->flush_supported = true;
@@ -2926,6 +3087,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2926 3087
2927 return 0; 3088 return 0;
2928 3089
3090bad_target_max_io_len:
3091 dm_pool_close_thin_device(tc->td);
2929bad_thin_open: 3092bad_thin_open:
2930 __pool_dec(tc->pool); 3093 __pool_dec(tc->pool);
2931bad_pool_lookup: 3094bad_pool_lookup:
@@ -2986,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
2986 return 0; 3149 return 0;
2987} 3150}
2988 3151
2989static void thin_postsuspend(struct dm_target *ti) 3152static void thin_presuspend(struct dm_target *ti)
2990{ 3153{
3154 struct thin_c *tc = ti->private;
3155
2991 if (dm_noflush_suspending(ti)) 3156 if (dm_noflush_suspending(ti))
2992 requeue_io((struct thin_c *)ti->private); 3157 noflush_work(tc, do_noflush_start);
3158}
3159
3160static void thin_postsuspend(struct dm_target *ti)
3161{
3162 struct thin_c *tc = ti->private;
3163
3164 /*
3165 * The dm_noflush_suspending flag has been cleared by now, so
3166 * unfortunately we must always run this.
3167 */
3168 noflush_work(tc, do_noflush_stop);
2993} 3169}
2994 3170
2995/* 3171/*
@@ -3074,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti,
3074 3250
3075static struct target_type thin_target = { 3251static struct target_type thin_target = {
3076 .name = "thin", 3252 .name = "thin",
3077 .version = {1, 10, 0}, 3253 .version = {1, 11, 0},
3078 .module = THIS_MODULE, 3254 .module = THIS_MODULE,
3079 .ctr = thin_ctr, 3255 .ctr = thin_ctr,
3080 .dtr = thin_dtr, 3256 .dtr = thin_dtr,
3081 .map = thin_map, 3257 .map = thin_map,
3082 .end_io = thin_endio, 3258 .end_io = thin_endio,
3259 .presuspend = thin_presuspend,
3083 .postsuspend = thin_postsuspend, 3260 .postsuspend = thin_postsuspend,
3084 .status = thin_status, 3261 .status = thin_status,
3085 .iterate_devices = thin_iterate_devices, 3262 .iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index 19b268795415..0c2dec7aec20 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA
6 ---help--- 6 ---help---
7 Library providing immutable on-disk data structure support for 7 Library providing immutable on-disk data structure support for
8 device-mapper targets such as the thin provisioning target. 8 device-mapper targets such as the thin provisioning target.
9
10config DM_DEBUG_BLOCK_STACK_TRACING
11 boolean "Keep stack trace of persistent data block lock holders"
12 depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
13 select STACKTRACE
14 ---help---
15 Enable this for messages that may help debug problems with the
16 block manager locking used by thin provisioning and caching.
17
18 If unsure, say N.
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 536782e3bcb7..786b689bdfc7 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -91,6 +91,69 @@ struct block_op {
91 dm_block_t block; 91 dm_block_t block;
92}; 92};
93 93
94struct bop_ring_buffer {
95 unsigned begin;
96 unsigned end;
97 struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
98};
99
100static void brb_init(struct bop_ring_buffer *brb)
101{
102 brb->begin = 0;
103 brb->end = 0;
104}
105
106static bool brb_empty(struct bop_ring_buffer *brb)
107{
108 return brb->begin == brb->end;
109}
110
111static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
112{
113 unsigned r = old + 1;
114 return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
115}
116
117static int brb_push(struct bop_ring_buffer *brb,
118 enum block_op_type type, dm_block_t b)
119{
120 struct block_op *bop;
121 unsigned next = brb_next(brb, brb->end);
122
123 /*
124 * We don't allow the last bop to be filled, this way we can
125 * differentiate between full and empty.
126 */
127 if (next == brb->begin)
128 return -ENOMEM;
129
130 bop = brb->bops + brb->end;
131 bop->type = type;
132 bop->block = b;
133
134 brb->end = next;
135
136 return 0;
137}
138
139static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
140{
141 struct block_op *bop;
142
143 if (brb_empty(brb))
144 return -ENODATA;
145
146 bop = brb->bops + brb->begin;
147 result->type = bop->type;
148 result->block = bop->block;
149
150 brb->begin = brb_next(brb, brb->begin);
151
152 return 0;
153}
154
155/*----------------------------------------------------------------*/
156
94struct sm_metadata { 157struct sm_metadata {
95 struct dm_space_map sm; 158 struct dm_space_map sm;
96 159
@@ -101,25 +164,20 @@ struct sm_metadata {
101 164
102 unsigned recursion_count; 165 unsigned recursion_count;
103 unsigned allocated_this_transaction; 166 unsigned allocated_this_transaction;
104 unsigned nr_uncommitted; 167 struct bop_ring_buffer uncommitted;
105 struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
106 168
107 struct threshold threshold; 169 struct threshold threshold;
108}; 170};
109 171
110static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) 172static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
111{ 173{
112 struct block_op *op; 174 int r = brb_push(&smm->uncommitted, type, b);
113 175
114 if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { 176 if (r) {
115 DMERR("too many recursive allocations"); 177 DMERR("too many recursive allocations");
116 return -ENOMEM; 178 return -ENOMEM;
117 } 179 }
118 180
119 op = smm->uncommitted + smm->nr_uncommitted++;
120 op->type = type;
121 op->block = b;
122
123 return 0; 181 return 0;
124} 182}
125 183
@@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm)
158 return -ENOMEM; 216 return -ENOMEM;
159 } 217 }
160 218
161 if (smm->recursion_count == 1 && smm->nr_uncommitted) { 219 if (smm->recursion_count == 1) {
162 while (smm->nr_uncommitted && !r) { 220 while (!brb_empty(&smm->uncommitted)) {
163 smm->nr_uncommitted--; 221 struct block_op bop;
164 r = commit_bop(smm, smm->uncommitted + 222
165 smm->nr_uncommitted); 223 r = brb_pop(&smm->uncommitted, &bop);
224 if (r) {
225 DMERR("bug in bop ring buffer");
226 break;
227 }
228
229 r = commit_bop(smm, &bop);
166 if (r) 230 if (r)
167 break; 231 break;
168 } 232 }
@@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
217static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, 281static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
218 uint32_t *result) 282 uint32_t *result)
219{ 283{
220 int r, i; 284 int r;
285 unsigned i;
221 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 286 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
222 unsigned adjustment = 0; 287 unsigned adjustment = 0;
223 288
@@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
225 * We may have some uncommitted adjustments to add. This list 290 * We may have some uncommitted adjustments to add. This list
226 * should always be really short. 291 * should always be really short.
227 */ 292 */
228 for (i = 0; i < smm->nr_uncommitted; i++) { 293 for (i = smm->uncommitted.begin;
229 struct block_op *op = smm->uncommitted + i; 294 i != smm->uncommitted.end;
295 i = brb_next(&smm->uncommitted, i)) {
296 struct block_op *op = smm->uncommitted.bops + i;
230 297
231 if (op->block != b) 298 if (op->block != b)
232 continue; 299 continue;
@@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
254static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, 321static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
255 dm_block_t b, int *result) 322 dm_block_t b, int *result)
256{ 323{
257 int r, i, adjustment = 0; 324 int r, adjustment = 0;
325 unsigned i;
258 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 326 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
259 uint32_t rc; 327 uint32_t rc;
260 328
@@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
262 * We may have some uncommitted adjustments to add. This list 330 * We may have some uncommitted adjustments to add. This list
263 * should always be really short. 331 * should always be really short.
264 */ 332 */
265 for (i = 0; i < smm->nr_uncommitted; i++) { 333 for (i = smm->uncommitted.begin;
266 struct block_op *op = smm->uncommitted + i; 334 i != smm->uncommitted.end;
335 i = brb_next(&smm->uncommitted, i)) {
336
337 struct block_op *op = smm->uncommitted.bops + i;
267 338
268 if (op->block != b) 339 if (op->block != b)
269 continue; 340 continue;
@@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
671 smm->begin = superblock + 1; 742 smm->begin = superblock + 1;
672 smm->recursion_count = 0; 743 smm->recursion_count = 0;
673 smm->allocated_this_transaction = 0; 744 smm->allocated_this_transaction = 0;
674 smm->nr_uncommitted = 0; 745 brb_init(&smm->uncommitted);
675 threshold_init(&smm->threshold); 746 threshold_init(&smm->threshold);
676 747
677 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); 748 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
@@ -680,6 +751,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
680 if (r) 751 if (r)
681 return r; 752 return r;
682 753
754 if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS)
755 nr_blocks = DM_SM_METADATA_MAX_BLOCKS;
683 r = sm_ll_extend(&smm->ll, nr_blocks); 756 r = sm_ll_extend(&smm->ll, nr_blocks);
684 if (r) 757 if (r)
685 return r; 758 return r;
@@ -713,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
713 smm->begin = 0; 786 smm->begin = 0;
714 smm->recursion_count = 0; 787 smm->recursion_count = 0;
715 smm->allocated_this_transaction = 0; 788 smm->allocated_this_transaction = 0;
716 smm->nr_uncommitted = 0; 789 brb_init(&smm->uncommitted);
717 threshold_init(&smm->threshold); 790 threshold_init(&smm->threshold);
718 791
719 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); 792 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
index 39bba0801cf2..64df923974d8 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.h
+++ b/drivers/md/persistent-data/dm-space-map-metadata.h
@@ -9,6 +9,17 @@
9 9
10#include "dm-transaction-manager.h" 10#include "dm-transaction-manager.h"
11 11
12#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT)
13
14/*
15 * The metadata device is currently limited in size.
16 *
17 * We have one block of index, which can hold 255 index entries. Each
18 * index entry contains allocation info about ~16k metadata blocks.
19 */
20#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64))
21#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE)
22
12/* 23/*
13 * Unfortunately we have to use two-phase construction due to the cycle 24 * Unfortunately we have to use two-phase construction due to the cycle
14 * between the tm and sm. 25 * between the tm and sm.
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fd3a2a14b587..4a6ca1cb2e78 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1953,11 +1953,15 @@ static int process_checks(struct r1bio *r1_bio)
1953 for (i = 0; i < conf->raid_disks * 2; i++) { 1953 for (i = 0; i < conf->raid_disks * 2; i++) {
1954 int j; 1954 int j;
1955 int size; 1955 int size;
1956 int uptodate;
1956 struct bio *b = r1_bio->bios[i]; 1957 struct bio *b = r1_bio->bios[i];
1957 if (b->bi_end_io != end_sync_read) 1958 if (b->bi_end_io != end_sync_read)
1958 continue; 1959 continue;
1959 /* fixup the bio for reuse */ 1960 /* fixup the bio for reuse, but preserve BIO_UPTODATE */
1961 uptodate = test_bit(BIO_UPTODATE, &b->bi_flags);
1960 bio_reset(b); 1962 bio_reset(b);
1963 if (!uptodate)
1964 clear_bit(BIO_UPTODATE, &b->bi_flags);
1961 b->bi_vcnt = vcnt; 1965 b->bi_vcnt = vcnt;
1962 b->bi_iter.bi_size = r1_bio->sectors << 9; 1966 b->bi_iter.bi_size = r1_bio->sectors << 9;
1963 b->bi_iter.bi_sector = r1_bio->sector + 1967 b->bi_iter.bi_sector = r1_bio->sector +
@@ -1990,11 +1994,14 @@ static int process_checks(struct r1bio *r1_bio)
1990 int j; 1994 int j;
1991 struct bio *pbio = r1_bio->bios[primary]; 1995 struct bio *pbio = r1_bio->bios[primary];
1992 struct bio *sbio = r1_bio->bios[i]; 1996 struct bio *sbio = r1_bio->bios[i];
1997 int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags);
1993 1998
1994 if (sbio->bi_end_io != end_sync_read) 1999 if (sbio->bi_end_io != end_sync_read)
1995 continue; 2000 continue;
2001 /* Now we can 'fixup' the BIO_UPTODATE flag */
2002 set_bit(BIO_UPTODATE, &sbio->bi_flags);
1996 2003
1997 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { 2004 if (uptodate) {
1998 for (j = vcnt; j-- ; ) { 2005 for (j = vcnt; j-- ; ) {
1999 struct page *p, *s; 2006 struct page *p, *s;
2000 p = pbio->bi_io_vec[j].bv_page; 2007 p = pbio->bi_io_vec[j].bv_page;
@@ -2009,7 +2016,7 @@ static int process_checks(struct r1bio *r1_bio)
2009 if (j >= 0) 2016 if (j >= 0)
2010 atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); 2017 atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
2011 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) 2018 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
2012 && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { 2019 && uptodate)) {
2013 /* No need to write to this device. */ 2020 /* No need to write to this device. */
2014 sbio->bi_end_io = NULL; 2021 sbio->bi_end_io = NULL;
2015 rdev_dec_pending(conf->mirrors[i].rdev, mddev); 2022 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f1feadeb7bb2..16f5c21963db 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5514,23 +5514,43 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
5514 return sectors * (raid_disks - conf->max_degraded); 5514 return sectors * (raid_disks - conf->max_degraded);
5515} 5515}
5516 5516
5517static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
5518{
5519 safe_put_page(percpu->spare_page);
5520 kfree(percpu->scribble);
5521 percpu->spare_page = NULL;
5522 percpu->scribble = NULL;
5523}
5524
5525static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
5526{
5527 if (conf->level == 6 && !percpu->spare_page)
5528 percpu->spare_page = alloc_page(GFP_KERNEL);
5529 if (!percpu->scribble)
5530 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
5531
5532 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
5533 free_scratch_buffer(conf, percpu);
5534 return -ENOMEM;
5535 }
5536
5537 return 0;
5538}
5539
5517static void raid5_free_percpu(struct r5conf *conf) 5540static void raid5_free_percpu(struct r5conf *conf)
5518{ 5541{
5519 struct raid5_percpu *percpu;
5520 unsigned long cpu; 5542 unsigned long cpu;
5521 5543
5522 if (!conf->percpu) 5544 if (!conf->percpu)
5523 return; 5545 return;
5524 5546
5525 get_online_cpus();
5526 for_each_possible_cpu(cpu) {
5527 percpu = per_cpu_ptr(conf->percpu, cpu);
5528 safe_put_page(percpu->spare_page);
5529 kfree(percpu->scribble);
5530 }
5531#ifdef CONFIG_HOTPLUG_CPU 5547#ifdef CONFIG_HOTPLUG_CPU
5532 unregister_cpu_notifier(&conf->cpu_notify); 5548 unregister_cpu_notifier(&conf->cpu_notify);
5533#endif 5549#endif
5550
5551 get_online_cpus();
5552 for_each_possible_cpu(cpu)
5553 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
5534 put_online_cpus(); 5554 put_online_cpus();
5535 5555
5536 free_percpu(conf->percpu); 5556 free_percpu(conf->percpu);
@@ -5557,15 +5577,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
5557 switch (action) { 5577 switch (action) {
5558 case CPU_UP_PREPARE: 5578 case CPU_UP_PREPARE:
5559 case CPU_UP_PREPARE_FROZEN: 5579 case CPU_UP_PREPARE_FROZEN:
5560 if (conf->level == 6 && !percpu->spare_page) 5580 if (alloc_scratch_buffer(conf, percpu)) {
5561 percpu->spare_page = alloc_page(GFP_KERNEL);
5562 if (!percpu->scribble)
5563 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
5564
5565 if (!percpu->scribble ||
5566 (conf->level == 6 && !percpu->spare_page)) {
5567 safe_put_page(percpu->spare_page);
5568 kfree(percpu->scribble);
5569 pr_err("%s: failed memory allocation for cpu%ld\n", 5581 pr_err("%s: failed memory allocation for cpu%ld\n",
5570 __func__, cpu); 5582 __func__, cpu);
5571 return notifier_from_errno(-ENOMEM); 5583 return notifier_from_errno(-ENOMEM);
@@ -5573,10 +5585,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
5573 break; 5585 break;
5574 case CPU_DEAD: 5586 case CPU_DEAD:
5575 case CPU_DEAD_FROZEN: 5587 case CPU_DEAD_FROZEN:
5576 safe_put_page(percpu->spare_page); 5588 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
5577 kfree(percpu->scribble);
5578 percpu->spare_page = NULL;
5579 percpu->scribble = NULL;
5580 break; 5589 break;
5581 default: 5590 default:
5582 break; 5591 break;
@@ -5588,40 +5597,29 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
5588static int raid5_alloc_percpu(struct r5conf *conf) 5597static int raid5_alloc_percpu(struct r5conf *conf)
5589{ 5598{
5590 unsigned long cpu; 5599 unsigned long cpu;
5591 struct page *spare_page; 5600 int err = 0;
5592 struct raid5_percpu __percpu *allcpus;
5593 void *scribble;
5594 int err;
5595 5601
5596 allcpus = alloc_percpu(struct raid5_percpu); 5602 conf->percpu = alloc_percpu(struct raid5_percpu);
5597 if (!allcpus) 5603 if (!conf->percpu)
5598 return -ENOMEM; 5604 return -ENOMEM;
5599 conf->percpu = allcpus; 5605
5606#ifdef CONFIG_HOTPLUG_CPU
5607 conf->cpu_notify.notifier_call = raid456_cpu_notify;
5608 conf->cpu_notify.priority = 0;
5609 err = register_cpu_notifier(&conf->cpu_notify);
5610 if (err)
5611 return err;
5612#endif
5600 5613
5601 get_online_cpus(); 5614 get_online_cpus();
5602 err = 0;
5603 for_each_present_cpu(cpu) { 5615 for_each_present_cpu(cpu) {
5604 if (conf->level == 6) { 5616 err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
5605 spare_page = alloc_page(GFP_KERNEL); 5617 if (err) {
5606 if (!spare_page) { 5618 pr_err("%s: failed memory allocation for cpu%ld\n",
5607 err = -ENOMEM; 5619 __func__, cpu);
5608 break;
5609 }
5610 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
5611 }
5612 scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
5613 if (!scribble) {
5614 err = -ENOMEM;
5615 break; 5620 break;
5616 } 5621 }
5617 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
5618 } 5622 }
5619#ifdef CONFIG_HOTPLUG_CPU
5620 conf->cpu_notify.notifier_call = raid456_cpu_notify;
5621 conf->cpu_notify.priority = 0;
5622 if (err == 0)
5623 err = register_cpu_notifier(&conf->cpu_notify);
5624#endif
5625 put_online_cpus(); 5623 put_online_cpus();
5626 5624
5627 return err; 5625 return err;