aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig14
-rw-r--r--drivers/md/bitmap.c2
-rw-r--r--drivers/md/dm-crypt.c219
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-exception-store.c13
-rw-r--r--drivers/md/dm-flakey.c2
-rw-r--r--drivers/md/dm-ioctl.c5
-rw-r--r--drivers/md/dm-linear.c2
-rw-r--r--drivers/md/dm-log.c13
-rw-r--r--drivers/md/dm-mpath.c49
-rw-r--r--drivers/md/dm-raid.c147
-rw-r--r--drivers/md/dm-raid1.c11
-rw-r--r--drivers/md/dm-region-hash.c5
-rw-r--r--drivers/md/dm-snap.c34
-rw-r--r--drivers/md/dm-stripe.c87
-rw-r--r--drivers/md/dm-table.c3
-rw-r--r--drivers/md/dm-thin-metadata.c769
-rw-r--r--drivers/md/dm-thin-metadata.h25
-rw-r--r--drivers/md/dm-thin.c541
-rw-r--r--drivers/md/dm-verity.c2
-rw-r--r--drivers/md/dm.c40
-rw-r--r--drivers/md/dm.h5
-rw-r--r--drivers/md/md.c112
-rw-r--r--drivers/md/md.h11
-rw-r--r--drivers/md/multipath.c3
-rw-r--r--drivers/md/persistent-data/Makefile1
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c105
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h21
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.c438
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.h26
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c12
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.h1
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c25
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c90
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h11
-rw-r--r--drivers/md/raid1.c244
-rw-r--r--drivers/md/raid1.h30
-rw-r--r--drivers/md/raid10.c115
-rw-r--r--drivers/md/raid10.h23
-rw-r--r--drivers/md/raid5.c376
-rw-r--r--drivers/md/raid5.h3
41 files changed, 1999 insertions, 1638 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 10f122a3a856..d949b781f6f8 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -260,15 +260,6 @@ config DM_DEBUG_BLOCK_STACK_TRACING
260 260
261 If unsure, say N. 261 If unsure, say N.
262 262
263config DM_DEBUG_SPACE_MAPS
264 boolean "Extra validation for thin provisioning space maps"
265 depends on DM_THIN_PROVISIONING
266 ---help---
267 Enable this for messages that may help debug problems with the
268 space maps used by thin provisioning.
269
270 If unsure, say N.
271
272config DM_MIRROR 263config DM_MIRROR
273 tristate "Mirror target" 264 tristate "Mirror target"
274 depends on BLK_DEV_DM 265 depends on BLK_DEV_DM
@@ -277,13 +268,14 @@ config DM_MIRROR
277 needed for live data migration tools such as 'pvmove'. 268 needed for live data migration tools such as 'pvmove'.
278 269
279config DM_RAID 270config DM_RAID
280 tristate "RAID 1/4/5/6 target" 271 tristate "RAID 1/4/5/6/10 target"
281 depends on BLK_DEV_DM 272 depends on BLK_DEV_DM
282 select MD_RAID1 273 select MD_RAID1
274 select MD_RAID10
283 select MD_RAID456 275 select MD_RAID456
284 select BLK_DEV_MD 276 select BLK_DEV_MD
285 ---help--- 277 ---help---
286 A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings 278 A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings
287 279
288 A RAID-5 set of N drives with a capacity of C MB per drive provides 280 A RAID-5 set of N drives with a capacity of C MB per drive provides
289 the capacity of C * (N - 1) MB, and protects against a failure 281 the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 15dbe03117e4..94e7f6ba2e11 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1305,7 +1305,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1305 prepare_to_wait(&bitmap->overflow_wait, &__wait, 1305 prepare_to_wait(&bitmap->overflow_wait, &__wait,
1306 TASK_UNINTERRUPTIBLE); 1306 TASK_UNINTERRUPTIBLE);
1307 spin_unlock_irq(&bitmap->counts.lock); 1307 spin_unlock_irq(&bitmap->counts.lock);
1308 io_schedule(); 1308 schedule();
1309 finish_wait(&bitmap->overflow_wait, &__wait); 1309 finish_wait(&bitmap->overflow_wait, &__wait);
1310 continue; 1310 continue;
1311 } 1311 }
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3f06df59fd82..664743d6a6cd 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -42,21 +42,21 @@ struct convert_context {
42 unsigned int offset_out; 42 unsigned int offset_out;
43 unsigned int idx_in; 43 unsigned int idx_in;
44 unsigned int idx_out; 44 unsigned int idx_out;
45 sector_t sector; 45 sector_t cc_sector;
46 atomic_t pending; 46 atomic_t cc_pending;
47}; 47};
48 48
49/* 49/*
50 * per bio private data 50 * per bio private data
51 */ 51 */
52struct dm_crypt_io { 52struct dm_crypt_io {
53 struct dm_target *target; 53 struct crypt_config *cc;
54 struct bio *base_bio; 54 struct bio *base_bio;
55 struct work_struct work; 55 struct work_struct work;
56 56
57 struct convert_context ctx; 57 struct convert_context ctx;
58 58
59 atomic_t pending; 59 atomic_t io_pending;
60 int error; 60 int error;
61 sector_t sector; 61 sector_t sector;
62 struct dm_crypt_io *base_io; 62 struct dm_crypt_io *base_io;
@@ -109,9 +109,6 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
109 */ 109 */
110struct crypt_cpu { 110struct crypt_cpu {
111 struct ablkcipher_request *req; 111 struct ablkcipher_request *req;
112 /* ESSIV: struct crypto_cipher *essiv_tfm */
113 void *iv_private;
114 struct crypto_ablkcipher *tfms[0];
115}; 112};
116 113
117/* 114/*
@@ -151,6 +148,10 @@ struct crypt_config {
151 * per_cpu_ptr() only. 148 * per_cpu_ptr() only.
152 */ 149 */
153 struct crypt_cpu __percpu *cpu; 150 struct crypt_cpu __percpu *cpu;
151
152 /* ESSIV: struct crypto_cipher *essiv_tfm */
153 void *iv_private;
154 struct crypto_ablkcipher **tfms;
154 unsigned tfms_count; 155 unsigned tfms_count;
155 156
156 /* 157 /*
@@ -193,7 +194,7 @@ static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
193 */ 194 */
194static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) 195static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
195{ 196{
196 return __this_cpu_ptr(cc->cpu)->tfms[0]; 197 return cc->tfms[0];
197} 198}
198 199
199/* 200/*
@@ -258,7 +259,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
258 struct hash_desc desc; 259 struct hash_desc desc;
259 struct scatterlist sg; 260 struct scatterlist sg;
260 struct crypto_cipher *essiv_tfm; 261 struct crypto_cipher *essiv_tfm;
261 int err, cpu; 262 int err;
262 263
263 sg_init_one(&sg, cc->key, cc->key_size); 264 sg_init_one(&sg, cc->key, cc->key_size);
264 desc.tfm = essiv->hash_tfm; 265 desc.tfm = essiv->hash_tfm;
@@ -268,14 +269,12 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
268 if (err) 269 if (err)
269 return err; 270 return err;
270 271
271 for_each_possible_cpu(cpu) { 272 essiv_tfm = cc->iv_private;
272 essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
273 273
274 err = crypto_cipher_setkey(essiv_tfm, essiv->salt, 274 err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
275 crypto_hash_digestsize(essiv->hash_tfm)); 275 crypto_hash_digestsize(essiv->hash_tfm));
276 if (err) 276 if (err)
277 return err; 277 return err;
278 }
279 278
280 return 0; 279 return 0;
281} 280}
@@ -286,16 +285,14 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
286 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 285 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
287 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); 286 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
288 struct crypto_cipher *essiv_tfm; 287 struct crypto_cipher *essiv_tfm;
289 int cpu, r, err = 0; 288 int r, err = 0;
290 289
291 memset(essiv->salt, 0, salt_size); 290 memset(essiv->salt, 0, salt_size);
292 291
293 for_each_possible_cpu(cpu) { 292 essiv_tfm = cc->iv_private;
294 essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private; 293 r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
295 r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); 294 if (r)
296 if (r) 295 err = r;
297 err = r;
298 }
299 296
300 return err; 297 return err;
301} 298}
@@ -335,8 +332,6 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
335 332
336static void crypt_iv_essiv_dtr(struct crypt_config *cc) 333static void crypt_iv_essiv_dtr(struct crypt_config *cc)
337{ 334{
338 int cpu;
339 struct crypt_cpu *cpu_cc;
340 struct crypto_cipher *essiv_tfm; 335 struct crypto_cipher *essiv_tfm;
341 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 336 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
342 337
@@ -346,15 +341,12 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc)
346 kzfree(essiv->salt); 341 kzfree(essiv->salt);
347 essiv->salt = NULL; 342 essiv->salt = NULL;
348 343
349 for_each_possible_cpu(cpu) { 344 essiv_tfm = cc->iv_private;
350 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
351 essiv_tfm = cpu_cc->iv_private;
352 345
353 if (essiv_tfm) 346 if (essiv_tfm)
354 crypto_free_cipher(essiv_tfm); 347 crypto_free_cipher(essiv_tfm);
355 348
356 cpu_cc->iv_private = NULL; 349 cc->iv_private = NULL;
357 }
358} 350}
359 351
360static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, 352static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -363,7 +355,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
363 struct crypto_cipher *essiv_tfm = NULL; 355 struct crypto_cipher *essiv_tfm = NULL;
364 struct crypto_hash *hash_tfm = NULL; 356 struct crypto_hash *hash_tfm = NULL;
365 u8 *salt = NULL; 357 u8 *salt = NULL;
366 int err, cpu; 358 int err;
367 359
368 if (!opts) { 360 if (!opts) {
369 ti->error = "Digest algorithm missing for ESSIV mode"; 361 ti->error = "Digest algorithm missing for ESSIV mode";
@@ -388,15 +380,13 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
388 cc->iv_gen_private.essiv.salt = salt; 380 cc->iv_gen_private.essiv.salt = salt;
389 cc->iv_gen_private.essiv.hash_tfm = hash_tfm; 381 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
390 382
391 for_each_possible_cpu(cpu) { 383 essiv_tfm = setup_essiv_cpu(cc, ti, salt,
392 essiv_tfm = setup_essiv_cpu(cc, ti, salt, 384 crypto_hash_digestsize(hash_tfm));
393 crypto_hash_digestsize(hash_tfm)); 385 if (IS_ERR(essiv_tfm)) {
394 if (IS_ERR(essiv_tfm)) { 386 crypt_iv_essiv_dtr(cc);
395 crypt_iv_essiv_dtr(cc); 387 return PTR_ERR(essiv_tfm);
396 return PTR_ERR(essiv_tfm);
397 }
398 per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
399 } 388 }
389 cc->iv_private = essiv_tfm;
400 390
401 return 0; 391 return 0;
402 392
@@ -410,7 +400,7 @@ bad:
410static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, 400static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
411 struct dm_crypt_request *dmreq) 401 struct dm_crypt_request *dmreq)
412{ 402{
413 struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; 403 struct crypto_cipher *essiv_tfm = cc->iv_private;
414 404
415 memset(iv, 0, cc->iv_size); 405 memset(iv, 0, cc->iv_size);
416 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); 406 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
@@ -664,7 +654,7 @@ static void crypt_convert_init(struct crypt_config *cc,
664 ctx->offset_out = 0; 654 ctx->offset_out = 0;
665 ctx->idx_in = bio_in ? bio_in->bi_idx : 0; 655 ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
666 ctx->idx_out = bio_out ? bio_out->bi_idx : 0; 656 ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
667 ctx->sector = sector + cc->iv_offset; 657 ctx->cc_sector = sector + cc->iv_offset;
668 init_completion(&ctx->restart); 658 init_completion(&ctx->restart);
669} 659}
670 660
@@ -695,12 +685,12 @@ static int crypt_convert_block(struct crypt_config *cc,
695 struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); 685 struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
696 struct dm_crypt_request *dmreq; 686 struct dm_crypt_request *dmreq;
697 u8 *iv; 687 u8 *iv;
698 int r = 0; 688 int r;
699 689
700 dmreq = dmreq_of_req(cc, req); 690 dmreq = dmreq_of_req(cc, req);
701 iv = iv_of_dmreq(cc, dmreq); 691 iv = iv_of_dmreq(cc, dmreq);
702 692
703 dmreq->iv_sector = ctx->sector; 693 dmreq->iv_sector = ctx->cc_sector;
704 dmreq->ctx = ctx; 694 dmreq->ctx = ctx;
705 sg_init_table(&dmreq->sg_in, 1); 695 sg_init_table(&dmreq->sg_in, 1);
706 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, 696 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -749,12 +739,12 @@ static void crypt_alloc_req(struct crypt_config *cc,
749 struct convert_context *ctx) 739 struct convert_context *ctx)
750{ 740{
751 struct crypt_cpu *this_cc = this_crypt_config(cc); 741 struct crypt_cpu *this_cc = this_crypt_config(cc);
752 unsigned key_index = ctx->sector & (cc->tfms_count - 1); 742 unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
753 743
754 if (!this_cc->req) 744 if (!this_cc->req)
755 this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); 745 this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
756 746
757 ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]); 747 ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]);
758 ablkcipher_request_set_callback(this_cc->req, 748 ablkcipher_request_set_callback(this_cc->req,
759 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, 749 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
760 kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); 750 kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
@@ -769,14 +759,14 @@ static int crypt_convert(struct crypt_config *cc,
769 struct crypt_cpu *this_cc = this_crypt_config(cc); 759 struct crypt_cpu *this_cc = this_crypt_config(cc);
770 int r; 760 int r;
771 761
772 atomic_set(&ctx->pending, 1); 762 atomic_set(&ctx->cc_pending, 1);
773 763
774 while(ctx->idx_in < ctx->bio_in->bi_vcnt && 764 while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
775 ctx->idx_out < ctx->bio_out->bi_vcnt) { 765 ctx->idx_out < ctx->bio_out->bi_vcnt) {
776 766
777 crypt_alloc_req(cc, ctx); 767 crypt_alloc_req(cc, ctx);
778 768
779 atomic_inc(&ctx->pending); 769 atomic_inc(&ctx->cc_pending);
780 770
781 r = crypt_convert_block(cc, ctx, this_cc->req); 771 r = crypt_convert_block(cc, ctx, this_cc->req);
782 772
@@ -788,19 +778,19 @@ static int crypt_convert(struct crypt_config *cc,
788 /* fall through*/ 778 /* fall through*/
789 case -EINPROGRESS: 779 case -EINPROGRESS:
790 this_cc->req = NULL; 780 this_cc->req = NULL;
791 ctx->sector++; 781 ctx->cc_sector++;
792 continue; 782 continue;
793 783
794 /* sync */ 784 /* sync */
795 case 0: 785 case 0:
796 atomic_dec(&ctx->pending); 786 atomic_dec(&ctx->cc_pending);
797 ctx->sector++; 787 ctx->cc_sector++;
798 cond_resched(); 788 cond_resched();
799 continue; 789 continue;
800 790
801 /* error */ 791 /* error */
802 default: 792 default:
803 atomic_dec(&ctx->pending); 793 atomic_dec(&ctx->cc_pending);
804 return r; 794 return r;
805 } 795 }
806 } 796 }
@@ -811,7 +801,7 @@ static int crypt_convert(struct crypt_config *cc,
811static void dm_crypt_bio_destructor(struct bio *bio) 801static void dm_crypt_bio_destructor(struct bio *bio)
812{ 802{
813 struct dm_crypt_io *io = bio->bi_private; 803 struct dm_crypt_io *io = bio->bi_private;
814 struct crypt_config *cc = io->target->private; 804 struct crypt_config *cc = io->cc;
815 805
816 bio_free(bio, cc->bs); 806 bio_free(bio, cc->bs);
817} 807}
@@ -825,7 +815,7 @@ static void dm_crypt_bio_destructor(struct bio *bio)
825static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, 815static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
826 unsigned *out_of_pages) 816 unsigned *out_of_pages)
827{ 817{
828 struct crypt_config *cc = io->target->private; 818 struct crypt_config *cc = io->cc;
829 struct bio *clone; 819 struct bio *clone;
830 unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 820 unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
831 gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; 821 gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
@@ -884,26 +874,25 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
884 } 874 }
885} 875}
886 876
887static struct dm_crypt_io *crypt_io_alloc(struct dm_target *ti, 877static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
888 struct bio *bio, sector_t sector) 878 struct bio *bio, sector_t sector)
889{ 879{
890 struct crypt_config *cc = ti->private;
891 struct dm_crypt_io *io; 880 struct dm_crypt_io *io;
892 881
893 io = mempool_alloc(cc->io_pool, GFP_NOIO); 882 io = mempool_alloc(cc->io_pool, GFP_NOIO);
894 io->target = ti; 883 io->cc = cc;
895 io->base_bio = bio; 884 io->base_bio = bio;
896 io->sector = sector; 885 io->sector = sector;
897 io->error = 0; 886 io->error = 0;
898 io->base_io = NULL; 887 io->base_io = NULL;
899 atomic_set(&io->pending, 0); 888 atomic_set(&io->io_pending, 0);
900 889
901 return io; 890 return io;
902} 891}
903 892
904static void crypt_inc_pending(struct dm_crypt_io *io) 893static void crypt_inc_pending(struct dm_crypt_io *io)
905{ 894{
906 atomic_inc(&io->pending); 895 atomic_inc(&io->io_pending);
907} 896}
908 897
909/* 898/*
@@ -913,12 +902,12 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
913 */ 902 */
914static void crypt_dec_pending(struct dm_crypt_io *io) 903static void crypt_dec_pending(struct dm_crypt_io *io)
915{ 904{
916 struct crypt_config *cc = io->target->private; 905 struct crypt_config *cc = io->cc;
917 struct bio *base_bio = io->base_bio; 906 struct bio *base_bio = io->base_bio;
918 struct dm_crypt_io *base_io = io->base_io; 907 struct dm_crypt_io *base_io = io->base_io;
919 int error = io->error; 908 int error = io->error;
920 909
921 if (!atomic_dec_and_test(&io->pending)) 910 if (!atomic_dec_and_test(&io->io_pending))
922 return; 911 return;
923 912
924 mempool_free(io, cc->io_pool); 913 mempool_free(io, cc->io_pool);
@@ -952,7 +941,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
952static void crypt_endio(struct bio *clone, int error) 941static void crypt_endio(struct bio *clone, int error)
953{ 942{
954 struct dm_crypt_io *io = clone->bi_private; 943 struct dm_crypt_io *io = clone->bi_private;
955 struct crypt_config *cc = io->target->private; 944 struct crypt_config *cc = io->cc;
956 unsigned rw = bio_data_dir(clone); 945 unsigned rw = bio_data_dir(clone);
957 946
958 if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) 947 if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
@@ -979,7 +968,7 @@ static void crypt_endio(struct bio *clone, int error)
979 968
980static void clone_init(struct dm_crypt_io *io, struct bio *clone) 969static void clone_init(struct dm_crypt_io *io, struct bio *clone)
981{ 970{
982 struct crypt_config *cc = io->target->private; 971 struct crypt_config *cc = io->cc;
983 972
984 clone->bi_private = io; 973 clone->bi_private = io;
985 clone->bi_end_io = crypt_endio; 974 clone->bi_end_io = crypt_endio;
@@ -990,7 +979,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
990 979
991static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) 980static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
992{ 981{
993 struct crypt_config *cc = io->target->private; 982 struct crypt_config *cc = io->cc;
994 struct bio *base_bio = io->base_bio; 983 struct bio *base_bio = io->base_bio;
995 struct bio *clone; 984 struct bio *clone;
996 985
@@ -1038,7 +1027,7 @@ static void kcryptd_io(struct work_struct *work)
1038 1027
1039static void kcryptd_queue_io(struct dm_crypt_io *io) 1028static void kcryptd_queue_io(struct dm_crypt_io *io)
1040{ 1029{
1041 struct crypt_config *cc = io->target->private; 1030 struct crypt_config *cc = io->cc;
1042 1031
1043 INIT_WORK(&io->work, kcryptd_io); 1032 INIT_WORK(&io->work, kcryptd_io);
1044 queue_work(cc->io_queue, &io->work); 1033 queue_work(cc->io_queue, &io->work);
@@ -1047,7 +1036,7 @@ static void kcryptd_queue_io(struct dm_crypt_io *io)
1047static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) 1036static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
1048{ 1037{
1049 struct bio *clone = io->ctx.bio_out; 1038 struct bio *clone = io->ctx.bio_out;
1050 struct crypt_config *cc = io->target->private; 1039 struct crypt_config *cc = io->cc;
1051 1040
1052 if (unlikely(io->error < 0)) { 1041 if (unlikely(io->error < 0)) {
1053 crypt_free_buffer_pages(cc, clone); 1042 crypt_free_buffer_pages(cc, clone);
@@ -1069,7 +1058,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
1069 1058
1070static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) 1059static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1071{ 1060{
1072 struct crypt_config *cc = io->target->private; 1061 struct crypt_config *cc = io->cc;
1073 struct bio *clone; 1062 struct bio *clone;
1074 struct dm_crypt_io *new_io; 1063 struct dm_crypt_io *new_io;
1075 int crypt_finished; 1064 int crypt_finished;
@@ -1107,7 +1096,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1107 if (r < 0) 1096 if (r < 0)
1108 io->error = -EIO; 1097 io->error = -EIO;
1109 1098
1110 crypt_finished = atomic_dec_and_test(&io->ctx.pending); 1099 crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
1111 1100
1112 /* Encryption was already finished, submit io now */ 1101 /* Encryption was already finished, submit io now */
1113 if (crypt_finished) { 1102 if (crypt_finished) {
@@ -1135,7 +1124,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1135 * between fragments, so switch to a new dm_crypt_io structure. 1124 * between fragments, so switch to a new dm_crypt_io structure.
1136 */ 1125 */
1137 if (unlikely(!crypt_finished && remaining)) { 1126 if (unlikely(!crypt_finished && remaining)) {
1138 new_io = crypt_io_alloc(io->target, io->base_bio, 1127 new_io = crypt_io_alloc(io->cc, io->base_bio,
1139 sector); 1128 sector);
1140 crypt_inc_pending(new_io); 1129 crypt_inc_pending(new_io);
1141 crypt_convert_init(cc, &new_io->ctx, NULL, 1130 crypt_convert_init(cc, &new_io->ctx, NULL,
@@ -1169,7 +1158,7 @@ static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
1169 1158
1170static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) 1159static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
1171{ 1160{
1172 struct crypt_config *cc = io->target->private; 1161 struct crypt_config *cc = io->cc;
1173 int r = 0; 1162 int r = 0;
1174 1163
1175 crypt_inc_pending(io); 1164 crypt_inc_pending(io);
@@ -1181,7 +1170,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
1181 if (r < 0) 1170 if (r < 0)
1182 io->error = -EIO; 1171 io->error = -EIO;
1183 1172
1184 if (atomic_dec_and_test(&io->ctx.pending)) 1173 if (atomic_dec_and_test(&io->ctx.cc_pending))
1185 kcryptd_crypt_read_done(io); 1174 kcryptd_crypt_read_done(io);
1186 1175
1187 crypt_dec_pending(io); 1176 crypt_dec_pending(io);
@@ -1193,7 +1182,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1193 struct dm_crypt_request *dmreq = async_req->data; 1182 struct dm_crypt_request *dmreq = async_req->data;
1194 struct convert_context *ctx = dmreq->ctx; 1183 struct convert_context *ctx = dmreq->ctx;
1195 struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); 1184 struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
1196 struct crypt_config *cc = io->target->private; 1185 struct crypt_config *cc = io->cc;
1197 1186
1198 if (error == -EINPROGRESS) { 1187 if (error == -EINPROGRESS) {
1199 complete(&ctx->restart); 1188 complete(&ctx->restart);
@@ -1208,7 +1197,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1208 1197
1209 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); 1198 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
1210 1199
1211 if (!atomic_dec_and_test(&ctx->pending)) 1200 if (!atomic_dec_and_test(&ctx->cc_pending))
1212 return; 1201 return;
1213 1202
1214 if (bio_data_dir(io->base_bio) == READ) 1203 if (bio_data_dir(io->base_bio) == READ)
@@ -1229,7 +1218,7 @@ static void kcryptd_crypt(struct work_struct *work)
1229 1218
1230static void kcryptd_queue_crypt(struct dm_crypt_io *io) 1219static void kcryptd_queue_crypt(struct dm_crypt_io *io)
1231{ 1220{
1232 struct crypt_config *cc = io->target->private; 1221 struct crypt_config *cc = io->cc;
1233 1222
1234 INIT_WORK(&io->work, kcryptd_crypt); 1223 INIT_WORK(&io->work, kcryptd_crypt);
1235 queue_work(cc->crypt_queue, &io->work); 1224 queue_work(cc->crypt_queue, &io->work);
@@ -1241,7 +1230,6 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
1241static int crypt_decode_key(u8 *key, char *hex, unsigned int size) 1230static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
1242{ 1231{
1243 char buffer[3]; 1232 char buffer[3];
1244 char *endp;
1245 unsigned int i; 1233 unsigned int i;
1246 1234
1247 buffer[2] = '\0'; 1235 buffer[2] = '\0';
@@ -1250,9 +1238,7 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
1250 buffer[0] = *hex++; 1238 buffer[0] = *hex++;
1251 buffer[1] = *hex++; 1239 buffer[1] = *hex++;
1252 1240
1253 key[i] = (u8)simple_strtoul(buffer, &endp, 16); 1241 if (kstrtou8(buffer, 16, &key[i]))
1254
1255 if (endp != &buffer[2])
1256 return -EINVAL; 1242 return -EINVAL;
1257 } 1243 }
1258 1244
@@ -1276,29 +1262,38 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
1276 } 1262 }
1277} 1263}
1278 1264
1279static void crypt_free_tfms(struct crypt_config *cc, int cpu) 1265static void crypt_free_tfms(struct crypt_config *cc)
1280{ 1266{
1281 struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1282 unsigned i; 1267 unsigned i;
1283 1268
1269 if (!cc->tfms)
1270 return;
1271
1284 for (i = 0; i < cc->tfms_count; i++) 1272 for (i = 0; i < cc->tfms_count; i++)
1285 if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) { 1273 if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) {
1286 crypto_free_ablkcipher(cpu_cc->tfms[i]); 1274 crypto_free_ablkcipher(cc->tfms[i]);
1287 cpu_cc->tfms[i] = NULL; 1275 cc->tfms[i] = NULL;
1288 } 1276 }
1277
1278 kfree(cc->tfms);
1279 cc->tfms = NULL;
1289} 1280}
1290 1281
1291static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode) 1282static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
1292{ 1283{
1293 struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1294 unsigned i; 1284 unsigned i;
1295 int err; 1285 int err;
1296 1286
1287 cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_ablkcipher *),
1288 GFP_KERNEL);
1289 if (!cc->tfms)
1290 return -ENOMEM;
1291
1297 for (i = 0; i < cc->tfms_count; i++) { 1292 for (i = 0; i < cc->tfms_count; i++) {
1298 cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); 1293 cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
1299 if (IS_ERR(cpu_cc->tfms[i])) { 1294 if (IS_ERR(cc->tfms[i])) {
1300 err = PTR_ERR(cpu_cc->tfms[i]); 1295 err = PTR_ERR(cc->tfms[i]);
1301 crypt_free_tfms(cc, cpu); 1296 crypt_free_tfms(cc);
1302 return err; 1297 return err;
1303 } 1298 }
1304 } 1299 }
@@ -1309,15 +1304,14 @@ static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
1309static int crypt_setkey_allcpus(struct crypt_config *cc) 1304static int crypt_setkey_allcpus(struct crypt_config *cc)
1310{ 1305{
1311 unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); 1306 unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
1312 int cpu, err = 0, i, r; 1307 int err = 0, i, r;
1313 1308
1314 for_each_possible_cpu(cpu) { 1309 for (i = 0; i < cc->tfms_count; i++) {
1315 for (i = 0; i < cc->tfms_count; i++) { 1310 r = crypto_ablkcipher_setkey(cc->tfms[i],
1316 r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i], 1311 cc->key + (i * subkey_size),
1317 cc->key + (i * subkey_size), subkey_size); 1312 subkey_size);
1318 if (r) 1313 if (r)
1319 err = r; 1314 err = r;
1320 }
1321 } 1315 }
1322 1316
1323 return err; 1317 return err;
@@ -1379,9 +1373,10 @@ static void crypt_dtr(struct dm_target *ti)
1379 cpu_cc = per_cpu_ptr(cc->cpu, cpu); 1373 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1380 if (cpu_cc->req) 1374 if (cpu_cc->req)
1381 mempool_free(cpu_cc->req, cc->req_pool); 1375 mempool_free(cpu_cc->req, cc->req_pool);
1382 crypt_free_tfms(cc, cpu);
1383 } 1376 }
1384 1377
1378 crypt_free_tfms(cc);
1379
1385 if (cc->bs) 1380 if (cc->bs)
1386 bioset_free(cc->bs); 1381 bioset_free(cc->bs);
1387 1382
@@ -1414,7 +1409,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1414 struct crypt_config *cc = ti->private; 1409 struct crypt_config *cc = ti->private;
1415 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; 1410 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
1416 char *cipher_api = NULL; 1411 char *cipher_api = NULL;
1417 int cpu, ret = -EINVAL; 1412 int ret = -EINVAL;
1418 char dummy; 1413 char dummy;
1419 1414
1420 /* Convert to crypto api definition? */ 1415 /* Convert to crypto api definition? */
@@ -1455,8 +1450,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1455 if (tmp) 1450 if (tmp)
1456 DMWARN("Ignoring unexpected additional cipher options"); 1451 DMWARN("Ignoring unexpected additional cipher options");
1457 1452
1458 cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) + 1453 cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
1459 cc->tfms_count * sizeof(*(cc->cpu->tfms)),
1460 __alignof__(struct crypt_cpu)); 1454 __alignof__(struct crypt_cpu));
1461 if (!cc->cpu) { 1455 if (!cc->cpu) {
1462 ti->error = "Cannot allocate per cpu state"; 1456 ti->error = "Cannot allocate per cpu state";
@@ -1489,12 +1483,10 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1489 } 1483 }
1490 1484
1491 /* Allocate cipher */ 1485 /* Allocate cipher */
1492 for_each_possible_cpu(cpu) { 1486 ret = crypt_alloc_tfms(cc, cipher_api);
1493 ret = crypt_alloc_tfms(cc, cpu, cipher_api); 1487 if (ret < 0) {
1494 if (ret < 0) { 1488 ti->error = "Error allocating crypto tfm";
1495 ti->error = "Error allocating crypto tfm"; 1489 goto bad;
1496 goto bad;
1497 }
1498 } 1490 }
1499 1491
1500 /* Initialize and set key */ 1492 /* Initialize and set key */
@@ -1702,7 +1694,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1702 } 1694 }
1703 1695
1704 ti->num_flush_requests = 1; 1696 ti->num_flush_requests = 1;
1705 ti->discard_zeroes_data_unsupported = 1; 1697 ti->discard_zeroes_data_unsupported = true;
1706 1698
1707 return 0; 1699 return 0;
1708 1700
@@ -1715,7 +1707,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1715 union map_info *map_context) 1707 union map_info *map_context)
1716{ 1708{
1717 struct dm_crypt_io *io; 1709 struct dm_crypt_io *io;
1718 struct crypt_config *cc; 1710 struct crypt_config *cc = ti->private;
1719 1711
1720 /* 1712 /*
1721 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues. 1713 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
@@ -1723,14 +1715,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1723 * - for REQ_DISCARD caller must use flush if IO ordering matters 1715 * - for REQ_DISCARD caller must use flush if IO ordering matters
1724 */ 1716 */
1725 if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { 1717 if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
1726 cc = ti->private;
1727 bio->bi_bdev = cc->dev->bdev; 1718 bio->bi_bdev = cc->dev->bdev;
1728 if (bio_sectors(bio)) 1719 if (bio_sectors(bio))
1729 bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); 1720 bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
1730 return DM_MAPIO_REMAPPED; 1721 return DM_MAPIO_REMAPPED;
1731 } 1722 }
1732 1723
1733 io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); 1724 io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_sector));
1734 1725
1735 if (bio_data_dir(io->base_bio) == READ) { 1726 if (bio_data_dir(io->base_bio) == READ) {
1736 if (kcryptd_io_read(io, GFP_NOWAIT)) 1727 if (kcryptd_io_read(io, GFP_NOWAIT))
@@ -1742,7 +1733,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1742} 1733}
1743 1734
1744static int crypt_status(struct dm_target *ti, status_type_t type, 1735static int crypt_status(struct dm_target *ti, status_type_t type,
1745 char *result, unsigned int maxlen) 1736 unsigned status_flags, char *result, unsigned maxlen)
1746{ 1737{
1747 struct crypt_config *cc = ti->private; 1738 struct crypt_config *cc = ti->private;
1748 unsigned int sz = 0; 1739 unsigned int sz = 0;
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 2dc22dddb2ae..f53846f9ab50 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -295,7 +295,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio,
295} 295}
296 296
297static int delay_status(struct dm_target *ti, status_type_t type, 297static int delay_status(struct dm_target *ti, status_type_t type,
298 char *result, unsigned maxlen) 298 unsigned status_flags, char *result, unsigned maxlen)
299{ 299{
300 struct delay_c *dc = ti->private; 300 struct delay_c *dc = ti->private;
301 int sz = 0; 301 int sz = 0;
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index aa70f7d43a1a..ebaa4f803eec 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -142,24 +142,19 @@ EXPORT_SYMBOL(dm_exception_store_type_unregister);
142static int set_chunk_size(struct dm_exception_store *store, 142static int set_chunk_size(struct dm_exception_store *store,
143 const char *chunk_size_arg, char **error) 143 const char *chunk_size_arg, char **error)
144{ 144{
145 unsigned long chunk_size_ulong; 145 unsigned chunk_size;
146 char *value;
147 146
148 chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10); 147 if (kstrtouint(chunk_size_arg, 10, &chunk_size)) {
149 if (*chunk_size_arg == '\0' || *value != '\0' ||
150 chunk_size_ulong > UINT_MAX) {
151 *error = "Invalid chunk size"; 148 *error = "Invalid chunk size";
152 return -EINVAL; 149 return -EINVAL;
153 } 150 }
154 151
155 if (!chunk_size_ulong) { 152 if (!chunk_size) {
156 store->chunk_size = store->chunk_mask = store->chunk_shift = 0; 153 store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
157 return 0; 154 return 0;
158 } 155 }
159 156
160 return dm_exception_store_set_chunk_size(store, 157 return dm_exception_store_set_chunk_size(store, chunk_size, error);
161 (unsigned) chunk_size_ulong,
162 error);
163} 158}
164 159
165int dm_exception_store_set_chunk_size(struct dm_exception_store *store, 160int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index ac49c01f1a44..cc15543a6ad7 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -333,7 +333,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
333} 333}
334 334
335static int flakey_status(struct dm_target *ti, status_type_t type, 335static int flakey_status(struct dm_target *ti, status_type_t type,
336 char *result, unsigned int maxlen) 336 unsigned status_flags, char *result, unsigned maxlen)
337{ 337{
338 unsigned sz = 0; 338 unsigned sz = 0;
339 struct flakey_c *fc = ti->private; 339 struct flakey_c *fc = ti->private;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index a1a3e6df17b8..afd95986d099 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1054,6 +1054,7 @@ static void retrieve_status(struct dm_table *table,
1054 char *outbuf, *outptr; 1054 char *outbuf, *outptr;
1055 status_type_t type; 1055 status_type_t type;
1056 size_t remaining, len, used = 0; 1056 size_t remaining, len, used = 0;
1057 unsigned status_flags = 0;
1057 1058
1058 outptr = outbuf = get_result_buffer(param, param_size, &len); 1059 outptr = outbuf = get_result_buffer(param, param_size, &len);
1059 1060
@@ -1090,7 +1091,9 @@ static void retrieve_status(struct dm_table *table,
1090 1091
1091 /* Get the status/table string from the target driver */ 1092 /* Get the status/table string from the target driver */
1092 if (ti->type->status) { 1093 if (ti->type->status) {
1093 if (ti->type->status(ti, type, outptr, remaining)) { 1094 if (param->flags & DM_NOFLUSH_FLAG)
1095 status_flags |= DM_STATUS_NOFLUSH_FLAG;
1096 if (ti->type->status(ti, type, status_flags, outptr, remaining)) {
1094 param->flags |= DM_BUFFER_FULL_FLAG; 1097 param->flags |= DM_BUFFER_FULL_FLAG;
1095 break; 1098 break;
1096 } 1099 }
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 3639eeab6042..1bf19a93eef0 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -96,7 +96,7 @@ static int linear_map(struct dm_target *ti, struct bio *bio,
96} 96}
97 97
98static int linear_status(struct dm_target *ti, status_type_t type, 98static int linear_status(struct dm_target *ti, status_type_t type,
99 char *result, unsigned int maxlen) 99 unsigned status_flags, char *result, unsigned maxlen)
100{ 100{
101 struct linear_c *lc = (struct linear_c *) ti->private; 101 struct linear_c *lc = (struct linear_c *) ti->private;
102 102
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 65ebaebf502b..627d19186d5a 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -571,16 +571,6 @@ static void disk_dtr(struct dm_dirty_log *log)
571 destroy_log_context(lc); 571 destroy_log_context(lc);
572} 572}
573 573
574static int count_bits32(uint32_t *addr, unsigned size)
575{
576 int count = 0, i;
577
578 for (i = 0; i < size; i++) {
579 count += hweight32(*(addr+i));
580 }
581 return count;
582}
583
584static void fail_log_device(struct log_c *lc) 574static void fail_log_device(struct log_c *lc)
585{ 575{
586 if (lc->log_dev_failed) 576 if (lc->log_dev_failed)
@@ -629,7 +619,8 @@ static int disk_resume(struct dm_dirty_log *log)
629 619
630 /* copy clean across to sync */ 620 /* copy clean across to sync */
631 memcpy(lc->sync_bits, lc->clean_bits, size); 621 memcpy(lc->sync_bits, lc->clean_bits, size);
632 lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); 622 lc->sync_count = memweight(lc->clean_bits,
623 lc->bitset_uint32_count * sizeof(uint32_t));
633 lc->sync_search = 0; 624 lc->sync_search = 0;
634 625
635 /* set the correct number of regions in the header */ 626 /* set the correct number of regions in the header */
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 638dae048b4f..d8abb90a6c2f 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -85,6 +85,7 @@ struct multipath {
85 unsigned queue_io:1; /* Must we queue all I/O? */ 85 unsigned queue_io:1; /* Must we queue all I/O? */
86 unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ 86 unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */
87 unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ 87 unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
88 unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
88 89
89 unsigned pg_init_retries; /* Number of times to retry pg_init */ 90 unsigned pg_init_retries; /* Number of times to retry pg_init */
90 unsigned pg_init_count; /* Number of times pg_init called */ 91 unsigned pg_init_count; /* Number of times pg_init called */
@@ -568,6 +569,8 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
568 int r; 569 int r;
569 struct pgpath *p; 570 struct pgpath *p;
570 struct multipath *m = ti->private; 571 struct multipath *m = ti->private;
572 struct request_queue *q = NULL;
573 const char *attached_handler_name;
571 574
572 /* we need at least a path arg */ 575 /* we need at least a path arg */
573 if (as->argc < 1) { 576 if (as->argc < 1) {
@@ -586,13 +589,37 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
586 goto bad; 589 goto bad;
587 } 590 }
588 591
589 if (m->hw_handler_name) { 592 if (m->retain_attached_hw_handler || m->hw_handler_name)
590 struct request_queue *q = bdev_get_queue(p->path.dev->bdev); 593 q = bdev_get_queue(p->path.dev->bdev);
594
595 if (m->retain_attached_hw_handler) {
596 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
597 if (attached_handler_name) {
598 /*
599 * Reset hw_handler_name to match the attached handler
600 * and clear any hw_handler_params associated with the
601 * ignored handler.
602 *
603 * NB. This modifies the table line to show the actual
604 * handler instead of the original table passed in.
605 */
606 kfree(m->hw_handler_name);
607 m->hw_handler_name = attached_handler_name;
608
609 kfree(m->hw_handler_params);
610 m->hw_handler_params = NULL;
611 }
612 }
591 613
614 if (m->hw_handler_name) {
615 /*
616 * Increments scsi_dh reference, even when using an
617 * already-attached handler.
618 */
592 r = scsi_dh_attach(q, m->hw_handler_name); 619 r = scsi_dh_attach(q, m->hw_handler_name);
593 if (r == -EBUSY) { 620 if (r == -EBUSY) {
594 /* 621 /*
595 * Already attached to different hw_handler, 622 * Already attached to different hw_handler:
596 * try to reattach with correct one. 623 * try to reattach with correct one.
597 */ 624 */
598 scsi_dh_detach(q); 625 scsi_dh_detach(q);
@@ -760,7 +787,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
760 const char *arg_name; 787 const char *arg_name;
761 788
762 static struct dm_arg _args[] = { 789 static struct dm_arg _args[] = {
763 {0, 5, "invalid number of feature args"}, 790 {0, 6, "invalid number of feature args"},
764 {1, 50, "pg_init_retries must be between 1 and 50"}, 791 {1, 50, "pg_init_retries must be between 1 and 50"},
765 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, 792 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
766 }; 793 };
@@ -781,6 +808,11 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
781 continue; 808 continue;
782 } 809 }
783 810
811 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
812 m->retain_attached_hw_handler = 1;
813 continue;
814 }
815
784 if (!strcasecmp(arg_name, "pg_init_retries") && 816 if (!strcasecmp(arg_name, "pg_init_retries") &&
785 (argc >= 1)) { 817 (argc >= 1)) {
786 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); 818 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
@@ -1346,7 +1378,7 @@ static void multipath_resume(struct dm_target *ti)
1346 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ 1378 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1347 */ 1379 */
1348static int multipath_status(struct dm_target *ti, status_type_t type, 1380static int multipath_status(struct dm_target *ti, status_type_t type,
1349 char *result, unsigned int maxlen) 1381 unsigned status_flags, char *result, unsigned maxlen)
1350{ 1382{
1351 int sz = 0; 1383 int sz = 0;
1352 unsigned long flags; 1384 unsigned long flags;
@@ -1364,13 +1396,16 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
1364 else { 1396 else {
1365 DMEMIT("%u ", m->queue_if_no_path + 1397 DMEMIT("%u ", m->queue_if_no_path +
1366 (m->pg_init_retries > 0) * 2 + 1398 (m->pg_init_retries > 0) * 2 +
1367 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2); 1399 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1400 m->retain_attached_hw_handler);
1368 if (m->queue_if_no_path) 1401 if (m->queue_if_no_path)
1369 DMEMIT("queue_if_no_path "); 1402 DMEMIT("queue_if_no_path ");
1370 if (m->pg_init_retries) 1403 if (m->pg_init_retries)
1371 DMEMIT("pg_init_retries %u ", m->pg_init_retries); 1404 DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1372 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) 1405 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1373 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); 1406 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1407 if (m->retain_attached_hw_handler)
1408 DMEMIT("retain_attached_hw_handler ");
1374 } 1409 }
1375 1410
1376 if (!m->hw_handler_name || type == STATUSTYPE_INFO) 1411 if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1656,7 +1691,7 @@ out:
1656 *---------------------------------------------------------------*/ 1691 *---------------------------------------------------------------*/
1657static struct target_type multipath_target = { 1692static struct target_type multipath_target = {
1658 .name = "multipath", 1693 .name = "multipath",
1659 .version = {1, 4, 0}, 1694 .version = {1, 5, 0},
1660 .module = THIS_MODULE, 1695 .module = THIS_MODULE,
1661 .ctr = multipath_ctr, 1696 .ctr = multipath_ctr,
1662 .dtr = multipath_dtr, 1697 .dtr = multipath_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 017c34d78d61..982e3e390c45 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -11,6 +11,7 @@
11#include "md.h" 11#include "md.h"
12#include "raid1.h" 12#include "raid1.h"
13#include "raid5.h" 13#include "raid5.h"
14#include "raid10.h"
14#include "bitmap.h" 15#include "bitmap.h"
15 16
16#include <linux/device-mapper.h> 17#include <linux/device-mapper.h>
@@ -52,7 +53,10 @@ struct raid_dev {
52#define DMPF_MAX_RECOVERY_RATE 0x20 53#define DMPF_MAX_RECOVERY_RATE 0x20
53#define DMPF_MAX_WRITE_BEHIND 0x40 54#define DMPF_MAX_WRITE_BEHIND 0x40
54#define DMPF_STRIPE_CACHE 0x80 55#define DMPF_STRIPE_CACHE 0x80
55#define DMPF_REGION_SIZE 0X100 56#define DMPF_REGION_SIZE 0x100
57#define DMPF_RAID10_COPIES 0x200
58#define DMPF_RAID10_FORMAT 0x400
59
56struct raid_set { 60struct raid_set {
57 struct dm_target *ti; 61 struct dm_target *ti;
58 62
@@ -76,6 +80,7 @@ static struct raid_type {
76 const unsigned algorithm; /* RAID algorithm. */ 80 const unsigned algorithm; /* RAID algorithm. */
77} raid_types[] = { 81} raid_types[] = {
78 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, 82 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
83 {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */},
79 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, 84 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
80 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 85 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
81 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, 86 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -86,6 +91,17 @@ static struct raid_type {
86 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
87}; 92};
88 93
94static unsigned raid10_md_layout_to_copies(int layout)
95{
96 return layout & 0xFF;
97}
98
99static int raid10_format_to_md_layout(char *format, unsigned copies)
100{
101 /* 1 "far" copy, and 'copies' "near" copies */
102 return (1 << 8) | (copies & 0xFF);
103}
104
89static struct raid_type *get_raid_type(char *name) 105static struct raid_type *get_raid_type(char *name)
90{ 106{
91 int i; 107 int i;
@@ -101,20 +117,12 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
101{ 117{
102 unsigned i; 118 unsigned i;
103 struct raid_set *rs; 119 struct raid_set *rs;
104 sector_t sectors_per_dev;
105 120
106 if (raid_devs <= raid_type->parity_devs) { 121 if (raid_devs <= raid_type->parity_devs) {
107 ti->error = "Insufficient number of devices"; 122 ti->error = "Insufficient number of devices";
108 return ERR_PTR(-EINVAL); 123 return ERR_PTR(-EINVAL);
109 } 124 }
110 125
111 sectors_per_dev = ti->len;
112 if ((raid_type->level > 1) &&
113 sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
114 ti->error = "Target length not divisible by number of data devices";
115 return ERR_PTR(-EINVAL);
116 }
117
118 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); 126 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
119 if (!rs) { 127 if (!rs) {
120 ti->error = "Cannot allocate raid context"; 128 ti->error = "Cannot allocate raid context";
@@ -128,7 +136,6 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
128 rs->md.raid_disks = raid_devs; 136 rs->md.raid_disks = raid_devs;
129 rs->md.level = raid_type->level; 137 rs->md.level = raid_type->level;
130 rs->md.new_level = rs->md.level; 138 rs->md.new_level = rs->md.level;
131 rs->md.dev_sectors = sectors_per_dev;
132 rs->md.layout = raid_type->algorithm; 139 rs->md.layout = raid_type->algorithm;
133 rs->md.new_layout = rs->md.layout; 140 rs->md.new_layout = rs->md.layout;
134 rs->md.delta_disks = 0; 141 rs->md.delta_disks = 0;
@@ -143,6 +150,7 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
143 * rs->md.external 150 * rs->md.external
144 * rs->md.chunk_sectors 151 * rs->md.chunk_sectors
145 * rs->md.new_chunk_sectors 152 * rs->md.new_chunk_sectors
153 * rs->md.dev_sectors
146 */ 154 */
147 155
148 return rs; 156 return rs;
@@ -347,12 +355,20 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
347 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 355 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
348 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 356 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
349 * [region_size <sectors>] Defines granularity of bitmap 357 * [region_size <sectors>] Defines granularity of bitmap
358 *
359 * RAID10-only options:
360 * [raid10_copies <# copies>] Number of copies. (Default: 2)
361 * [raid10_format <near>] Layout algorithm. (Default: near)
350 */ 362 */
351static int parse_raid_params(struct raid_set *rs, char **argv, 363static int parse_raid_params(struct raid_set *rs, char **argv,
352 unsigned num_raid_params) 364 unsigned num_raid_params)
353{ 365{
366 char *raid10_format = "near";
367 unsigned raid10_copies = 2;
354 unsigned i, rebuild_cnt = 0; 368 unsigned i, rebuild_cnt = 0;
355 unsigned long value, region_size = 0; 369 unsigned long value, region_size = 0;
370 sector_t sectors_per_dev = rs->ti->len;
371 sector_t max_io_len;
356 char *key; 372 char *key;
357 373
358 /* 374 /*
@@ -422,20 +438,53 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
422 } 438 }
423 439
424 key = argv[i++]; 440 key = argv[i++];
441
442 /* Parameters that take a string value are checked here. */
443 if (!strcasecmp(key, "raid10_format")) {
444 if (rs->raid_type->level != 10) {
445 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
446 return -EINVAL;
447 }
448 if (strcmp("near", argv[i])) {
449 rs->ti->error = "Invalid 'raid10_format' value given";
450 return -EINVAL;
451 }
452 raid10_format = argv[i];
453 rs->print_flags |= DMPF_RAID10_FORMAT;
454 continue;
455 }
456
425 if (strict_strtoul(argv[i], 10, &value) < 0) { 457 if (strict_strtoul(argv[i], 10, &value) < 0) {
426 rs->ti->error = "Bad numerical argument given in raid params"; 458 rs->ti->error = "Bad numerical argument given in raid params";
427 return -EINVAL; 459 return -EINVAL;
428 } 460 }
429 461
462 /* Parameters that take a numeric value are checked here */
430 if (!strcasecmp(key, "rebuild")) { 463 if (!strcasecmp(key, "rebuild")) {
431 rebuild_cnt++; 464 rebuild_cnt++;
432 if (((rs->raid_type->level != 1) && 465
433 (rebuild_cnt > rs->raid_type->parity_devs)) || 466 switch (rs->raid_type->level) {
434 ((rs->raid_type->level == 1) && 467 case 1:
435 (rebuild_cnt > (rs->md.raid_disks - 1)))) { 468 if (rebuild_cnt >= rs->md.raid_disks) {
436 rs->ti->error = "Too many rebuild devices specified for given RAID type"; 469 rs->ti->error = "Too many rebuild devices specified";
470 return -EINVAL;
471 }
472 break;
473 case 4:
474 case 5:
475 case 6:
476 if (rebuild_cnt > rs->raid_type->parity_devs) {
477 rs->ti->error = "Too many rebuild devices specified for given RAID type";
478 return -EINVAL;
479 }
480 break;
481 case 10:
482 default:
483 DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
484 rs->ti->error = "Rebuild not supported for this RAID type";
437 return -EINVAL; 485 return -EINVAL;
438 } 486 }
487
439 if (value > rs->md.raid_disks) { 488 if (value > rs->md.raid_disks) {
440 rs->ti->error = "Invalid rebuild index given"; 489 rs->ti->error = "Invalid rebuild index given";
441 return -EINVAL; 490 return -EINVAL;
@@ -486,7 +535,8 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
486 */ 535 */
487 value /= 2; 536 value /= 2;
488 537
489 if (rs->raid_type->level < 5) { 538 if ((rs->raid_type->level != 5) &&
539 (rs->raid_type->level != 6)) {
490 rs->ti->error = "Inappropriate argument: stripe_cache"; 540 rs->ti->error = "Inappropriate argument: stripe_cache";
491 return -EINVAL; 541 return -EINVAL;
492 } 542 }
@@ -511,6 +561,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
511 } else if (!strcasecmp(key, "region_size")) { 561 } else if (!strcasecmp(key, "region_size")) {
512 rs->print_flags |= DMPF_REGION_SIZE; 562 rs->print_flags |= DMPF_REGION_SIZE;
513 region_size = value; 563 region_size = value;
564 } else if (!strcasecmp(key, "raid10_copies") &&
565 (rs->raid_type->level == 10)) {
566 if ((value < 2) || (value > 0xFF)) {
567 rs->ti->error = "Bad value for 'raid10_copies'";
568 return -EINVAL;
569 }
570 rs->print_flags |= DMPF_RAID10_COPIES;
571 raid10_copies = value;
514 } else { 572 } else {
515 DMERR("Unable to parse RAID parameter: %s", key); 573 DMERR("Unable to parse RAID parameter: %s", key);
516 rs->ti->error = "Unable to parse RAID parameters"; 574 rs->ti->error = "Unable to parse RAID parameters";
@@ -522,14 +580,33 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
522 return -EINVAL; 580 return -EINVAL;
523 581
524 if (rs->md.chunk_sectors) 582 if (rs->md.chunk_sectors)
525 rs->ti->split_io = rs->md.chunk_sectors; 583 max_io_len = rs->md.chunk_sectors;
526 else 584 else
527 rs->ti->split_io = region_size; 585 max_io_len = region_size;
528 586
529 if (rs->md.chunk_sectors) 587 if (dm_set_target_max_io_len(rs->ti, max_io_len))
530 rs->ti->split_io = rs->md.chunk_sectors; 588 return -EINVAL;
531 else 589
532 rs->ti->split_io = region_size; 590 if (rs->raid_type->level == 10) {
591 if (raid10_copies > rs->md.raid_disks) {
592 rs->ti->error = "Not enough devices to satisfy specification";
593 return -EINVAL;
594 }
595
596 /* (Len * #mirrors) / #devices */
597 sectors_per_dev = rs->ti->len * raid10_copies;
598 sector_div(sectors_per_dev, rs->md.raid_disks);
599
600 rs->md.layout = raid10_format_to_md_layout(raid10_format,
601 raid10_copies);
602 rs->md.new_layout = rs->md.layout;
603 } else if ((rs->raid_type->level > 1) &&
604 sector_div(sectors_per_dev,
605 (rs->md.raid_disks - rs->raid_type->parity_devs))) {
606 rs->ti->error = "Target length not divisible by number of data devices";
607 return -EINVAL;
608 }
609 rs->md.dev_sectors = sectors_per_dev;
533 610
534 /* Assume there are no metadata devices until the drives are parsed */ 611 /* Assume there are no metadata devices until the drives are parsed */
535 rs->md.persistent = 0; 612 rs->md.persistent = 0;
@@ -552,6 +629,9 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
552 if (rs->raid_type->level == 1) 629 if (rs->raid_type->level == 1)
553 return md_raid1_congested(&rs->md, bits); 630 return md_raid1_congested(&rs->md, bits);
554 631
632 if (rs->raid_type->level == 10)
633 return md_raid10_congested(&rs->md, bits);
634
555 return md_raid5_congested(&rs->md, bits); 635 return md_raid5_congested(&rs->md, bits);
556} 636}
557 637
@@ -870,6 +950,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
870 case 6: 950 case 6:
871 redundancy = rs->raid_type->parity_devs; 951 redundancy = rs->raid_type->parity_devs;
872 break; 952 break;
953 case 10:
954 redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
955 break;
873 default: 956 default:
874 ti->error = "Unknown RAID type"; 957 ti->error = "Unknown RAID type";
875 return -EINVAL; 958 return -EINVAL;
@@ -1035,12 +1118,19 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
1035 goto bad; 1118 goto bad;
1036 } 1119 }
1037 1120
1121 if (ti->len != rs->md.array_sectors) {
1122 ti->error = "Array size does not match requested target length";
1123 ret = -EINVAL;
1124 goto size_mismatch;
1125 }
1038 rs->callbacks.congested_fn = raid_is_congested; 1126 rs->callbacks.congested_fn = raid_is_congested;
1039 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 1127 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
1040 1128
1041 mddev_suspend(&rs->md); 1129 mddev_suspend(&rs->md);
1042 return 0; 1130 return 0;
1043 1131
1132size_mismatch:
1133 md_stop(&rs->md);
1044bad: 1134bad:
1045 context_free(rs); 1135 context_free(rs);
1046 1136
@@ -1067,7 +1157,7 @@ static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_c
1067} 1157}
1068 1158
1069static int raid_status(struct dm_target *ti, status_type_t type, 1159static int raid_status(struct dm_target *ti, status_type_t type,
1070 char *result, unsigned maxlen) 1160 unsigned status_flags, char *result, unsigned maxlen)
1071{ 1161{
1072 struct raid_set *rs = ti->private; 1162 struct raid_set *rs = ti->private;
1073 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ 1163 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
@@ -1189,6 +1279,13 @@ static int raid_status(struct dm_target *ti, status_type_t type,
1189 DMEMIT(" region_size %lu", 1279 DMEMIT(" region_size %lu",
1190 rs->md.bitmap_info.chunksize >> 9); 1280 rs->md.bitmap_info.chunksize >> 9);
1191 1281
1282 if (rs->print_flags & DMPF_RAID10_COPIES)
1283 DMEMIT(" raid10_copies %u",
1284 raid10_md_layout_to_copies(rs->md.layout));
1285
1286 if (rs->print_flags & DMPF_RAID10_FORMAT)
1287 DMEMIT(" raid10_format near");
1288
1192 DMEMIT(" %d", rs->md.raid_disks); 1289 DMEMIT(" %d", rs->md.raid_disks);
1193 for (i = 0; i < rs->md.raid_disks; i++) { 1290 for (i = 0; i < rs->md.raid_disks; i++) {
1194 if (rs->dev[i].meta_dev) 1291 if (rs->dev[i].meta_dev)
@@ -1263,7 +1360,7 @@ static void raid_resume(struct dm_target *ti)
1263 1360
1264static struct target_type raid_target = { 1361static struct target_type raid_target = {
1265 .name = "raid", 1362 .name = "raid",
1266 .version = {1, 2, 0}, 1363 .version = {1, 3, 0},
1267 .module = THIS_MODULE, 1364 .module = THIS_MODULE,
1268 .ctr = raid_ctr, 1365 .ctr = raid_ctr,
1269 .dtr = raid_dtr, 1366 .dtr = raid_dtr,
@@ -1290,6 +1387,8 @@ module_init(dm_raid_init);
1290module_exit(dm_raid_exit); 1387module_exit(dm_raid_exit);
1291 1388
1292MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); 1389MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
1390MODULE_ALIAS("dm-raid1");
1391MODULE_ALIAS("dm-raid10");
1293MODULE_ALIAS("dm-raid4"); 1392MODULE_ALIAS("dm-raid4");
1294MODULE_ALIAS("dm-raid5"); 1393MODULE_ALIAS("dm-raid5");
1295MODULE_ALIAS("dm-raid6"); 1394MODULE_ALIAS("dm-raid6");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d039de8322f0..bc5ddba8045b 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1081,9 +1081,14 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1081 } 1081 }
1082 1082
1083 ti->private = ms; 1083 ti->private = ms;
1084 ti->split_io = dm_rh_get_region_size(ms->rh); 1084
1085 r = dm_set_target_max_io_len(ti, dm_rh_get_region_size(ms->rh));
1086 if (r)
1087 goto err_free_context;
1088
1085 ti->num_flush_requests = 1; 1089 ti->num_flush_requests = 1;
1086 ti->num_discard_requests = 1; 1090 ti->num_discard_requests = 1;
1091 ti->discard_zeroes_data_unsupported = true;
1087 1092
1088 ms->kmirrord_wq = alloc_workqueue("kmirrord", 1093 ms->kmirrord_wq = alloc_workqueue("kmirrord",
1089 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); 1094 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
@@ -1214,7 +1219,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1214 * We need to dec pending if this was a write. 1219 * We need to dec pending if this was a write.
1215 */ 1220 */
1216 if (rw == WRITE) { 1221 if (rw == WRITE) {
1217 if (!(bio->bi_rw & REQ_FLUSH)) 1222 if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)))
1218 dm_rh_dec(ms->rh, map_context->ll); 1223 dm_rh_dec(ms->rh, map_context->ll);
1219 return error; 1224 return error;
1220 } 1225 }
@@ -1362,7 +1367,7 @@ static char device_status_char(struct mirror *m)
1362 1367
1363 1368
1364static int mirror_status(struct dm_target *ti, status_type_t type, 1369static int mirror_status(struct dm_target *ti, status_type_t type,
1365 char *result, unsigned int maxlen) 1370 unsigned status_flags, char *result, unsigned maxlen)
1366{ 1371{
1367 unsigned int m, sz = 0; 1372 unsigned int m, sz = 0;
1368 struct mirror_set *ms = (struct mirror_set *) ti->private; 1373 struct mirror_set *ms = (struct mirror_set *) ti->private;
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 7771ed212182..69732e03eb34 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -404,6 +404,9 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
404 return; 404 return;
405 } 405 }
406 406
407 if (bio->bi_rw & REQ_DISCARD)
408 return;
409
407 /* We must inform the log that the sync count has changed. */ 410 /* We must inform the log that the sync count has changed. */
408 log->type->set_region_sync(log, region, 0); 411 log->type->set_region_sync(log, region, 0);
409 412
@@ -524,7 +527,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
524 struct bio *bio; 527 struct bio *bio;
525 528
526 for (bio = bios->head; bio; bio = bio->bi_next) { 529 for (bio = bios->head; bio; bio = bio->bi_next) {
527 if (bio->bi_rw & REQ_FLUSH) 530 if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))
528 continue; 531 continue;
529 rh_inc(rh, dm_rh_bio_to_region(rh, bio)); 532 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
530 } 533 }
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 6f758870fc19..a143921feaf6 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -691,7 +691,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
691 * Return a minimum chunk size of all snapshots that have the specified origin. 691 * Return a minimum chunk size of all snapshots that have the specified origin.
692 * Return zero if the origin has no snapshots. 692 * Return zero if the origin has no snapshots.
693 */ 693 */
694static sector_t __minimum_chunk_size(struct origin *o) 694static uint32_t __minimum_chunk_size(struct origin *o)
695{ 695{
696 struct dm_snapshot *snap; 696 struct dm_snapshot *snap;
697 unsigned chunk_size = 0; 697 unsigned chunk_size = 0;
@@ -701,7 +701,7 @@ static sector_t __minimum_chunk_size(struct origin *o)
701 chunk_size = min_not_zero(chunk_size, 701 chunk_size = min_not_zero(chunk_size,
702 snap->store->chunk_size); 702 snap->store->chunk_size);
703 703
704 return chunk_size; 704 return (uint32_t) chunk_size;
705} 705}
706 706
707/* 707/*
@@ -1172,7 +1172,10 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1172 ti->error = "Chunk size not set"; 1172 ti->error = "Chunk size not set";
1173 goto bad_read_metadata; 1173 goto bad_read_metadata;
1174 } 1174 }
1175 ti->split_io = s->store->chunk_size; 1175
1176 r = dm_set_target_max_io_len(ti, s->store->chunk_size);
1177 if (r)
1178 goto bad_read_metadata;
1176 1179
1177 return 0; 1180 return 0;
1178 1181
@@ -1239,7 +1242,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src,
1239 snap_dest->store->snap = snap_dest; 1242 snap_dest->store->snap = snap_dest;
1240 snap_src->store->snap = snap_src; 1243 snap_src->store->snap = snap_src;
1241 1244
1242 snap_dest->ti->split_io = snap_dest->store->chunk_size; 1245 snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
1243 snap_dest->valid = snap_src->valid; 1246 snap_dest->valid = snap_src->valid;
1244 1247
1245 /* 1248 /*
@@ -1817,9 +1820,9 @@ static void snapshot_resume(struct dm_target *ti)
1817 up_write(&s->lock); 1820 up_write(&s->lock);
1818} 1821}
1819 1822
1820static sector_t get_origin_minimum_chunksize(struct block_device *bdev) 1823static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
1821{ 1824{
1822 sector_t min_chunksize; 1825 uint32_t min_chunksize;
1823 1826
1824 down_read(&_origins_lock); 1827 down_read(&_origins_lock);
1825 min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); 1828 min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
@@ -1838,15 +1841,15 @@ static void snapshot_merge_resume(struct dm_target *ti)
1838 snapshot_resume(ti); 1841 snapshot_resume(ti);
1839 1842
1840 /* 1843 /*
1841 * snapshot-merge acts as an origin, so set ti->split_io 1844 * snapshot-merge acts as an origin, so set ti->max_io_len
1842 */ 1845 */
1843 ti->split_io = get_origin_minimum_chunksize(s->origin->bdev); 1846 ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev);
1844 1847
1845 start_merge(s); 1848 start_merge(s);
1846} 1849}
1847 1850
1848static int snapshot_status(struct dm_target *ti, status_type_t type, 1851static int snapshot_status(struct dm_target *ti, status_type_t type,
1849 char *result, unsigned int maxlen) 1852 unsigned status_flags, char *result, unsigned maxlen)
1850{ 1853{
1851 unsigned sz = 0; 1854 unsigned sz = 0;
1852 struct dm_snapshot *snap = ti->private; 1855 struct dm_snapshot *snap = ti->private;
@@ -2073,12 +2076,12 @@ static int origin_write_extent(struct dm_snapshot *merging_snap,
2073 struct origin *o; 2076 struct origin *o;
2074 2077
2075 /* 2078 /*
2076 * The origin's __minimum_chunk_size() got stored in split_io 2079 * The origin's __minimum_chunk_size() got stored in max_io_len
2077 * by snapshot_merge_resume(). 2080 * by snapshot_merge_resume().
2078 */ 2081 */
2079 down_read(&_origins_lock); 2082 down_read(&_origins_lock);
2080 o = __lookup_origin(merging_snap->origin->bdev); 2083 o = __lookup_origin(merging_snap->origin->bdev);
2081 for (n = 0; n < size; n += merging_snap->ti->split_io) 2084 for (n = 0; n < size; n += merging_snap->ti->max_io_len)
2082 if (__origin_write(&o->snapshots, sector + n, NULL) == 2085 if (__origin_write(&o->snapshots, sector + n, NULL) ==
2083 DM_MAPIO_SUBMITTED) 2086 DM_MAPIO_SUBMITTED)
2084 must_wait = 1; 2087 must_wait = 1;
@@ -2138,18 +2141,18 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
2138} 2141}
2139 2142
2140/* 2143/*
2141 * Set the target "split_io" field to the minimum of all the snapshots' 2144 * Set the target "max_io_len" field to the minimum of all the snapshots'
2142 * chunk sizes. 2145 * chunk sizes.
2143 */ 2146 */
2144static void origin_resume(struct dm_target *ti) 2147static void origin_resume(struct dm_target *ti)
2145{ 2148{
2146 struct dm_dev *dev = ti->private; 2149 struct dm_dev *dev = ti->private;
2147 2150
2148 ti->split_io = get_origin_minimum_chunksize(dev->bdev); 2151 ti->max_io_len = get_origin_minimum_chunksize(dev->bdev);
2149} 2152}
2150 2153
2151static int origin_status(struct dm_target *ti, status_type_t type, char *result, 2154static int origin_status(struct dm_target *ti, status_type_t type,
2152 unsigned int maxlen) 2155 unsigned status_flags, char *result, unsigned maxlen)
2153{ 2156{
2154 struct dm_dev *dev = ti->private; 2157 struct dm_dev *dev = ti->private;
2155 2158
@@ -2176,7 +2179,6 @@ static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2176 return max_size; 2179 return max_size;
2177 2180
2178 bvm->bi_bdev = dev->bdev; 2181 bvm->bi_bdev = dev->bdev;
2179 bvm->bi_sector = bvm->bi_sector;
2180 2182
2181 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2183 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2182} 2184}
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 35c94ff24ad5..a087bf2a8d66 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -26,14 +26,12 @@ struct stripe {
26struct stripe_c { 26struct stripe_c {
27 uint32_t stripes; 27 uint32_t stripes;
28 int stripes_shift; 28 int stripes_shift;
29 sector_t stripes_mask;
30 29
31 /* The size of this target / num. stripes */ 30 /* The size of this target / num. stripes */
32 sector_t stripe_width; 31 sector_t stripe_width;
33 32
34 /* stripe chunk size */ 33 uint32_t chunk_size;
35 uint32_t chunk_shift; 34 int chunk_size_shift;
36 sector_t chunk_mask;
37 35
38 /* Needed for handling events */ 36 /* Needed for handling events */
39 struct dm_target *ti; 37 struct dm_target *ti;
@@ -91,7 +89,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
91 89
92/* 90/*
93 * Construct a striped mapping. 91 * Construct a striped mapping.
94 * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+ 92 * <number of stripes> <chunk size> [<dev_path> <offset>]+
95 */ 93 */
96static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) 94static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
97{ 95{
@@ -99,7 +97,6 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
99 sector_t width; 97 sector_t width;
100 uint32_t stripes; 98 uint32_t stripes;
101 uint32_t chunk_size; 99 uint32_t chunk_size;
102 char *end;
103 int r; 100 int r;
104 unsigned int i; 101 unsigned int i;
105 102
@@ -108,34 +105,23 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
108 return -EINVAL; 105 return -EINVAL;
109 } 106 }
110 107
111 stripes = simple_strtoul(argv[0], &end, 10); 108 if (kstrtouint(argv[0], 10, &stripes) || !stripes) {
112 if (!stripes || *end) {
113 ti->error = "Invalid stripe count"; 109 ti->error = "Invalid stripe count";
114 return -EINVAL; 110 return -EINVAL;
115 } 111 }
116 112
117 chunk_size = simple_strtoul(argv[1], &end, 10); 113 if (kstrtouint(argv[1], 10, &chunk_size) || !chunk_size) {
118 if (*end) {
119 ti->error = "Invalid chunk_size"; 114 ti->error = "Invalid chunk_size";
120 return -EINVAL; 115 return -EINVAL;
121 } 116 }
122 117
123 /* 118 width = ti->len;
124 * chunk_size is a power of two 119 if (sector_div(width, chunk_size)) {
125 */
126 if (!is_power_of_2(chunk_size) ||
127 (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
128 ti->error = "Invalid chunk size";
129 return -EINVAL;
130 }
131
132 if (ti->len & (chunk_size - 1)) {
133 ti->error = "Target length not divisible by " 120 ti->error = "Target length not divisible by "
134 "chunk size"; 121 "chunk size";
135 return -EINVAL; 122 return -EINVAL;
136 } 123 }
137 124
138 width = ti->len;
139 if (sector_div(width, stripes)) { 125 if (sector_div(width, stripes)) {
140 ti->error = "Target length not divisible by " 126 ti->error = "Target length not divisible by "
141 "number of stripes"; 127 "number of stripes";
@@ -167,17 +153,21 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
167 153
168 if (stripes & (stripes - 1)) 154 if (stripes & (stripes - 1))
169 sc->stripes_shift = -1; 155 sc->stripes_shift = -1;
170 else { 156 else
171 sc->stripes_shift = ffs(stripes) - 1; 157 sc->stripes_shift = __ffs(stripes);
172 sc->stripes_mask = ((sector_t) stripes) - 1; 158
173 } 159 r = dm_set_target_max_io_len(ti, chunk_size);
160 if (r)
161 return r;
174 162
175 ti->split_io = chunk_size;
176 ti->num_flush_requests = stripes; 163 ti->num_flush_requests = stripes;
177 ti->num_discard_requests = stripes; 164 ti->num_discard_requests = stripes;
178 165
179 sc->chunk_shift = ffs(chunk_size) - 1; 166 sc->chunk_size = chunk_size;
180 sc->chunk_mask = ((sector_t) chunk_size) - 1; 167 if (chunk_size & (chunk_size - 1))
168 sc->chunk_size_shift = -1;
169 else
170 sc->chunk_size_shift = __ffs(chunk_size);
181 171
182 /* 172 /*
183 * Get the stripe destinations. 173 * Get the stripe destinations.
@@ -216,17 +206,29 @@ static void stripe_dtr(struct dm_target *ti)
216static void stripe_map_sector(struct stripe_c *sc, sector_t sector, 206static void stripe_map_sector(struct stripe_c *sc, sector_t sector,
217 uint32_t *stripe, sector_t *result) 207 uint32_t *stripe, sector_t *result)
218{ 208{
219 sector_t offset = dm_target_offset(sc->ti, sector); 209 sector_t chunk = dm_target_offset(sc->ti, sector);
220 sector_t chunk = offset >> sc->chunk_shift; 210 sector_t chunk_offset;
211
212 if (sc->chunk_size_shift < 0)
213 chunk_offset = sector_div(chunk, sc->chunk_size);
214 else {
215 chunk_offset = chunk & (sc->chunk_size - 1);
216 chunk >>= sc->chunk_size_shift;
217 }
221 218
222 if (sc->stripes_shift < 0) 219 if (sc->stripes_shift < 0)
223 *stripe = sector_div(chunk, sc->stripes); 220 *stripe = sector_div(chunk, sc->stripes);
224 else { 221 else {
225 *stripe = chunk & sc->stripes_mask; 222 *stripe = chunk & (sc->stripes - 1);
226 chunk >>= sc->stripes_shift; 223 chunk >>= sc->stripes_shift;
227 } 224 }
228 225
229 *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask); 226 if (sc->chunk_size_shift < 0)
227 chunk *= sc->chunk_size;
228 else
229 chunk <<= sc->chunk_size_shift;
230
231 *result = chunk + chunk_offset;
230} 232}
231 233
232static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, 234static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
@@ -237,9 +239,16 @@ static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
237 stripe_map_sector(sc, sector, &stripe, result); 239 stripe_map_sector(sc, sector, &stripe, result);
238 if (stripe == target_stripe) 240 if (stripe == target_stripe)
239 return; 241 return;
240 *result &= ~sc->chunk_mask; /* round down */ 242
243 /* round down */
244 sector = *result;
245 if (sc->chunk_size_shift < 0)
246 *result -= sector_div(sector, sc->chunk_size);
247 else
248 *result = sector & ~(sector_t)(sc->chunk_size - 1);
249
241 if (target_stripe < stripe) 250 if (target_stripe < stripe)
242 *result += sc->chunk_mask + 1; /* next chunk */ 251 *result += sc->chunk_size; /* next chunk */
243} 252}
244 253
245static int stripe_map_discard(struct stripe_c *sc, struct bio *bio, 254static int stripe_map_discard(struct stripe_c *sc, struct bio *bio,
@@ -302,8 +311,8 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
302 * 311 *
303 */ 312 */
304 313
305static int stripe_status(struct dm_target *ti, 314static int stripe_status(struct dm_target *ti, status_type_t type,
306 status_type_t type, char *result, unsigned int maxlen) 315 unsigned status_flags, char *result, unsigned maxlen)
307{ 316{
308 struct stripe_c *sc = (struct stripe_c *) ti->private; 317 struct stripe_c *sc = (struct stripe_c *) ti->private;
309 char buffer[sc->stripes + 1]; 318 char buffer[sc->stripes + 1];
@@ -324,7 +333,7 @@ static int stripe_status(struct dm_target *ti,
324 333
325 case STATUSTYPE_TABLE: 334 case STATUSTYPE_TABLE:
326 DMEMIT("%d %llu", sc->stripes, 335 DMEMIT("%d %llu", sc->stripes,
327 (unsigned long long)sc->chunk_mask + 1); 336 (unsigned long long)sc->chunk_size);
328 for (i = 0; i < sc->stripes; i++) 337 for (i = 0; i < sc->stripes; i++)
329 DMEMIT(" %s %llu", sc->stripe[i].dev->name, 338 DMEMIT(" %s %llu", sc->stripe[i].dev->name,
330 (unsigned long long)sc->stripe[i].physical_start); 339 (unsigned long long)sc->stripe[i].physical_start);
@@ -391,7 +400,7 @@ static void stripe_io_hints(struct dm_target *ti,
391 struct queue_limits *limits) 400 struct queue_limits *limits)
392{ 401{
393 struct stripe_c *sc = ti->private; 402 struct stripe_c *sc = ti->private;
394 unsigned chunk_size = (sc->chunk_mask + 1) << 9; 403 unsigned chunk_size = sc->chunk_size << SECTOR_SHIFT;
395 404
396 blk_limits_io_min(limits, chunk_size); 405 blk_limits_io_min(limits, chunk_size);
397 blk_limits_io_opt(limits, chunk_size * sc->stripes); 406 blk_limits_io_opt(limits, chunk_size * sc->stripes);
@@ -419,7 +428,7 @@ static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
419 428
420static struct target_type stripe_target = { 429static struct target_type stripe_target = {
421 .name = "striped", 430 .name = "striped",
422 .version = {1, 4, 0}, 431 .version = {1, 5, 0},
423 .module = THIS_MODULE, 432 .module = THIS_MODULE,
424 .ctr = stripe_ctr, 433 .ctr = stripe_ctr,
425 .dtr = stripe_dtr, 434 .dtr = stripe_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2e227fbf1622..f90069029aae 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1319,6 +1319,9 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
1319 if (!ti->num_flush_requests) 1319 if (!ti->num_flush_requests)
1320 continue; 1320 continue;
1321 1321
1322 if (ti->flush_supported)
1323 return 1;
1324
1322 if (ti->type->iterate_devices && 1325 if (ti->type->iterate_devices &&
1323 ti->type->iterate_devices(ti, device_flush_capable, &flush)) 1326 ti->type->iterate_devices(ti, device_flush_capable, &flush))
1324 return 1; 1327 return 1;
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 3e2907f0bc46..693e149e9727 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2011 Red Hat, Inc. 2 * Copyright (C) 2011-2012 Red Hat, Inc.
3 * 3 *
4 * This file is released under the GPL. 4 * This file is released under the GPL.
5 */ 5 */
@@ -80,6 +80,12 @@
80#define THIN_METADATA_CACHE_SIZE 64 80#define THIN_METADATA_CACHE_SIZE 64
81#define SECTOR_TO_BLOCK_SHIFT 3 81#define SECTOR_TO_BLOCK_SHIFT 3
82 82
83/*
84 * 3 for btree insert +
85 * 2 for btree lookup used within space map
86 */
87#define THIN_MAX_CONCURRENT_LOCKS 5
88
83/* This should be plenty */ 89/* This should be plenty */
84#define SPACE_MAP_ROOT_SIZE 128 90#define SPACE_MAP_ROOT_SIZE 128
85 91
@@ -172,13 +178,20 @@ struct dm_pool_metadata {
172 178
173 struct rw_semaphore root_lock; 179 struct rw_semaphore root_lock;
174 uint32_t time; 180 uint32_t time;
175 int need_commit;
176 dm_block_t root; 181 dm_block_t root;
177 dm_block_t details_root; 182 dm_block_t details_root;
178 struct list_head thin_devices; 183 struct list_head thin_devices;
179 uint64_t trans_id; 184 uint64_t trans_id;
180 unsigned long flags; 185 unsigned long flags;
181 sector_t data_block_size; 186 sector_t data_block_size;
187 bool read_only:1;
188
189 /*
190 * Set if a transaction has to be aborted but the attempt to roll back
191 * to the previous (good) transaction failed. The only pool metadata
192 * operation possible in this state is the closing of the device.
193 */
194 bool fail_io:1;
182}; 195};
183 196
184struct dm_thin_device { 197struct dm_thin_device {
@@ -187,7 +200,8 @@ struct dm_thin_device {
187 dm_thin_id id; 200 dm_thin_id id;
188 201
189 int open_count; 202 int open_count;
190 int changed; 203 bool changed:1;
204 bool aborted_with_changes:1;
191 uint64_t mapped_blocks; 205 uint64_t mapped_blocks;
192 uint64_t transaction_id; 206 uint64_t transaction_id;
193 uint32_t creation_time; 207 uint32_t creation_time;
@@ -338,7 +352,21 @@ static int subtree_equal(void *context, void *value1_le, void *value2_le)
338 352
339/*----------------------------------------------------------------*/ 353/*----------------------------------------------------------------*/
340 354
341static int superblock_all_zeroes(struct dm_block_manager *bm, int *result) 355static int superblock_lock_zero(struct dm_pool_metadata *pmd,
356 struct dm_block **sblock)
357{
358 return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
359 &sb_validator, sblock);
360}
361
362static int superblock_lock(struct dm_pool_metadata *pmd,
363 struct dm_block **sblock)
364{
365 return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
366 &sb_validator, sblock);
367}
368
369static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
342{ 370{
343 int r; 371 int r;
344 unsigned i; 372 unsigned i;
@@ -365,72 +393,9 @@ static int superblock_all_zeroes(struct dm_block_manager *bm, int *result)
365 return dm_bm_unlock(b); 393 return dm_bm_unlock(b);
366} 394}
367 395
368static int init_pmd(struct dm_pool_metadata *pmd, 396static void __setup_btree_details(struct dm_pool_metadata *pmd)
369 struct dm_block_manager *bm,
370 dm_block_t nr_blocks, int create)
371{ 397{
372 int r; 398 pmd->info.tm = pmd->tm;
373 struct dm_space_map *sm, *data_sm;
374 struct dm_transaction_manager *tm;
375 struct dm_block *sblock;
376
377 if (create) {
378 r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION,
379 &sb_validator, &tm, &sm, &sblock);
380 if (r < 0) {
381 DMERR("tm_create_with_sm failed");
382 return r;
383 }
384
385 data_sm = dm_sm_disk_create(tm, nr_blocks);
386 if (IS_ERR(data_sm)) {
387 DMERR("sm_disk_create failed");
388 dm_tm_unlock(tm, sblock);
389 r = PTR_ERR(data_sm);
390 goto bad;
391 }
392 } else {
393 struct thin_disk_superblock *disk_super = NULL;
394 size_t space_map_root_offset =
395 offsetof(struct thin_disk_superblock, metadata_space_map_root);
396
397 r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION,
398 &sb_validator, space_map_root_offset,
399 SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock);
400 if (r < 0) {
401 DMERR("tm_open_with_sm failed");
402 return r;
403 }
404
405 disk_super = dm_block_data(sblock);
406 data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root,
407 sizeof(disk_super->data_space_map_root));
408 if (IS_ERR(data_sm)) {
409 DMERR("sm_disk_open failed");
410 r = PTR_ERR(data_sm);
411 goto bad;
412 }
413 }
414
415
416 r = dm_tm_unlock(tm, sblock);
417 if (r < 0) {
418 DMERR("couldn't unlock superblock");
419 goto bad_data_sm;
420 }
421
422 pmd->bm = bm;
423 pmd->metadata_sm = sm;
424 pmd->data_sm = data_sm;
425 pmd->tm = tm;
426 pmd->nb_tm = dm_tm_create_non_blocking_clone(tm);
427 if (!pmd->nb_tm) {
428 DMERR("could not create clone tm");
429 r = -ENOMEM;
430 goto bad_data_sm;
431 }
432
433 pmd->info.tm = tm;
434 pmd->info.levels = 2; 399 pmd->info.levels = 2;
435 pmd->info.value_type.context = pmd->data_sm; 400 pmd->info.value_type.context = pmd->data_sm;
436 pmd->info.value_type.size = sizeof(__le64); 401 pmd->info.value_type.size = sizeof(__le64);
@@ -441,7 +406,7 @@ static int init_pmd(struct dm_pool_metadata *pmd,
441 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); 406 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
442 pmd->nb_info.tm = pmd->nb_tm; 407 pmd->nb_info.tm = pmd->nb_tm;
443 408
444 pmd->tl_info.tm = tm; 409 pmd->tl_info.tm = pmd->tm;
445 pmd->tl_info.levels = 1; 410 pmd->tl_info.levels = 1;
446 pmd->tl_info.value_type.context = &pmd->info; 411 pmd->tl_info.value_type.context = &pmd->info;
447 pmd->tl_info.value_type.size = sizeof(__le64); 412 pmd->tl_info.value_type.size = sizeof(__le64);
@@ -449,7 +414,7 @@ static int init_pmd(struct dm_pool_metadata *pmd,
449 pmd->tl_info.value_type.dec = subtree_dec; 414 pmd->tl_info.value_type.dec = subtree_dec;
450 pmd->tl_info.value_type.equal = subtree_equal; 415 pmd->tl_info.value_type.equal = subtree_equal;
451 416
452 pmd->bl_info.tm = tm; 417 pmd->bl_info.tm = pmd->tm;
453 pmd->bl_info.levels = 1; 418 pmd->bl_info.levels = 1;
454 pmd->bl_info.value_type.context = pmd->data_sm; 419 pmd->bl_info.value_type.context = pmd->data_sm;
455 pmd->bl_info.value_type.size = sizeof(__le64); 420 pmd->bl_info.value_type.size = sizeof(__le64);
@@ -457,48 +422,266 @@ static int init_pmd(struct dm_pool_metadata *pmd,
457 pmd->bl_info.value_type.dec = data_block_dec; 422 pmd->bl_info.value_type.dec = data_block_dec;
458 pmd->bl_info.value_type.equal = data_block_equal; 423 pmd->bl_info.value_type.equal = data_block_equal;
459 424
460 pmd->details_info.tm = tm; 425 pmd->details_info.tm = pmd->tm;
461 pmd->details_info.levels = 1; 426 pmd->details_info.levels = 1;
462 pmd->details_info.value_type.context = NULL; 427 pmd->details_info.value_type.context = NULL;
463 pmd->details_info.value_type.size = sizeof(struct disk_device_details); 428 pmd->details_info.value_type.size = sizeof(struct disk_device_details);
464 pmd->details_info.value_type.inc = NULL; 429 pmd->details_info.value_type.inc = NULL;
465 pmd->details_info.value_type.dec = NULL; 430 pmd->details_info.value_type.dec = NULL;
466 pmd->details_info.value_type.equal = NULL; 431 pmd->details_info.value_type.equal = NULL;
432}
467 433
468 pmd->root = 0; 434static int __write_initial_superblock(struct dm_pool_metadata *pmd)
435{
436 int r;
437 struct dm_block *sblock;
438 size_t metadata_len, data_len;
439 struct thin_disk_superblock *disk_super;
440 sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
469 441
470 init_rwsem(&pmd->root_lock); 442 if (bdev_size > THIN_METADATA_MAX_SECTORS)
471 pmd->time = 0; 443 bdev_size = THIN_METADATA_MAX_SECTORS;
472 pmd->need_commit = 0; 444
473 pmd->details_root = 0; 445 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
474 pmd->trans_id = 0; 446 if (r < 0)
475 pmd->flags = 0; 447 return r;
476 INIT_LIST_HEAD(&pmd->thin_devices); 448
449 r = dm_sm_root_size(pmd->data_sm, &data_len);
450 if (r < 0)
451 return r;
452
453 r = dm_sm_commit(pmd->data_sm);
454 if (r < 0)
455 return r;
456
457 r = dm_tm_pre_commit(pmd->tm);
458 if (r < 0)
459 return r;
460
461 r = superblock_lock_zero(pmd, &sblock);
462 if (r)
463 return r;
464
465 disk_super = dm_block_data(sblock);
466 disk_super->flags = 0;
467 memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
468 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
469 disk_super->version = cpu_to_le32(THIN_VERSION);
470 disk_super->time = 0;
471 disk_super->trans_id = 0;
472 disk_super->held_root = 0;
473
474 r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
475 metadata_len);
476 if (r < 0)
477 goto bad_locked;
478
479 r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
480 data_len);
481 if (r < 0)
482 goto bad_locked;
483
484 disk_super->data_mapping_root = cpu_to_le64(pmd->root);
485 disk_super->device_details_root = cpu_to_le64(pmd->details_root);
486 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
487 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
488 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
489
490 return dm_tm_commit(pmd->tm, sblock);
491
492bad_locked:
493 dm_bm_unlock(sblock);
494 return r;
495}
496
497static int __format_metadata(struct dm_pool_metadata *pmd)
498{
499 int r;
500
501 r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
502 &pmd->tm, &pmd->metadata_sm);
503 if (r < 0) {
504 DMERR("tm_create_with_sm failed");
505 return r;
506 }
507
508 pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
509 if (IS_ERR(pmd->data_sm)) {
510 DMERR("sm_disk_create failed");
511 r = PTR_ERR(pmd->data_sm);
512 goto bad_cleanup_tm;
513 }
514
515 pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
516 if (!pmd->nb_tm) {
517 DMERR("could not create non-blocking clone tm");
518 r = -ENOMEM;
519 goto bad_cleanup_data_sm;
520 }
521
522 __setup_btree_details(pmd);
523
524 r = dm_btree_empty(&pmd->info, &pmd->root);
525 if (r < 0)
526 goto bad_cleanup_nb_tm;
527
528 r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
529 if (r < 0) {
530 DMERR("couldn't create devices root");
531 goto bad_cleanup_nb_tm;
532 }
533
534 r = __write_initial_superblock(pmd);
535 if (r)
536 goto bad_cleanup_nb_tm;
477 537
478 return 0; 538 return 0;
479 539
480bad_data_sm: 540bad_cleanup_nb_tm:
481 dm_sm_destroy(data_sm); 541 dm_tm_destroy(pmd->nb_tm);
482bad: 542bad_cleanup_data_sm:
483 dm_tm_destroy(tm); 543 dm_sm_destroy(pmd->data_sm);
484 dm_sm_destroy(sm); 544bad_cleanup_tm:
545 dm_tm_destroy(pmd->tm);
546 dm_sm_destroy(pmd->metadata_sm);
547
548 return r;
549}
550
551static int __check_incompat_features(struct thin_disk_superblock *disk_super,
552 struct dm_pool_metadata *pmd)
553{
554 uint32_t features;
555
556 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
557 if (features) {
558 DMERR("could not access metadata due to unsupported optional features (%lx).",
559 (unsigned long)features);
560 return -EINVAL;
561 }
562
563 /*
564 * Check for read-only metadata to skip the following RDWR checks.
565 */
566 if (get_disk_ro(pmd->bdev->bd_disk))
567 return 0;
568
569 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
570 if (features) {
571 DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
572 (unsigned long)features);
573 return -EINVAL;
574 }
575
576 return 0;
577}
578
579static int __open_metadata(struct dm_pool_metadata *pmd)
580{
581 int r;
582 struct dm_block *sblock;
583 struct thin_disk_superblock *disk_super;
584
585 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
586 &sb_validator, &sblock);
587 if (r < 0) {
588 DMERR("couldn't read superblock");
589 return r;
590 }
591
592 disk_super = dm_block_data(sblock);
593
594 r = __check_incompat_features(disk_super, pmd);
595 if (r < 0)
596 goto bad_unlock_sblock;
597
598 r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
599 disk_super->metadata_space_map_root,
600 sizeof(disk_super->metadata_space_map_root),
601 &pmd->tm, &pmd->metadata_sm);
602 if (r < 0) {
603 DMERR("tm_open_with_sm failed");
604 goto bad_unlock_sblock;
605 }
606
607 pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
608 sizeof(disk_super->data_space_map_root));
609 if (IS_ERR(pmd->data_sm)) {
610 DMERR("sm_disk_open failed");
611 r = PTR_ERR(pmd->data_sm);
612 goto bad_cleanup_tm;
613 }
614
615 pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
616 if (!pmd->nb_tm) {
617 DMERR("could not create non-blocking clone tm");
618 r = -ENOMEM;
619 goto bad_cleanup_data_sm;
620 }
621
622 __setup_btree_details(pmd);
623 return dm_bm_unlock(sblock);
624
625bad_cleanup_data_sm:
626 dm_sm_destroy(pmd->data_sm);
627bad_cleanup_tm:
628 dm_tm_destroy(pmd->tm);
629 dm_sm_destroy(pmd->metadata_sm);
630bad_unlock_sblock:
631 dm_bm_unlock(sblock);
632
633 return r;
634}
635
636static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
637{
638 int r, unformatted;
639
640 r = __superblock_all_zeroes(pmd->bm, &unformatted);
641 if (r)
642 return r;
643
644 if (unformatted)
645 return format_device ? __format_metadata(pmd) : -EPERM;
646
647 return __open_metadata(pmd);
648}
649
650static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
651{
652 int r;
653
654 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE,
655 THIN_METADATA_CACHE_SIZE,
656 THIN_MAX_CONCURRENT_LOCKS);
657 if (IS_ERR(pmd->bm)) {
658 DMERR("could not create block manager");
659 return PTR_ERR(pmd->bm);
660 }
661
662 r = __open_or_format_metadata(pmd, format_device);
663 if (r)
664 dm_block_manager_destroy(pmd->bm);
485 665
486 return r; 666 return r;
487} 667}
488 668
669static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
670{
671 dm_sm_destroy(pmd->data_sm);
672 dm_sm_destroy(pmd->metadata_sm);
673 dm_tm_destroy(pmd->nb_tm);
674 dm_tm_destroy(pmd->tm);
675 dm_block_manager_destroy(pmd->bm);
676}
677
489static int __begin_transaction(struct dm_pool_metadata *pmd) 678static int __begin_transaction(struct dm_pool_metadata *pmd)
490{ 679{
491 int r; 680 int r;
492 u32 features;
493 struct thin_disk_superblock *disk_super; 681 struct thin_disk_superblock *disk_super;
494 struct dm_block *sblock; 682 struct dm_block *sblock;
495 683
496 /* 684 /*
497 * __maybe_commit_transaction() resets these
498 */
499 WARN_ON(pmd->need_commit);
500
501 /*
502 * We re-read the superblock every time. Shouldn't need to do this 685 * We re-read the superblock every time. Shouldn't need to do this
503 * really. 686 * really.
504 */ 687 */
@@ -515,32 +698,8 @@ static int __begin_transaction(struct dm_pool_metadata *pmd)
515 pmd->flags = le32_to_cpu(disk_super->flags); 698 pmd->flags = le32_to_cpu(disk_super->flags);
516 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); 699 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
517 700
518 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
519 if (features) {
520 DMERR("could not access metadata due to "
521 "unsupported optional features (%lx).",
522 (unsigned long)features);
523 r = -EINVAL;
524 goto out;
525 }
526
527 /*
528 * Check for read-only metadata to skip the following RDWR checks.
529 */
530 if (get_disk_ro(pmd->bdev->bd_disk))
531 goto out;
532
533 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
534 if (features) {
535 DMERR("could not access metadata RDWR due to "
536 "unsupported optional features (%lx).",
537 (unsigned long)features);
538 r = -EINVAL;
539 }
540
541out:
542 dm_bm_unlock(sblock); 701 dm_bm_unlock(sblock);
543 return r; 702 return 0;
544} 703}
545 704
546static int __write_changed_details(struct dm_pool_metadata *pmd) 705static int __write_changed_details(struct dm_pool_metadata *pmd)
@@ -573,8 +732,6 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
573 list_del(&td->list); 732 list_del(&td->list);
574 kfree(td); 733 kfree(td);
575 } 734 }
576
577 pmd->need_commit = 1;
578 } 735 }
579 736
580 return 0; 737 return 0;
@@ -582,9 +739,6 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
582 739
583static int __commit_transaction(struct dm_pool_metadata *pmd) 740static int __commit_transaction(struct dm_pool_metadata *pmd)
584{ 741{
585 /*
586 * FIXME: Associated pool should be made read-only on failure.
587 */
588 int r; 742 int r;
589 size_t metadata_len, data_len; 743 size_t metadata_len, data_len;
590 struct thin_disk_superblock *disk_super; 744 struct thin_disk_superblock *disk_super;
@@ -597,31 +751,27 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
597 751
598 r = __write_changed_details(pmd); 752 r = __write_changed_details(pmd);
599 if (r < 0) 753 if (r < 0)
600 goto out; 754 return r;
601
602 if (!pmd->need_commit)
603 goto out;
604 755
605 r = dm_sm_commit(pmd->data_sm); 756 r = dm_sm_commit(pmd->data_sm);
606 if (r < 0) 757 if (r < 0)
607 goto out; 758 return r;
608 759
609 r = dm_tm_pre_commit(pmd->tm); 760 r = dm_tm_pre_commit(pmd->tm);
610 if (r < 0) 761 if (r < 0)
611 goto out; 762 return r;
612 763
613 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); 764 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
614 if (r < 0) 765 if (r < 0)
615 goto out; 766 return r;
616 767
617 r = dm_sm_root_size(pmd->data_sm, &data_len); 768 r = dm_sm_root_size(pmd->data_sm, &data_len);
618 if (r < 0) 769 if (r < 0)
619 goto out; 770 return r;
620 771
621 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 772 r = superblock_lock(pmd, &sblock);
622 &sb_validator, &sblock);
623 if (r) 773 if (r)
624 goto out; 774 return r;
625 775
626 disk_super = dm_block_data(sblock); 776 disk_super = dm_block_data(sblock);
627 disk_super->time = cpu_to_le32(pmd->time); 777 disk_super->time = cpu_to_le32(pmd->time);
@@ -640,12 +790,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
640 if (r < 0) 790 if (r < 0)
641 goto out_locked; 791 goto out_locked;
642 792
643 r = dm_tm_commit(pmd->tm, sblock); 793 return dm_tm_commit(pmd->tm, sblock);
644 if (!r)
645 pmd->need_commit = 0;
646
647out:
648 return r;
649 794
650out_locked: 795out_locked:
651 dm_bm_unlock(sblock); 796 dm_bm_unlock(sblock);
@@ -653,15 +798,11 @@ out_locked:
653} 798}
654 799
655struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, 800struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
656 sector_t data_block_size) 801 sector_t data_block_size,
802 bool format_device)
657{ 803{
658 int r; 804 int r;
659 struct thin_disk_superblock *disk_super;
660 struct dm_pool_metadata *pmd; 805 struct dm_pool_metadata *pmd;
661 sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
662 struct dm_block_manager *bm;
663 int create;
664 struct dm_block *sblock;
665 806
666 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); 807 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
667 if (!pmd) { 808 if (!pmd) {
@@ -669,90 +810,28 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
669 return ERR_PTR(-ENOMEM); 810 return ERR_PTR(-ENOMEM);
670 } 811 }
671 812
672 /* 813 init_rwsem(&pmd->root_lock);
673 * Max hex locks: 814 pmd->time = 0;
674 * 3 for btree insert + 815 INIT_LIST_HEAD(&pmd->thin_devices);
675 * 2 for btree lookup used within space map 816 pmd->read_only = false;
676 */ 817 pmd->fail_io = false;
677 bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE, 818 pmd->bdev = bdev;
678 THIN_METADATA_CACHE_SIZE, 5); 819 pmd->data_block_size = data_block_size;
679 if (!bm) {
680 DMERR("could not create block manager");
681 kfree(pmd);
682 return ERR_PTR(-ENOMEM);
683 }
684
685 r = superblock_all_zeroes(bm, &create);
686 if (r) {
687 dm_block_manager_destroy(bm);
688 kfree(pmd);
689 return ERR_PTR(r);
690 }
691
692 820
693 r = init_pmd(pmd, bm, 0, create); 821 r = __create_persistent_data_objects(pmd, format_device);
694 if (r) { 822 if (r) {
695 dm_block_manager_destroy(bm);
696 kfree(pmd); 823 kfree(pmd);
697 return ERR_PTR(r); 824 return ERR_PTR(r);
698 } 825 }
699 pmd->bdev = bdev;
700
701 if (!create) {
702 r = __begin_transaction(pmd);
703 if (r < 0)
704 goto bad;
705 return pmd;
706 }
707
708 /*
709 * Create.
710 */
711 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
712 &sb_validator, &sblock);
713 if (r)
714 goto bad;
715
716 if (bdev_size > THIN_METADATA_MAX_SECTORS)
717 bdev_size = THIN_METADATA_MAX_SECTORS;
718
719 disk_super = dm_block_data(sblock);
720 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
721 disk_super->version = cpu_to_le32(THIN_VERSION);
722 disk_super->time = 0;
723 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
724 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
725 disk_super->data_block_size = cpu_to_le32(data_block_size);
726
727 r = dm_bm_unlock(sblock);
728 if (r < 0)
729 goto bad;
730
731 r = dm_btree_empty(&pmd->info, &pmd->root);
732 if (r < 0)
733 goto bad;
734
735 r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
736 if (r < 0) {
737 DMERR("couldn't create devices root");
738 goto bad;
739 }
740 826
741 pmd->flags = 0; 827 r = __begin_transaction(pmd);
742 pmd->need_commit = 1;
743 r = dm_pool_commit_metadata(pmd);
744 if (r < 0) { 828 if (r < 0) {
745 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 829 if (dm_pool_metadata_close(pmd) < 0)
746 __func__, r); 830 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
747 goto bad; 831 return ERR_PTR(r);
748 } 832 }
749 833
750 return pmd; 834 return pmd;
751
752bad:
753 if (dm_pool_metadata_close(pmd) < 0)
754 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
755 return ERR_PTR(r);
756} 835}
757 836
758int dm_pool_metadata_close(struct dm_pool_metadata *pmd) 837int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
@@ -778,18 +857,17 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
778 return -EBUSY; 857 return -EBUSY;
779 } 858 }
780 859
781 r = __commit_transaction(pmd); 860 if (!pmd->read_only && !pmd->fail_io) {
782 if (r < 0) 861 r = __commit_transaction(pmd);
783 DMWARN("%s: __commit_transaction() failed, error = %d", 862 if (r < 0)
784 __func__, r); 863 DMWARN("%s: __commit_transaction() failed, error = %d",
864 __func__, r);
865 }
785 866
786 dm_tm_destroy(pmd->tm); 867 if (!pmd->fail_io)
787 dm_tm_destroy(pmd->nb_tm); 868 __destroy_persistent_data_objects(pmd);
788 dm_block_manager_destroy(pmd->bm);
789 dm_sm_destroy(pmd->metadata_sm);
790 dm_sm_destroy(pmd->data_sm);
791 kfree(pmd);
792 869
870 kfree(pmd);
793 return 0; 871 return 0;
794} 872}
795 873
@@ -850,6 +928,7 @@ static int __open_device(struct dm_pool_metadata *pmd,
850 (*td)->id = dev; 928 (*td)->id = dev;
851 (*td)->open_count = 1; 929 (*td)->open_count = 1;
852 (*td)->changed = changed; 930 (*td)->changed = changed;
931 (*td)->aborted_with_changes = false;
853 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); 932 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
854 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); 933 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
855 (*td)->creation_time = le32_to_cpu(details_le.creation_time); 934 (*td)->creation_time = le32_to_cpu(details_le.creation_time);
@@ -911,10 +990,11 @@ static int __create_thin(struct dm_pool_metadata *pmd,
911 990
912int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) 991int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
913{ 992{
914 int r; 993 int r = -EINVAL;
915 994
916 down_write(&pmd->root_lock); 995 down_write(&pmd->root_lock);
917 r = __create_thin(pmd, dev); 996 if (!pmd->fail_io)
997 r = __create_thin(pmd, dev);
918 up_write(&pmd->root_lock); 998 up_write(&pmd->root_lock);
919 999
920 return r; 1000 return r;
@@ -1001,10 +1081,11 @@ int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1001 dm_thin_id dev, 1081 dm_thin_id dev,
1002 dm_thin_id origin) 1082 dm_thin_id origin)
1003{ 1083{
1004 int r; 1084 int r = -EINVAL;
1005 1085
1006 down_write(&pmd->root_lock); 1086 down_write(&pmd->root_lock);
1007 r = __create_snap(pmd, dev, origin); 1087 if (!pmd->fail_io)
1088 r = __create_snap(pmd, dev, origin);
1008 up_write(&pmd->root_lock); 1089 up_write(&pmd->root_lock);
1009 1090
1010 return r; 1091 return r;
@@ -1037,18 +1118,17 @@ static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
1037 if (r) 1118 if (r)
1038 return r; 1119 return r;
1039 1120
1040 pmd->need_commit = 1;
1041
1042 return 0; 1121 return 0;
1043} 1122}
1044 1123
1045int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, 1124int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1046 dm_thin_id dev) 1125 dm_thin_id dev)
1047{ 1126{
1048 int r; 1127 int r = -EINVAL;
1049 1128
1050 down_write(&pmd->root_lock); 1129 down_write(&pmd->root_lock);
1051 r = __delete_device(pmd, dev); 1130 if (!pmd->fail_io)
1131 r = __delete_device(pmd, dev);
1052 up_write(&pmd->root_lock); 1132 up_write(&pmd->root_lock);
1053 1133
1054 return r; 1134 return r;
@@ -1058,28 +1138,40 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1058 uint64_t current_id, 1138 uint64_t current_id,
1059 uint64_t new_id) 1139 uint64_t new_id)
1060{ 1140{
1141 int r = -EINVAL;
1142
1061 down_write(&pmd->root_lock); 1143 down_write(&pmd->root_lock);
1144
1145 if (pmd->fail_io)
1146 goto out;
1147
1062 if (pmd->trans_id != current_id) { 1148 if (pmd->trans_id != current_id) {
1063 up_write(&pmd->root_lock);
1064 DMERR("mismatched transaction id"); 1149 DMERR("mismatched transaction id");
1065 return -EINVAL; 1150 goto out;
1066 } 1151 }
1067 1152
1068 pmd->trans_id = new_id; 1153 pmd->trans_id = new_id;
1069 pmd->need_commit = 1; 1154 r = 0;
1155
1156out:
1070 up_write(&pmd->root_lock); 1157 up_write(&pmd->root_lock);
1071 1158
1072 return 0; 1159 return r;
1073} 1160}
1074 1161
1075int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, 1162int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1076 uint64_t *result) 1163 uint64_t *result)
1077{ 1164{
1165 int r = -EINVAL;
1166
1078 down_read(&pmd->root_lock); 1167 down_read(&pmd->root_lock);
1079 *result = pmd->trans_id; 1168 if (!pmd->fail_io) {
1169 *result = pmd->trans_id;
1170 r = 0;
1171 }
1080 up_read(&pmd->root_lock); 1172 up_read(&pmd->root_lock);
1081 1173
1082 return 0; 1174 return r;
1083} 1175}
1084 1176
1085static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) 1177static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
@@ -1108,8 +1200,6 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1108 1200
1109 dm_tm_dec(pmd->tm, held_root); 1201 dm_tm_dec(pmd->tm, held_root);
1110 dm_tm_unlock(pmd->tm, copy); 1202 dm_tm_unlock(pmd->tm, copy);
1111 pmd->need_commit = 1;
1112
1113 return -EBUSY; 1203 return -EBUSY;
1114 } 1204 }
1115 1205
@@ -1131,29 +1221,25 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1131 /* 1221 /*
1132 * Write the held root into the superblock. 1222 * Write the held root into the superblock.
1133 */ 1223 */
1134 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 1224 r = superblock_lock(pmd, &sblock);
1135 &sb_validator, &sblock);
1136 if (r) { 1225 if (r) {
1137 dm_tm_dec(pmd->tm, held_root); 1226 dm_tm_dec(pmd->tm, held_root);
1138 pmd->need_commit = 1;
1139 return r; 1227 return r;
1140 } 1228 }
1141 1229
1142 disk_super = dm_block_data(sblock); 1230 disk_super = dm_block_data(sblock);
1143 disk_super->held_root = cpu_to_le64(held_root); 1231 disk_super->held_root = cpu_to_le64(held_root);
1144 dm_bm_unlock(sblock); 1232 dm_bm_unlock(sblock);
1145
1146 pmd->need_commit = 1;
1147
1148 return 0; 1233 return 0;
1149} 1234}
1150 1235
1151int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) 1236int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1152{ 1237{
1153 int r; 1238 int r = -EINVAL;
1154 1239
1155 down_write(&pmd->root_lock); 1240 down_write(&pmd->root_lock);
1156 r = __reserve_metadata_snap(pmd); 1241 if (!pmd->fail_io)
1242 r = __reserve_metadata_snap(pmd);
1157 up_write(&pmd->root_lock); 1243 up_write(&pmd->root_lock);
1158 1244
1159 return r; 1245 return r;
@@ -1166,15 +1252,13 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1166 struct dm_block *sblock, *copy; 1252 struct dm_block *sblock, *copy;
1167 dm_block_t held_root; 1253 dm_block_t held_root;
1168 1254
1169 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 1255 r = superblock_lock(pmd, &sblock);
1170 &sb_validator, &sblock);
1171 if (r) 1256 if (r)
1172 return r; 1257 return r;
1173 1258
1174 disk_super = dm_block_data(sblock); 1259 disk_super = dm_block_data(sblock);
1175 held_root = le64_to_cpu(disk_super->held_root); 1260 held_root = le64_to_cpu(disk_super->held_root);
1176 disk_super->held_root = cpu_to_le64(0); 1261 disk_super->held_root = cpu_to_le64(0);
1177 pmd->need_commit = 1;
1178 1262
1179 dm_bm_unlock(sblock); 1263 dm_bm_unlock(sblock);
1180 1264
@@ -1197,10 +1281,11 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1197 1281
1198int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) 1282int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1199{ 1283{
1200 int r; 1284 int r = -EINVAL;
1201 1285
1202 down_write(&pmd->root_lock); 1286 down_write(&pmd->root_lock);
1203 r = __release_metadata_snap(pmd); 1287 if (!pmd->fail_io)
1288 r = __release_metadata_snap(pmd);
1204 up_write(&pmd->root_lock); 1289 up_write(&pmd->root_lock);
1205 1290
1206 return r; 1291 return r;
@@ -1227,10 +1312,11 @@ static int __get_metadata_snap(struct dm_pool_metadata *pmd,
1227int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, 1312int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1228 dm_block_t *result) 1313 dm_block_t *result)
1229{ 1314{
1230 int r; 1315 int r = -EINVAL;
1231 1316
1232 down_read(&pmd->root_lock); 1317 down_read(&pmd->root_lock);
1233 r = __get_metadata_snap(pmd, result); 1318 if (!pmd->fail_io)
1319 r = __get_metadata_snap(pmd, result);
1234 up_read(&pmd->root_lock); 1320 up_read(&pmd->root_lock);
1235 1321
1236 return r; 1322 return r;
@@ -1239,10 +1325,11 @@ int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1239int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, 1325int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1240 struct dm_thin_device **td) 1326 struct dm_thin_device **td)
1241{ 1327{
1242 int r; 1328 int r = -EINVAL;
1243 1329
1244 down_write(&pmd->root_lock); 1330 down_write(&pmd->root_lock);
1245 r = __open_device(pmd, dev, 0, td); 1331 if (!pmd->fail_io)
1332 r = __open_device(pmd, dev, 0, td);
1246 up_write(&pmd->root_lock); 1333 up_write(&pmd->root_lock);
1247 1334
1248 return r; 1335 return r;
@@ -1262,7 +1349,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1262 return td->id; 1349 return td->id;
1263} 1350}
1264 1351
1265static int __snapshotted_since(struct dm_thin_device *td, uint32_t time) 1352static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1266{ 1353{
1267 return td->snapshotted_time > time; 1354 return td->snapshotted_time > time;
1268} 1355}
@@ -1270,28 +1357,31 @@ static int __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1270int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 1357int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1271 int can_block, struct dm_thin_lookup_result *result) 1358 int can_block, struct dm_thin_lookup_result *result)
1272{ 1359{
1273 int r; 1360 int r = -EINVAL;
1274 uint64_t block_time = 0; 1361 uint64_t block_time = 0;
1275 __le64 value; 1362 __le64 value;
1276 struct dm_pool_metadata *pmd = td->pmd; 1363 struct dm_pool_metadata *pmd = td->pmd;
1277 dm_block_t keys[2] = { td->id, block }; 1364 dm_block_t keys[2] = { td->id, block };
1365 struct dm_btree_info *info;
1278 1366
1279 if (can_block) { 1367 if (can_block) {
1280 down_read(&pmd->root_lock); 1368 down_read(&pmd->root_lock);
1281 r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value); 1369 info = &pmd->info;
1282 if (!r) 1370 } else if (down_read_trylock(&pmd->root_lock))
1283 block_time = le64_to_cpu(value); 1371 info = &pmd->nb_info;
1284 up_read(&pmd->root_lock); 1372 else
1285
1286 } else if (down_read_trylock(&pmd->root_lock)) {
1287 r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value);
1288 if (!r)
1289 block_time = le64_to_cpu(value);
1290 up_read(&pmd->root_lock);
1291
1292 } else
1293 return -EWOULDBLOCK; 1373 return -EWOULDBLOCK;
1294 1374
1375 if (pmd->fail_io)
1376 goto out;
1377
1378 r = dm_btree_lookup(info, pmd->root, keys, &value);
1379 if (!r)
1380 block_time = le64_to_cpu(value);
1381
1382out:
1383 up_read(&pmd->root_lock);
1384
1295 if (!r) { 1385 if (!r) {
1296 dm_block_t exception_block; 1386 dm_block_t exception_block;
1297 uint32_t exception_time; 1387 uint32_t exception_time;
@@ -1312,7 +1402,6 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
1312 struct dm_pool_metadata *pmd = td->pmd; 1402 struct dm_pool_metadata *pmd = td->pmd;
1313 dm_block_t keys[2] = { td->id, block }; 1403 dm_block_t keys[2] = { td->id, block };
1314 1404
1315 pmd->need_commit = 1;
1316 value = cpu_to_le64(pack_block_time(data_block, pmd->time)); 1405 value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1317 __dm_bless_for_disk(&value); 1406 __dm_bless_for_disk(&value);
1318 1407
@@ -1321,10 +1410,9 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
1321 if (r) 1410 if (r)
1322 return r; 1411 return r;
1323 1412
1324 if (inserted) { 1413 td->changed = 1;
1414 if (inserted)
1325 td->mapped_blocks++; 1415 td->mapped_blocks++;
1326 td->changed = 1;
1327 }
1328 1416
1329 return 0; 1417 return 0;
1330} 1418}
@@ -1332,10 +1420,11 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
1332int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, 1420int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1333 dm_block_t data_block) 1421 dm_block_t data_block)
1334{ 1422{
1335 int r; 1423 int r = -EINVAL;
1336 1424
1337 down_write(&td->pmd->root_lock); 1425 down_write(&td->pmd->root_lock);
1338 r = __insert(td, block, data_block); 1426 if (!td->pmd->fail_io)
1427 r = __insert(td, block, data_block);
1339 up_write(&td->pmd->root_lock); 1428 up_write(&td->pmd->root_lock);
1340 1429
1341 return r; 1430 return r;
@@ -1353,31 +1442,51 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
1353 1442
1354 td->mapped_blocks--; 1443 td->mapped_blocks--;
1355 td->changed = 1; 1444 td->changed = 1;
1356 pmd->need_commit = 1;
1357 1445
1358 return 0; 1446 return 0;
1359} 1447}
1360 1448
1361int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) 1449int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1362{ 1450{
1363 int r; 1451 int r = -EINVAL;
1364 1452
1365 down_write(&td->pmd->root_lock); 1453 down_write(&td->pmd->root_lock);
1366 r = __remove(td, block); 1454 if (!td->pmd->fail_io)
1455 r = __remove(td, block);
1367 up_write(&td->pmd->root_lock); 1456 up_write(&td->pmd->root_lock);
1368 1457
1369 return r; 1458 return r;
1370} 1459}
1371 1460
1372int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) 1461bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1373{ 1462{
1374 int r; 1463 int r;
1375 1464
1376 down_write(&pmd->root_lock); 1465 down_read(&td->pmd->root_lock);
1466 r = td->changed;
1467 up_read(&td->pmd->root_lock);
1377 1468
1378 r = dm_sm_new_block(pmd->data_sm, result); 1469 return r;
1379 pmd->need_commit = 1; 1470}
1471
1472bool dm_thin_aborted_changes(struct dm_thin_device *td)
1473{
1474 bool r;
1380 1475
1476 down_read(&td->pmd->root_lock);
1477 r = td->aborted_with_changes;
1478 up_read(&td->pmd->root_lock);
1479
1480 return r;
1481}
1482
1483int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1484{
1485 int r = -EINVAL;
1486
1487 down_write(&pmd->root_lock);
1488 if (!pmd->fail_io)
1489 r = dm_sm_new_block(pmd->data_sm, result);
1381 up_write(&pmd->root_lock); 1490 up_write(&pmd->root_lock);
1382 1491
1383 return r; 1492 return r;
@@ -1385,9 +1494,11 @@ int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1385 1494
1386int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) 1495int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1387{ 1496{
1388 int r; 1497 int r = -EINVAL;
1389 1498
1390 down_write(&pmd->root_lock); 1499 down_write(&pmd->root_lock);
1500 if (pmd->fail_io)
1501 goto out;
1391 1502
1392 r = __commit_transaction(pmd); 1503 r = __commit_transaction(pmd);
1393 if (r <= 0) 1504 if (r <= 0)
@@ -1402,12 +1513,41 @@ out:
1402 return r; 1513 return r;
1403} 1514}
1404 1515
1516static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
1517{
1518 struct dm_thin_device *td;
1519
1520 list_for_each_entry(td, &pmd->thin_devices, list)
1521 td->aborted_with_changes = td->changed;
1522}
1523
1524int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1525{
1526 int r = -EINVAL;
1527
1528 down_write(&pmd->root_lock);
1529 if (pmd->fail_io)
1530 goto out;
1531
1532 __set_abort_with_changes_flags(pmd);
1533 __destroy_persistent_data_objects(pmd);
1534 r = __create_persistent_data_objects(pmd, false);
1535 if (r)
1536 pmd->fail_io = true;
1537
1538out:
1539 up_write(&pmd->root_lock);
1540
1541 return r;
1542}
1543
1405int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) 1544int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1406{ 1545{
1407 int r; 1546 int r = -EINVAL;
1408 1547
1409 down_read(&pmd->root_lock); 1548 down_read(&pmd->root_lock);
1410 r = dm_sm_get_nr_free(pmd->data_sm, result); 1549 if (!pmd->fail_io)
1550 r = dm_sm_get_nr_free(pmd->data_sm, result);
1411 up_read(&pmd->root_lock); 1551 up_read(&pmd->root_lock);
1412 1552
1413 return r; 1553 return r;
@@ -1416,10 +1556,11 @@ int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *resul
1416int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, 1556int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1417 dm_block_t *result) 1557 dm_block_t *result)
1418{ 1558{
1419 int r; 1559 int r = -EINVAL;
1420 1560
1421 down_read(&pmd->root_lock); 1561 down_read(&pmd->root_lock);
1422 r = dm_sm_get_nr_free(pmd->metadata_sm, result); 1562 if (!pmd->fail_io)
1563 r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1423 up_read(&pmd->root_lock); 1564 up_read(&pmd->root_lock);
1424 1565
1425 return r; 1566 return r;
@@ -1428,10 +1569,11 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1428int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, 1569int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1429 dm_block_t *result) 1570 dm_block_t *result)
1430{ 1571{
1431 int r; 1572 int r = -EINVAL;
1432 1573
1433 down_read(&pmd->root_lock); 1574 down_read(&pmd->root_lock);
1434 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); 1575 if (!pmd->fail_io)
1576 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1435 up_read(&pmd->root_lock); 1577 up_read(&pmd->root_lock);
1436 1578
1437 return r; 1579 return r;
@@ -1448,10 +1590,11 @@ int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
1448 1590
1449int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) 1591int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1450{ 1592{
1451 int r; 1593 int r = -EINVAL;
1452 1594
1453 down_read(&pmd->root_lock); 1595 down_read(&pmd->root_lock);
1454 r = dm_sm_get_nr_blocks(pmd->data_sm, result); 1596 if (!pmd->fail_io)
1597 r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1455 up_read(&pmd->root_lock); 1598 up_read(&pmd->root_lock);
1456 1599
1457 return r; 1600 return r;
@@ -1459,13 +1602,17 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1459 1602
1460int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) 1603int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1461{ 1604{
1605 int r = -EINVAL;
1462 struct dm_pool_metadata *pmd = td->pmd; 1606 struct dm_pool_metadata *pmd = td->pmd;
1463 1607
1464 down_read(&pmd->root_lock); 1608 down_read(&pmd->root_lock);
1465 *result = td->mapped_blocks; 1609 if (!pmd->fail_io) {
1610 *result = td->mapped_blocks;
1611 r = 0;
1612 }
1466 up_read(&pmd->root_lock); 1613 up_read(&pmd->root_lock);
1467 1614
1468 return 0; 1615 return r;
1469} 1616}
1470 1617
1471static int __highest_block(struct dm_thin_device *td, dm_block_t *result) 1618static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
@@ -1487,11 +1634,12 @@ static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
1487int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, 1634int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
1488 dm_block_t *result) 1635 dm_block_t *result)
1489{ 1636{
1490 int r; 1637 int r = -EINVAL;
1491 struct dm_pool_metadata *pmd = td->pmd; 1638 struct dm_pool_metadata *pmd = td->pmd;
1492 1639
1493 down_read(&pmd->root_lock); 1640 down_read(&pmd->root_lock);
1494 r = __highest_block(td, result); 1641 if (!pmd->fail_io)
1642 r = __highest_block(td, result);
1495 up_read(&pmd->root_lock); 1643 up_read(&pmd->root_lock);
1496 1644
1497 return r; 1645 return r;
@@ -1514,20 +1662,25 @@ static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1514 return -EINVAL; 1662 return -EINVAL;
1515 } 1663 }
1516 1664
1517 r = dm_sm_extend(pmd->data_sm, new_count - old_count); 1665 return dm_sm_extend(pmd->data_sm, new_count - old_count);
1518 if (!r)
1519 pmd->need_commit = 1;
1520
1521 return r;
1522} 1666}
1523 1667
1524int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) 1668int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1525{ 1669{
1526 int r; 1670 int r = -EINVAL;
1527 1671
1528 down_write(&pmd->root_lock); 1672 down_write(&pmd->root_lock);
1529 r = __resize_data_dev(pmd, new_count); 1673 if (!pmd->fail_io)
1674 r = __resize_data_dev(pmd, new_count);
1530 up_write(&pmd->root_lock); 1675 up_write(&pmd->root_lock);
1531 1676
1532 return r; 1677 return r;
1533} 1678}
1679
1680void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
1681{
1682 down_write(&pmd->root_lock);
1683 pmd->read_only = true;
1684 dm_bm_set_read_only(pmd->bm);
1685 up_write(&pmd->root_lock);
1686}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index b88918ccdaf6..0cecc3702885 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -38,7 +38,8 @@ typedef uint64_t dm_thin_id;
38 * Reopens or creates a new, empty metadata volume. 38 * Reopens or creates a new, empty metadata volume.
39 */ 39 */
40struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, 40struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
41 sector_t data_block_size); 41 sector_t data_block_size,
42 bool format_device);
42 43
43int dm_pool_metadata_close(struct dm_pool_metadata *pmd); 44int dm_pool_metadata_close(struct dm_pool_metadata *pmd);
44 45
@@ -79,6 +80,16 @@ int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
79int dm_pool_commit_metadata(struct dm_pool_metadata *pmd); 80int dm_pool_commit_metadata(struct dm_pool_metadata *pmd);
80 81
81/* 82/*
83 * Discards all uncommitted changes. Rereads the superblock, rolling back
84 * to the last good transaction. Thin devices remain open.
85 * dm_thin_aborted_changes() tells you if they had uncommitted changes.
86 *
87 * If this call fails it's only useful to call dm_pool_metadata_close().
88 * All other methods will fail with -EINVAL.
89 */
90int dm_pool_abort_metadata(struct dm_pool_metadata *pmd);
91
92/*
82 * Set/get userspace transaction id. 93 * Set/get userspace transaction id.
83 */ 94 */
84int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, 95int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
@@ -119,7 +130,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
119 130
120struct dm_thin_lookup_result { 131struct dm_thin_lookup_result {
121 dm_block_t block; 132 dm_block_t block;
122 int shared; 133 unsigned shared:1;
123}; 134};
124 135
125/* 136/*
@@ -147,6 +158,10 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
147/* 158/*
148 * Queries. 159 * Queries.
149 */ 160 */
161bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
162
163bool dm_thin_aborted_changes(struct dm_thin_device *td);
164
150int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, 165int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
151 dm_block_t *highest_mapped); 166 dm_block_t *highest_mapped);
152 167
@@ -171,6 +186,12 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
171 */ 186 */
172int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); 187int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
173 188
189/*
190 * Flicks the underlying block manager into read only mode, so you know
191 * that nothing is changing.
192 */
193void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
194
174/*----------------------------------------------------------------*/ 195/*----------------------------------------------------------------*/
175 196
176#endif 197#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 37fdaf81bd1f..af1fc3b2c2ad 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1,10 +1,11 @@
1/* 1/*
2 * Copyright (C) 2011 Red Hat UK. 2 * Copyright (C) 2011-2012 Red Hat UK.
3 * 3 *
4 * This file is released under the GPL. 4 * This file is released under the GPL.
5 */ 5 */
6 6
7#include "dm-thin-metadata.h" 7#include "dm-thin-metadata.h"
8#include "dm.h"
8 9
9#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
10#include <linux/dm-io.h> 11#include <linux/dm-io.h>
@@ -19,7 +20,7 @@
19/* 20/*
20 * Tunable constants 21 * Tunable constants
21 */ 22 */
22#define ENDIO_HOOK_POOL_SIZE 10240 23#define ENDIO_HOOK_POOL_SIZE 1024
23#define DEFERRED_SET_SIZE 64 24#define DEFERRED_SET_SIZE 64
24#define MAPPING_POOL_SIZE 1024 25#define MAPPING_POOL_SIZE 1024
25#define PRISON_CELLS 1024 26#define PRISON_CELLS 1024
@@ -496,12 +497,27 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
496 */ 497 */
497struct dm_thin_new_mapping; 498struct dm_thin_new_mapping;
498 499
500/*
501 * The pool runs in 3 modes. Ordered in degraded order for comparisons.
502 */
503enum pool_mode {
504 PM_WRITE, /* metadata may be changed */
505 PM_READ_ONLY, /* metadata may not be changed */
506 PM_FAIL, /* all I/O fails */
507};
508
499struct pool_features { 509struct pool_features {
510 enum pool_mode mode;
511
500 unsigned zero_new_blocks:1; 512 unsigned zero_new_blocks:1;
501 unsigned discard_enabled:1; 513 unsigned discard_enabled:1;
502 unsigned discard_passdown:1; 514 unsigned discard_passdown:1;
503}; 515};
504 516
517struct thin_c;
518typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
519typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
520
505struct pool { 521struct pool {
506 struct list_head list; 522 struct list_head list;
507 struct dm_target *ti; /* Only set if a pool target is bound */ 523 struct dm_target *ti; /* Only set if a pool target is bound */
@@ -510,10 +526,9 @@ struct pool {
510 struct block_device *md_dev; 526 struct block_device *md_dev;
511 struct dm_pool_metadata *pmd; 527 struct dm_pool_metadata *pmd;
512 528
513 uint32_t sectors_per_block;
514 unsigned block_shift;
515 dm_block_t offset_mask;
516 dm_block_t low_water_blocks; 529 dm_block_t low_water_blocks;
530 uint32_t sectors_per_block;
531 int sectors_per_block_shift;
517 532
518 struct pool_features pf; 533 struct pool_features pf;
519 unsigned low_water_triggered:1; /* A dm event has been sent */ 534 unsigned low_water_triggered:1; /* A dm event has been sent */
@@ -526,8 +541,8 @@ struct pool {
526 struct work_struct worker; 541 struct work_struct worker;
527 struct delayed_work waker; 542 struct delayed_work waker;
528 543
529 unsigned ref_count;
530 unsigned long last_commit_jiffies; 544 unsigned long last_commit_jiffies;
545 unsigned ref_count;
531 546
532 spinlock_t lock; 547 spinlock_t lock;
533 struct bio_list deferred_bios; 548 struct bio_list deferred_bios;
@@ -543,8 +558,17 @@ struct pool {
543 struct dm_thin_new_mapping *next_mapping; 558 struct dm_thin_new_mapping *next_mapping;
544 mempool_t *mapping_pool; 559 mempool_t *mapping_pool;
545 mempool_t *endio_hook_pool; 560 mempool_t *endio_hook_pool;
561
562 process_bio_fn process_bio;
563 process_bio_fn process_discard;
564
565 process_mapping_fn process_prepared_mapping;
566 process_mapping_fn process_prepared_discard;
546}; 567};
547 568
569static enum pool_mode get_pool_mode(struct pool *pool);
570static void set_pool_mode(struct pool *pool, enum pool_mode mode);
571
548/* 572/*
549 * Target context for a pool. 573 * Target context for a pool.
550 */ 574 */
@@ -679,16 +703,28 @@ static void requeue_io(struct thin_c *tc)
679 703
680static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 704static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
681{ 705{
682 return bio->bi_sector >> tc->pool->block_shift; 706 sector_t block_nr = bio->bi_sector;
707
708 if (tc->pool->sectors_per_block_shift < 0)
709 (void) sector_div(block_nr, tc->pool->sectors_per_block);
710 else
711 block_nr >>= tc->pool->sectors_per_block_shift;
712
713 return block_nr;
683} 714}
684 715
685static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 716static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
686{ 717{
687 struct pool *pool = tc->pool; 718 struct pool *pool = tc->pool;
719 sector_t bi_sector = bio->bi_sector;
688 720
689 bio->bi_bdev = tc->pool_dev->bdev; 721 bio->bi_bdev = tc->pool_dev->bdev;
690 bio->bi_sector = (block << pool->block_shift) + 722 if (tc->pool->sectors_per_block_shift < 0)
691 (bio->bi_sector & pool->offset_mask); 723 bio->bi_sector = (block * pool->sectors_per_block) +
724 sector_div(bi_sector, pool->sectors_per_block);
725 else
726 bio->bi_sector = (block << pool->sectors_per_block_shift) |
727 (bi_sector & (pool->sectors_per_block - 1));
692} 728}
693 729
694static void remap_to_origin(struct thin_c *tc, struct bio *bio) 730static void remap_to_origin(struct thin_c *tc, struct bio *bio)
@@ -696,21 +732,39 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio)
696 bio->bi_bdev = tc->origin_dev->bdev; 732 bio->bi_bdev = tc->origin_dev->bdev;
697} 733}
698 734
735static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
736{
737 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
738 dm_thin_changed_this_transaction(tc->td);
739}
740
699static void issue(struct thin_c *tc, struct bio *bio) 741static void issue(struct thin_c *tc, struct bio *bio)
700{ 742{
701 struct pool *pool = tc->pool; 743 struct pool *pool = tc->pool;
702 unsigned long flags; 744 unsigned long flags;
703 745
746 if (!bio_triggers_commit(tc, bio)) {
747 generic_make_request(bio);
748 return;
749 }
750
704 /* 751 /*
705 * Batch together any FUA/FLUSH bios we find and then issue 752 * Complete bio with an error if earlier I/O caused changes to
706 * a single commit for them in process_deferred_bios(). 753 * the metadata that can't be committed e.g, due to I/O errors
754 * on the metadata device.
707 */ 755 */
708 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 756 if (dm_thin_aborted_changes(tc->td)) {
709 spin_lock_irqsave(&pool->lock, flags); 757 bio_io_error(bio);
710 bio_list_add(&pool->deferred_flush_bios, bio); 758 return;
711 spin_unlock_irqrestore(&pool->lock, flags); 759 }
712 } else 760
713 generic_make_request(bio); 761 /*
762 * Batch together any bios that trigger commits and then issue a
763 * single commit for them in process_deferred_bios().
764 */
765 spin_lock_irqsave(&pool->lock, flags);
766 bio_list_add(&pool->deferred_flush_bios, bio);
767 spin_unlock_irqrestore(&pool->lock, flags);
714} 768}
715 769
716static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) 770static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
@@ -847,6 +901,14 @@ static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell
847 wake_worker(pool); 901 wake_worker(pool);
848} 902}
849 903
904static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
905{
906 if (m->bio)
907 m->bio->bi_end_io = m->saved_bi_end_io;
908 cell_error(m->cell);
909 list_del(&m->list);
910 mempool_free(m, m->tc->pool->mapping_pool);
911}
850static void process_prepared_mapping(struct dm_thin_new_mapping *m) 912static void process_prepared_mapping(struct dm_thin_new_mapping *m)
851{ 913{
852 struct thin_c *tc = m->tc; 914 struct thin_c *tc = m->tc;
@@ -859,7 +921,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
859 921
860 if (m->err) { 922 if (m->err) {
861 cell_error(m->cell); 923 cell_error(m->cell);
862 return; 924 goto out;
863 } 925 }
864 926
865 /* 927 /*
@@ -871,7 +933,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
871 if (r) { 933 if (r) {
872 DMERR("dm_thin_insert_block() failed"); 934 DMERR("dm_thin_insert_block() failed");
873 cell_error(m->cell); 935 cell_error(m->cell);
874 return; 936 goto out;
875 } 937 }
876 938
877 /* 939 /*
@@ -886,22 +948,25 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
886 } else 948 } else
887 cell_defer(tc, m->cell, m->data_block); 949 cell_defer(tc, m->cell, m->data_block);
888 950
951out:
889 list_del(&m->list); 952 list_del(&m->list);
890 mempool_free(m, tc->pool->mapping_pool); 953 mempool_free(m, tc->pool->mapping_pool);
891} 954}
892 955
893static void process_prepared_discard(struct dm_thin_new_mapping *m) 956static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
894{ 957{
895 int r;
896 struct thin_c *tc = m->tc; 958 struct thin_c *tc = m->tc;
897 959
898 r = dm_thin_remove_block(tc->td, m->virt_block); 960 bio_io_error(m->bio);
899 if (r) 961 cell_defer_except(tc, m->cell);
900 DMERR("dm_thin_remove_block() failed"); 962 cell_defer_except(tc, m->cell2);
963 mempool_free(m, tc->pool->mapping_pool);
964}
965
966static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
967{
968 struct thin_c *tc = m->tc;
901 969
902 /*
903 * Pass the discard down to the underlying device?
904 */
905 if (m->pass_discard) 970 if (m->pass_discard)
906 remap_and_issue(tc, m->bio, m->data_block); 971 remap_and_issue(tc, m->bio, m->data_block);
907 else 972 else
@@ -912,8 +977,20 @@ static void process_prepared_discard(struct dm_thin_new_mapping *m)
912 mempool_free(m, tc->pool->mapping_pool); 977 mempool_free(m, tc->pool->mapping_pool);
913} 978}
914 979
980static void process_prepared_discard(struct dm_thin_new_mapping *m)
981{
982 int r;
983 struct thin_c *tc = m->tc;
984
985 r = dm_thin_remove_block(tc->td, m->virt_block);
986 if (r)
987 DMERR("dm_thin_remove_block() failed");
988
989 process_prepared_discard_passdown(m);
990}
991
915static void process_prepared(struct pool *pool, struct list_head *head, 992static void process_prepared(struct pool *pool, struct list_head *head,
916 void (*fn)(struct dm_thin_new_mapping *)) 993 process_mapping_fn *fn)
917{ 994{
918 unsigned long flags; 995 unsigned long flags;
919 struct list_head maps; 996 struct list_head maps;
@@ -925,7 +1002,7 @@ static void process_prepared(struct pool *pool, struct list_head *head,
925 spin_unlock_irqrestore(&pool->lock, flags); 1002 spin_unlock_irqrestore(&pool->lock, flags);
926 1003
927 list_for_each_entry_safe(m, tmp, &maps, list) 1004 list_for_each_entry_safe(m, tmp, &maps, list)
928 fn(m); 1005 (*fn)(m);
929} 1006}
930 1007
931/* 1008/*
@@ -933,9 +1010,7 @@ static void process_prepared(struct pool *pool, struct list_head *head,
933 */ 1010 */
934static int io_overlaps_block(struct pool *pool, struct bio *bio) 1011static int io_overlaps_block(struct pool *pool, struct bio *bio)
935{ 1012{
936 return !(bio->bi_sector & pool->offset_mask) && 1013 return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
937 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
938
939} 1014}
940 1015
941static int io_overwrites_block(struct pool *pool, struct bio *bio) 1016static int io_overwrites_block(struct pool *pool, struct bio *bio)
@@ -1093,6 +1168,35 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
1093 } 1168 }
1094} 1169}
1095 1170
1171static int commit(struct pool *pool)
1172{
1173 int r;
1174
1175 r = dm_pool_commit_metadata(pool->pmd);
1176 if (r)
1177 DMERR("commit failed, error = %d", r);
1178
1179 return r;
1180}
1181
1182/*
1183 * A non-zero return indicates read_only or fail_io mode.
1184 * Many callers don't care about the return value.
1185 */
1186static int commit_or_fallback(struct pool *pool)
1187{
1188 int r;
1189
1190 if (get_pool_mode(pool) != PM_WRITE)
1191 return -EINVAL;
1192
1193 r = commit(pool);
1194 if (r)
1195 set_pool_mode(pool, PM_READ_ONLY);
1196
1197 return r;
1198}
1199
1096static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 1200static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1097{ 1201{
1098 int r; 1202 int r;
@@ -1121,12 +1225,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1121 * Try to commit to see if that will free up some 1225 * Try to commit to see if that will free up some
1122 * more space. 1226 * more space.
1123 */ 1227 */
1124 r = dm_pool_commit_metadata(pool->pmd); 1228 (void) commit_or_fallback(pool);
1125 if (r) {
1126 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1127 __func__, r);
1128 return r;
1129 }
1130 1229
1131 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1230 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1132 if (r) 1231 if (r)
@@ -1218,7 +1317,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
1218 */ 1317 */
1219 m = get_next_mapping(pool); 1318 m = get_next_mapping(pool);
1220 m->tc = tc; 1319 m->tc = tc;
1221 m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; 1320 m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
1222 m->virt_block = block; 1321 m->virt_block = block;
1223 m->data_block = lookup_result.block; 1322 m->data_block = lookup_result.block;
1224 m->cell = cell; 1323 m->cell = cell;
@@ -1234,18 +1333,16 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
1234 } 1333 }
1235 } else { 1334 } else {
1236 /* 1335 /*
1237 * This path is hit if people are ignoring 1336 * The DM core makes sure that the discard doesn't span
1238 * limits->discard_granularity. It ignores any 1337 * a block boundary. So we submit the discard of a
1239 * part of the discard that is in a subsequent 1338 * partial block appropriately.
1240 * block.
1241 */ 1339 */
1242 sector_t offset = bio->bi_sector - (block << pool->block_shift);
1243 unsigned remaining = (pool->sectors_per_block - offset) << 9;
1244 bio->bi_size = min(bio->bi_size, remaining);
1245
1246 cell_release_singleton(cell, bio); 1340 cell_release_singleton(cell, bio);
1247 cell_release_singleton(cell2, bio); 1341 cell_release_singleton(cell2, bio);
1248 remap_and_issue(tc, bio, lookup_result.block); 1342 if ((!lookup_result.shared) && pool->pf.discard_passdown)
1343 remap_and_issue(tc, bio, lookup_result.block);
1344 else
1345 bio_endio(bio, 0);
1249 } 1346 }
1250 break; 1347 break;
1251 1348
@@ -1307,7 +1404,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1307 if (bio_detain(pool->prison, &key, bio, &cell)) 1404 if (bio_detain(pool->prison, &key, bio, &cell))
1308 return; 1405 return;
1309 1406
1310 if (bio_data_dir(bio) == WRITE) 1407 if (bio_data_dir(bio) == WRITE && bio->bi_size)
1311 break_sharing(tc, bio, block, &key, lookup_result, cell); 1408 break_sharing(tc, bio, block, &key, lookup_result, cell);
1312 else { 1409 else {
1313 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1410 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
@@ -1359,6 +1456,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1359 1456
1360 default: 1457 default:
1361 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1458 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1459 set_pool_mode(tc->pool, PM_READ_ONLY);
1362 cell_error(cell); 1460 cell_error(cell);
1363 break; 1461 break;
1364 } 1462 }
@@ -1416,6 +1514,49 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1416 } 1514 }
1417} 1515}
1418 1516
1517static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1518{
1519 int r;
1520 int rw = bio_data_dir(bio);
1521 dm_block_t block = get_bio_block(tc, bio);
1522 struct dm_thin_lookup_result lookup_result;
1523
1524 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1525 switch (r) {
1526 case 0:
1527 if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
1528 bio_io_error(bio);
1529 else
1530 remap_and_issue(tc, bio, lookup_result.block);
1531 break;
1532
1533 case -ENODATA:
1534 if (rw != READ) {
1535 bio_io_error(bio);
1536 break;
1537 }
1538
1539 if (tc->origin_dev) {
1540 remap_to_origin_and_issue(tc, bio);
1541 break;
1542 }
1543
1544 zero_fill_bio(bio);
1545 bio_endio(bio, 0);
1546 break;
1547
1548 default:
1549 DMERR("dm_thin_find_block() failed, error = %d", r);
1550 bio_io_error(bio);
1551 break;
1552 }
1553}
1554
1555static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1556{
1557 bio_io_error(bio);
1558}
1559
1419static int need_commit_due_to_time(struct pool *pool) 1560static int need_commit_due_to_time(struct pool *pool)
1420{ 1561{
1421 return jiffies < pool->last_commit_jiffies || 1562 return jiffies < pool->last_commit_jiffies ||
@@ -1427,7 +1568,6 @@ static void process_deferred_bios(struct pool *pool)
1427 unsigned long flags; 1568 unsigned long flags;
1428 struct bio *bio; 1569 struct bio *bio;
1429 struct bio_list bios; 1570 struct bio_list bios;
1430 int r;
1431 1571
1432 bio_list_init(&bios); 1572 bio_list_init(&bios);
1433 1573
@@ -1454,9 +1594,9 @@ static void process_deferred_bios(struct pool *pool)
1454 } 1594 }
1455 1595
1456 if (bio->bi_rw & REQ_DISCARD) 1596 if (bio->bi_rw & REQ_DISCARD)
1457 process_discard(tc, bio); 1597 pool->process_discard(tc, bio);
1458 else 1598 else
1459 process_bio(tc, bio); 1599 pool->process_bio(tc, bio);
1460 } 1600 }
1461 1601
1462 /* 1602 /*
@@ -1472,10 +1612,7 @@ static void process_deferred_bios(struct pool *pool)
1472 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1612 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1473 return; 1613 return;
1474 1614
1475 r = dm_pool_commit_metadata(pool->pmd); 1615 if (commit_or_fallback(pool)) {
1476 if (r) {
1477 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1478 __func__, r);
1479 while ((bio = bio_list_pop(&bios))) 1616 while ((bio = bio_list_pop(&bios)))
1480 bio_io_error(bio); 1617 bio_io_error(bio);
1481 return; 1618 return;
@@ -1490,8 +1627,8 @@ static void do_worker(struct work_struct *ws)
1490{ 1627{
1491 struct pool *pool = container_of(ws, struct pool, worker); 1628 struct pool *pool = container_of(ws, struct pool, worker);
1492 1629
1493 process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); 1630 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1494 process_prepared(pool, &pool->prepared_discards, process_prepared_discard); 1631 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1495 process_deferred_bios(pool); 1632 process_deferred_bios(pool);
1496} 1633}
1497 1634
@@ -1508,6 +1645,52 @@ static void do_waker(struct work_struct *ws)
1508 1645
1509/*----------------------------------------------------------------*/ 1646/*----------------------------------------------------------------*/
1510 1647
1648static enum pool_mode get_pool_mode(struct pool *pool)
1649{
1650 return pool->pf.mode;
1651}
1652
1653static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1654{
1655 int r;
1656
1657 pool->pf.mode = mode;
1658
1659 switch (mode) {
1660 case PM_FAIL:
1661 DMERR("switching pool to failure mode");
1662 pool->process_bio = process_bio_fail;
1663 pool->process_discard = process_bio_fail;
1664 pool->process_prepared_mapping = process_prepared_mapping_fail;
1665 pool->process_prepared_discard = process_prepared_discard_fail;
1666 break;
1667
1668 case PM_READ_ONLY:
1669 DMERR("switching pool to read-only mode");
1670 r = dm_pool_abort_metadata(pool->pmd);
1671 if (r) {
1672 DMERR("aborting transaction failed");
1673 set_pool_mode(pool, PM_FAIL);
1674 } else {
1675 dm_pool_metadata_read_only(pool->pmd);
1676 pool->process_bio = process_bio_read_only;
1677 pool->process_discard = process_discard;
1678 pool->process_prepared_mapping = process_prepared_mapping_fail;
1679 pool->process_prepared_discard = process_prepared_discard_passdown;
1680 }
1681 break;
1682
1683 case PM_WRITE:
1684 pool->process_bio = process_bio;
1685 pool->process_discard = process_discard;
1686 pool->process_prepared_mapping = process_prepared_mapping;
1687 pool->process_prepared_discard = process_prepared_discard;
1688 break;
1689 }
1690}
1691
1692/*----------------------------------------------------------------*/
1693
1511/* 1694/*
1512 * Mapping functions. 1695 * Mapping functions.
1513 */ 1696 */
@@ -1553,6 +1736,12 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1553 struct dm_thin_lookup_result result; 1736 struct dm_thin_lookup_result result;
1554 1737
1555 map_context->ptr = thin_hook_bio(tc, bio); 1738 map_context->ptr = thin_hook_bio(tc, bio);
1739
1740 if (get_pool_mode(tc->pool) == PM_FAIL) {
1741 bio_io_error(bio);
1742 return DM_MAPIO_SUBMITTED;
1743 }
1744
1556 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 1745 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1557 thin_defer_bio(tc, bio); 1746 thin_defer_bio(tc, bio);
1558 return DM_MAPIO_SUBMITTED; 1747 return DM_MAPIO_SUBMITTED;
@@ -1589,14 +1778,35 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1589 break; 1778 break;
1590 1779
1591 case -ENODATA: 1780 case -ENODATA:
1781 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1782 /*
1783 * This block isn't provisioned, and we have no way
1784 * of doing so. Just error it.
1785 */
1786 bio_io_error(bio);
1787 r = DM_MAPIO_SUBMITTED;
1788 break;
1789 }
1790 /* fall through */
1791
1792 case -EWOULDBLOCK:
1592 /* 1793 /*
1593 * In future, the failed dm_thin_find_block above could 1794 * In future, the failed dm_thin_find_block above could
1594 * provide the hint to load the metadata into cache. 1795 * provide the hint to load the metadata into cache.
1595 */ 1796 */
1596 case -EWOULDBLOCK:
1597 thin_defer_bio(tc, bio); 1797 thin_defer_bio(tc, bio);
1598 r = DM_MAPIO_SUBMITTED; 1798 r = DM_MAPIO_SUBMITTED;
1599 break; 1799 break;
1800
1801 default:
1802 /*
1803 * Must always call bio_io_error on failure.
1804 * dm_thin_find_block can fail with -EINVAL if the
1805 * pool is switched to fail-io mode.
1806 */
1807 bio_io_error(bio);
1808 r = DM_MAPIO_SUBMITTED;
1809 break;
1600 } 1810 }
1601 1811
1602 return r; 1812 return r;
@@ -1633,15 +1843,26 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1633{ 1843{
1634 struct pool_c *pt = ti->private; 1844 struct pool_c *pt = ti->private;
1635 1845
1846 /*
1847 * We want to make sure that degraded pools are never upgraded.
1848 */
1849 enum pool_mode old_mode = pool->pf.mode;
1850 enum pool_mode new_mode = pt->pf.mode;
1851
1852 if (old_mode > new_mode)
1853 new_mode = old_mode;
1854
1636 pool->ti = ti; 1855 pool->ti = ti;
1637 pool->low_water_blocks = pt->low_water_blocks; 1856 pool->low_water_blocks = pt->low_water_blocks;
1638 pool->pf = pt->pf; 1857 pool->pf = pt->pf;
1858 set_pool_mode(pool, new_mode);
1639 1859
1640 /* 1860 /*
1641 * If discard_passdown was enabled verify that the data device 1861 * If discard_passdown was enabled verify that the data device
1642 * supports discards. Disable discard_passdown if not; otherwise 1862 * supports discards. Disable discard_passdown if not; otherwise
1643 * -EOPNOTSUPP will be returned. 1863 * -EOPNOTSUPP will be returned.
1644 */ 1864 */
1865 /* FIXME: pull this out into a sep fn. */
1645 if (pt->pf.discard_passdown) { 1866 if (pt->pf.discard_passdown) {
1646 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1867 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1647 if (!q || !blk_queue_discard(q)) { 1868 if (!q || !blk_queue_discard(q)) {
@@ -1667,6 +1888,7 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1667/* Initialize pool features. */ 1888/* Initialize pool features. */
1668static void pool_features_init(struct pool_features *pf) 1889static void pool_features_init(struct pool_features *pf)
1669{ 1890{
1891 pf->mode = PM_WRITE;
1670 pf->zero_new_blocks = 1; 1892 pf->zero_new_blocks = 1;
1671 pf->discard_enabled = 1; 1893 pf->discard_enabled = 1;
1672 pf->discard_passdown = 1; 1894 pf->discard_passdown = 1;
@@ -1697,14 +1919,16 @@ static struct kmem_cache *_endio_hook_cache;
1697 1919
1698static struct pool *pool_create(struct mapped_device *pool_md, 1920static struct pool *pool_create(struct mapped_device *pool_md,
1699 struct block_device *metadata_dev, 1921 struct block_device *metadata_dev,
1700 unsigned long block_size, char **error) 1922 unsigned long block_size,
1923 int read_only, char **error)
1701{ 1924{
1702 int r; 1925 int r;
1703 void *err_p; 1926 void *err_p;
1704 struct pool *pool; 1927 struct pool *pool;
1705 struct dm_pool_metadata *pmd; 1928 struct dm_pool_metadata *pmd;
1929 bool format_device = read_only ? false : true;
1706 1930
1707 pmd = dm_pool_metadata_open(metadata_dev, block_size); 1931 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
1708 if (IS_ERR(pmd)) { 1932 if (IS_ERR(pmd)) {
1709 *error = "Error creating metadata object"; 1933 *error = "Error creating metadata object";
1710 return (struct pool *)pmd; 1934 return (struct pool *)pmd;
@@ -1719,8 +1943,10 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1719 1943
1720 pool->pmd = pmd; 1944 pool->pmd = pmd;
1721 pool->sectors_per_block = block_size; 1945 pool->sectors_per_block = block_size;
1722 pool->block_shift = ffs(block_size) - 1; 1946 if (block_size & (block_size - 1))
1723 pool->offset_mask = block_size - 1; 1947 pool->sectors_per_block_shift = -1;
1948 else
1949 pool->sectors_per_block_shift = __ffs(block_size);
1724 pool->low_water_blocks = 0; 1950 pool->low_water_blocks = 0;
1725 pool_features_init(&pool->pf); 1951 pool_features_init(&pool->pf);
1726 pool->prison = prison_create(PRISON_CELLS); 1952 pool->prison = prison_create(PRISON_CELLS);
@@ -1819,25 +2045,29 @@ static void __pool_dec(struct pool *pool)
1819 2045
1820static struct pool *__pool_find(struct mapped_device *pool_md, 2046static struct pool *__pool_find(struct mapped_device *pool_md,
1821 struct block_device *metadata_dev, 2047 struct block_device *metadata_dev,
1822 unsigned long block_size, char **error, 2048 unsigned long block_size, int read_only,
1823 int *created) 2049 char **error, int *created)
1824{ 2050{
1825 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 2051 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1826 2052
1827 if (pool) { 2053 if (pool) {
1828 if (pool->pool_md != pool_md) 2054 if (pool->pool_md != pool_md) {
2055 *error = "metadata device already in use by a pool";
1829 return ERR_PTR(-EBUSY); 2056 return ERR_PTR(-EBUSY);
2057 }
1830 __pool_inc(pool); 2058 __pool_inc(pool);
1831 2059
1832 } else { 2060 } else {
1833 pool = __pool_table_lookup(pool_md); 2061 pool = __pool_table_lookup(pool_md);
1834 if (pool) { 2062 if (pool) {
1835 if (pool->md_dev != metadata_dev) 2063 if (pool->md_dev != metadata_dev) {
2064 *error = "different pool cannot replace a pool";
1836 return ERR_PTR(-EINVAL); 2065 return ERR_PTR(-EINVAL);
2066 }
1837 __pool_inc(pool); 2067 __pool_inc(pool);
1838 2068
1839 } else { 2069 } else {
1840 pool = pool_create(pool_md, metadata_dev, block_size, error); 2070 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
1841 *created = 1; 2071 *created = 1;
1842 } 2072 }
1843 } 2073 }
@@ -1888,19 +2118,23 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1888 arg_name = dm_shift_arg(as); 2118 arg_name = dm_shift_arg(as);
1889 argc--; 2119 argc--;
1890 2120
1891 if (!strcasecmp(arg_name, "skip_block_zeroing")) { 2121 if (!strcasecmp(arg_name, "skip_block_zeroing"))
1892 pf->zero_new_blocks = 0; 2122 pf->zero_new_blocks = 0;
1893 continue; 2123
1894 } else if (!strcasecmp(arg_name, "ignore_discard")) { 2124 else if (!strcasecmp(arg_name, "ignore_discard"))
1895 pf->discard_enabled = 0; 2125 pf->discard_enabled = 0;
1896 continue; 2126
1897 } else if (!strcasecmp(arg_name, "no_discard_passdown")) { 2127 else if (!strcasecmp(arg_name, "no_discard_passdown"))
1898 pf->discard_passdown = 0; 2128 pf->discard_passdown = 0;
1899 continue;
1900 }
1901 2129
1902 ti->error = "Unrecognised pool feature requested"; 2130 else if (!strcasecmp(arg_name, "read_only"))
1903 r = -EINVAL; 2131 pf->mode = PM_READ_ONLY;
2132
2133 else {
2134 ti->error = "Unrecognised pool feature requested";
2135 r = -EINVAL;
2136 break;
2137 }
1904 } 2138 }
1905 2139
1906 return r; 2140 return r;
@@ -1964,7 +2198,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1964 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 2198 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1965 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2199 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1966 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2200 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1967 !is_power_of_2(block_size)) { 2201 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1968 ti->error = "Invalid block size"; 2202 ti->error = "Invalid block size";
1969 r = -EINVAL; 2203 r = -EINVAL;
1970 goto out; 2204 goto out;
@@ -1993,7 +2227,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1993 } 2227 }
1994 2228
1995 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 2229 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1996 block_size, &ti->error, &pool_created); 2230 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
1997 if (IS_ERR(pool)) { 2231 if (IS_ERR(pool)) {
1998 r = PTR_ERR(pool); 2232 r = PTR_ERR(pool);
1999 goto out_free_pt; 2233 goto out_free_pt;
@@ -2011,6 +2245,15 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2011 goto out_flags_changed; 2245 goto out_flags_changed;
2012 } 2246 }
2013 2247
2248 /*
2249 * The block layer requires discard_granularity to be a power of 2.
2250 */
2251 if (pf.discard_enabled && !is_power_of_2(block_size)) {
2252 ti->error = "Discard support must be disabled when the block size is not a power of 2";
2253 r = -EINVAL;
2254 goto out_flags_changed;
2255 }
2256
2014 pt->pool = pool; 2257 pt->pool = pool;
2015 pt->ti = ti; 2258 pt->ti = ti;
2016 pt->metadata_dev = metadata_dev; 2259 pt->metadata_dev = metadata_dev;
@@ -2030,7 +2273,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2030 * stacking of discard limits (this keeps the pool and 2273 * stacking of discard limits (this keeps the pool and
2031 * thin devices' discard limits consistent). 2274 * thin devices' discard limits consistent).
2032 */ 2275 */
2033 ti->discards_supported = 1; 2276 ti->discards_supported = true;
2034 } 2277 }
2035 ti->private = pt; 2278 ti->private = pt;
2036 2279
@@ -2090,7 +2333,8 @@ static int pool_preresume(struct dm_target *ti)
2090 int r; 2333 int r;
2091 struct pool_c *pt = ti->private; 2334 struct pool_c *pt = ti->private;
2092 struct pool *pool = pt->pool; 2335 struct pool *pool = pt->pool;
2093 dm_block_t data_size, sb_data_size; 2336 sector_t data_size = ti->len;
2337 dm_block_t sb_data_size;
2094 2338
2095 /* 2339 /*
2096 * Take control of the pool object. 2340 * Take control of the pool object.
@@ -2099,7 +2343,8 @@ static int pool_preresume(struct dm_target *ti)
2099 if (r) 2343 if (r)
2100 return r; 2344 return r;
2101 2345
2102 data_size = ti->len >> pool->block_shift; 2346 (void) sector_div(data_size, pool->sectors_per_block);
2347
2103 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2348 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2104 if (r) { 2349 if (r) {
2105 DMERR("failed to retrieve data device size"); 2350 DMERR("failed to retrieve data device size");
@@ -2108,22 +2353,19 @@ static int pool_preresume(struct dm_target *ti)
2108 2353
2109 if (data_size < sb_data_size) { 2354 if (data_size < sb_data_size) {
2110 DMERR("pool target too small, is %llu blocks (expected %llu)", 2355 DMERR("pool target too small, is %llu blocks (expected %llu)",
2111 data_size, sb_data_size); 2356 (unsigned long long)data_size, sb_data_size);
2112 return -EINVAL; 2357 return -EINVAL;
2113 2358
2114 } else if (data_size > sb_data_size) { 2359 } else if (data_size > sb_data_size) {
2115 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2360 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2116 if (r) { 2361 if (r) {
2117 DMERR("failed to resize data device"); 2362 DMERR("failed to resize data device");
2363 /* FIXME Stricter than necessary: Rollback transaction instead here */
2364 set_pool_mode(pool, PM_READ_ONLY);
2118 return r; 2365 return r;
2119 } 2366 }
2120 2367
2121 r = dm_pool_commit_metadata(pool->pmd); 2368 (void) commit_or_fallback(pool);
2122 if (r) {
2123 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2124 __func__, r);
2125 return r;
2126 }
2127 } 2369 }
2128 2370
2129 return 0; 2371 return 0;
@@ -2146,19 +2388,12 @@ static void pool_resume(struct dm_target *ti)
2146 2388
2147static void pool_postsuspend(struct dm_target *ti) 2389static void pool_postsuspend(struct dm_target *ti)
2148{ 2390{
2149 int r;
2150 struct pool_c *pt = ti->private; 2391 struct pool_c *pt = ti->private;
2151 struct pool *pool = pt->pool; 2392 struct pool *pool = pt->pool;
2152 2393
2153 cancel_delayed_work(&pool->waker); 2394 cancel_delayed_work(&pool->waker);
2154 flush_workqueue(pool->wq); 2395 flush_workqueue(pool->wq);
2155 2396 (void) commit_or_fallback(pool);
2156 r = dm_pool_commit_metadata(pool->pmd);
2157 if (r < 0) {
2158 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2159 __func__, r);
2160 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
2161 }
2162} 2397}
2163 2398
2164static int check_arg_count(unsigned argc, unsigned args_required) 2399static int check_arg_count(unsigned argc, unsigned args_required)
@@ -2292,6 +2527,8 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct
2292 if (r) 2527 if (r)
2293 return r; 2528 return r;
2294 2529
2530 (void) commit_or_fallback(pool);
2531
2295 r = dm_pool_reserve_metadata_snap(pool->pmd); 2532 r = dm_pool_reserve_metadata_snap(pool->pmd);
2296 if (r) 2533 if (r)
2297 DMWARN("reserve_metadata_snap message failed."); 2534 DMWARN("reserve_metadata_snap message failed.");
@@ -2351,25 +2588,41 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2351 else 2588 else
2352 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2589 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2353 2590
2354 if (!r) { 2591 if (!r)
2355 r = dm_pool_commit_metadata(pool->pmd); 2592 (void) commit_or_fallback(pool);
2356 if (r)
2357 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
2358 argv[0], r);
2359 }
2360 2593
2361 return r; 2594 return r;
2362} 2595}
2363 2596
2597static void emit_flags(struct pool_features *pf, char *result,
2598 unsigned sz, unsigned maxlen)
2599{
2600 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2601 !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
2602 DMEMIT("%u ", count);
2603
2604 if (!pf->zero_new_blocks)
2605 DMEMIT("skip_block_zeroing ");
2606
2607 if (!pf->discard_enabled)
2608 DMEMIT("ignore_discard ");
2609
2610 if (!pf->discard_passdown)
2611 DMEMIT("no_discard_passdown ");
2612
2613 if (pf->mode == PM_READ_ONLY)
2614 DMEMIT("read_only ");
2615}
2616
2364/* 2617/*
2365 * Status line is: 2618 * Status line is:
2366 * <transaction id> <used metadata sectors>/<total metadata sectors> 2619 * <transaction id> <used metadata sectors>/<total metadata sectors>
2367 * <used data sectors>/<total data sectors> <held metadata root> 2620 * <used data sectors>/<total data sectors> <held metadata root>
2368 */ 2621 */
2369static int pool_status(struct dm_target *ti, status_type_t type, 2622static int pool_status(struct dm_target *ti, status_type_t type,
2370 char *result, unsigned maxlen) 2623 unsigned status_flags, char *result, unsigned maxlen)
2371{ 2624{
2372 int r, count; 2625 int r;
2373 unsigned sz = 0; 2626 unsigned sz = 0;
2374 uint64_t transaction_id; 2627 uint64_t transaction_id;
2375 dm_block_t nr_free_blocks_data; 2628 dm_block_t nr_free_blocks_data;
@@ -2384,6 +2637,15 @@ static int pool_status(struct dm_target *ti, status_type_t type,
2384 2637
2385 switch (type) { 2638 switch (type) {
2386 case STATUSTYPE_INFO: 2639 case STATUSTYPE_INFO:
2640 if (get_pool_mode(pool) == PM_FAIL) {
2641 DMEMIT("Fail");
2642 break;
2643 }
2644
2645 /* Commit to ensure statistics aren't out-of-date */
2646 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2647 (void) commit_or_fallback(pool);
2648
2387 r = dm_pool_get_metadata_transaction_id(pool->pmd, 2649 r = dm_pool_get_metadata_transaction_id(pool->pmd,
2388 &transaction_id); 2650 &transaction_id);
2389 if (r) 2651 if (r)
@@ -2419,9 +2681,19 @@ static int pool_status(struct dm_target *ti, status_type_t type,
2419 (unsigned long long)nr_blocks_data); 2681 (unsigned long long)nr_blocks_data);
2420 2682
2421 if (held_root) 2683 if (held_root)
2422 DMEMIT("%llu", held_root); 2684 DMEMIT("%llu ", held_root);
2685 else
2686 DMEMIT("- ");
2687
2688 if (pool->pf.mode == PM_READ_ONLY)
2689 DMEMIT("ro ");
2690 else
2691 DMEMIT("rw ");
2692
2693 if (pool->pf.discard_enabled && pool->pf.discard_passdown)
2694 DMEMIT("discard_passdown");
2423 else 2695 else
2424 DMEMIT("-"); 2696 DMEMIT("no_discard_passdown");
2425 2697
2426 break; 2698 break;
2427 2699
@@ -2431,20 +2703,7 @@ static int pool_status(struct dm_target *ti, status_type_t type,
2431 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 2703 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2432 (unsigned long)pool->sectors_per_block, 2704 (unsigned long)pool->sectors_per_block,
2433 (unsigned long long)pt->low_water_blocks); 2705 (unsigned long long)pt->low_water_blocks);
2434 2706 emit_flags(&pt->pf, result, sz, maxlen);
2435 count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
2436 !pt->pf.discard_passdown;
2437 DMEMIT("%u ", count);
2438
2439 if (!pool->pf.zero_new_blocks)
2440 DMEMIT("skip_block_zeroing ");
2441
2442 if (!pool->pf.discard_enabled)
2443 DMEMIT("ignore_discard ");
2444
2445 if (!pt->pf.discard_passdown)
2446 DMEMIT("no_discard_passdown ");
2447
2448 break; 2707 break;
2449 } 2708 }
2450 2709
@@ -2482,7 +2741,8 @@ static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
2482 2741
2483 /* 2742 /*
2484 * This is just a hint, and not enforced. We have to cope with 2743 * This is just a hint, and not enforced. We have to cope with
2485 * bios that overlap 2 blocks. 2744 * bios that cover a block partially. A discard that spans a block
2745 * boundary is not sent to this target.
2486 */ 2746 */
2487 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 2747 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2488 limits->discard_zeroes_data = pool->pf.zero_new_blocks; 2748 limits->discard_zeroes_data = pool->pf.zero_new_blocks;
@@ -2503,7 +2763,7 @@ static struct target_type pool_target = {
2503 .name = "thin-pool", 2763 .name = "thin-pool",
2504 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2764 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2505 DM_TARGET_IMMUTABLE, 2765 DM_TARGET_IMMUTABLE,
2506 .version = {1, 2, 0}, 2766 .version = {1, 3, 0},
2507 .module = THIS_MODULE, 2767 .module = THIS_MODULE,
2508 .ctr = pool_ctr, 2768 .ctr = pool_ctr,
2509 .dtr = pool_dtr, 2769 .dtr = pool_dtr,
@@ -2608,19 +2868,31 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2608 } 2868 }
2609 __pool_inc(tc->pool); 2869 __pool_inc(tc->pool);
2610 2870
2871 if (get_pool_mode(tc->pool) == PM_FAIL) {
2872 ti->error = "Couldn't open thin device, Pool is in fail mode";
2873 goto bad_thin_open;
2874 }
2875
2611 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 2876 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2612 if (r) { 2877 if (r) {
2613 ti->error = "Couldn't open thin internal device"; 2878 ti->error = "Couldn't open thin internal device";
2614 goto bad_thin_open; 2879 goto bad_thin_open;
2615 } 2880 }
2616 2881
2617 ti->split_io = tc->pool->sectors_per_block; 2882 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2883 if (r)
2884 goto bad_thin_open;
2885
2618 ti->num_flush_requests = 1; 2886 ti->num_flush_requests = 1;
2887 ti->flush_supported = true;
2619 2888
2620 /* In case the pool supports discards, pass them on. */ 2889 /* In case the pool supports discards, pass them on. */
2621 if (tc->pool->pf.discard_enabled) { 2890 if (tc->pool->pf.discard_enabled) {
2622 ti->discards_supported = 1; 2891 ti->discards_supported = true;
2623 ti->num_discard_requests = 1; 2892 ti->num_discard_requests = 1;
2893 ti->discard_zeroes_data_unsupported = true;
2894 /* Discard requests must be split on a block boundary */
2895 ti->split_discard_requests = true;
2624 } 2896 }
2625 2897
2626 dm_put(pool_md); 2898 dm_put(pool_md);
@@ -2701,7 +2973,7 @@ static void thin_postsuspend(struct dm_target *ti)
2701 * <nr mapped sectors> <highest mapped sector> 2973 * <nr mapped sectors> <highest mapped sector>
2702 */ 2974 */
2703static int thin_status(struct dm_target *ti, status_type_t type, 2975static int thin_status(struct dm_target *ti, status_type_t type,
2704 char *result, unsigned maxlen) 2976 unsigned status_flags, char *result, unsigned maxlen)
2705{ 2977{
2706 int r; 2978 int r;
2707 ssize_t sz = 0; 2979 ssize_t sz = 0;
@@ -2709,6 +2981,11 @@ static int thin_status(struct dm_target *ti, status_type_t type,
2709 char buf[BDEVNAME_SIZE]; 2981 char buf[BDEVNAME_SIZE];
2710 struct thin_c *tc = ti->private; 2982 struct thin_c *tc = ti->private;
2711 2983
2984 if (get_pool_mode(tc->pool) == PM_FAIL) {
2985 DMEMIT("Fail");
2986 return 0;
2987 }
2988
2712 if (!tc->td) 2989 if (!tc->td)
2713 DMEMIT("-"); 2990 DMEMIT("-");
2714 else { 2991 else {
@@ -2746,19 +3023,21 @@ static int thin_status(struct dm_target *ti, status_type_t type,
2746static int thin_iterate_devices(struct dm_target *ti, 3023static int thin_iterate_devices(struct dm_target *ti,
2747 iterate_devices_callout_fn fn, void *data) 3024 iterate_devices_callout_fn fn, void *data)
2748{ 3025{
2749 dm_block_t blocks; 3026 sector_t blocks;
2750 struct thin_c *tc = ti->private; 3027 struct thin_c *tc = ti->private;
3028 struct pool *pool = tc->pool;
2751 3029
2752 /* 3030 /*
2753 * We can't call dm_pool_get_data_dev_size() since that blocks. So 3031 * We can't call dm_pool_get_data_dev_size() since that blocks. So
2754 * we follow a more convoluted path through to the pool's target. 3032 * we follow a more convoluted path through to the pool's target.
2755 */ 3033 */
2756 if (!tc->pool->ti) 3034 if (!pool->ti)
2757 return 0; /* nothing is bound */ 3035 return 0; /* nothing is bound */
2758 3036
2759 blocks = tc->pool->ti->len >> tc->pool->block_shift; 3037 blocks = pool->ti->len;
3038 (void) sector_div(blocks, pool->sectors_per_block);
2760 if (blocks) 3039 if (blocks)
2761 return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); 3040 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
2762 3041
2763 return 0; 3042 return 0;
2764} 3043}
@@ -2775,7 +3054,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2775 3054
2776static struct target_type thin_target = { 3055static struct target_type thin_target = {
2777 .name = "thin", 3056 .name = "thin",
2778 .version = {1, 1, 0}, 3057 .version = {1, 3, 0},
2779 .module = THIS_MODULE, 3058 .module = THIS_MODULE,
2780 .ctr = thin_ctr, 3059 .ctr = thin_ctr,
2781 .dtr = thin_dtr, 3060 .dtr = thin_dtr,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index fa365d39b612..254d19268ad2 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -515,7 +515,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio,
515 * Status: V (valid) or C (corruption found) 515 * Status: V (valid) or C (corruption found)
516 */ 516 */
517static int verity_status(struct dm_target *ti, status_type_t type, 517static int verity_status(struct dm_target *ti, status_type_t type,
518 char *result, unsigned maxlen) 518 unsigned status_flags, char *result, unsigned maxlen)
519{ 519{
520 struct dm_verity *v = ti->private; 520 struct dm_verity *v = ti->private;
521 unsigned sz = 0; 521 unsigned sz = 0;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e24143cc2040..4e09b6ff5b49 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -968,22 +968,41 @@ static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti
968static sector_t max_io_len(sector_t sector, struct dm_target *ti) 968static sector_t max_io_len(sector_t sector, struct dm_target *ti)
969{ 969{
970 sector_t len = max_io_len_target_boundary(sector, ti); 970 sector_t len = max_io_len_target_boundary(sector, ti);
971 sector_t offset, max_len;
971 972
972 /* 973 /*
973 * Does the target need to split even further ? 974 * Does the target need to split even further?
974 */ 975 */
975 if (ti->split_io) { 976 if (ti->max_io_len) {
976 sector_t boundary; 977 offset = dm_target_offset(ti, sector);
977 sector_t offset = dm_target_offset(ti, sector); 978 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
978 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 979 max_len = sector_div(offset, ti->max_io_len);
979 - offset; 980 else
980 if (len > boundary) 981 max_len = offset & (ti->max_io_len - 1);
981 len = boundary; 982 max_len = ti->max_io_len - max_len;
983
984 if (len > max_len)
985 len = max_len;
982 } 986 }
983 987
984 return len; 988 return len;
985} 989}
986 990
991int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
992{
993 if (len > UINT_MAX) {
994 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
995 (unsigned long long)len, UINT_MAX);
996 ti->error = "Maximum size of target IO is too large";
997 return -EINVAL;
998 }
999
1000 ti->max_io_len = (uint32_t) len;
1001
1002 return 0;
1003}
1004EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1005
987static void __map_bio(struct dm_target *ti, struct bio *clone, 1006static void __map_bio(struct dm_target *ti, struct bio *clone,
988 struct dm_target_io *tio) 1007 struct dm_target_io *tio)
989{ 1008{
@@ -1196,7 +1215,10 @@ static int __clone_and_map_discard(struct clone_info *ci)
1196 if (!ti->num_discard_requests) 1215 if (!ti->num_discard_requests)
1197 return -EOPNOTSUPP; 1216 return -EOPNOTSUPP;
1198 1217
1199 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1218 if (!ti->split_discard_requests)
1219 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1220 else
1221 len = min(ci->sector_count, max_io_len(ci->sector, ti));
1200 1222
1201 __issue_target_requests(ci, ti, ti->num_discard_requests, len); 1223 __issue_target_requests(ci, ti, ti->num_discard_requests, len);
1202 1224
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b7dacd59d8d7..52eef493d266 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -23,6 +23,11 @@
23#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) 23#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
24 24
25/* 25/*
26 * Status feature flags
27 */
28#define DM_STATUS_NOFLUSH_FLAG (1 << 0)
29
30/*
26 * Type of table and mapped_device's mempool 31 * Type of table and mapped_device's mempool
27 */ 32 */
28#define DM_TYPE_NONE 0 33#define DM_TYPE_NONE 0
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1c2f9048e1ae..fcd098794d37 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -498,61 +498,13 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
498} 498}
499EXPORT_SYMBOL(md_flush_request); 499EXPORT_SYMBOL(md_flush_request);
500 500
501/* Support for plugging. 501void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
502 * This mirrors the plugging support in request_queue, but does not
503 * require having a whole queue or request structures.
504 * We allocate an md_plug_cb for each md device and each thread it gets
505 * plugged on. This links tot the private plug_handle structure in the
506 * personality data where we keep a count of the number of outstanding
507 * plugs so other code can see if a plug is active.
508 */
509struct md_plug_cb {
510 struct blk_plug_cb cb;
511 struct mddev *mddev;
512};
513
514static void plugger_unplug(struct blk_plug_cb *cb)
515{
516 struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
517 if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
518 md_wakeup_thread(mdcb->mddev->thread);
519 kfree(mdcb);
520}
521
522/* Check that an unplug wakeup will come shortly.
523 * If not, wakeup the md thread immediately
524 */
525int mddev_check_plugged(struct mddev *mddev)
526{ 502{
527 struct blk_plug *plug = current->plug; 503 struct mddev *mddev = cb->data;
528 struct md_plug_cb *mdcb; 504 md_wakeup_thread(mddev->thread);
529 505 kfree(cb);
530 if (!plug)
531 return 0;
532
533 list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
534 if (mdcb->cb.callback == plugger_unplug &&
535 mdcb->mddev == mddev) {
536 /* Already on the list, move to top */
537 if (mdcb != list_first_entry(&plug->cb_list,
538 struct md_plug_cb,
539 cb.list))
540 list_move(&mdcb->cb.list, &plug->cb_list);
541 return 1;
542 }
543 }
544 /* Not currently on the callback list */
545 mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
546 if (!mdcb)
547 return 0;
548
549 mdcb->mddev = mddev;
550 mdcb->cb.callback = plugger_unplug;
551 atomic_inc(&mddev->plug_cnt);
552 list_add(&mdcb->cb.list, &plug->cb_list);
553 return 1;
554} 506}
555EXPORT_SYMBOL_GPL(mddev_check_plugged); 507EXPORT_SYMBOL(md_unplug);
556 508
557static inline struct mddev *mddev_get(struct mddev *mddev) 509static inline struct mddev *mddev_get(struct mddev *mddev)
558{ 510{
@@ -602,7 +554,6 @@ void mddev_init(struct mddev *mddev)
602 atomic_set(&mddev->active, 1); 554 atomic_set(&mddev->active, 1);
603 atomic_set(&mddev->openers, 0); 555 atomic_set(&mddev->openers, 0);
604 atomic_set(&mddev->active_io, 0); 556 atomic_set(&mddev->active_io, 0);
605 atomic_set(&mddev->plug_cnt, 0);
606 spin_lock_init(&mddev->write_lock); 557 spin_lock_init(&mddev->write_lock);
607 atomic_set(&mddev->flush_pending, 0); 558 atomic_set(&mddev->flush_pending, 0);
608 init_waitqueue_head(&mddev->sb_wait); 559 init_waitqueue_head(&mddev->sb_wait);
@@ -2931,6 +2882,7 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len)
2931 * can be sane */ 2882 * can be sane */
2932 return -EBUSY; 2883 return -EBUSY;
2933 rdev->data_offset = offset; 2884 rdev->data_offset = offset;
2885 rdev->new_data_offset = offset;
2934 return len; 2886 return len;
2935} 2887}
2936 2888
@@ -3926,8 +3878,8 @@ array_state_show(struct mddev *mddev, char *page)
3926 return sprintf(page, "%s\n", array_states[st]); 3878 return sprintf(page, "%s\n", array_states[st]);
3927} 3879}
3928 3880
3929static int do_md_stop(struct mddev * mddev, int ro, int is_open); 3881static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev);
3930static int md_set_readonly(struct mddev * mddev, int is_open); 3882static int md_set_readonly(struct mddev * mddev, struct block_device *bdev);
3931static int do_md_run(struct mddev * mddev); 3883static int do_md_run(struct mddev * mddev);
3932static int restart_array(struct mddev *mddev); 3884static int restart_array(struct mddev *mddev);
3933 3885
@@ -3941,24 +3893,20 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3941 break; 3893 break;
3942 case clear: 3894 case clear:
3943 /* stopping an active array */ 3895 /* stopping an active array */
3944 if (atomic_read(&mddev->openers) > 0) 3896 err = do_md_stop(mddev, 0, NULL);
3945 return -EBUSY;
3946 err = do_md_stop(mddev, 0, 0);
3947 break; 3897 break;
3948 case inactive: 3898 case inactive:
3949 /* stopping an active array */ 3899 /* stopping an active array */
3950 if (mddev->pers) { 3900 if (mddev->pers)
3951 if (atomic_read(&mddev->openers) > 0) 3901 err = do_md_stop(mddev, 2, NULL);
3952 return -EBUSY; 3902 else
3953 err = do_md_stop(mddev, 2, 0);
3954 } else
3955 err = 0; /* already inactive */ 3903 err = 0; /* already inactive */
3956 break; 3904 break;
3957 case suspended: 3905 case suspended:
3958 break; /* not supported yet */ 3906 break; /* not supported yet */
3959 case readonly: 3907 case readonly:
3960 if (mddev->pers) 3908 if (mddev->pers)
3961 err = md_set_readonly(mddev, 0); 3909 err = md_set_readonly(mddev, NULL);
3962 else { 3910 else {
3963 mddev->ro = 1; 3911 mddev->ro = 1;
3964 set_disk_ro(mddev->gendisk, 1); 3912 set_disk_ro(mddev->gendisk, 1);
@@ -3968,7 +3916,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3968 case read_auto: 3916 case read_auto:
3969 if (mddev->pers) { 3917 if (mddev->pers) {
3970 if (mddev->ro == 0) 3918 if (mddev->ro == 0)
3971 err = md_set_readonly(mddev, 0); 3919 err = md_set_readonly(mddev, NULL);
3972 else if (mddev->ro == 1) 3920 else if (mddev->ro == 1)
3973 err = restart_array(mddev); 3921 err = restart_array(mddev);
3974 if (err == 0) { 3922 if (err == 0) {
@@ -5351,15 +5299,17 @@ void md_stop(struct mddev *mddev)
5351} 5299}
5352EXPORT_SYMBOL_GPL(md_stop); 5300EXPORT_SYMBOL_GPL(md_stop);
5353 5301
5354static int md_set_readonly(struct mddev *mddev, int is_open) 5302static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5355{ 5303{
5356 int err = 0; 5304 int err = 0;
5357 mutex_lock(&mddev->open_mutex); 5305 mutex_lock(&mddev->open_mutex);
5358 if (atomic_read(&mddev->openers) > is_open) { 5306 if (atomic_read(&mddev->openers) > !!bdev) {
5359 printk("md: %s still in use.\n",mdname(mddev)); 5307 printk("md: %s still in use.\n",mdname(mddev));
5360 err = -EBUSY; 5308 err = -EBUSY;
5361 goto out; 5309 goto out;
5362 } 5310 }
5311 if (bdev)
5312 sync_blockdev(bdev);
5363 if (mddev->pers) { 5313 if (mddev->pers) {
5364 __md_stop_writes(mddev); 5314 __md_stop_writes(mddev);
5365 5315
@@ -5381,18 +5331,26 @@ out:
5381 * 0 - completely stop and dis-assemble array 5331 * 0 - completely stop and dis-assemble array
5382 * 2 - stop but do not disassemble array 5332 * 2 - stop but do not disassemble array
5383 */ 5333 */
5384static int do_md_stop(struct mddev * mddev, int mode, int is_open) 5334static int do_md_stop(struct mddev * mddev, int mode,
5335 struct block_device *bdev)
5385{ 5336{
5386 struct gendisk *disk = mddev->gendisk; 5337 struct gendisk *disk = mddev->gendisk;
5387 struct md_rdev *rdev; 5338 struct md_rdev *rdev;
5388 5339
5389 mutex_lock(&mddev->open_mutex); 5340 mutex_lock(&mddev->open_mutex);
5390 if (atomic_read(&mddev->openers) > is_open || 5341 if (atomic_read(&mddev->openers) > !!bdev ||
5391 mddev->sysfs_active) { 5342 mddev->sysfs_active) {
5392 printk("md: %s still in use.\n",mdname(mddev)); 5343 printk("md: %s still in use.\n",mdname(mddev));
5393 mutex_unlock(&mddev->open_mutex); 5344 mutex_unlock(&mddev->open_mutex);
5394 return -EBUSY; 5345 return -EBUSY;
5395 } 5346 }
5347 if (bdev)
5348 /* It is possible IO was issued on some other
5349 * open file which was closed before we took ->open_mutex.
5350 * As that was not the last close __blkdev_put will not
5351 * have called sync_blockdev, so we must.
5352 */
5353 sync_blockdev(bdev);
5396 5354
5397 if (mddev->pers) { 5355 if (mddev->pers) {
5398 if (mddev->ro) 5356 if (mddev->ro)
@@ -5466,7 +5424,7 @@ static void autorun_array(struct mddev *mddev)
5466 err = do_md_run(mddev); 5424 err = do_md_run(mddev);
5467 if (err) { 5425 if (err) {
5468 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 5426 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
5469 do_md_stop(mddev, 0, 0); 5427 do_md_stop(mddev, 0, NULL);
5470 } 5428 }
5471} 5429}
5472 5430
@@ -5784,8 +5742,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
5784 super_types[mddev->major_version]. 5742 super_types[mddev->major_version].
5785 validate_super(mddev, rdev); 5743 validate_super(mddev, rdev);
5786 if ((info->state & (1<<MD_DISK_SYNC)) && 5744 if ((info->state & (1<<MD_DISK_SYNC)) &&
5787 (!test_bit(In_sync, &rdev->flags) || 5745 rdev->raid_disk != info->raid_disk) {
5788 rdev->raid_disk != info->raid_disk)) {
5789 /* This was a hot-add request, but events doesn't 5746 /* This was a hot-add request, but events doesn't
5790 * match, so reject it. 5747 * match, so reject it.
5791 */ 5748 */
@@ -6482,11 +6439,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6482 goto done_unlock; 6439 goto done_unlock;
6483 6440
6484 case STOP_ARRAY: 6441 case STOP_ARRAY:
6485 err = do_md_stop(mddev, 0, 1); 6442 err = do_md_stop(mddev, 0, bdev);
6486 goto done_unlock; 6443 goto done_unlock;
6487 6444
6488 case STOP_ARRAY_RO: 6445 case STOP_ARRAY_RO:
6489 err = md_set_readonly(mddev, 1); 6446 err = md_set_readonly(mddev, bdev);
6490 goto done_unlock; 6447 goto done_unlock;
6491 6448
6492 case BLKROSET: 6449 case BLKROSET:
@@ -6751,7 +6708,7 @@ struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev
6751 thread->tsk = kthread_run(md_thread, thread, 6708 thread->tsk = kthread_run(md_thread, thread,
6752 "%s_%s", 6709 "%s_%s",
6753 mdname(thread->mddev), 6710 mdname(thread->mddev),
6754 name ?: mddev->pers->name); 6711 name);
6755 if (IS_ERR(thread->tsk)) { 6712 if (IS_ERR(thread->tsk)) {
6756 kfree(thread); 6713 kfree(thread);
6757 return NULL; 6714 return NULL;
@@ -7298,6 +7255,7 @@ void md_do_sync(struct mddev *mddev)
7298 int skipped = 0; 7255 int skipped = 0;
7299 struct md_rdev *rdev; 7256 struct md_rdev *rdev;
7300 char *desc; 7257 char *desc;
7258 struct blk_plug plug;
7301 7259
7302 /* just incase thread restarts... */ 7260 /* just incase thread restarts... */
7303 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7261 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -7447,6 +7405,7 @@ void md_do_sync(struct mddev *mddev)
7447 } 7405 }
7448 mddev->curr_resync_completed = j; 7406 mddev->curr_resync_completed = j;
7449 7407
7408 blk_start_plug(&plug);
7450 while (j < max_sectors) { 7409 while (j < max_sectors) {
7451 sector_t sectors; 7410 sector_t sectors;
7452 7411
@@ -7552,6 +7511,7 @@ void md_do_sync(struct mddev *mddev)
7552 * this also signals 'finished resyncing' to md_stop 7511 * this also signals 'finished resyncing' to md_stop
7553 */ 7512 */
7554 out: 7513 out:
7514 blk_finish_plug(&plug);
7555 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 7515 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7556 7516
7557 /* tell personality that we are finished */ 7517 /* tell personality that we are finished */
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7b4a3c318cae..f385b038589d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -266,9 +266,6 @@ struct mddev {
266 int new_chunk_sectors; 266 int new_chunk_sectors;
267 int reshape_backwards; 267 int reshape_backwards;
268 268
269 atomic_t plug_cnt; /* If device is expecting
270 * more bios soon.
271 */
272 struct md_thread *thread; /* management thread */ 269 struct md_thread *thread; /* management thread */
273 struct md_thread *sync_thread; /* doing resync or reconstruct */ 270 struct md_thread *sync_thread; /* doing resync or reconstruct */
274 sector_t curr_resync; /* last block scheduled */ 271 sector_t curr_resync; /* last block scheduled */
@@ -630,6 +627,12 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
630 struct mddev *mddev); 627 struct mddev *mddev);
631extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 628extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
632 struct mddev *mddev); 629 struct mddev *mddev);
633extern int mddev_check_plugged(struct mddev *mddev);
634extern void md_trim_bio(struct bio *bio, int offset, int size); 630extern void md_trim_bio(struct bio *bio, int offset, int size);
631
632extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
633static inline int mddev_check_plugged(struct mddev *mddev)
634{
635 return !!blk_check_plugged(md_unplug, mddev,
636 sizeof(struct blk_plug_cb));
637}
635#endif /* _MD_MD_H */ 638#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 9339e67fcc79..61a1833ebaf3 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -474,7 +474,8 @@ static int multipath_run (struct mddev *mddev)
474 } 474 }
475 475
476 { 476 {
477 mddev->thread = md_register_thread(multipathd, mddev, NULL); 477 mddev->thread = md_register_thread(multipathd, mddev,
478 "multipath");
478 if (!mddev->thread) { 479 if (!mddev->thread) {
479 printk(KERN_ERR "multipath: couldn't allocate thread" 480 printk(KERN_ERR "multipath: couldn't allocate thread"
480 " for %s\n", mdname(mddev)); 481 " for %s\n", mdname(mddev));
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile
index cfa95f662230..d8e7cb767c1e 100644
--- a/drivers/md/persistent-data/Makefile
+++ b/drivers/md/persistent-data/Makefile
@@ -1,7 +1,6 @@
1obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o 1obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
2dm-persistent-data-objs := \ 2dm-persistent-data-objs := \
3 dm-block-manager.o \ 3 dm-block-manager.o \
4 dm-space-map-checker.o \
5 dm-space-map-common.o \ 4 dm-space-map-common.o \
6 dm-space-map-disk.o \ 5 dm-space-map-disk.o \
7 dm-space-map-metadata.o \ 6 dm-space-map-metadata.o \
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 0317ecdc6e53..5ba277768d99 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -325,11 +325,6 @@ static struct dm_buffer *to_buffer(struct dm_block *b)
325 return (struct dm_buffer *) b; 325 return (struct dm_buffer *) b;
326} 326}
327 327
328static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm)
329{
330 return (struct dm_bufio_client *) bm;
331}
332
333dm_block_t dm_block_location(struct dm_block *b) 328dm_block_t dm_block_location(struct dm_block *b)
334{ 329{
335 return dm_bufio_get_block_number(to_buffer(b)); 330 return dm_bufio_get_block_number(to_buffer(b));
@@ -367,34 +362,60 @@ static void dm_block_manager_write_callback(struct dm_buffer *buf)
367/*---------------------------------------------------------------- 362/*----------------------------------------------------------------
368 * Public interface 363 * Public interface
369 *--------------------------------------------------------------*/ 364 *--------------------------------------------------------------*/
365struct dm_block_manager {
366 struct dm_bufio_client *bufio;
367 bool read_only:1;
368};
369
370struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, 370struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
371 unsigned block_size, 371 unsigned block_size,
372 unsigned cache_size, 372 unsigned cache_size,
373 unsigned max_held_per_thread) 373 unsigned max_held_per_thread)
374{ 374{
375 return (struct dm_block_manager *) 375 int r;
376 dm_bufio_client_create(bdev, block_size, max_held_per_thread, 376 struct dm_block_manager *bm;
377 sizeof(struct buffer_aux), 377
378 dm_block_manager_alloc_callback, 378 bm = kmalloc(sizeof(*bm), GFP_KERNEL);
379 dm_block_manager_write_callback); 379 if (!bm) {
380 r = -ENOMEM;
381 goto bad;
382 }
383
384 bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread,
385 sizeof(struct buffer_aux),
386 dm_block_manager_alloc_callback,
387 dm_block_manager_write_callback);
388 if (IS_ERR(bm->bufio)) {
389 r = PTR_ERR(bm->bufio);
390 kfree(bm);
391 goto bad;
392 }
393
394 bm->read_only = false;
395
396 return bm;
397
398bad:
399 return ERR_PTR(r);
380} 400}
381EXPORT_SYMBOL_GPL(dm_block_manager_create); 401EXPORT_SYMBOL_GPL(dm_block_manager_create);
382 402
383void dm_block_manager_destroy(struct dm_block_manager *bm) 403void dm_block_manager_destroy(struct dm_block_manager *bm)
384{ 404{
385 return dm_bufio_client_destroy(to_bufio(bm)); 405 dm_bufio_client_destroy(bm->bufio);
406 kfree(bm);
386} 407}
387EXPORT_SYMBOL_GPL(dm_block_manager_destroy); 408EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
388 409
389unsigned dm_bm_block_size(struct dm_block_manager *bm) 410unsigned dm_bm_block_size(struct dm_block_manager *bm)
390{ 411{
391 return dm_bufio_get_block_size(to_bufio(bm)); 412 return dm_bufio_get_block_size(bm->bufio);
392} 413}
393EXPORT_SYMBOL_GPL(dm_bm_block_size); 414EXPORT_SYMBOL_GPL(dm_bm_block_size);
394 415
395dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) 416dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
396{ 417{
397 return dm_bufio_get_device_size(to_bufio(bm)); 418 return dm_bufio_get_device_size(bm->bufio);
398} 419}
399 420
400static int dm_bm_validate_buffer(struct dm_block_manager *bm, 421static int dm_bm_validate_buffer(struct dm_block_manager *bm,
@@ -406,7 +427,7 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm,
406 int r; 427 int r;
407 if (!v) 428 if (!v)
408 return 0; 429 return 0;
409 r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm))); 430 r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio));
410 if (unlikely(r)) 431 if (unlikely(r))
411 return r; 432 return r;
412 aux->validator = v; 433 aux->validator = v;
@@ -430,7 +451,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
430 void *p; 451 void *p;
431 int r; 452 int r;
432 453
433 p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); 454 p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
434 if (unlikely(IS_ERR(p))) 455 if (unlikely(IS_ERR(p)))
435 return PTR_ERR(p); 456 return PTR_ERR(p);
436 457
@@ -463,7 +484,10 @@ int dm_bm_write_lock(struct dm_block_manager *bm,
463 void *p; 484 void *p;
464 int r; 485 int r;
465 486
466 p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); 487 if (bm->read_only)
488 return -EPERM;
489
490 p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
467 if (unlikely(IS_ERR(p))) 491 if (unlikely(IS_ERR(p)))
468 return PTR_ERR(p); 492 return PTR_ERR(p);
469 493
@@ -496,7 +520,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm,
496 void *p; 520 void *p;
497 int r; 521 int r;
498 522
499 p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result); 523 p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
500 if (unlikely(IS_ERR(p))) 524 if (unlikely(IS_ERR(p)))
501 return PTR_ERR(p); 525 return PTR_ERR(p);
502 if (unlikely(!p)) 526 if (unlikely(!p))
@@ -529,7 +553,10 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
529 struct buffer_aux *aux; 553 struct buffer_aux *aux;
530 void *p; 554 void *p;
531 555
532 p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result); 556 if (bm->read_only)
557 return -EPERM;
558
559 p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
533 if (unlikely(IS_ERR(p))) 560 if (unlikely(IS_ERR(p)))
534 return PTR_ERR(p); 561 return PTR_ERR(p);
535 562
@@ -547,6 +574,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
547 574
548 return 0; 575 return 0;
549} 576}
577EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero);
550 578
551int dm_bm_unlock(struct dm_block *b) 579int dm_bm_unlock(struct dm_block *b)
552{ 580{
@@ -565,45 +593,30 @@ int dm_bm_unlock(struct dm_block *b)
565} 593}
566EXPORT_SYMBOL_GPL(dm_bm_unlock); 594EXPORT_SYMBOL_GPL(dm_bm_unlock);
567 595
568int dm_bm_unlock_move(struct dm_block *b, dm_block_t n)
569{
570 struct buffer_aux *aux;
571
572 aux = dm_bufio_get_aux_data(to_buffer(b));
573
574 if (aux->write_locked) {
575 dm_bufio_mark_buffer_dirty(to_buffer(b));
576 bl_up_write(&aux->lock);
577 } else
578 bl_up_read(&aux->lock);
579
580 dm_bufio_release_move(to_buffer(b), n);
581 return 0;
582}
583
584int dm_bm_flush_and_unlock(struct dm_block_manager *bm, 596int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
585 struct dm_block *superblock) 597 struct dm_block *superblock)
586{ 598{
587 int r; 599 int r;
588 600
589 r = dm_bufio_write_dirty_buffers(to_bufio(bm)); 601 if (bm->read_only)
590 if (unlikely(r)) 602 return -EPERM;
591 return r; 603
592 r = dm_bufio_issue_flush(to_bufio(bm)); 604 r = dm_bufio_write_dirty_buffers(bm->bufio);
593 if (unlikely(r)) 605 if (unlikely(r)) {
606 dm_bm_unlock(superblock);
594 return r; 607 return r;
608 }
595 609
596 dm_bm_unlock(superblock); 610 dm_bm_unlock(superblock);
597 611
598 r = dm_bufio_write_dirty_buffers(to_bufio(bm)); 612 return dm_bufio_write_dirty_buffers(bm->bufio);
599 if (unlikely(r)) 613}
600 return r;
601 r = dm_bufio_issue_flush(to_bufio(bm));
602 if (unlikely(r))
603 return r;
604 614
605 return 0; 615void dm_bm_set_read_only(struct dm_block_manager *bm)
616{
617 bm->read_only = true;
606} 618}
619EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
607 620
608u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) 621u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
609{ 622{
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 924833d2dfa6..be5bff61be28 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -97,14 +97,6 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
97int dm_bm_unlock(struct dm_block *b); 97int dm_bm_unlock(struct dm_block *b);
98 98
99/* 99/*
100 * An optimisation; we often want to copy a block's contents to a new
101 * block. eg, as part of the shadowing operation. It's far better for
102 * bufio to do this move behind the scenes than hold 2 locks and memcpy the
103 * data.
104 */
105int dm_bm_unlock_move(struct dm_block *b, dm_block_t n);
106
107/*
108 * It's a common idiom to have a superblock that should be committed last. 100 * It's a common idiom to have a superblock that should be committed last.
109 * 101 *
110 * @superblock should be write-locked on entry. It will be unlocked during 102 * @superblock should be write-locked on entry. It will be unlocked during
@@ -116,6 +108,19 @@ int dm_bm_unlock_move(struct dm_block *b, dm_block_t n);
116int dm_bm_flush_and_unlock(struct dm_block_manager *bm, 108int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
117 struct dm_block *superblock); 109 struct dm_block *superblock);
118 110
111/*
112 * Switches the bm to a read only mode. Once read-only mode
113 * has been entered the following functions will return -EPERM.
114 *
115 * dm_bm_write_lock
116 * dm_bm_write_lock_zero
117 * dm_bm_flush_and_unlock
118 *
119 * Additionally you should not use dm_bm_unlock_move, however no error will
120 * be returned if you do.
121 */
122void dm_bm_set_read_only(struct dm_block_manager *bm);
123
119u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); 124u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
120 125
121/*----------------------------------------------------------------*/ 126/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c
deleted file mode 100644
index 50ed53bf4aa2..000000000000
--- a/drivers/md/persistent-data/dm-space-map-checker.c
+++ /dev/null
@@ -1,438 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map-checker.h"
8
9#include <linux/device-mapper.h>
10#include <linux/export.h>
11
12#ifdef CONFIG_DM_DEBUG_SPACE_MAPS
13
14#define DM_MSG_PREFIX "space map checker"
15
16/*----------------------------------------------------------------*/
17
18struct count_array {
19 dm_block_t nr;
20 dm_block_t nr_free;
21
22 uint32_t *counts;
23};
24
25static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count)
26{
27 if (b >= ca->nr)
28 return -EINVAL;
29
30 *count = ca->counts[b];
31 return 0;
32}
33
34static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r)
35{
36 if (b >= ca->nr)
37 return -EINVAL;
38
39 *r = ca->counts[b] > 1;
40 return 0;
41}
42
43static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count)
44{
45 uint32_t old_count;
46
47 if (b >= ca->nr)
48 return -EINVAL;
49
50 old_count = ca->counts[b];
51
52 if (!count && old_count)
53 ca->nr_free++;
54
55 else if (count && !old_count)
56 ca->nr_free--;
57
58 ca->counts[b] = count;
59 return 0;
60}
61
62static int ca_inc_block(struct count_array *ca, dm_block_t b)
63{
64 if (b >= ca->nr)
65 return -EINVAL;
66
67 ca_set_count(ca, b, ca->counts[b] + 1);
68 return 0;
69}
70
71static int ca_dec_block(struct count_array *ca, dm_block_t b)
72{
73 if (b >= ca->nr)
74 return -EINVAL;
75
76 BUG_ON(ca->counts[b] == 0);
77 ca_set_count(ca, b, ca->counts[b] - 1);
78 return 0;
79}
80
81static int ca_create(struct count_array *ca, struct dm_space_map *sm)
82{
83 int r;
84 dm_block_t nr_blocks;
85
86 r = dm_sm_get_nr_blocks(sm, &nr_blocks);
87 if (r)
88 return r;
89
90 ca->nr = nr_blocks;
91 ca->nr_free = nr_blocks;
92 ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL);
93 if (!ca->counts)
94 return -ENOMEM;
95
96 return 0;
97}
98
99static int ca_load(struct count_array *ca, struct dm_space_map *sm)
100{
101 int r;
102 uint32_t count;
103 dm_block_t nr_blocks, i;
104
105 r = dm_sm_get_nr_blocks(sm, &nr_blocks);
106 if (r)
107 return r;
108
109 BUG_ON(ca->nr != nr_blocks);
110
111 DMWARN("Loading debug space map from disk. This may take some time");
112 for (i = 0; i < nr_blocks; i++) {
113 r = dm_sm_get_count(sm, i, &count);
114 if (r) {
115 DMERR("load failed");
116 return r;
117 }
118
119 ca_set_count(ca, i, count);
120 }
121 DMWARN("Load complete");
122
123 return 0;
124}
125
126static int ca_extend(struct count_array *ca, dm_block_t extra_blocks)
127{
128 dm_block_t nr_blocks = ca->nr + extra_blocks;
129 uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL);
130 if (!counts)
131 return -ENOMEM;
132
133 memcpy(counts, ca->counts, sizeof(*counts) * ca->nr);
134 kfree(ca->counts);
135 ca->nr = nr_blocks;
136 ca->nr_free += extra_blocks;
137 ca->counts = counts;
138 return 0;
139}
140
141static int ca_commit(struct count_array *old, struct count_array *new)
142{
143 if (old->nr != new->nr) {
144 BUG_ON(old->nr > new->nr);
145 ca_extend(old, new->nr - old->nr);
146 }
147
148 BUG_ON(old->nr != new->nr);
149 old->nr_free = new->nr_free;
150 memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr);
151 return 0;
152}
153
154static void ca_destroy(struct count_array *ca)
155{
156 kfree(ca->counts);
157}
158
159/*----------------------------------------------------------------*/
160
161struct sm_checker {
162 struct dm_space_map sm;
163
164 struct count_array old_counts;
165 struct count_array counts;
166
167 struct dm_space_map *real_sm;
168};
169
170static void sm_checker_destroy(struct dm_space_map *sm)
171{
172 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
173
174 dm_sm_destroy(smc->real_sm);
175 ca_destroy(&smc->old_counts);
176 ca_destroy(&smc->counts);
177 kfree(smc);
178}
179
180static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
181{
182 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
183 int r = dm_sm_get_nr_blocks(smc->real_sm, count);
184 if (!r)
185 BUG_ON(smc->old_counts.nr != *count);
186 return r;
187}
188
189static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
190{
191 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
192 int r = dm_sm_get_nr_free(smc->real_sm, count);
193 if (!r) {
194 /*
195 * Slow, but we know it's correct.
196 */
197 dm_block_t b, n = 0;
198 for (b = 0; b < smc->old_counts.nr; b++)
199 if (smc->old_counts.counts[b] == 0 &&
200 smc->counts.counts[b] == 0)
201 n++;
202
203 if (n != *count)
204 DMERR("free block counts differ, checker %u, sm-disk:%u",
205 (unsigned) n, (unsigned) *count);
206 }
207 return r;
208}
209
210static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b)
211{
212 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
213 int r = dm_sm_new_block(smc->real_sm, b);
214
215 if (!r) {
216 BUG_ON(*b >= smc->old_counts.nr);
217 BUG_ON(smc->old_counts.counts[*b] != 0);
218 BUG_ON(*b >= smc->counts.nr);
219 BUG_ON(smc->counts.counts[*b] != 0);
220 ca_set_count(&smc->counts, *b, 1);
221 }
222
223 return r;
224}
225
226static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b)
227{
228 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
229 int r = dm_sm_inc_block(smc->real_sm, b);
230 int r2 = ca_inc_block(&smc->counts, b);
231 BUG_ON(r != r2);
232 return r;
233}
234
235static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b)
236{
237 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
238 int r = dm_sm_dec_block(smc->real_sm, b);
239 int r2 = ca_dec_block(&smc->counts, b);
240 BUG_ON(r != r2);
241 return r;
242}
243
244static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result)
245{
246 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
247 uint32_t result2 = 0;
248 int r = dm_sm_get_count(smc->real_sm, b, result);
249 int r2 = ca_get_count(&smc->counts, b, &result2);
250
251 BUG_ON(r != r2);
252 if (!r)
253 BUG_ON(*result != result2);
254 return r;
255}
256
257static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result)
258{
259 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
260 int result2 = 0;
261 int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result);
262 int r2 = ca_count_more_than_one(&smc->counts, b, &result2);
263
264 BUG_ON(r != r2);
265 if (!r)
266 BUG_ON(!(*result) && result2);
267 return r;
268}
269
270static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count)
271{
272 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
273 uint32_t old_rc;
274 int r = dm_sm_set_count(smc->real_sm, b, count);
275 int r2;
276
277 BUG_ON(b >= smc->counts.nr);
278 old_rc = smc->counts.counts[b];
279 r2 = ca_set_count(&smc->counts, b, count);
280 BUG_ON(r != r2);
281
282 return r;
283}
284
285static int sm_checker_commit(struct dm_space_map *sm)
286{
287 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
288 int r;
289
290 r = dm_sm_commit(smc->real_sm);
291 if (r)
292 return r;
293
294 r = ca_commit(&smc->old_counts, &smc->counts);
295 if (r)
296 return r;
297
298 return 0;
299}
300
301static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
302{
303 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
304 int r = dm_sm_extend(smc->real_sm, extra_blocks);
305 if (r)
306 return r;
307
308 return ca_extend(&smc->counts, extra_blocks);
309}
310
311static int sm_checker_root_size(struct dm_space_map *sm, size_t *result)
312{
313 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
314 return dm_sm_root_size(smc->real_sm, result);
315}
316
317static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
318{
319 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
320 return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len);
321}
322
323/*----------------------------------------------------------------*/
324
325static struct dm_space_map ops_ = {
326 .destroy = sm_checker_destroy,
327 .get_nr_blocks = sm_checker_get_nr_blocks,
328 .get_nr_free = sm_checker_get_nr_free,
329 .inc_block = sm_checker_inc_block,
330 .dec_block = sm_checker_dec_block,
331 .new_block = sm_checker_new_block,
332 .get_count = sm_checker_get_count,
333 .count_is_more_than_one = sm_checker_count_more_than_one,
334 .set_count = sm_checker_set_count,
335 .commit = sm_checker_commit,
336 .extend = sm_checker_extend,
337 .root_size = sm_checker_root_size,
338 .copy_root = sm_checker_copy_root
339};
340
341struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
342{
343 int r;
344 struct sm_checker *smc;
345
346 if (!sm)
347 return NULL;
348
349 smc = kmalloc(sizeof(*smc), GFP_KERNEL);
350 if (!smc)
351 return NULL;
352
353 memcpy(&smc->sm, &ops_, sizeof(smc->sm));
354 r = ca_create(&smc->old_counts, sm);
355 if (r) {
356 kfree(smc);
357 return NULL;
358 }
359
360 r = ca_create(&smc->counts, sm);
361 if (r) {
362 ca_destroy(&smc->old_counts);
363 kfree(smc);
364 return NULL;
365 }
366
367 smc->real_sm = sm;
368
369 r = ca_load(&smc->counts, sm);
370 if (r) {
371 ca_destroy(&smc->counts);
372 ca_destroy(&smc->old_counts);
373 kfree(smc);
374 return NULL;
375 }
376
377 r = ca_commit(&smc->old_counts, &smc->counts);
378 if (r) {
379 ca_destroy(&smc->counts);
380 ca_destroy(&smc->old_counts);
381 kfree(smc);
382 return NULL;
383 }
384
385 return &smc->sm;
386}
387EXPORT_SYMBOL_GPL(dm_sm_checker_create);
388
389struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
390{
391 int r;
392 struct sm_checker *smc;
393
394 if (!sm)
395 return NULL;
396
397 smc = kmalloc(sizeof(*smc), GFP_KERNEL);
398 if (!smc)
399 return NULL;
400
401 memcpy(&smc->sm, &ops_, sizeof(smc->sm));
402 r = ca_create(&smc->old_counts, sm);
403 if (r) {
404 kfree(smc);
405 return NULL;
406 }
407
408 r = ca_create(&smc->counts, sm);
409 if (r) {
410 ca_destroy(&smc->old_counts);
411 kfree(smc);
412 return NULL;
413 }
414
415 smc->real_sm = sm;
416 return &smc->sm;
417}
418EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
419
420/*----------------------------------------------------------------*/
421
422#else
423
424struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
425{
426 return sm;
427}
428EXPORT_SYMBOL_GPL(dm_sm_checker_create);
429
430struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
431{
432 return sm;
433}
434EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
435
436/*----------------------------------------------------------------*/
437
438#endif
diff --git a/drivers/md/persistent-data/dm-space-map-checker.h b/drivers/md/persistent-data/dm-space-map-checker.h
deleted file mode 100644
index 444dccf6688c..000000000000
--- a/drivers/md/persistent-data/dm-space-map-checker.h
+++ /dev/null
@@ -1,26 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H
8#define SNAPSHOTS_SPACE_MAP_CHECKER_H
9
10#include "dm-space-map.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * This space map wraps a real on-disk space map, and verifies all of its
16 * operations. It uses a lot of memory, so only use if you have a specific
17 * problem that you're debugging.
18 *
19 * Ownership of @sm passes.
20 */
21struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm);
22struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm);
23
24/*----------------------------------------------------------------*/
25
26#endif
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index ff3beed6ad2d..d77602d63c83 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -224,6 +224,7 @@ static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
224 ll->nr_blocks = 0; 224 ll->nr_blocks = 0;
225 ll->bitmap_root = 0; 225 ll->bitmap_root = 0;
226 ll->ref_count_root = 0; 226 ll->ref_count_root = 0;
227 ll->bitmap_index_changed = false;
227 228
228 return 0; 229 return 0;
229} 230}
@@ -476,7 +477,15 @@ int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
476 477
477int sm_ll_commit(struct ll_disk *ll) 478int sm_ll_commit(struct ll_disk *ll)
478{ 479{
479 return ll->commit(ll); 480 int r = 0;
481
482 if (ll->bitmap_index_changed) {
483 r = ll->commit(ll);
484 if (!r)
485 ll->bitmap_index_changed = false;
486 }
487
488 return r;
480} 489}
481 490
482/*----------------------------------------------------------------*/ 491/*----------------------------------------------------------------*/
@@ -491,6 +500,7 @@ static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index,
491static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index, 500static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index,
492 struct disk_index_entry *ie) 501 struct disk_index_entry *ie)
493{ 502{
503 ll->bitmap_index_changed = true;
494 memcpy(ll->mi_le.index + index, ie, sizeof(*ie)); 504 memcpy(ll->mi_le.index + index, ie, sizeof(*ie));
495 return 0; 505 return 0;
496} 506}
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
index 8f220821a9a9..b3078d5eda0c 100644
--- a/drivers/md/persistent-data/dm-space-map-common.h
+++ b/drivers/md/persistent-data/dm-space-map-common.h
@@ -78,6 +78,7 @@ struct ll_disk {
78 open_index_fn open_index; 78 open_index_fn open_index;
79 max_index_entries_fn max_entries; 79 max_index_entries_fn max_entries;
80 commit_fn commit; 80 commit_fn commit;
81 bool bitmap_index_changed:1;
81}; 82};
82 83
83struct disk_sm_root { 84struct disk_sm_root {
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index fc469ba9f627..f6d29e614ab7 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -4,7 +4,6 @@
4 * This file is released under the GPL. 4 * This file is released under the GPL.
5 */ 5 */
6 6
7#include "dm-space-map-checker.h"
8#include "dm-space-map-common.h" 7#include "dm-space-map-common.h"
9#include "dm-space-map-disk.h" 8#include "dm-space-map-disk.h"
10#include "dm-space-map.h" 9#include "dm-space-map.h"
@@ -252,9 +251,8 @@ static struct dm_space_map ops = {
252 .copy_root = sm_disk_copy_root 251 .copy_root = sm_disk_copy_root
253}; 252};
254 253
255static struct dm_space_map *dm_sm_disk_create_real( 254struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
256 struct dm_transaction_manager *tm, 255 dm_block_t nr_blocks)
257 dm_block_t nr_blocks)
258{ 256{
259 int r; 257 int r;
260 struct sm_disk *smd; 258 struct sm_disk *smd;
@@ -285,18 +283,10 @@ bad:
285 kfree(smd); 283 kfree(smd);
286 return ERR_PTR(r); 284 return ERR_PTR(r);
287} 285}
288
289struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
290 dm_block_t nr_blocks)
291{
292 struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks);
293 return dm_sm_checker_create_fresh(sm);
294}
295EXPORT_SYMBOL_GPL(dm_sm_disk_create); 286EXPORT_SYMBOL_GPL(dm_sm_disk_create);
296 287
297static struct dm_space_map *dm_sm_disk_open_real( 288struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
298 struct dm_transaction_manager *tm, 289 void *root_le, size_t len)
299 void *root_le, size_t len)
300{ 290{
301 int r; 291 int r;
302 struct sm_disk *smd; 292 struct sm_disk *smd;
@@ -323,13 +313,6 @@ bad:
323 kfree(smd); 313 kfree(smd);
324 return ERR_PTR(r); 314 return ERR_PTR(r);
325} 315}
326
327struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
328 void *root_le, size_t len)
329{
330 return dm_sm_checker_create(
331 dm_sm_disk_open_real(tm, root_le, len));
332}
333EXPORT_SYMBOL_GPL(dm_sm_disk_open); 316EXPORT_SYMBOL_GPL(dm_sm_disk_open);
334 317
335/*----------------------------------------------------------------*/ 318/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 400fe144c0cd..d247a35da3c6 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -5,7 +5,6 @@
5 */ 5 */
6#include "dm-transaction-manager.h" 6#include "dm-transaction-manager.h"
7#include "dm-space-map.h" 7#include "dm-space-map.h"
8#include "dm-space-map-checker.h"
9#include "dm-space-map-disk.h" 8#include "dm-space-map-disk.h"
10#include "dm-space-map-metadata.h" 9#include "dm-space-map-metadata.h"
11#include "dm-persistent-data-internal.h" 10#include "dm-persistent-data-internal.h"
@@ -138,6 +137,9 @@ EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone);
138 137
139void dm_tm_destroy(struct dm_transaction_manager *tm) 138void dm_tm_destroy(struct dm_transaction_manager *tm)
140{ 139{
140 if (!tm->is_clone)
141 wipe_shadow_table(tm);
142
141 kfree(tm); 143 kfree(tm);
142} 144}
143EXPORT_SYMBOL_GPL(dm_tm_destroy); 145EXPORT_SYMBOL_GPL(dm_tm_destroy);
@@ -217,13 +219,24 @@ static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
217 if (r < 0) 219 if (r < 0)
218 return r; 220 return r;
219 221
220 r = dm_bm_unlock_move(orig_block, new); 222 /*
221 if (r < 0) { 223 * It would be tempting to use dm_bm_unlock_move here, but some
224 * code, such as the space maps, keeps using the old data structures
225 * secure in the knowledge they won't be changed until the next
226 * transaction. Using unlock_move would force a synchronous read
227 * since the old block would no longer be in the cache.
228 */
229 r = dm_bm_write_lock_zero(tm->bm, new, v, result);
230 if (r) {
222 dm_bm_unlock(orig_block); 231 dm_bm_unlock(orig_block);
223 return r; 232 return r;
224 } 233 }
225 234
226 return dm_bm_write_lock(tm->bm, new, v, result); 235 memcpy(dm_block_data(*result), dm_block_data(orig_block),
236 dm_bm_block_size(tm->bm));
237
238 dm_bm_unlock(orig_block);
239 return r;
227} 240}
228 241
229int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, 242int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
@@ -308,94 +321,61 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
308 321
309static int dm_tm_create_internal(struct dm_block_manager *bm, 322static int dm_tm_create_internal(struct dm_block_manager *bm,
310 dm_block_t sb_location, 323 dm_block_t sb_location,
311 struct dm_block_validator *sb_validator,
312 size_t root_offset, size_t root_max_len,
313 struct dm_transaction_manager **tm, 324 struct dm_transaction_manager **tm,
314 struct dm_space_map **sm, 325 struct dm_space_map **sm,
315 struct dm_block **sblock, 326 int create,
316 int create) 327 void *sm_root, size_t sm_len)
317{ 328{
318 int r; 329 int r;
319 struct dm_space_map *inner;
320 330
321 inner = dm_sm_metadata_init(); 331 *sm = dm_sm_metadata_init();
322 if (IS_ERR(inner)) 332 if (IS_ERR(*sm))
323 return PTR_ERR(inner); 333 return PTR_ERR(*sm);
324 334
325 *tm = dm_tm_create(bm, inner); 335 *tm = dm_tm_create(bm, *sm);
326 if (IS_ERR(*tm)) { 336 if (IS_ERR(*tm)) {
327 dm_sm_destroy(inner); 337 dm_sm_destroy(*sm);
328 return PTR_ERR(*tm); 338 return PTR_ERR(*tm);
329 } 339 }
330 340
331 if (create) { 341 if (create) {
332 r = dm_bm_write_lock_zero(dm_tm_get_bm(*tm), sb_location, 342 r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm),
333 sb_validator, sblock);
334 if (r < 0) {
335 DMERR("couldn't lock superblock");
336 goto bad1;
337 }
338
339 r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm),
340 sb_location); 343 sb_location);
341 if (r) { 344 if (r) {
342 DMERR("couldn't create metadata space map"); 345 DMERR("couldn't create metadata space map");
343 goto bad2; 346 goto bad;
344 } 347 }
345 348
346 *sm = dm_sm_checker_create(inner);
347 if (!*sm)
348 goto bad2;
349
350 } else { 349 } else {
351 r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location, 350 r = dm_sm_metadata_open(*sm, *tm, sm_root, sm_len);
352 sb_validator, sblock);
353 if (r < 0) {
354 DMERR("couldn't lock superblock");
355 goto bad1;
356 }
357
358 r = dm_sm_metadata_open(inner, *tm,
359 dm_block_data(*sblock) + root_offset,
360 root_max_len);
361 if (r) { 351 if (r) {
362 DMERR("couldn't open metadata space map"); 352 DMERR("couldn't open metadata space map");
363 goto bad2; 353 goto bad;
364 } 354 }
365
366 *sm = dm_sm_checker_create(inner);
367 if (!*sm)
368 goto bad2;
369 } 355 }
370 356
371 return 0; 357 return 0;
372 358
373bad2: 359bad:
374 dm_tm_unlock(*tm, *sblock);
375bad1:
376 dm_tm_destroy(*tm); 360 dm_tm_destroy(*tm);
377 dm_sm_destroy(inner); 361 dm_sm_destroy(*sm);
378 return r; 362 return r;
379} 363}
380 364
381int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, 365int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
382 struct dm_block_validator *sb_validator,
383 struct dm_transaction_manager **tm, 366 struct dm_transaction_manager **tm,
384 struct dm_space_map **sm, struct dm_block **sblock) 367 struct dm_space_map **sm)
385{ 368{
386 return dm_tm_create_internal(bm, sb_location, sb_validator, 369 return dm_tm_create_internal(bm, sb_location, tm, sm, 1, NULL, 0);
387 0, 0, tm, sm, sblock, 1);
388} 370}
389EXPORT_SYMBOL_GPL(dm_tm_create_with_sm); 371EXPORT_SYMBOL_GPL(dm_tm_create_with_sm);
390 372
391int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, 373int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
392 struct dm_block_validator *sb_validator, 374 void *sm_root, size_t root_len,
393 size_t root_offset, size_t root_max_len,
394 struct dm_transaction_manager **tm, 375 struct dm_transaction_manager **tm,
395 struct dm_space_map **sm, struct dm_block **sblock) 376 struct dm_space_map **sm)
396{ 377{
397 return dm_tm_create_internal(bm, sb_location, sb_validator, root_offset, 378 return dm_tm_create_internal(bm, sb_location, tm, sm, 0, sm_root, root_len);
398 root_max_len, tm, sm, sblock, 0);
399} 379}
400EXPORT_SYMBOL_GPL(dm_tm_open_with_sm); 380EXPORT_SYMBOL_GPL(dm_tm_open_with_sm);
401 381
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index 6da784871db4..b5b139076ca5 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -115,16 +115,17 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
115 * 115 *
116 * Returns a tm that has an open transaction to write the new disk sm. 116 * Returns a tm that has an open transaction to write the new disk sm.
117 * Caller should store the new sm root and commit. 117 * Caller should store the new sm root and commit.
118 *
119 * The superblock location is passed so the metadata space map knows it
120 * shouldn't be used.
118 */ 121 */
119int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, 122int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
120 struct dm_block_validator *sb_validator,
121 struct dm_transaction_manager **tm, 123 struct dm_transaction_manager **tm,
122 struct dm_space_map **sm, struct dm_block **sblock); 124 struct dm_space_map **sm);
123 125
124int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, 126int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
125 struct dm_block_validator *sb_validator, 127 void *sm_root, size_t root_len,
126 size_t root_offset, size_t root_max_len,
127 struct dm_transaction_manager **tm, 128 struct dm_transaction_manager **tm,
128 struct dm_space_map **sm, struct dm_block **sblock); 129 struct dm_space_map **sm);
129 130
130#endif /* _LINUX_DM_TRANSACTION_MANAGER_H */ 131#endif /* _LINUX_DM_TRANSACTION_MANAGER_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a9c7981ddd24..611b5f797618 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -46,6 +46,20 @@
46 */ 46 */
47#define NR_RAID1_BIOS 256 47#define NR_RAID1_BIOS 256
48 48
49/* when we get a read error on a read-only array, we redirect to another
50 * device without failing the first device, or trying to over-write to
51 * correct the read error. To keep track of bad blocks on a per-bio
52 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
53 */
54#define IO_BLOCKED ((struct bio *)1)
55/* When we successfully write to a known bad-block, we need to remove the
56 * bad-block marking which must be done from process context. So we record
57 * the success by setting devs[n].bio to IO_MADE_GOOD
58 */
59#define IO_MADE_GOOD ((struct bio *)2)
60
61#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
62
49/* When there are this many requests queue to be written by 63/* When there are this many requests queue to be written by
50 * the raid1 thread, we become 'congested' to provide back-pressure 64 * the raid1 thread, we become 'congested' to provide back-pressure
51 * for writeback. 65 * for writeback.
@@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
483 const sector_t this_sector = r1_bio->sector; 497 const sector_t this_sector = r1_bio->sector;
484 int sectors; 498 int sectors;
485 int best_good_sectors; 499 int best_good_sectors;
486 int start_disk; 500 int best_disk, best_dist_disk, best_pending_disk;
487 int best_disk; 501 int has_nonrot_disk;
488 int i; 502 int disk;
489 sector_t best_dist; 503 sector_t best_dist;
504 unsigned int min_pending;
490 struct md_rdev *rdev; 505 struct md_rdev *rdev;
491 int choose_first; 506 int choose_first;
507 int choose_next_idle;
492 508
493 rcu_read_lock(); 509 rcu_read_lock();
494 /* 510 /*
@@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
499 retry: 515 retry:
500 sectors = r1_bio->sectors; 516 sectors = r1_bio->sectors;
501 best_disk = -1; 517 best_disk = -1;
518 best_dist_disk = -1;
502 best_dist = MaxSector; 519 best_dist = MaxSector;
520 best_pending_disk = -1;
521 min_pending = UINT_MAX;
503 best_good_sectors = 0; 522 best_good_sectors = 0;
523 has_nonrot_disk = 0;
524 choose_next_idle = 0;
504 525
505 if (conf->mddev->recovery_cp < MaxSector && 526 if (conf->mddev->recovery_cp < MaxSector &&
506 (this_sector + sectors >= conf->next_resync)) { 527 (this_sector + sectors >= conf->next_resync))
507 choose_first = 1; 528 choose_first = 1;
508 start_disk = 0; 529 else
509 } else {
510 choose_first = 0; 530 choose_first = 0;
511 start_disk = conf->last_used;
512 }
513 531
514 for (i = 0 ; i < conf->raid_disks * 2 ; i++) { 532 for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
515 sector_t dist; 533 sector_t dist;
516 sector_t first_bad; 534 sector_t first_bad;
517 int bad_sectors; 535 int bad_sectors;
518 536 unsigned int pending;
519 int disk = start_disk + i; 537 bool nonrot;
520 if (disk >= conf->raid_disks)
521 disk -= conf->raid_disks;
522 538
523 rdev = rcu_dereference(conf->mirrors[disk].rdev); 539 rdev = rcu_dereference(conf->mirrors[disk].rdev);
524 if (r1_bio->bios[disk] == IO_BLOCKED 540 if (r1_bio->bios[disk] == IO_BLOCKED
@@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
577 } else 593 } else
578 best_good_sectors = sectors; 594 best_good_sectors = sectors;
579 595
596 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
597 has_nonrot_disk |= nonrot;
598 pending = atomic_read(&rdev->nr_pending);
580 dist = abs(this_sector - conf->mirrors[disk].head_position); 599 dist = abs(this_sector - conf->mirrors[disk].head_position);
581 if (choose_first 600 if (choose_first) {
582 /* Don't change to another disk for sequential reads */
583 || conf->next_seq_sect == this_sector
584 || dist == 0
585 /* If device is idle, use it */
586 || atomic_read(&rdev->nr_pending) == 0) {
587 best_disk = disk; 601 best_disk = disk;
588 break; 602 break;
589 } 603 }
604 /* Don't change to another disk for sequential reads */
605 if (conf->mirrors[disk].next_seq_sect == this_sector
606 || dist == 0) {
607 int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
608 struct raid1_info *mirror = &conf->mirrors[disk];
609
610 best_disk = disk;
611 /*
612 * If buffered sequential IO size exceeds optimal
613 * iosize, check if there is idle disk. If yes, choose
614 * the idle disk. read_balance could already choose an
615 * idle disk before noticing it's a sequential IO in
616 * this disk. This doesn't matter because this disk
617 * will idle, next time it will be utilized after the
618 * first disk has IO size exceeds optimal iosize. In
619 * this way, iosize of the first disk will be optimal
620 * iosize at least. iosize of the second disk might be
621 * small, but not a big deal since when the second disk
622 * starts IO, the first disk is likely still busy.
623 */
624 if (nonrot && opt_iosize > 0 &&
625 mirror->seq_start != MaxSector &&
626 mirror->next_seq_sect > opt_iosize &&
627 mirror->next_seq_sect - opt_iosize >=
628 mirror->seq_start) {
629 choose_next_idle = 1;
630 continue;
631 }
632 break;
633 }
634 /* If device is idle, use it */
635 if (pending == 0) {
636 best_disk = disk;
637 break;
638 }
639
640 if (choose_next_idle)
641 continue;
642
643 if (min_pending > pending) {
644 min_pending = pending;
645 best_pending_disk = disk;
646 }
647
590 if (dist < best_dist) { 648 if (dist < best_dist) {
591 best_dist = dist; 649 best_dist = dist;
592 best_disk = disk; 650 best_dist_disk = disk;
593 } 651 }
594 } 652 }
595 653
654 /*
655 * If all disks are rotational, choose the closest disk. If any disk is
656 * non-rotational, choose the disk with less pending request even the
657 * disk is rotational, which might/might not be optimal for raids with
658 * mixed ratation/non-rotational disks depending on workload.
659 */
660 if (best_disk == -1) {
661 if (has_nonrot_disk)
662 best_disk = best_pending_disk;
663 else
664 best_disk = best_dist_disk;
665 }
666
596 if (best_disk >= 0) { 667 if (best_disk >= 0) {
597 rdev = rcu_dereference(conf->mirrors[best_disk].rdev); 668 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
598 if (!rdev) 669 if (!rdev)
@@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
606 goto retry; 677 goto retry;
607 } 678 }
608 sectors = best_good_sectors; 679 sectors = best_good_sectors;
609 conf->next_seq_sect = this_sector + sectors; 680
610 conf->last_used = best_disk; 681 if (conf->mirrors[best_disk].next_seq_sect != this_sector)
682 conf->mirrors[best_disk].seq_start = this_sector;
683
684 conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
611 } 685 }
612 rcu_read_unlock(); 686 rcu_read_unlock();
613 *max_sectors = sectors; 687 *max_sectors = sectors;
@@ -870,10 +944,48 @@ do_sync_io:
870 pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 944 pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
871} 945}
872 946
947struct raid1_plug_cb {
948 struct blk_plug_cb cb;
949 struct bio_list pending;
950 int pending_cnt;
951};
952
953static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
954{
955 struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
956 cb);
957 struct mddev *mddev = plug->cb.data;
958 struct r1conf *conf = mddev->private;
959 struct bio *bio;
960
961 if (from_schedule) {
962 spin_lock_irq(&conf->device_lock);
963 bio_list_merge(&conf->pending_bio_list, &plug->pending);
964 conf->pending_count += plug->pending_cnt;
965 spin_unlock_irq(&conf->device_lock);
966 md_wakeup_thread(mddev->thread);
967 kfree(plug);
968 return;
969 }
970
971 /* we aren't scheduling, so we can do the write-out directly. */
972 bio = bio_list_get(&plug->pending);
973 bitmap_unplug(mddev->bitmap);
974 wake_up(&conf->wait_barrier);
975
976 while (bio) { /* submit pending writes */
977 struct bio *next = bio->bi_next;
978 bio->bi_next = NULL;
979 generic_make_request(bio);
980 bio = next;
981 }
982 kfree(plug);
983}
984
873static void make_request(struct mddev *mddev, struct bio * bio) 985static void make_request(struct mddev *mddev, struct bio * bio)
874{ 986{
875 struct r1conf *conf = mddev->private; 987 struct r1conf *conf = mddev->private;
876 struct mirror_info *mirror; 988 struct raid1_info *mirror;
877 struct r1bio *r1_bio; 989 struct r1bio *r1_bio;
878 struct bio *read_bio; 990 struct bio *read_bio;
879 int i, disks; 991 int i, disks;
@@ -883,7 +995,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
883 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 995 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
884 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 996 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
885 struct md_rdev *blocked_rdev; 997 struct md_rdev *blocked_rdev;
886 int plugged; 998 struct blk_plug_cb *cb;
999 struct raid1_plug_cb *plug = NULL;
887 int first_clone; 1000 int first_clone;
888 int sectors_handled; 1001 int sectors_handled;
889 int max_sectors; 1002 int max_sectors;
@@ -1034,7 +1147,6 @@ read_again:
1034 * the bad blocks. Each set of writes gets it's own r1bio 1147 * the bad blocks. Each set of writes gets it's own r1bio
1035 * with a set of bios attached. 1148 * with a set of bios attached.
1036 */ 1149 */
1037 plugged = mddev_check_plugged(mddev);
1038 1150
1039 disks = conf->raid_disks * 2; 1151 disks = conf->raid_disks * 2;
1040 retry_write: 1152 retry_write:
@@ -1187,10 +1299,23 @@ read_again:
1187 mbio->bi_private = r1_bio; 1299 mbio->bi_private = r1_bio;
1188 1300
1189 atomic_inc(&r1_bio->remaining); 1301 atomic_inc(&r1_bio->remaining);
1302
1303 cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
1304 if (cb)
1305 plug = container_of(cb, struct raid1_plug_cb, cb);
1306 else
1307 plug = NULL;
1190 spin_lock_irqsave(&conf->device_lock, flags); 1308 spin_lock_irqsave(&conf->device_lock, flags);
1191 bio_list_add(&conf->pending_bio_list, mbio); 1309 if (plug) {
1192 conf->pending_count++; 1310 bio_list_add(&plug->pending, mbio);
1311 plug->pending_cnt++;
1312 } else {
1313 bio_list_add(&conf->pending_bio_list, mbio);
1314 conf->pending_count++;
1315 }
1193 spin_unlock_irqrestore(&conf->device_lock, flags); 1316 spin_unlock_irqrestore(&conf->device_lock, flags);
1317 if (!plug)
1318 md_wakeup_thread(mddev->thread);
1194 } 1319 }
1195 /* Mustn't call r1_bio_write_done before this next test, 1320 /* Mustn't call r1_bio_write_done before this next test,
1196 * as it could result in the bio being freed. 1321 * as it could result in the bio being freed.
@@ -1213,9 +1338,6 @@ read_again:
1213 1338
1214 /* In case raid1d snuck in to freeze_array */ 1339 /* In case raid1d snuck in to freeze_array */
1215 wake_up(&conf->wait_barrier); 1340 wake_up(&conf->wait_barrier);
1216
1217 if (do_sync || !bitmap || !plugged)
1218 md_wakeup_thread(mddev->thread);
1219} 1341}
1220 1342
1221static void status(struct seq_file *seq, struct mddev *mddev) 1343static void status(struct seq_file *seq, struct mddev *mddev)
@@ -1367,7 +1489,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1367 struct r1conf *conf = mddev->private; 1489 struct r1conf *conf = mddev->private;
1368 int err = -EEXIST; 1490 int err = -EEXIST;
1369 int mirror = 0; 1491 int mirror = 0;
1370 struct mirror_info *p; 1492 struct raid1_info *p;
1371 int first = 0; 1493 int first = 0;
1372 int last = conf->raid_disks - 1; 1494 int last = conf->raid_disks - 1;
1373 struct request_queue *q = bdev_get_queue(rdev->bdev); 1495 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1436,7 +1558,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1436 struct r1conf *conf = mddev->private; 1558 struct r1conf *conf = mddev->private;
1437 int err = 0; 1559 int err = 0;
1438 int number = rdev->raid_disk; 1560 int number = rdev->raid_disk;
1439 struct mirror_info *p = conf->mirrors+ number; 1561 struct raid1_info *p = conf->mirrors + number;
1440 1562
1441 if (rdev != p->rdev) 1563 if (rdev != p->rdev)
1442 p = conf->mirrors + conf->raid_disks + number; 1564 p = conf->mirrors + conf->raid_disks + number;
@@ -1821,8 +1943,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
1821 1943
1822 if (atomic_dec_and_test(&r1_bio->remaining)) { 1944 if (atomic_dec_and_test(&r1_bio->remaining)) {
1823 /* if we're here, all write(s) have completed, so clean up */ 1945 /* if we're here, all write(s) have completed, so clean up */
1824 md_done_sync(mddev, r1_bio->sectors, 1); 1946 int s = r1_bio->sectors;
1825 put_buf(r1_bio); 1947 if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1948 test_bit(R1BIO_WriteError, &r1_bio->state))
1949 reschedule_retry(r1_bio);
1950 else {
1951 put_buf(r1_bio);
1952 md_done_sync(mddev, s, 1);
1953 }
1826 } 1954 }
1827} 1955}
1828 1956
@@ -2170,8 +2298,7 @@ static void raid1d(struct mddev *mddev)
2170 blk_start_plug(&plug); 2298 blk_start_plug(&plug);
2171 for (;;) { 2299 for (;;) {
2172 2300
2173 if (atomic_read(&mddev->plug_cnt) == 0) 2301 flush_pending_writes(conf);
2174 flush_pending_writes(conf);
2175 2302
2176 spin_lock_irqsave(&conf->device_lock, flags); 2303 spin_lock_irqsave(&conf->device_lock, flags);
2177 if (list_empty(head)) { 2304 if (list_empty(head)) {
@@ -2368,6 +2495,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2368 bio->bi_rw = READ; 2495 bio->bi_rw = READ;
2369 bio->bi_end_io = end_sync_read; 2496 bio->bi_end_io = end_sync_read;
2370 read_targets++; 2497 read_targets++;
2498 } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
2499 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2500 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
2501 /*
2502 * The device is suitable for reading (InSync),
2503 * but has bad block(s) here. Let's try to correct them,
2504 * if we are doing resync or repair. Otherwise, leave
2505 * this device alone for this sync request.
2506 */
2507 bio->bi_rw = WRITE;
2508 bio->bi_end_io = end_sync_write;
2509 write_targets++;
2371 } 2510 }
2372 } 2511 }
2373 if (bio->bi_end_io) { 2512 if (bio->bi_end_io) {
@@ -2425,7 +2564,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2425 /* There is nowhere to write, so all non-sync 2564 /* There is nowhere to write, so all non-sync
2426 * drives must be failed - so we are finished 2565 * drives must be failed - so we are finished
2427 */ 2566 */
2428 sector_t rv = max_sector - sector_nr; 2567 sector_t rv;
2568 if (min_bad > 0)
2569 max_sector = sector_nr + min_bad;
2570 rv = max_sector - sector_nr;
2429 *skipped = 1; 2571 *skipped = 1;
2430 put_buf(r1_bio); 2572 put_buf(r1_bio);
2431 return rv; 2573 return rv;
@@ -2488,9 +2630,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2488 */ 2630 */
2489 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 2631 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2490 atomic_set(&r1_bio->remaining, read_targets); 2632 atomic_set(&r1_bio->remaining, read_targets);
2491 for (i = 0; i < conf->raid_disks * 2; i++) { 2633 for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
2492 bio = r1_bio->bios[i]; 2634 bio = r1_bio->bios[i];
2493 if (bio->bi_end_io == end_sync_read) { 2635 if (bio->bi_end_io == end_sync_read) {
2636 read_targets--;
2494 md_sync_acct(bio->bi_bdev, nr_sectors); 2637 md_sync_acct(bio->bi_bdev, nr_sectors);
2495 generic_make_request(bio); 2638 generic_make_request(bio);
2496 } 2639 }
@@ -2517,7 +2660,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2517{ 2660{
2518 struct r1conf *conf; 2661 struct r1conf *conf;
2519 int i; 2662 int i;
2520 struct mirror_info *disk; 2663 struct raid1_info *disk;
2521 struct md_rdev *rdev; 2664 struct md_rdev *rdev;
2522 int err = -ENOMEM; 2665 int err = -ENOMEM;
2523 2666
@@ -2525,7 +2668,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2525 if (!conf) 2668 if (!conf)
2526 goto abort; 2669 goto abort;
2527 2670
2528 conf->mirrors = kzalloc(sizeof(struct mirror_info) 2671 conf->mirrors = kzalloc(sizeof(struct raid1_info)
2529 * mddev->raid_disks * 2, 2672 * mddev->raid_disks * 2,
2530 GFP_KERNEL); 2673 GFP_KERNEL);
2531 if (!conf->mirrors) 2674 if (!conf->mirrors)
@@ -2568,6 +2711,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2568 mddev->merge_check_needed = 1; 2711 mddev->merge_check_needed = 1;
2569 2712
2570 disk->head_position = 0; 2713 disk->head_position = 0;
2714 disk->seq_start = MaxSector;
2571 } 2715 }
2572 conf->raid_disks = mddev->raid_disks; 2716 conf->raid_disks = mddev->raid_disks;
2573 conf->mddev = mddev; 2717 conf->mddev = mddev;
@@ -2581,7 +2725,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2581 conf->recovery_disabled = mddev->recovery_disabled - 1; 2725 conf->recovery_disabled = mddev->recovery_disabled - 1;
2582 2726
2583 err = -EIO; 2727 err = -EIO;
2584 conf->last_used = -1;
2585 for (i = 0; i < conf->raid_disks * 2; i++) { 2728 for (i = 0; i < conf->raid_disks * 2; i++) {
2586 2729
2587 disk = conf->mirrors + i; 2730 disk = conf->mirrors + i;
@@ -2607,21 +2750,11 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2607 if (disk->rdev && 2750 if (disk->rdev &&
2608 (disk->rdev->saved_raid_disk < 0)) 2751 (disk->rdev->saved_raid_disk < 0))
2609 conf->fullsync = 1; 2752 conf->fullsync = 1;
2610 } else if (conf->last_used < 0) 2753 }
2611 /*
2612 * The first working device is used as a
2613 * starting point to read balancing.
2614 */
2615 conf->last_used = i;
2616 } 2754 }
2617 2755
2618 if (conf->last_used < 0) {
2619 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2620 mdname(mddev));
2621 goto abort;
2622 }
2623 err = -ENOMEM; 2756 err = -ENOMEM;
2624 conf->thread = md_register_thread(raid1d, mddev, NULL); 2757 conf->thread = md_register_thread(raid1d, mddev, "raid1");
2625 if (!conf->thread) { 2758 if (!conf->thread) {
2626 printk(KERN_ERR 2759 printk(KERN_ERR
2627 "md/raid1:%s: couldn't allocate thread\n", 2760 "md/raid1:%s: couldn't allocate thread\n",
@@ -2794,7 +2927,7 @@ static int raid1_reshape(struct mddev *mddev)
2794 */ 2927 */
2795 mempool_t *newpool, *oldpool; 2928 mempool_t *newpool, *oldpool;
2796 struct pool_info *newpoolinfo; 2929 struct pool_info *newpoolinfo;
2797 struct mirror_info *newmirrors; 2930 struct raid1_info *newmirrors;
2798 struct r1conf *conf = mddev->private; 2931 struct r1conf *conf = mddev->private;
2799 int cnt, raid_disks; 2932 int cnt, raid_disks;
2800 unsigned long flags; 2933 unsigned long flags;
@@ -2837,7 +2970,7 @@ static int raid1_reshape(struct mddev *mddev)
2837 kfree(newpoolinfo); 2970 kfree(newpoolinfo);
2838 return -ENOMEM; 2971 return -ENOMEM;
2839 } 2972 }
2840 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, 2973 newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
2841 GFP_KERNEL); 2974 GFP_KERNEL);
2842 if (!newmirrors) { 2975 if (!newmirrors) {
2843 kfree(newpoolinfo); 2976 kfree(newpoolinfo);
@@ -2876,7 +3009,6 @@ static int raid1_reshape(struct mddev *mddev)
2876 conf->raid_disks = mddev->raid_disks = raid_disks; 3009 conf->raid_disks = mddev->raid_disks = raid_disks;
2877 mddev->delta_disks = 0; 3010 mddev->delta_disks = 0;
2878 3011
2879 conf->last_used = 0; /* just make sure it is in-range */
2880 lower_barrier(conf); 3012 lower_barrier(conf);
2881 3013
2882 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3014 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 80ded139314c..0ff3715fb7eb 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,9 +1,15 @@
1#ifndef _RAID1_H 1#ifndef _RAID1_H
2#define _RAID1_H 2#define _RAID1_H
3 3
4struct mirror_info { 4struct raid1_info {
5 struct md_rdev *rdev; 5 struct md_rdev *rdev;
6 sector_t head_position; 6 sector_t head_position;
7
8 /* When choose the best device for a read (read_balance())
9 * we try to keep sequential reads one the same device
10 */
11 sector_t next_seq_sect;
12 sector_t seq_start;
7}; 13};
8 14
9/* 15/*
@@ -24,17 +30,11 @@ struct pool_info {
24 30
25struct r1conf { 31struct r1conf {
26 struct mddev *mddev; 32 struct mddev *mddev;
27 struct mirror_info *mirrors; /* twice 'raid_disks' to 33 struct raid1_info *mirrors; /* twice 'raid_disks' to
28 * allow for replacements. 34 * allow for replacements.
29 */ 35 */
30 int raid_disks; 36 int raid_disks;
31 37
32 /* When choose the best device for a read (read_balance())
33 * we try to keep sequential reads one the same device
34 * using 'last_used' and 'next_seq_sect'
35 */
36 int last_used;
37 sector_t next_seq_sect;
38 /* During resync, read_balancing is only allowed on the part 38 /* During resync, read_balancing is only allowed on the part
39 * of the array that has been resynced. 'next_resync' tells us 39 * of the array that has been resynced. 'next_resync' tells us
40 * where that is. 40 * where that is.
@@ -135,20 +135,6 @@ struct r1bio {
135 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ 135 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
136}; 136};
137 137
138/* when we get a read error on a read-only array, we redirect to another
139 * device without failing the first device, or trying to over-write to
140 * correct the read error. To keep track of bad blocks on a per-bio
141 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
142 */
143#define IO_BLOCKED ((struct bio *)1)
144/* When we successfully write to a known bad-block, we need to remove the
145 * bad-block marking which must be done from process context. So we record
146 * the success by setting bios[n] to IO_MADE_GOOD
147 */
148#define IO_MADE_GOOD ((struct bio *)2)
149
150#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
151
152/* bits for r1bio.state */ 138/* bits for r1bio.state */
153#define R1BIO_Uptodate 0 139#define R1BIO_Uptodate 0
154#define R1BIO_IsSync 1 140#define R1BIO_IsSync 1
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 99ae6068e456..de5ed6fd8806 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -60,7 +60,21 @@
60 */ 60 */
61#define NR_RAID10_BIOS 256 61#define NR_RAID10_BIOS 256
62 62
63/* When there are this many requests queue to be written by 63/* when we get a read error on a read-only array, we redirect to another
64 * device without failing the first device, or trying to over-write to
65 * correct the read error. To keep track of bad blocks on a per-bio
66 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
67 */
68#define IO_BLOCKED ((struct bio *)1)
69/* When we successfully write to a known bad-block, we need to remove the
70 * bad-block marking which must be done from process context. So we record
71 * the success by setting devs[n].bio to IO_MADE_GOOD
72 */
73#define IO_MADE_GOOD ((struct bio *)2)
74
75#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
76
77/* When there are this many requests queued to be written by
64 * the raid10 thread, we become 'congested' to provide back-pressure 78 * the raid10 thread, we become 'congested' to provide back-pressure
65 * for writeback. 79 * for writeback.
66 */ 80 */
@@ -717,7 +731,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
717 int sectors = r10_bio->sectors; 731 int sectors = r10_bio->sectors;
718 int best_good_sectors; 732 int best_good_sectors;
719 sector_t new_distance, best_dist; 733 sector_t new_distance, best_dist;
720 struct md_rdev *rdev, *best_rdev; 734 struct md_rdev *best_rdev, *rdev = NULL;
721 int do_balance; 735 int do_balance;
722 int best_slot; 736 int best_slot;
723 struct geom *geo = &conf->geo; 737 struct geom *geo = &conf->geo;
@@ -839,9 +853,8 @@ retry:
839 return rdev; 853 return rdev;
840} 854}
841 855
842static int raid10_congested(void *data, int bits) 856int md_raid10_congested(struct mddev *mddev, int bits)
843{ 857{
844 struct mddev *mddev = data;
845 struct r10conf *conf = mddev->private; 858 struct r10conf *conf = mddev->private;
846 int i, ret = 0; 859 int i, ret = 0;
847 860
@@ -849,8 +862,6 @@ static int raid10_congested(void *data, int bits)
849 conf->pending_count >= max_queued_requests) 862 conf->pending_count >= max_queued_requests)
850 return 1; 863 return 1;
851 864
852 if (mddev_congested(mddev, bits))
853 return 1;
854 rcu_read_lock(); 865 rcu_read_lock();
855 for (i = 0; 866 for (i = 0;
856 (i < conf->geo.raid_disks || i < conf->prev.raid_disks) 867 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
@@ -866,6 +877,15 @@ static int raid10_congested(void *data, int bits)
866 rcu_read_unlock(); 877 rcu_read_unlock();
867 return ret; 878 return ret;
868} 879}
880EXPORT_SYMBOL_GPL(md_raid10_congested);
881
882static int raid10_congested(void *data, int bits)
883{
884 struct mddev *mddev = data;
885
886 return mddev_congested(mddev, bits) ||
887 md_raid10_congested(mddev, bits);
888}
869 889
870static void flush_pending_writes(struct r10conf *conf) 890static void flush_pending_writes(struct r10conf *conf)
871{ 891{
@@ -1039,7 +1059,6 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1039 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 1059 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1040 unsigned long flags; 1060 unsigned long flags;
1041 struct md_rdev *blocked_rdev; 1061 struct md_rdev *blocked_rdev;
1042 int plugged;
1043 int sectors_handled; 1062 int sectors_handled;
1044 int max_sectors; 1063 int max_sectors;
1045 int sectors; 1064 int sectors;
@@ -1239,7 +1258,6 @@ read_again:
1239 * of r10_bios is recored in bio->bi_phys_segments just as with 1258 * of r10_bios is recored in bio->bi_phys_segments just as with
1240 * the read case. 1259 * the read case.
1241 */ 1260 */
1242 plugged = mddev_check_plugged(mddev);
1243 1261
1244 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ 1262 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
1245 raid10_find_phys(conf, r10_bio); 1263 raid10_find_phys(conf, r10_bio);
@@ -1396,6 +1414,8 @@ retry_write:
1396 bio_list_add(&conf->pending_bio_list, mbio); 1414 bio_list_add(&conf->pending_bio_list, mbio);
1397 conf->pending_count++; 1415 conf->pending_count++;
1398 spin_unlock_irqrestore(&conf->device_lock, flags); 1416 spin_unlock_irqrestore(&conf->device_lock, flags);
1417 if (!mddev_check_plugged(mddev))
1418 md_wakeup_thread(mddev->thread);
1399 1419
1400 if (!r10_bio->devs[i].repl_bio) 1420 if (!r10_bio->devs[i].repl_bio)
1401 continue; 1421 continue;
@@ -1423,6 +1443,8 @@ retry_write:
1423 bio_list_add(&conf->pending_bio_list, mbio); 1443 bio_list_add(&conf->pending_bio_list, mbio);
1424 conf->pending_count++; 1444 conf->pending_count++;
1425 spin_unlock_irqrestore(&conf->device_lock, flags); 1445 spin_unlock_irqrestore(&conf->device_lock, flags);
1446 if (!mddev_check_plugged(mddev))
1447 md_wakeup_thread(mddev->thread);
1426 } 1448 }
1427 1449
1428 /* Don't remove the bias on 'remaining' (one_write_done) until 1450 /* Don't remove the bias on 'remaining' (one_write_done) until
@@ -1448,9 +1470,6 @@ retry_write:
1448 1470
1449 /* In case raid10d snuck in to freeze_array */ 1471 /* In case raid10d snuck in to freeze_array */
1450 wake_up(&conf->wait_barrier); 1472 wake_up(&conf->wait_barrier);
1451
1452 if (do_sync || !mddev->bitmap || !plugged)
1453 md_wakeup_thread(mddev->thread);
1454} 1473}
1455 1474
1456static void status(struct seq_file *seq, struct mddev *mddev) 1475static void status(struct seq_file *seq, struct mddev *mddev)
@@ -1547,7 +1566,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1547static void print_conf(struct r10conf *conf) 1566static void print_conf(struct r10conf *conf)
1548{ 1567{
1549 int i; 1568 int i;
1550 struct mirror_info *tmp; 1569 struct raid10_info *tmp;
1551 1570
1552 printk(KERN_DEBUG "RAID10 conf printout:\n"); 1571 printk(KERN_DEBUG "RAID10 conf printout:\n");
1553 if (!conf) { 1572 if (!conf) {
@@ -1581,7 +1600,7 @@ static int raid10_spare_active(struct mddev *mddev)
1581{ 1600{
1582 int i; 1601 int i;
1583 struct r10conf *conf = mddev->private; 1602 struct r10conf *conf = mddev->private;
1584 struct mirror_info *tmp; 1603 struct raid10_info *tmp;
1585 int count = 0; 1604 int count = 0;
1586 unsigned long flags; 1605 unsigned long flags;
1587 1606
@@ -1656,7 +1675,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1656 else 1675 else
1657 mirror = first; 1676 mirror = first;
1658 for ( ; mirror <= last ; mirror++) { 1677 for ( ; mirror <= last ; mirror++) {
1659 struct mirror_info *p = &conf->mirrors[mirror]; 1678 struct raid10_info *p = &conf->mirrors[mirror];
1660 if (p->recovery_disabled == mddev->recovery_disabled) 1679 if (p->recovery_disabled == mddev->recovery_disabled)
1661 continue; 1680 continue;
1662 if (p->rdev) { 1681 if (p->rdev) {
@@ -1710,7 +1729,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1710 int err = 0; 1729 int err = 0;
1711 int number = rdev->raid_disk; 1730 int number = rdev->raid_disk;
1712 struct md_rdev **rdevp; 1731 struct md_rdev **rdevp;
1713 struct mirror_info *p = conf->mirrors + number; 1732 struct raid10_info *p = conf->mirrors + number;
1714 1733
1715 print_conf(conf); 1734 print_conf(conf);
1716 if (rdev == p->rdev) 1735 if (rdev == p->rdev)
@@ -2310,7 +2329,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2310 if (r10_sync_page_io(rdev, 2329 if (r10_sync_page_io(rdev,
2311 r10_bio->devs[sl].addr + 2330 r10_bio->devs[sl].addr +
2312 sect, 2331 sect,
2313 s<<9, conf->tmppage, WRITE) 2332 s, conf->tmppage, WRITE)
2314 == 0) { 2333 == 0) {
2315 /* Well, this device is dead */ 2334 /* Well, this device is dead */
2316 printk(KERN_NOTICE 2335 printk(KERN_NOTICE
@@ -2349,7 +2368,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2349 switch (r10_sync_page_io(rdev, 2368 switch (r10_sync_page_io(rdev,
2350 r10_bio->devs[sl].addr + 2369 r10_bio->devs[sl].addr +
2351 sect, 2370 sect,
2352 s<<9, conf->tmppage, 2371 s, conf->tmppage,
2353 READ)) { 2372 READ)) {
2354 case 0: 2373 case 0:
2355 /* Well, this device is dead */ 2374 /* Well, this device is dead */
@@ -2512,7 +2531,7 @@ read_more:
2512 slot = r10_bio->read_slot; 2531 slot = r10_bio->read_slot;
2513 printk_ratelimited( 2532 printk_ratelimited(
2514 KERN_ERR 2533 KERN_ERR
2515 "md/raid10:%s: %s: redirecting" 2534 "md/raid10:%s: %s: redirecting "
2516 "sector %llu to another mirror\n", 2535 "sector %llu to another mirror\n",
2517 mdname(mddev), 2536 mdname(mddev),
2518 bdevname(rdev->bdev, b), 2537 bdevname(rdev->bdev, b),
@@ -2876,7 +2895,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2876 sector_t sect; 2895 sector_t sect;
2877 int must_sync; 2896 int must_sync;
2878 int any_working; 2897 int any_working;
2879 struct mirror_info *mirror = &conf->mirrors[i]; 2898 struct raid10_info *mirror = &conf->mirrors[i];
2880 2899
2881 if ((mirror->rdev == NULL || 2900 if ((mirror->rdev == NULL ||
2882 test_bit(In_sync, &mirror->rdev->flags)) 2901 test_bit(In_sync, &mirror->rdev->flags))
@@ -2890,6 +2909,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2890 /* want to reconstruct this device */ 2909 /* want to reconstruct this device */
2891 rb2 = r10_bio; 2910 rb2 = r10_bio;
2892 sect = raid10_find_virt(conf, sector_nr, i); 2911 sect = raid10_find_virt(conf, sector_nr, i);
2912 if (sect >= mddev->resync_max_sectors) {
2913 /* last stripe is not complete - don't
2914 * try to recover this sector.
2915 */
2916 continue;
2917 }
2893 /* Unless we are doing a full sync, or a replacement 2918 /* Unless we are doing a full sync, or a replacement
2894 * we only need to recover the block if it is set in 2919 * we only need to recover the block if it is set in
2895 * the bitmap 2920 * the bitmap
@@ -3382,7 +3407,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3382 goto out; 3407 goto out;
3383 3408
3384 /* FIXME calc properly */ 3409 /* FIXME calc properly */
3385 conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + 3410 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3386 max(0,mddev->delta_disks)), 3411 max(0,mddev->delta_disks)),
3387 GFP_KERNEL); 3412 GFP_KERNEL);
3388 if (!conf->mirrors) 3413 if (!conf->mirrors)
@@ -3421,7 +3446,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3421 spin_lock_init(&conf->resync_lock); 3446 spin_lock_init(&conf->resync_lock);
3422 init_waitqueue_head(&conf->wait_barrier); 3447 init_waitqueue_head(&conf->wait_barrier);
3423 3448
3424 conf->thread = md_register_thread(raid10d, mddev, NULL); 3449 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3425 if (!conf->thread) 3450 if (!conf->thread)
3426 goto out; 3451 goto out;
3427 3452
@@ -3446,7 +3471,7 @@ static int run(struct mddev *mddev)
3446{ 3471{
3447 struct r10conf *conf; 3472 struct r10conf *conf;
3448 int i, disk_idx, chunk_size; 3473 int i, disk_idx, chunk_size;
3449 struct mirror_info *disk; 3474 struct raid10_info *disk;
3450 struct md_rdev *rdev; 3475 struct md_rdev *rdev;
3451 sector_t size; 3476 sector_t size;
3452 sector_t min_offset_diff = 0; 3477 sector_t min_offset_diff = 0;
@@ -3466,12 +3491,14 @@ static int run(struct mddev *mddev)
3466 conf->thread = NULL; 3491 conf->thread = NULL;
3467 3492
3468 chunk_size = mddev->chunk_sectors << 9; 3493 chunk_size = mddev->chunk_sectors << 9;
3469 blk_queue_io_min(mddev->queue, chunk_size); 3494 if (mddev->queue) {
3470 if (conf->geo.raid_disks % conf->geo.near_copies) 3495 blk_queue_io_min(mddev->queue, chunk_size);
3471 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3496 if (conf->geo.raid_disks % conf->geo.near_copies)
3472 else 3497 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3473 blk_queue_io_opt(mddev->queue, chunk_size * 3498 else
3474 (conf->geo.raid_disks / conf->geo.near_copies)); 3499 blk_queue_io_opt(mddev->queue, chunk_size *
3500 (conf->geo.raid_disks / conf->geo.near_copies));
3501 }
3475 3502
3476 rdev_for_each(rdev, mddev) { 3503 rdev_for_each(rdev, mddev) {
3477 long long diff; 3504 long long diff;
@@ -3505,8 +3532,9 @@ static int run(struct mddev *mddev)
3505 if (first || diff < min_offset_diff) 3532 if (first || diff < min_offset_diff)
3506 min_offset_diff = diff; 3533 min_offset_diff = diff;
3507 3534
3508 disk_stack_limits(mddev->gendisk, rdev->bdev, 3535 if (mddev->gendisk)
3509 rdev->data_offset << 9); 3536 disk_stack_limits(mddev->gendisk, rdev->bdev,
3537 rdev->data_offset << 9);
3510 3538
3511 disk->head_position = 0; 3539 disk->head_position = 0;
3512 } 3540 }
@@ -3569,22 +3597,22 @@ static int run(struct mddev *mddev)
3569 md_set_array_sectors(mddev, size); 3597 md_set_array_sectors(mddev, size);
3570 mddev->resync_max_sectors = size; 3598 mddev->resync_max_sectors = size;
3571 3599
3572 mddev->queue->backing_dev_info.congested_fn = raid10_congested; 3600 if (mddev->queue) {
3573 mddev->queue->backing_dev_info.congested_data = mddev;
3574
3575 /* Calculate max read-ahead size.
3576 * We need to readahead at least twice a whole stripe....
3577 * maybe...
3578 */
3579 {
3580 int stripe = conf->geo.raid_disks * 3601 int stripe = conf->geo.raid_disks *
3581 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 3602 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3603 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3604 mddev->queue->backing_dev_info.congested_data = mddev;
3605
3606 /* Calculate max read-ahead size.
3607 * We need to readahead at least twice a whole stripe....
3608 * maybe...
3609 */
3582 stripe /= conf->geo.near_copies; 3610 stripe /= conf->geo.near_copies;
3583 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 3611 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3584 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 3612 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3613 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3585 } 3614 }
3586 3615
3587 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3588 3616
3589 if (md_integrity_register(mddev)) 3617 if (md_integrity_register(mddev))
3590 goto out_free_conf; 3618 goto out_free_conf;
@@ -3635,7 +3663,10 @@ static int stop(struct mddev *mddev)
3635 lower_barrier(conf); 3663 lower_barrier(conf);
3636 3664
3637 md_unregister_thread(&mddev->thread); 3665 md_unregister_thread(&mddev->thread);
3638 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 3666 if (mddev->queue)
3667 /* the unplug fn references 'conf'*/
3668 blk_sync_queue(mddev->queue);
3669
3639 if (conf->r10bio_pool) 3670 if (conf->r10bio_pool)
3640 mempool_destroy(conf->r10bio_pool); 3671 mempool_destroy(conf->r10bio_pool);
3641 kfree(conf->mirrors); 3672 kfree(conf->mirrors);
@@ -3799,7 +3830,7 @@ static int raid10_check_reshape(struct mddev *mddev)
3799 if (mddev->delta_disks > 0) { 3830 if (mddev->delta_disks > 0) {
3800 /* allocate new 'mirrors' list */ 3831 /* allocate new 'mirrors' list */
3801 conf->mirrors_new = kzalloc( 3832 conf->mirrors_new = kzalloc(
3802 sizeof(struct mirror_info) 3833 sizeof(struct raid10_info)
3803 *(mddev->raid_disks + 3834 *(mddev->raid_disks +
3804 mddev->delta_disks), 3835 mddev->delta_disks),
3805 GFP_KERNEL); 3836 GFP_KERNEL);
@@ -3924,7 +3955,7 @@ static int raid10_start_reshape(struct mddev *mddev)
3924 spin_lock_irq(&conf->device_lock); 3955 spin_lock_irq(&conf->device_lock);
3925 if (conf->mirrors_new) { 3956 if (conf->mirrors_new) {
3926 memcpy(conf->mirrors_new, conf->mirrors, 3957 memcpy(conf->mirrors_new, conf->mirrors,
3927 sizeof(struct mirror_info)*conf->prev.raid_disks); 3958 sizeof(struct raid10_info)*conf->prev.raid_disks);
3928 smp_mb(); 3959 smp_mb();
3929 kfree(conf->mirrors_old); /* FIXME and elsewhere */ 3960 kfree(conf->mirrors_old); /* FIXME and elsewhere */
3930 conf->mirrors_old = conf->mirrors; 3961 conf->mirrors_old = conf->mirrors;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 135b1b0a1554..007c2c68dd83 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -1,7 +1,7 @@
1#ifndef _RAID10_H 1#ifndef _RAID10_H
2#define _RAID10_H 2#define _RAID10_H
3 3
4struct mirror_info { 4struct raid10_info {
5 struct md_rdev *rdev, *replacement; 5 struct md_rdev *rdev, *replacement;
6 sector_t head_position; 6 sector_t head_position;
7 int recovery_disabled; /* matches 7 int recovery_disabled; /* matches
@@ -13,8 +13,8 @@ struct mirror_info {
13 13
14struct r10conf { 14struct r10conf {
15 struct mddev *mddev; 15 struct mddev *mddev;
16 struct mirror_info *mirrors; 16 struct raid10_info *mirrors;
17 struct mirror_info *mirrors_new, *mirrors_old; 17 struct raid10_info *mirrors_new, *mirrors_old;
18 spinlock_t device_lock; 18 spinlock_t device_lock;
19 19
20 /* geometry */ 20 /* geometry */
@@ -123,20 +123,6 @@ struct r10bio {
123 } devs[0]; 123 } devs[0];
124}; 124};
125 125
126/* when we get a read error on a read-only array, we redirect to another
127 * device without failing the first device, or trying to over-write to
128 * correct the read error. To keep track of bad blocks on a per-bio
129 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
130 */
131#define IO_BLOCKED ((struct bio*)1)
132/* When we successfully write to a known bad-block, we need to remove the
133 * bad-block marking which must be done from process context. So we record
134 * the success by setting devs[n].bio to IO_MADE_GOOD
135 */
136#define IO_MADE_GOOD ((struct bio *)2)
137
138#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
139
140/* bits for r10bio.state */ 126/* bits for r10bio.state */
141enum r10bio_state { 127enum r10bio_state {
142 R10BIO_Uptodate, 128 R10BIO_Uptodate,
@@ -159,4 +145,7 @@ enum r10bio_state {
159 */ 145 */
160 R10BIO_Previous, 146 R10BIO_Previous,
161}; 147};
148
149extern int md_raid10_congested(struct mddev *mddev, int bits);
150
162#endif 151#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d26767246d26..adda94df5eb2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -99,34 +99,40 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
99 * We maintain a biased count of active stripes in the bottom 16 bits of 99 * We maintain a biased count of active stripes in the bottom 16 bits of
100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
101 */ 101 */
102static inline int raid5_bi_phys_segments(struct bio *bio) 102static inline int raid5_bi_processed_stripes(struct bio *bio)
103{ 103{
104 return bio->bi_phys_segments & 0xffff; 104 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
105 return (atomic_read(segments) >> 16) & 0xffff;
105} 106}
106 107
107static inline int raid5_bi_hw_segments(struct bio *bio) 108static inline int raid5_dec_bi_active_stripes(struct bio *bio)
108{ 109{
109 return (bio->bi_phys_segments >> 16) & 0xffff; 110 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
111 return atomic_sub_return(1, segments) & 0xffff;
110} 112}
111 113
112static inline int raid5_dec_bi_phys_segments(struct bio *bio) 114static inline void raid5_inc_bi_active_stripes(struct bio *bio)
113{ 115{
114 --bio->bi_phys_segments; 116 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
115 return raid5_bi_phys_segments(bio); 117 atomic_inc(segments);
116} 118}
117 119
118static inline int raid5_dec_bi_hw_segments(struct bio *bio) 120static inline void raid5_set_bi_processed_stripes(struct bio *bio,
121 unsigned int cnt)
119{ 122{
120 unsigned short val = raid5_bi_hw_segments(bio); 123 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
124 int old, new;
121 125
122 --val; 126 do {
123 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 127 old = atomic_read(segments);
124 return val; 128 new = (old & 0xffff) | (cnt << 16);
129 } while (atomic_cmpxchg(segments, old, new) != old);
125} 130}
126 131
127static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 132static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
128{ 133{
129 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); 134 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
135 atomic_set(segments, cnt);
130} 136}
131 137
132/* Find first data disk in a raid6 stripe */ 138/* Find first data disk in a raid6 stripe */
@@ -190,47 +196,56 @@ static int stripe_operations_active(struct stripe_head *sh)
190 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 196 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
191} 197}
192 198
193static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 199static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
194{ 200{
195 if (atomic_dec_and_test(&sh->count)) { 201 BUG_ON(!list_empty(&sh->lru));
196 BUG_ON(!list_empty(&sh->lru)); 202 BUG_ON(atomic_read(&conf->active_stripes)==0);
197 BUG_ON(atomic_read(&conf->active_stripes)==0); 203 if (test_bit(STRIPE_HANDLE, &sh->state)) {
198 if (test_bit(STRIPE_HANDLE, &sh->state)) { 204 if (test_bit(STRIPE_DELAYED, &sh->state) &&
199 if (test_bit(STRIPE_DELAYED, &sh->state)) 205 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
200 list_add_tail(&sh->lru, &conf->delayed_list); 206 list_add_tail(&sh->lru, &conf->delayed_list);
201 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 207 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
202 sh->bm_seq - conf->seq_write > 0) 208 sh->bm_seq - conf->seq_write > 0)
203 list_add_tail(&sh->lru, &conf->bitmap_list); 209 list_add_tail(&sh->lru, &conf->bitmap_list);
204 else { 210 else {
205 clear_bit(STRIPE_BIT_DELAY, &sh->state); 211 clear_bit(STRIPE_DELAYED, &sh->state);
206 list_add_tail(&sh->lru, &conf->handle_list); 212 clear_bit(STRIPE_BIT_DELAY, &sh->state);
207 } 213 list_add_tail(&sh->lru, &conf->handle_list);
208 md_wakeup_thread(conf->mddev->thread); 214 }
209 } else { 215 md_wakeup_thread(conf->mddev->thread);
210 BUG_ON(stripe_operations_active(sh)); 216 } else {
211 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 217 BUG_ON(stripe_operations_active(sh));
212 if (atomic_dec_return(&conf->preread_active_stripes) 218 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
213 < IO_THRESHOLD) 219 if (atomic_dec_return(&conf->preread_active_stripes)
214 md_wakeup_thread(conf->mddev->thread); 220 < IO_THRESHOLD)
215 atomic_dec(&conf->active_stripes); 221 md_wakeup_thread(conf->mddev->thread);
216 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 222 atomic_dec(&conf->active_stripes);
217 list_add_tail(&sh->lru, &conf->inactive_list); 223 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
218 wake_up(&conf->wait_for_stripe); 224 list_add_tail(&sh->lru, &conf->inactive_list);
219 if (conf->retry_read_aligned) 225 wake_up(&conf->wait_for_stripe);
220 md_wakeup_thread(conf->mddev->thread); 226 if (conf->retry_read_aligned)
221 } 227 md_wakeup_thread(conf->mddev->thread);
222 } 228 }
223 } 229 }
224} 230}
225 231
232static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
233{
234 if (atomic_dec_and_test(&sh->count))
235 do_release_stripe(conf, sh);
236}
237
226static void release_stripe(struct stripe_head *sh) 238static void release_stripe(struct stripe_head *sh)
227{ 239{
228 struct r5conf *conf = sh->raid_conf; 240 struct r5conf *conf = sh->raid_conf;
229 unsigned long flags; 241 unsigned long flags;
230 242
231 spin_lock_irqsave(&conf->device_lock, flags); 243 local_irq_save(flags);
232 __release_stripe(conf, sh); 244 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
233 spin_unlock_irqrestore(&conf->device_lock, flags); 245 do_release_stripe(conf, sh);
246 spin_unlock(&conf->device_lock);
247 }
248 local_irq_restore(flags);
234} 249}
235 250
236static inline void remove_hash(struct stripe_head *sh) 251static inline void remove_hash(struct stripe_head *sh)
@@ -469,7 +484,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
469 } else { 484 } else {
470 if (atomic_read(&sh->count)) { 485 if (atomic_read(&sh->count)) {
471 BUG_ON(!list_empty(&sh->lru) 486 BUG_ON(!list_empty(&sh->lru)
472 && !test_bit(STRIPE_EXPANDING, &sh->state)); 487 && !test_bit(STRIPE_EXPANDING, &sh->state)
488 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state));
473 } else { 489 } else {
474 if (!test_bit(STRIPE_HANDLE, &sh->state)) 490 if (!test_bit(STRIPE_HANDLE, &sh->state))
475 atomic_inc(&conf->active_stripes); 491 atomic_inc(&conf->active_stripes);
@@ -606,6 +622,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
606 * a chance*/ 622 * a chance*/
607 md_check_recovery(conf->mddev); 623 md_check_recovery(conf->mddev);
608 } 624 }
625 /*
626 * Because md_wait_for_blocked_rdev
627 * will dec nr_pending, we must
628 * increment it first.
629 */
630 atomic_inc(&rdev->nr_pending);
609 md_wait_for_blocked_rdev(rdev, conf->mddev); 631 md_wait_for_blocked_rdev(rdev, conf->mddev);
610 } else { 632 } else {
611 /* Acknowledged bad block - skip the write */ 633 /* Acknowledged bad block - skip the write */
@@ -632,6 +654,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
632 else 654 else
633 bi->bi_sector = (sh->sector 655 bi->bi_sector = (sh->sector
634 + rdev->data_offset); 656 + rdev->data_offset);
657 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
658 bi->bi_rw |= REQ_FLUSH;
659
635 bi->bi_flags = 1 << BIO_UPTODATE; 660 bi->bi_flags = 1 << BIO_UPTODATE;
636 bi->bi_idx = 0; 661 bi->bi_idx = 0;
637 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 662 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -741,14 +766,12 @@ static void ops_complete_biofill(void *stripe_head_ref)
741{ 766{
742 struct stripe_head *sh = stripe_head_ref; 767 struct stripe_head *sh = stripe_head_ref;
743 struct bio *return_bi = NULL; 768 struct bio *return_bi = NULL;
744 struct r5conf *conf = sh->raid_conf;
745 int i; 769 int i;
746 770
747 pr_debug("%s: stripe %llu\n", __func__, 771 pr_debug("%s: stripe %llu\n", __func__,
748 (unsigned long long)sh->sector); 772 (unsigned long long)sh->sector);
749 773
750 /* clear completed biofills */ 774 /* clear completed biofills */
751 spin_lock_irq(&conf->device_lock);
752 for (i = sh->disks; i--; ) { 775 for (i = sh->disks; i--; ) {
753 struct r5dev *dev = &sh->dev[i]; 776 struct r5dev *dev = &sh->dev[i];
754 777
@@ -766,7 +789,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
766 while (rbi && rbi->bi_sector < 789 while (rbi && rbi->bi_sector <
767 dev->sector + STRIPE_SECTORS) { 790 dev->sector + STRIPE_SECTORS) {
768 rbi2 = r5_next_bio(rbi, dev->sector); 791 rbi2 = r5_next_bio(rbi, dev->sector);
769 if (!raid5_dec_bi_phys_segments(rbi)) { 792 if (!raid5_dec_bi_active_stripes(rbi)) {
770 rbi->bi_next = return_bi; 793 rbi->bi_next = return_bi;
771 return_bi = rbi; 794 return_bi = rbi;
772 } 795 }
@@ -774,7 +797,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
774 } 797 }
775 } 798 }
776 } 799 }
777 spin_unlock_irq(&conf->device_lock);
778 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 800 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
779 801
780 return_io(return_bi); 802 return_io(return_bi);
@@ -786,7 +808,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
786static void ops_run_biofill(struct stripe_head *sh) 808static void ops_run_biofill(struct stripe_head *sh)
787{ 809{
788 struct dma_async_tx_descriptor *tx = NULL; 810 struct dma_async_tx_descriptor *tx = NULL;
789 struct r5conf *conf = sh->raid_conf;
790 struct async_submit_ctl submit; 811 struct async_submit_ctl submit;
791 int i; 812 int i;
792 813
@@ -797,10 +818,10 @@ static void ops_run_biofill(struct stripe_head *sh)
797 struct r5dev *dev = &sh->dev[i]; 818 struct r5dev *dev = &sh->dev[i];
798 if (test_bit(R5_Wantfill, &dev->flags)) { 819 if (test_bit(R5_Wantfill, &dev->flags)) {
799 struct bio *rbi; 820 struct bio *rbi;
800 spin_lock_irq(&conf->device_lock); 821 spin_lock_irq(&sh->stripe_lock);
801 dev->read = rbi = dev->toread; 822 dev->read = rbi = dev->toread;
802 dev->toread = NULL; 823 dev->toread = NULL;
803 spin_unlock_irq(&conf->device_lock); 824 spin_unlock_irq(&sh->stripe_lock);
804 while (rbi && rbi->bi_sector < 825 while (rbi && rbi->bi_sector <
805 dev->sector + STRIPE_SECTORS) { 826 dev->sector + STRIPE_SECTORS) {
806 tx = async_copy_data(0, rbi, dev->page, 827 tx = async_copy_data(0, rbi, dev->page,
@@ -1136,12 +1157,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1136 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1157 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1137 struct bio *wbi; 1158 struct bio *wbi;
1138 1159
1139 spin_lock_irq(&sh->raid_conf->device_lock); 1160 spin_lock_irq(&sh->stripe_lock);
1140 chosen = dev->towrite; 1161 chosen = dev->towrite;
1141 dev->towrite = NULL; 1162 dev->towrite = NULL;
1142 BUG_ON(dev->written); 1163 BUG_ON(dev->written);
1143 wbi = dev->written = chosen; 1164 wbi = dev->written = chosen;
1144 spin_unlock_irq(&sh->raid_conf->device_lock); 1165 spin_unlock_irq(&sh->stripe_lock);
1145 1166
1146 while (wbi && wbi->bi_sector < 1167 while (wbi && wbi->bi_sector <
1147 dev->sector + STRIPE_SECTORS) { 1168 dev->sector + STRIPE_SECTORS) {
@@ -1446,6 +1467,8 @@ static int grow_one_stripe(struct r5conf *conf)
1446 init_waitqueue_head(&sh->ops.wait_for_ops); 1467 init_waitqueue_head(&sh->ops.wait_for_ops);
1447 #endif 1468 #endif
1448 1469
1470 spin_lock_init(&sh->stripe_lock);
1471
1449 if (grow_buffers(sh)) { 1472 if (grow_buffers(sh)) {
1450 shrink_buffers(sh); 1473 shrink_buffers(sh);
1451 kmem_cache_free(conf->slab_cache, sh); 1474 kmem_cache_free(conf->slab_cache, sh);
@@ -1731,12 +1754,15 @@ static void raid5_end_read_request(struct bio * bi, int error)
1731 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1754 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1732 clear_bit(R5_ReadError, &sh->dev[i].flags); 1755 clear_bit(R5_ReadError, &sh->dev[i].flags);
1733 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1756 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1734 } 1757 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
1758 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1759
1735 if (atomic_read(&rdev->read_errors)) 1760 if (atomic_read(&rdev->read_errors))
1736 atomic_set(&rdev->read_errors, 0); 1761 atomic_set(&rdev->read_errors, 0);
1737 } else { 1762 } else {
1738 const char *bdn = bdevname(rdev->bdev, b); 1763 const char *bdn = bdevname(rdev->bdev, b);
1739 int retry = 0; 1764 int retry = 0;
1765 int set_bad = 0;
1740 1766
1741 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1767 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1742 atomic_inc(&rdev->read_errors); 1768 atomic_inc(&rdev->read_errors);
@@ -1748,7 +1774,8 @@ static void raid5_end_read_request(struct bio * bi, int error)
1748 mdname(conf->mddev), 1774 mdname(conf->mddev),
1749 (unsigned long long)s, 1775 (unsigned long long)s,
1750 bdn); 1776 bdn);
1751 else if (conf->mddev->degraded >= conf->max_degraded) 1777 else if (conf->mddev->degraded >= conf->max_degraded) {
1778 set_bad = 1;
1752 printk_ratelimited( 1779 printk_ratelimited(
1753 KERN_WARNING 1780 KERN_WARNING
1754 "md/raid:%s: read error not correctable " 1781 "md/raid:%s: read error not correctable "
@@ -1756,8 +1783,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
1756 mdname(conf->mddev), 1783 mdname(conf->mddev),
1757 (unsigned long long)s, 1784 (unsigned long long)s,
1758 bdn); 1785 bdn);
1759 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1786 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
1760 /* Oh, no!!! */ 1787 /* Oh, no!!! */
1788 set_bad = 1;
1761 printk_ratelimited( 1789 printk_ratelimited(
1762 KERN_WARNING 1790 KERN_WARNING
1763 "md/raid:%s: read error NOT corrected!! " 1791 "md/raid:%s: read error NOT corrected!! "
@@ -1765,7 +1793,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1765 mdname(conf->mddev), 1793 mdname(conf->mddev),
1766 (unsigned long long)s, 1794 (unsigned long long)s,
1767 bdn); 1795 bdn);
1768 else if (atomic_read(&rdev->read_errors) 1796 } else if (atomic_read(&rdev->read_errors)
1769 > conf->max_nr_stripes) 1797 > conf->max_nr_stripes)
1770 printk(KERN_WARNING 1798 printk(KERN_WARNING
1771 "md/raid:%s: Too many read errors, failing device %s.\n", 1799 "md/raid:%s: Too many read errors, failing device %s.\n",
@@ -1773,11 +1801,19 @@ static void raid5_end_read_request(struct bio * bi, int error)
1773 else 1801 else
1774 retry = 1; 1802 retry = 1;
1775 if (retry) 1803 if (retry)
1776 set_bit(R5_ReadError, &sh->dev[i].flags); 1804 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
1805 set_bit(R5_ReadError, &sh->dev[i].flags);
1806 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1807 } else
1808 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1777 else { 1809 else {
1778 clear_bit(R5_ReadError, &sh->dev[i].flags); 1810 clear_bit(R5_ReadError, &sh->dev[i].flags);
1779 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1811 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1780 md_error(conf->mddev, rdev); 1812 if (!(set_bad
1813 && test_bit(In_sync, &rdev->flags)
1814 && rdev_set_badblocks(
1815 rdev, sh->sector, STRIPE_SECTORS, 0)))
1816 md_error(conf->mddev, rdev);
1781 } 1817 }
1782 } 1818 }
1783 rdev_dec_pending(rdev, conf->mddev); 1819 rdev_dec_pending(rdev, conf->mddev);
@@ -2325,11 +2361,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2325 (unsigned long long)bi->bi_sector, 2361 (unsigned long long)bi->bi_sector,
2326 (unsigned long long)sh->sector); 2362 (unsigned long long)sh->sector);
2327 2363
2328 2364 /*
2329 spin_lock_irq(&conf->device_lock); 2365 * If several bio share a stripe. The bio bi_phys_segments acts as a
2366 * reference count to avoid race. The reference count should already be
2367 * increased before this function is called (for example, in
2368 * make_request()), so other bio sharing this stripe will not free the
2369 * stripe. If a stripe is owned by one stripe, the stripe lock will
2370 * protect it.
2371 */
2372 spin_lock_irq(&sh->stripe_lock);
2330 if (forwrite) { 2373 if (forwrite) {
2331 bip = &sh->dev[dd_idx].towrite; 2374 bip = &sh->dev[dd_idx].towrite;
2332 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2375 if (*bip == NULL)
2333 firstwrite = 1; 2376 firstwrite = 1;
2334 } else 2377 } else
2335 bip = &sh->dev[dd_idx].toread; 2378 bip = &sh->dev[dd_idx].toread;
@@ -2345,7 +2388,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2345 if (*bip) 2388 if (*bip)
2346 bi->bi_next = *bip; 2389 bi->bi_next = *bip;
2347 *bip = bi; 2390 *bip = bi;
2348 bi->bi_phys_segments++; 2391 raid5_inc_bi_active_stripes(bi);
2349 2392
2350 if (forwrite) { 2393 if (forwrite) {
2351 /* check if page is covered */ 2394 /* check if page is covered */
@@ -2360,7 +2403,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2360 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2403 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2361 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2404 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2362 } 2405 }
2363 spin_unlock_irq(&conf->device_lock); 2406 spin_unlock_irq(&sh->stripe_lock);
2364 2407
2365 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2408 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2366 (unsigned long long)(*bip)->bi_sector, 2409 (unsigned long long)(*bip)->bi_sector,
@@ -2376,7 +2419,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2376 2419
2377 overlap: 2420 overlap:
2378 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2421 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2379 spin_unlock_irq(&conf->device_lock); 2422 spin_unlock_irq(&sh->stripe_lock);
2380 return 0; 2423 return 0;
2381} 2424}
2382 2425
@@ -2426,10 +2469,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2426 rdev_dec_pending(rdev, conf->mddev); 2469 rdev_dec_pending(rdev, conf->mddev);
2427 } 2470 }
2428 } 2471 }
2429 spin_lock_irq(&conf->device_lock); 2472 spin_lock_irq(&sh->stripe_lock);
2430 /* fail all writes first */ 2473 /* fail all writes first */
2431 bi = sh->dev[i].towrite; 2474 bi = sh->dev[i].towrite;
2432 sh->dev[i].towrite = NULL; 2475 sh->dev[i].towrite = NULL;
2476 spin_unlock_irq(&sh->stripe_lock);
2433 if (bi) { 2477 if (bi) {
2434 s->to_write--; 2478 s->to_write--;
2435 bitmap_end = 1; 2479 bitmap_end = 1;
@@ -2442,13 +2486,17 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2442 sh->dev[i].sector + STRIPE_SECTORS) { 2486 sh->dev[i].sector + STRIPE_SECTORS) {
2443 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2487 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2444 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2488 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2445 if (!raid5_dec_bi_phys_segments(bi)) { 2489 if (!raid5_dec_bi_active_stripes(bi)) {
2446 md_write_end(conf->mddev); 2490 md_write_end(conf->mddev);
2447 bi->bi_next = *return_bi; 2491 bi->bi_next = *return_bi;
2448 *return_bi = bi; 2492 *return_bi = bi;
2449 } 2493 }
2450 bi = nextbi; 2494 bi = nextbi;
2451 } 2495 }
2496 if (bitmap_end)
2497 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2498 STRIPE_SECTORS, 0, 0);
2499 bitmap_end = 0;
2452 /* and fail all 'written' */ 2500 /* and fail all 'written' */
2453 bi = sh->dev[i].written; 2501 bi = sh->dev[i].written;
2454 sh->dev[i].written = NULL; 2502 sh->dev[i].written = NULL;
@@ -2457,7 +2505,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2457 sh->dev[i].sector + STRIPE_SECTORS) { 2505 sh->dev[i].sector + STRIPE_SECTORS) {
2458 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2506 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2459 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2507 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2460 if (!raid5_dec_bi_phys_segments(bi)) { 2508 if (!raid5_dec_bi_active_stripes(bi)) {
2461 md_write_end(conf->mddev); 2509 md_write_end(conf->mddev);
2462 bi->bi_next = *return_bi; 2510 bi->bi_next = *return_bi;
2463 *return_bi = bi; 2511 *return_bi = bi;
@@ -2481,14 +2529,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2481 struct bio *nextbi = 2529 struct bio *nextbi =
2482 r5_next_bio(bi, sh->dev[i].sector); 2530 r5_next_bio(bi, sh->dev[i].sector);
2483 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2531 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2484 if (!raid5_dec_bi_phys_segments(bi)) { 2532 if (!raid5_dec_bi_active_stripes(bi)) {
2485 bi->bi_next = *return_bi; 2533 bi->bi_next = *return_bi;
2486 *return_bi = bi; 2534 *return_bi = bi;
2487 } 2535 }
2488 bi = nextbi; 2536 bi = nextbi;
2489 } 2537 }
2490 } 2538 }
2491 spin_unlock_irq(&conf->device_lock);
2492 if (bitmap_end) 2539 if (bitmap_end)
2493 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2540 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2494 STRIPE_SECTORS, 0, 0); 2541 STRIPE_SECTORS, 0, 0);
@@ -2692,30 +2739,23 @@ static void handle_stripe_clean_event(struct r5conf *conf,
2692 test_bit(R5_UPTODATE, &dev->flags)) { 2739 test_bit(R5_UPTODATE, &dev->flags)) {
2693 /* We can return any write requests */ 2740 /* We can return any write requests */
2694 struct bio *wbi, *wbi2; 2741 struct bio *wbi, *wbi2;
2695 int bitmap_end = 0;
2696 pr_debug("Return write for disc %d\n", i); 2742 pr_debug("Return write for disc %d\n", i);
2697 spin_lock_irq(&conf->device_lock);
2698 wbi = dev->written; 2743 wbi = dev->written;
2699 dev->written = NULL; 2744 dev->written = NULL;
2700 while (wbi && wbi->bi_sector < 2745 while (wbi && wbi->bi_sector <
2701 dev->sector + STRIPE_SECTORS) { 2746 dev->sector + STRIPE_SECTORS) {
2702 wbi2 = r5_next_bio(wbi, dev->sector); 2747 wbi2 = r5_next_bio(wbi, dev->sector);
2703 if (!raid5_dec_bi_phys_segments(wbi)) { 2748 if (!raid5_dec_bi_active_stripes(wbi)) {
2704 md_write_end(conf->mddev); 2749 md_write_end(conf->mddev);
2705 wbi->bi_next = *return_bi; 2750 wbi->bi_next = *return_bi;
2706 *return_bi = wbi; 2751 *return_bi = wbi;
2707 } 2752 }
2708 wbi = wbi2; 2753 wbi = wbi2;
2709 } 2754 }
2710 if (dev->towrite == NULL) 2755 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2711 bitmap_end = 1; 2756 STRIPE_SECTORS,
2712 spin_unlock_irq(&conf->device_lock);
2713 if (bitmap_end)
2714 bitmap_endwrite(conf->mddev->bitmap,
2715 sh->sector,
2716 STRIPE_SECTORS,
2717 !test_bit(STRIPE_DEGRADED, &sh->state), 2757 !test_bit(STRIPE_DEGRADED, &sh->state),
2718 0); 2758 0);
2719 } 2759 }
2720 } 2760 }
2721 2761
@@ -3167,7 +3207,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3167 3207
3168 /* Now to look around and see what can be done */ 3208 /* Now to look around and see what can be done */
3169 rcu_read_lock(); 3209 rcu_read_lock();
3170 spin_lock_irq(&conf->device_lock);
3171 for (i=disks; i--; ) { 3210 for (i=disks; i--; ) {
3172 struct md_rdev *rdev; 3211 struct md_rdev *rdev;
3173 sector_t first_bad; 3212 sector_t first_bad;
@@ -3313,7 +3352,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3313 do_recovery = 1; 3352 do_recovery = 1;
3314 } 3353 }
3315 } 3354 }
3316 spin_unlock_irq(&conf->device_lock);
3317 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3355 if (test_bit(STRIPE_SYNCING, &sh->state)) {
3318 /* If there is a failed device being replaced, 3356 /* If there is a failed device being replaced,
3319 * we must be recovering. 3357 * we must be recovering.
@@ -3582,8 +3620,18 @@ static void handle_stripe(struct stripe_head *sh)
3582 3620
3583finish: 3621finish:
3584 /* wait for this device to become unblocked */ 3622 /* wait for this device to become unblocked */
3585 if (conf->mddev->external && unlikely(s.blocked_rdev)) 3623 if (unlikely(s.blocked_rdev)) {
3586 md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); 3624 if (conf->mddev->external)
3625 md_wait_for_blocked_rdev(s.blocked_rdev,
3626 conf->mddev);
3627 else
3628 /* Internal metadata will immediately
3629 * be written by raid5d, so we don't
3630 * need to wait here.
3631 */
3632 rdev_dec_pending(s.blocked_rdev,
3633 conf->mddev);
3634 }
3587 3635
3588 if (s.handle_bad_blocks) 3636 if (s.handle_bad_blocks)
3589 for (i = disks; i--; ) { 3637 for (i = disks; i--; ) {
@@ -3766,7 +3814,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
3766 * this sets the active strip count to 1 and the processed 3814 * this sets the active strip count to 1 and the processed
3767 * strip count to zero (upper 8 bits) 3815 * strip count to zero (upper 8 bits)
3768 */ 3816 */
3769 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3817 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
3770 } 3818 }
3771 3819
3772 return bi; 3820 return bi;
@@ -3881,8 +3929,6 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3881 raid_bio->bi_next = (void*)rdev; 3929 raid_bio->bi_next = (void*)rdev;
3882 align_bi->bi_bdev = rdev->bdev; 3930 align_bi->bi_bdev = rdev->bdev;
3883 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3931 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3884 /* No reshape active, so we can trust rdev->data_offset */
3885 align_bi->bi_sector += rdev->data_offset;
3886 3932
3887 if (!bio_fits_rdev(align_bi) || 3933 if (!bio_fits_rdev(align_bi) ||
3888 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, 3934 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
@@ -3893,6 +3939,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3893 return 0; 3939 return 0;
3894 } 3940 }
3895 3941
3942 /* No reshape active, so we can trust rdev->data_offset */
3943 align_bi->bi_sector += rdev->data_offset;
3944
3896 spin_lock_irq(&conf->device_lock); 3945 spin_lock_irq(&conf->device_lock);
3897 wait_event_lock_irq(conf->wait_for_stripe, 3946 wait_event_lock_irq(conf->wait_for_stripe,
3898 conf->quiesce == 0, 3947 conf->quiesce == 0,
@@ -3962,6 +4011,62 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
3962 return sh; 4011 return sh;
3963} 4012}
3964 4013
4014struct raid5_plug_cb {
4015 struct blk_plug_cb cb;
4016 struct list_head list;
4017};
4018
4019static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4020{
4021 struct raid5_plug_cb *cb = container_of(
4022 blk_cb, struct raid5_plug_cb, cb);
4023 struct stripe_head *sh;
4024 struct mddev *mddev = cb->cb.data;
4025 struct r5conf *conf = mddev->private;
4026
4027 if (cb->list.next && !list_empty(&cb->list)) {
4028 spin_lock_irq(&conf->device_lock);
4029 while (!list_empty(&cb->list)) {
4030 sh = list_first_entry(&cb->list, struct stripe_head, lru);
4031 list_del_init(&sh->lru);
4032 /*
4033 * avoid race release_stripe_plug() sees
4034 * STRIPE_ON_UNPLUG_LIST clear but the stripe
4035 * is still in our list
4036 */
4037 smp_mb__before_clear_bit();
4038 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
4039 __release_stripe(conf, sh);
4040 }
4041 spin_unlock_irq(&conf->device_lock);
4042 }
4043 kfree(cb);
4044}
4045
4046static void release_stripe_plug(struct mddev *mddev,
4047 struct stripe_head *sh)
4048{
4049 struct blk_plug_cb *blk_cb = blk_check_plugged(
4050 raid5_unplug, mddev,
4051 sizeof(struct raid5_plug_cb));
4052 struct raid5_plug_cb *cb;
4053
4054 if (!blk_cb) {
4055 release_stripe(sh);
4056 return;
4057 }
4058
4059 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
4060
4061 if (cb->list.next == NULL)
4062 INIT_LIST_HEAD(&cb->list);
4063
4064 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
4065 list_add_tail(&sh->lru, &cb->list);
4066 else
4067 release_stripe(sh);
4068}
4069
3965static void make_request(struct mddev *mddev, struct bio * bi) 4070static void make_request(struct mddev *mddev, struct bio * bi)
3966{ 4071{
3967 struct r5conf *conf = mddev->private; 4072 struct r5conf *conf = mddev->private;
@@ -3971,7 +4076,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
3971 struct stripe_head *sh; 4076 struct stripe_head *sh;
3972 const int rw = bio_data_dir(bi); 4077 const int rw = bio_data_dir(bi);
3973 int remaining; 4078 int remaining;
3974 int plugged;
3975 4079
3976 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 4080 if (unlikely(bi->bi_rw & REQ_FLUSH)) {
3977 md_flush_request(mddev, bi); 4081 md_flush_request(mddev, bi);
@@ -3990,7 +4094,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
3990 bi->bi_next = NULL; 4094 bi->bi_next = NULL;
3991 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4095 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
3992 4096
3993 plugged = mddev_check_plugged(mddev);
3994 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 4097 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
3995 DEFINE_WAIT(w); 4098 DEFINE_WAIT(w);
3996 int previous; 4099 int previous;
@@ -4089,24 +4192,19 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4089 finish_wait(&conf->wait_for_overlap, &w); 4192 finish_wait(&conf->wait_for_overlap, &w);
4090 set_bit(STRIPE_HANDLE, &sh->state); 4193 set_bit(STRIPE_HANDLE, &sh->state);
4091 clear_bit(STRIPE_DELAYED, &sh->state); 4194 clear_bit(STRIPE_DELAYED, &sh->state);
4092 if ((bi->bi_rw & REQ_SYNC) && 4195 if ((bi->bi_rw & REQ_NOIDLE) &&
4093 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4196 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4094 atomic_inc(&conf->preread_active_stripes); 4197 atomic_inc(&conf->preread_active_stripes);
4095 release_stripe(sh); 4198 release_stripe_plug(mddev, sh);
4096 } else { 4199 } else {
4097 /* cannot get stripe for read-ahead, just give-up */ 4200 /* cannot get stripe for read-ahead, just give-up */
4098 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4201 clear_bit(BIO_UPTODATE, &bi->bi_flags);
4099 finish_wait(&conf->wait_for_overlap, &w); 4202 finish_wait(&conf->wait_for_overlap, &w);
4100 break; 4203 break;
4101 } 4204 }
4102
4103 } 4205 }
4104 if (!plugged)
4105 md_wakeup_thread(mddev->thread);
4106 4206
4107 spin_lock_irq(&conf->device_lock); 4207 remaining = raid5_dec_bi_active_stripes(bi);
4108 remaining = raid5_dec_bi_phys_segments(bi);
4109 spin_unlock_irq(&conf->device_lock);
4110 if (remaining == 0) { 4208 if (remaining == 0) {
4111 4209
4112 if ( rw == WRITE ) 4210 if ( rw == WRITE )
@@ -4462,7 +4560,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4462 sector += STRIPE_SECTORS, 4560 sector += STRIPE_SECTORS,
4463 scnt++) { 4561 scnt++) {
4464 4562
4465 if (scnt < raid5_bi_hw_segments(raid_bio)) 4563 if (scnt < raid5_bi_processed_stripes(raid_bio))
4466 /* already done this stripe */ 4564 /* already done this stripe */
4467 continue; 4565 continue;
4468 4566
@@ -4470,25 +4568,24 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4470 4568
4471 if (!sh) { 4569 if (!sh) {
4472 /* failed to get a stripe - must wait */ 4570 /* failed to get a stripe - must wait */
4473 raid5_set_bi_hw_segments(raid_bio, scnt); 4571 raid5_set_bi_processed_stripes(raid_bio, scnt);
4474 conf->retry_read_aligned = raid_bio; 4572 conf->retry_read_aligned = raid_bio;
4475 return handled; 4573 return handled;
4476 } 4574 }
4477 4575
4478 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4576 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4479 release_stripe(sh); 4577 release_stripe(sh);
4480 raid5_set_bi_hw_segments(raid_bio, scnt); 4578 raid5_set_bi_processed_stripes(raid_bio, scnt);
4481 conf->retry_read_aligned = raid_bio; 4579 conf->retry_read_aligned = raid_bio;
4482 return handled; 4580 return handled;
4483 } 4581 }
4484 4582
4583 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
4485 handle_stripe(sh); 4584 handle_stripe(sh);
4486 release_stripe(sh); 4585 release_stripe(sh);
4487 handled++; 4586 handled++;
4488 } 4587 }
4489 spin_lock_irq(&conf->device_lock); 4588 remaining = raid5_dec_bi_active_stripes(raid_bio);
4490 remaining = raid5_dec_bi_phys_segments(raid_bio);
4491 spin_unlock_irq(&conf->device_lock);
4492 if (remaining == 0) 4589 if (remaining == 0)
4493 bio_endio(raid_bio, 0); 4590 bio_endio(raid_bio, 0);
4494 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4591 if (atomic_dec_and_test(&conf->active_aligned_reads))
@@ -4496,6 +4593,30 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4496 return handled; 4593 return handled;
4497} 4594}
4498 4595
4596#define MAX_STRIPE_BATCH 8
4597static int handle_active_stripes(struct r5conf *conf)
4598{
4599 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
4600 int i, batch_size = 0;
4601
4602 while (batch_size < MAX_STRIPE_BATCH &&
4603 (sh = __get_priority_stripe(conf)) != NULL)
4604 batch[batch_size++] = sh;
4605
4606 if (batch_size == 0)
4607 return batch_size;
4608 spin_unlock_irq(&conf->device_lock);
4609
4610 for (i = 0; i < batch_size; i++)
4611 handle_stripe(batch[i]);
4612
4613 cond_resched();
4614
4615 spin_lock_irq(&conf->device_lock);
4616 for (i = 0; i < batch_size; i++)
4617 __release_stripe(conf, batch[i]);
4618 return batch_size;
4619}
4499 4620
4500/* 4621/*
4501 * This is our raid5 kernel thread. 4622 * This is our raid5 kernel thread.
@@ -4506,7 +4627,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4506 */ 4627 */
4507static void raid5d(struct mddev *mddev) 4628static void raid5d(struct mddev *mddev)
4508{ 4629{
4509 struct stripe_head *sh;
4510 struct r5conf *conf = mddev->private; 4630 struct r5conf *conf = mddev->private;
4511 int handled; 4631 int handled;
4512 struct blk_plug plug; 4632 struct blk_plug plug;
@@ -4520,8 +4640,9 @@ static void raid5d(struct mddev *mddev)
4520 spin_lock_irq(&conf->device_lock); 4640 spin_lock_irq(&conf->device_lock);
4521 while (1) { 4641 while (1) {
4522 struct bio *bio; 4642 struct bio *bio;
4643 int batch_size;
4523 4644
4524 if (atomic_read(&mddev->plug_cnt) == 0 && 4645 if (
4525 !list_empty(&conf->bitmap_list)) { 4646 !list_empty(&conf->bitmap_list)) {
4526 /* Now is a good time to flush some bitmap updates */ 4647 /* Now is a good time to flush some bitmap updates */
4527 conf->seq_flush++; 4648 conf->seq_flush++;
@@ -4531,8 +4652,7 @@ static void raid5d(struct mddev *mddev)
4531 conf->seq_write = conf->seq_flush; 4652 conf->seq_write = conf->seq_flush;
4532 activate_bit_delay(conf); 4653 activate_bit_delay(conf);
4533 } 4654 }
4534 if (atomic_read(&mddev->plug_cnt) == 0) 4655 raid5_activate_delayed(conf);
4535 raid5_activate_delayed(conf);
4536 4656
4537 while ((bio = remove_bio_from_retry(conf))) { 4657 while ((bio = remove_bio_from_retry(conf))) {
4538 int ok; 4658 int ok;
@@ -4544,21 +4664,16 @@ static void raid5d(struct mddev *mddev)
4544 handled++; 4664 handled++;
4545 } 4665 }
4546 4666
4547 sh = __get_priority_stripe(conf); 4667 batch_size = handle_active_stripes(conf);
4548 4668 if (!batch_size)
4549 if (!sh)
4550 break; 4669 break;
4551 spin_unlock_irq(&conf->device_lock); 4670 handled += batch_size;
4552
4553 handled++;
4554 handle_stripe(sh);
4555 release_stripe(sh);
4556 cond_resched();
4557 4671
4558 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) 4672 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
4673 spin_unlock_irq(&conf->device_lock);
4559 md_check_recovery(mddev); 4674 md_check_recovery(mddev);
4560 4675 spin_lock_irq(&conf->device_lock);
4561 spin_lock_irq(&conf->device_lock); 4676 }
4562 } 4677 }
4563 pr_debug("%d stripes handled\n", handled); 4678 pr_debug("%d stripes handled\n", handled);
4564 4679
@@ -4823,6 +4938,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
4823 int raid_disk, memory, max_disks; 4938 int raid_disk, memory, max_disks;
4824 struct md_rdev *rdev; 4939 struct md_rdev *rdev;
4825 struct disk_info *disk; 4940 struct disk_info *disk;
4941 char pers_name[6];
4826 4942
4827 if (mddev->new_level != 5 4943 if (mddev->new_level != 5
4828 && mddev->new_level != 4 4944 && mddev->new_level != 4
@@ -4946,7 +5062,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
4946 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 5062 printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
4947 mdname(mddev), memory); 5063 mdname(mddev), memory);
4948 5064
4949 conf->thread = md_register_thread(raid5d, mddev, NULL); 5065 sprintf(pers_name, "raid%d", mddev->new_level);
5066 conf->thread = md_register_thread(raid5d, mddev, pers_name);
4950 if (!conf->thread) { 5067 if (!conf->thread) {
4951 printk(KERN_ERR 5068 printk(KERN_ERR
4952 "md/raid:%s: couldn't allocate thread.\n", 5069 "md/raid:%s: couldn't allocate thread.\n",
@@ -5465,10 +5582,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5465 if (rdev->saved_raid_disk >= 0 && 5582 if (rdev->saved_raid_disk >= 0 &&
5466 rdev->saved_raid_disk >= first && 5583 rdev->saved_raid_disk >= first &&
5467 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5584 conf->disks[rdev->saved_raid_disk].rdev == NULL)
5468 disk = rdev->saved_raid_disk; 5585 first = rdev->saved_raid_disk;
5469 else 5586
5470 disk = first; 5587 for (disk = first; disk <= last; disk++) {
5471 for ( ; disk <= last ; disk++) {
5472 p = conf->disks + disk; 5588 p = conf->disks + disk;
5473 if (p->rdev == NULL) { 5589 if (p->rdev == NULL) {
5474 clear_bit(In_sync, &rdev->flags); 5590 clear_bit(In_sync, &rdev->flags);
@@ -5477,8 +5593,11 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5477 if (rdev->saved_raid_disk != disk) 5593 if (rdev->saved_raid_disk != disk)
5478 conf->fullsync = 1; 5594 conf->fullsync = 1;
5479 rcu_assign_pointer(p->rdev, rdev); 5595 rcu_assign_pointer(p->rdev, rdev);
5480 break; 5596 goto out;
5481 } 5597 }
5598 }
5599 for (disk = first; disk <= last; disk++) {
5600 p = conf->disks + disk;
5482 if (test_bit(WantReplacement, &p->rdev->flags) && 5601 if (test_bit(WantReplacement, &p->rdev->flags) &&
5483 p->replacement == NULL) { 5602 p->replacement == NULL) {
5484 clear_bit(In_sync, &rdev->flags); 5603 clear_bit(In_sync, &rdev->flags);
@@ -5490,6 +5609,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5490 break; 5609 break;
5491 } 5610 }
5492 } 5611 }
5612out:
5493 print_raid5_conf(conf); 5613 print_raid5_conf(conf);
5494 return err; 5614 return err;
5495} 5615}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2164021f3b5f..a9fc24901eda 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -210,6 +210,7 @@ struct stripe_head {
210 int disks; /* disks in stripe */ 210 int disks; /* disks in stripe */
211 enum check_states check_state; 211 enum check_states check_state;
212 enum reconstruct_states reconstruct_state; 212 enum reconstruct_states reconstruct_state;
213 spinlock_t stripe_lock;
213 /** 214 /**
214 * struct stripe_operations 215 * struct stripe_operations
215 * @target - STRIPE_OP_COMPUTE_BLK target 216 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -273,6 +274,7 @@ enum r5dev_flags {
273 R5_Wantwrite, 274 R5_Wantwrite,
274 R5_Overlap, /* There is a pending overlapping request 275 R5_Overlap, /* There is a pending overlapping request
275 * on this block */ 276 * on this block */
277 R5_ReadNoMerge, /* prevent bio from merging in block-layer */
276 R5_ReadError, /* seen a read error here recently */ 278 R5_ReadError, /* seen a read error here recently */
277 R5_ReWrite, /* have tried to over-write the readerror */ 279 R5_ReWrite, /* have tried to over-write the readerror */
278 280
@@ -319,6 +321,7 @@ enum {
319 STRIPE_BIOFILL_RUN, 321 STRIPE_BIOFILL_RUN,
320 STRIPE_COMPUTE_RUN, 322 STRIPE_COMPUTE_RUN,
321 STRIPE_OPS_REQ_PENDING, 323 STRIPE_OPS_REQ_PENDING,
324 STRIPE_ON_UNPLUG_LIST,
322}; 325};
323 326
324/* 327/*