aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig9
-rw-r--r--drivers/md/dm-crypt.c219
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-exception-store.c13
-rw-r--r--drivers/md/dm-flakey.c2
-rw-r--r--drivers/md/dm-ioctl.c5
-rw-r--r--drivers/md/dm-linear.c2
-rw-r--r--drivers/md/dm-log.c13
-rw-r--r--drivers/md/dm-mpath.c49
-rw-r--r--drivers/md/dm-raid.c147
-rw-r--r--drivers/md/dm-raid1.c10
-rw-r--r--drivers/md/dm-snap.c34
-rw-r--r--drivers/md/dm-stripe.c87
-rw-r--r--drivers/md/dm-table.c3
-rw-r--r--drivers/md/dm-thin-metadata.c769
-rw-r--r--drivers/md/dm-thin-metadata.h25
-rw-r--r--drivers/md/dm-thin.c542
-rw-r--r--drivers/md/dm-verity.c2
-rw-r--r--drivers/md/dm.c40
-rw-r--r--drivers/md/dm.h5
-rw-r--r--drivers/md/md.c8
-rw-r--r--drivers/md/persistent-data/Makefile1
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c105
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h21
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.c446
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.h26
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c12
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.h1
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c34
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c91
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h11
-rw-r--r--drivers/md/raid1.c164
-rw-r--r--drivers/md/raid1.h30
-rw-r--r--drivers/md/raid10.c92
-rw-r--r--drivers/md/raid10.h23
-rw-r--r--drivers/md/raid5.c205
-rw-r--r--drivers/md/raid5.h2
37 files changed, 1721 insertions, 1529 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 1de441a6c55f..d949b781f6f8 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -260,15 +260,6 @@ config DM_DEBUG_BLOCK_STACK_TRACING
260 260
261 If unsure, say N. 261 If unsure, say N.
262 262
263config DM_DEBUG_SPACE_MAPS
264 boolean "Extra validation for thin provisioning space maps"
265 depends on DM_THIN_PROVISIONING
266 ---help---
267 Enable this for messages that may help debug problems with the
268 space maps used by thin provisioning.
269
270 If unsure, say N.
271
272config DM_MIRROR 263config DM_MIRROR
273 tristate "Mirror target" 264 tristate "Mirror target"
274 depends on BLK_DEV_DM 265 depends on BLK_DEV_DM
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3f06df59fd82..664743d6a6cd 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -42,21 +42,21 @@ struct convert_context {
42 unsigned int offset_out; 42 unsigned int offset_out;
43 unsigned int idx_in; 43 unsigned int idx_in;
44 unsigned int idx_out; 44 unsigned int idx_out;
45 sector_t sector; 45 sector_t cc_sector;
46 atomic_t pending; 46 atomic_t cc_pending;
47}; 47};
48 48
49/* 49/*
50 * per bio private data 50 * per bio private data
51 */ 51 */
52struct dm_crypt_io { 52struct dm_crypt_io {
53 struct dm_target *target; 53 struct crypt_config *cc;
54 struct bio *base_bio; 54 struct bio *base_bio;
55 struct work_struct work; 55 struct work_struct work;
56 56
57 struct convert_context ctx; 57 struct convert_context ctx;
58 58
59 atomic_t pending; 59 atomic_t io_pending;
60 int error; 60 int error;
61 sector_t sector; 61 sector_t sector;
62 struct dm_crypt_io *base_io; 62 struct dm_crypt_io *base_io;
@@ -109,9 +109,6 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
109 */ 109 */
110struct crypt_cpu { 110struct crypt_cpu {
111 struct ablkcipher_request *req; 111 struct ablkcipher_request *req;
112 /* ESSIV: struct crypto_cipher *essiv_tfm */
113 void *iv_private;
114 struct crypto_ablkcipher *tfms[0];
115}; 112};
116 113
117/* 114/*
@@ -151,6 +148,10 @@ struct crypt_config {
151 * per_cpu_ptr() only. 148 * per_cpu_ptr() only.
152 */ 149 */
153 struct crypt_cpu __percpu *cpu; 150 struct crypt_cpu __percpu *cpu;
151
152 /* ESSIV: struct crypto_cipher *essiv_tfm */
153 void *iv_private;
154 struct crypto_ablkcipher **tfms;
154 unsigned tfms_count; 155 unsigned tfms_count;
155 156
156 /* 157 /*
@@ -193,7 +194,7 @@ static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
193 */ 194 */
194static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) 195static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
195{ 196{
196 return __this_cpu_ptr(cc->cpu)->tfms[0]; 197 return cc->tfms[0];
197} 198}
198 199
199/* 200/*
@@ -258,7 +259,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
258 struct hash_desc desc; 259 struct hash_desc desc;
259 struct scatterlist sg; 260 struct scatterlist sg;
260 struct crypto_cipher *essiv_tfm; 261 struct crypto_cipher *essiv_tfm;
261 int err, cpu; 262 int err;
262 263
263 sg_init_one(&sg, cc->key, cc->key_size); 264 sg_init_one(&sg, cc->key, cc->key_size);
264 desc.tfm = essiv->hash_tfm; 265 desc.tfm = essiv->hash_tfm;
@@ -268,14 +269,12 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
268 if (err) 269 if (err)
269 return err; 270 return err;
270 271
271 for_each_possible_cpu(cpu) { 272 essiv_tfm = cc->iv_private;
272 essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
273 273
274 err = crypto_cipher_setkey(essiv_tfm, essiv->salt, 274 err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
275 crypto_hash_digestsize(essiv->hash_tfm)); 275 crypto_hash_digestsize(essiv->hash_tfm));
276 if (err) 276 if (err)
277 return err; 277 return err;
278 }
279 278
280 return 0; 279 return 0;
281} 280}
@@ -286,16 +285,14 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
286 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 285 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
287 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); 286 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
288 struct crypto_cipher *essiv_tfm; 287 struct crypto_cipher *essiv_tfm;
289 int cpu, r, err = 0; 288 int r, err = 0;
290 289
291 memset(essiv->salt, 0, salt_size); 290 memset(essiv->salt, 0, salt_size);
292 291
293 for_each_possible_cpu(cpu) { 292 essiv_tfm = cc->iv_private;
294 essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private; 293 r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
295 r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); 294 if (r)
296 if (r) 295 err = r;
297 err = r;
298 }
299 296
300 return err; 297 return err;
301} 298}
@@ -335,8 +332,6 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
335 332
336static void crypt_iv_essiv_dtr(struct crypt_config *cc) 333static void crypt_iv_essiv_dtr(struct crypt_config *cc)
337{ 334{
338 int cpu;
339 struct crypt_cpu *cpu_cc;
340 struct crypto_cipher *essiv_tfm; 335 struct crypto_cipher *essiv_tfm;
341 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 336 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
342 337
@@ -346,15 +341,12 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc)
346 kzfree(essiv->salt); 341 kzfree(essiv->salt);
347 essiv->salt = NULL; 342 essiv->salt = NULL;
348 343
349 for_each_possible_cpu(cpu) { 344 essiv_tfm = cc->iv_private;
350 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
351 essiv_tfm = cpu_cc->iv_private;
352 345
353 if (essiv_tfm) 346 if (essiv_tfm)
354 crypto_free_cipher(essiv_tfm); 347 crypto_free_cipher(essiv_tfm);
355 348
356 cpu_cc->iv_private = NULL; 349 cc->iv_private = NULL;
357 }
358} 350}
359 351
360static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, 352static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -363,7 +355,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
363 struct crypto_cipher *essiv_tfm = NULL; 355 struct crypto_cipher *essiv_tfm = NULL;
364 struct crypto_hash *hash_tfm = NULL; 356 struct crypto_hash *hash_tfm = NULL;
365 u8 *salt = NULL; 357 u8 *salt = NULL;
366 int err, cpu; 358 int err;
367 359
368 if (!opts) { 360 if (!opts) {
369 ti->error = "Digest algorithm missing for ESSIV mode"; 361 ti->error = "Digest algorithm missing for ESSIV mode";
@@ -388,15 +380,13 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
388 cc->iv_gen_private.essiv.salt = salt; 380 cc->iv_gen_private.essiv.salt = salt;
389 cc->iv_gen_private.essiv.hash_tfm = hash_tfm; 381 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
390 382
391 for_each_possible_cpu(cpu) { 383 essiv_tfm = setup_essiv_cpu(cc, ti, salt,
392 essiv_tfm = setup_essiv_cpu(cc, ti, salt, 384 crypto_hash_digestsize(hash_tfm));
393 crypto_hash_digestsize(hash_tfm)); 385 if (IS_ERR(essiv_tfm)) {
394 if (IS_ERR(essiv_tfm)) { 386 crypt_iv_essiv_dtr(cc);
395 crypt_iv_essiv_dtr(cc); 387 return PTR_ERR(essiv_tfm);
396 return PTR_ERR(essiv_tfm);
397 }
398 per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
399 } 388 }
389 cc->iv_private = essiv_tfm;
400 390
401 return 0; 391 return 0;
402 392
@@ -410,7 +400,7 @@ bad:
410static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, 400static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
411 struct dm_crypt_request *dmreq) 401 struct dm_crypt_request *dmreq)
412{ 402{
413 struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; 403 struct crypto_cipher *essiv_tfm = cc->iv_private;
414 404
415 memset(iv, 0, cc->iv_size); 405 memset(iv, 0, cc->iv_size);
416 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); 406 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
@@ -664,7 +654,7 @@ static void crypt_convert_init(struct crypt_config *cc,
664 ctx->offset_out = 0; 654 ctx->offset_out = 0;
665 ctx->idx_in = bio_in ? bio_in->bi_idx : 0; 655 ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
666 ctx->idx_out = bio_out ? bio_out->bi_idx : 0; 656 ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
667 ctx->sector = sector + cc->iv_offset; 657 ctx->cc_sector = sector + cc->iv_offset;
668 init_completion(&ctx->restart); 658 init_completion(&ctx->restart);
669} 659}
670 660
@@ -695,12 +685,12 @@ static int crypt_convert_block(struct crypt_config *cc,
695 struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); 685 struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
696 struct dm_crypt_request *dmreq; 686 struct dm_crypt_request *dmreq;
697 u8 *iv; 687 u8 *iv;
698 int r = 0; 688 int r;
699 689
700 dmreq = dmreq_of_req(cc, req); 690 dmreq = dmreq_of_req(cc, req);
701 iv = iv_of_dmreq(cc, dmreq); 691 iv = iv_of_dmreq(cc, dmreq);
702 692
703 dmreq->iv_sector = ctx->sector; 693 dmreq->iv_sector = ctx->cc_sector;
704 dmreq->ctx = ctx; 694 dmreq->ctx = ctx;
705 sg_init_table(&dmreq->sg_in, 1); 695 sg_init_table(&dmreq->sg_in, 1);
706 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, 696 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -749,12 +739,12 @@ static void crypt_alloc_req(struct crypt_config *cc,
749 struct convert_context *ctx) 739 struct convert_context *ctx)
750{ 740{
751 struct crypt_cpu *this_cc = this_crypt_config(cc); 741 struct crypt_cpu *this_cc = this_crypt_config(cc);
752 unsigned key_index = ctx->sector & (cc->tfms_count - 1); 742 unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
753 743
754 if (!this_cc->req) 744 if (!this_cc->req)
755 this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); 745 this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
756 746
757 ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]); 747 ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]);
758 ablkcipher_request_set_callback(this_cc->req, 748 ablkcipher_request_set_callback(this_cc->req,
759 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, 749 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
760 kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); 750 kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
@@ -769,14 +759,14 @@ static int crypt_convert(struct crypt_config *cc,
769 struct crypt_cpu *this_cc = this_crypt_config(cc); 759 struct crypt_cpu *this_cc = this_crypt_config(cc);
770 int r; 760 int r;
771 761
772 atomic_set(&ctx->pending, 1); 762 atomic_set(&ctx->cc_pending, 1);
773 763
774 while(ctx->idx_in < ctx->bio_in->bi_vcnt && 764 while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
775 ctx->idx_out < ctx->bio_out->bi_vcnt) { 765 ctx->idx_out < ctx->bio_out->bi_vcnt) {
776 766
777 crypt_alloc_req(cc, ctx); 767 crypt_alloc_req(cc, ctx);
778 768
779 atomic_inc(&ctx->pending); 769 atomic_inc(&ctx->cc_pending);
780 770
781 r = crypt_convert_block(cc, ctx, this_cc->req); 771 r = crypt_convert_block(cc, ctx, this_cc->req);
782 772
@@ -788,19 +778,19 @@ static int crypt_convert(struct crypt_config *cc,
788 /* fall through*/ 778 /* fall through*/
789 case -EINPROGRESS: 779 case -EINPROGRESS:
790 this_cc->req = NULL; 780 this_cc->req = NULL;
791 ctx->sector++; 781 ctx->cc_sector++;
792 continue; 782 continue;
793 783
794 /* sync */ 784 /* sync */
795 case 0: 785 case 0:
796 atomic_dec(&ctx->pending); 786 atomic_dec(&ctx->cc_pending);
797 ctx->sector++; 787 ctx->cc_sector++;
798 cond_resched(); 788 cond_resched();
799 continue; 789 continue;
800 790
801 /* error */ 791 /* error */
802 default: 792 default:
803 atomic_dec(&ctx->pending); 793 atomic_dec(&ctx->cc_pending);
804 return r; 794 return r;
805 } 795 }
806 } 796 }
@@ -811,7 +801,7 @@ static int crypt_convert(struct crypt_config *cc,
811static void dm_crypt_bio_destructor(struct bio *bio) 801static void dm_crypt_bio_destructor(struct bio *bio)
812{ 802{
813 struct dm_crypt_io *io = bio->bi_private; 803 struct dm_crypt_io *io = bio->bi_private;
814 struct crypt_config *cc = io->target->private; 804 struct crypt_config *cc = io->cc;
815 805
816 bio_free(bio, cc->bs); 806 bio_free(bio, cc->bs);
817} 807}
@@ -825,7 +815,7 @@ static void dm_crypt_bio_destructor(struct bio *bio)
825static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, 815static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
826 unsigned *out_of_pages) 816 unsigned *out_of_pages)
827{ 817{
828 struct crypt_config *cc = io->target->private; 818 struct crypt_config *cc = io->cc;
829 struct bio *clone; 819 struct bio *clone;
830 unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 820 unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
831 gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; 821 gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
@@ -884,26 +874,25 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
884 } 874 }
885} 875}
886 876
887static struct dm_crypt_io *crypt_io_alloc(struct dm_target *ti, 877static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
888 struct bio *bio, sector_t sector) 878 struct bio *bio, sector_t sector)
889{ 879{
890 struct crypt_config *cc = ti->private;
891 struct dm_crypt_io *io; 880 struct dm_crypt_io *io;
892 881
893 io = mempool_alloc(cc->io_pool, GFP_NOIO); 882 io = mempool_alloc(cc->io_pool, GFP_NOIO);
894 io->target = ti; 883 io->cc = cc;
895 io->base_bio = bio; 884 io->base_bio = bio;
896 io->sector = sector; 885 io->sector = sector;
897 io->error = 0; 886 io->error = 0;
898 io->base_io = NULL; 887 io->base_io = NULL;
899 atomic_set(&io->pending, 0); 888 atomic_set(&io->io_pending, 0);
900 889
901 return io; 890 return io;
902} 891}
903 892
904static void crypt_inc_pending(struct dm_crypt_io *io) 893static void crypt_inc_pending(struct dm_crypt_io *io)
905{ 894{
906 atomic_inc(&io->pending); 895 atomic_inc(&io->io_pending);
907} 896}
908 897
909/* 898/*
@@ -913,12 +902,12 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
913 */ 902 */
914static void crypt_dec_pending(struct dm_crypt_io *io) 903static void crypt_dec_pending(struct dm_crypt_io *io)
915{ 904{
916 struct crypt_config *cc = io->target->private; 905 struct crypt_config *cc = io->cc;
917 struct bio *base_bio = io->base_bio; 906 struct bio *base_bio = io->base_bio;
918 struct dm_crypt_io *base_io = io->base_io; 907 struct dm_crypt_io *base_io = io->base_io;
919 int error = io->error; 908 int error = io->error;
920 909
921 if (!atomic_dec_and_test(&io->pending)) 910 if (!atomic_dec_and_test(&io->io_pending))
922 return; 911 return;
923 912
924 mempool_free(io, cc->io_pool); 913 mempool_free(io, cc->io_pool);
@@ -952,7 +941,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
952static void crypt_endio(struct bio *clone, int error) 941static void crypt_endio(struct bio *clone, int error)
953{ 942{
954 struct dm_crypt_io *io = clone->bi_private; 943 struct dm_crypt_io *io = clone->bi_private;
955 struct crypt_config *cc = io->target->private; 944 struct crypt_config *cc = io->cc;
956 unsigned rw = bio_data_dir(clone); 945 unsigned rw = bio_data_dir(clone);
957 946
958 if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) 947 if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
@@ -979,7 +968,7 @@ static void crypt_endio(struct bio *clone, int error)
979 968
980static void clone_init(struct dm_crypt_io *io, struct bio *clone) 969static void clone_init(struct dm_crypt_io *io, struct bio *clone)
981{ 970{
982 struct crypt_config *cc = io->target->private; 971 struct crypt_config *cc = io->cc;
983 972
984 clone->bi_private = io; 973 clone->bi_private = io;
985 clone->bi_end_io = crypt_endio; 974 clone->bi_end_io = crypt_endio;
@@ -990,7 +979,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
990 979
991static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) 980static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
992{ 981{
993 struct crypt_config *cc = io->target->private; 982 struct crypt_config *cc = io->cc;
994 struct bio *base_bio = io->base_bio; 983 struct bio *base_bio = io->base_bio;
995 struct bio *clone; 984 struct bio *clone;
996 985
@@ -1038,7 +1027,7 @@ static void kcryptd_io(struct work_struct *work)
1038 1027
1039static void kcryptd_queue_io(struct dm_crypt_io *io) 1028static void kcryptd_queue_io(struct dm_crypt_io *io)
1040{ 1029{
1041 struct crypt_config *cc = io->target->private; 1030 struct crypt_config *cc = io->cc;
1042 1031
1043 INIT_WORK(&io->work, kcryptd_io); 1032 INIT_WORK(&io->work, kcryptd_io);
1044 queue_work(cc->io_queue, &io->work); 1033 queue_work(cc->io_queue, &io->work);
@@ -1047,7 +1036,7 @@ static void kcryptd_queue_io(struct dm_crypt_io *io)
1047static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) 1036static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
1048{ 1037{
1049 struct bio *clone = io->ctx.bio_out; 1038 struct bio *clone = io->ctx.bio_out;
1050 struct crypt_config *cc = io->target->private; 1039 struct crypt_config *cc = io->cc;
1051 1040
1052 if (unlikely(io->error < 0)) { 1041 if (unlikely(io->error < 0)) {
1053 crypt_free_buffer_pages(cc, clone); 1042 crypt_free_buffer_pages(cc, clone);
@@ -1069,7 +1058,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
1069 1058
1070static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) 1059static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1071{ 1060{
1072 struct crypt_config *cc = io->target->private; 1061 struct crypt_config *cc = io->cc;
1073 struct bio *clone; 1062 struct bio *clone;
1074 struct dm_crypt_io *new_io; 1063 struct dm_crypt_io *new_io;
1075 int crypt_finished; 1064 int crypt_finished;
@@ -1107,7 +1096,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1107 if (r < 0) 1096 if (r < 0)
1108 io->error = -EIO; 1097 io->error = -EIO;
1109 1098
1110 crypt_finished = atomic_dec_and_test(&io->ctx.pending); 1099 crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
1111 1100
1112 /* Encryption was already finished, submit io now */ 1101 /* Encryption was already finished, submit io now */
1113 if (crypt_finished) { 1102 if (crypt_finished) {
@@ -1135,7 +1124,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1135 * between fragments, so switch to a new dm_crypt_io structure. 1124 * between fragments, so switch to a new dm_crypt_io structure.
1136 */ 1125 */
1137 if (unlikely(!crypt_finished && remaining)) { 1126 if (unlikely(!crypt_finished && remaining)) {
1138 new_io = crypt_io_alloc(io->target, io->base_bio, 1127 new_io = crypt_io_alloc(io->cc, io->base_bio,
1139 sector); 1128 sector);
1140 crypt_inc_pending(new_io); 1129 crypt_inc_pending(new_io);
1141 crypt_convert_init(cc, &new_io->ctx, NULL, 1130 crypt_convert_init(cc, &new_io->ctx, NULL,
@@ -1169,7 +1158,7 @@ static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
1169 1158
1170static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) 1159static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
1171{ 1160{
1172 struct crypt_config *cc = io->target->private; 1161 struct crypt_config *cc = io->cc;
1173 int r = 0; 1162 int r = 0;
1174 1163
1175 crypt_inc_pending(io); 1164 crypt_inc_pending(io);
@@ -1181,7 +1170,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
1181 if (r < 0) 1170 if (r < 0)
1182 io->error = -EIO; 1171 io->error = -EIO;
1183 1172
1184 if (atomic_dec_and_test(&io->ctx.pending)) 1173 if (atomic_dec_and_test(&io->ctx.cc_pending))
1185 kcryptd_crypt_read_done(io); 1174 kcryptd_crypt_read_done(io);
1186 1175
1187 crypt_dec_pending(io); 1176 crypt_dec_pending(io);
@@ -1193,7 +1182,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1193 struct dm_crypt_request *dmreq = async_req->data; 1182 struct dm_crypt_request *dmreq = async_req->data;
1194 struct convert_context *ctx = dmreq->ctx; 1183 struct convert_context *ctx = dmreq->ctx;
1195 struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); 1184 struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
1196 struct crypt_config *cc = io->target->private; 1185 struct crypt_config *cc = io->cc;
1197 1186
1198 if (error == -EINPROGRESS) { 1187 if (error == -EINPROGRESS) {
1199 complete(&ctx->restart); 1188 complete(&ctx->restart);
@@ -1208,7 +1197,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1208 1197
1209 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); 1198 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
1210 1199
1211 if (!atomic_dec_and_test(&ctx->pending)) 1200 if (!atomic_dec_and_test(&ctx->cc_pending))
1212 return; 1201 return;
1213 1202
1214 if (bio_data_dir(io->base_bio) == READ) 1203 if (bio_data_dir(io->base_bio) == READ)
@@ -1229,7 +1218,7 @@ static void kcryptd_crypt(struct work_struct *work)
1229 1218
1230static void kcryptd_queue_crypt(struct dm_crypt_io *io) 1219static void kcryptd_queue_crypt(struct dm_crypt_io *io)
1231{ 1220{
1232 struct crypt_config *cc = io->target->private; 1221 struct crypt_config *cc = io->cc;
1233 1222
1234 INIT_WORK(&io->work, kcryptd_crypt); 1223 INIT_WORK(&io->work, kcryptd_crypt);
1235 queue_work(cc->crypt_queue, &io->work); 1224 queue_work(cc->crypt_queue, &io->work);
@@ -1241,7 +1230,6 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
1241static int crypt_decode_key(u8 *key, char *hex, unsigned int size) 1230static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
1242{ 1231{
1243 char buffer[3]; 1232 char buffer[3];
1244 char *endp;
1245 unsigned int i; 1233 unsigned int i;
1246 1234
1247 buffer[2] = '\0'; 1235 buffer[2] = '\0';
@@ -1250,9 +1238,7 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
1250 buffer[0] = *hex++; 1238 buffer[0] = *hex++;
1251 buffer[1] = *hex++; 1239 buffer[1] = *hex++;
1252 1240
1253 key[i] = (u8)simple_strtoul(buffer, &endp, 16); 1241 if (kstrtou8(buffer, 16, &key[i]))
1254
1255 if (endp != &buffer[2])
1256 return -EINVAL; 1242 return -EINVAL;
1257 } 1243 }
1258 1244
@@ -1276,29 +1262,38 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
1276 } 1262 }
1277} 1263}
1278 1264
1279static void crypt_free_tfms(struct crypt_config *cc, int cpu) 1265static void crypt_free_tfms(struct crypt_config *cc)
1280{ 1266{
1281 struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1282 unsigned i; 1267 unsigned i;
1283 1268
1269 if (!cc->tfms)
1270 return;
1271
1284 for (i = 0; i < cc->tfms_count; i++) 1272 for (i = 0; i < cc->tfms_count; i++)
1285 if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) { 1273 if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) {
1286 crypto_free_ablkcipher(cpu_cc->tfms[i]); 1274 crypto_free_ablkcipher(cc->tfms[i]);
1287 cpu_cc->tfms[i] = NULL; 1275 cc->tfms[i] = NULL;
1288 } 1276 }
1277
1278 kfree(cc->tfms);
1279 cc->tfms = NULL;
1289} 1280}
1290 1281
1291static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode) 1282static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
1292{ 1283{
1293 struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1294 unsigned i; 1284 unsigned i;
1295 int err; 1285 int err;
1296 1286
1287 cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_ablkcipher *),
1288 GFP_KERNEL);
1289 if (!cc->tfms)
1290 return -ENOMEM;
1291
1297 for (i = 0; i < cc->tfms_count; i++) { 1292 for (i = 0; i < cc->tfms_count; i++) {
1298 cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); 1293 cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
1299 if (IS_ERR(cpu_cc->tfms[i])) { 1294 if (IS_ERR(cc->tfms[i])) {
1300 err = PTR_ERR(cpu_cc->tfms[i]); 1295 err = PTR_ERR(cc->tfms[i]);
1301 crypt_free_tfms(cc, cpu); 1296 crypt_free_tfms(cc);
1302 return err; 1297 return err;
1303 } 1298 }
1304 } 1299 }
@@ -1309,15 +1304,14 @@ static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
1309static int crypt_setkey_allcpus(struct crypt_config *cc) 1304static int crypt_setkey_allcpus(struct crypt_config *cc)
1310{ 1305{
1311 unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); 1306 unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
1312 int cpu, err = 0, i, r; 1307 int err = 0, i, r;
1313 1308
1314 for_each_possible_cpu(cpu) { 1309 for (i = 0; i < cc->tfms_count; i++) {
1315 for (i = 0; i < cc->tfms_count; i++) { 1310 r = crypto_ablkcipher_setkey(cc->tfms[i],
1316 r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i], 1311 cc->key + (i * subkey_size),
1317 cc->key + (i * subkey_size), subkey_size); 1312 subkey_size);
1318 if (r) 1313 if (r)
1319 err = r; 1314 err = r;
1320 }
1321 } 1315 }
1322 1316
1323 return err; 1317 return err;
@@ -1379,9 +1373,10 @@ static void crypt_dtr(struct dm_target *ti)
1379 cpu_cc = per_cpu_ptr(cc->cpu, cpu); 1373 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1380 if (cpu_cc->req) 1374 if (cpu_cc->req)
1381 mempool_free(cpu_cc->req, cc->req_pool); 1375 mempool_free(cpu_cc->req, cc->req_pool);
1382 crypt_free_tfms(cc, cpu);
1383 } 1376 }
1384 1377
1378 crypt_free_tfms(cc);
1379
1385 if (cc->bs) 1380 if (cc->bs)
1386 bioset_free(cc->bs); 1381 bioset_free(cc->bs);
1387 1382
@@ -1414,7 +1409,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1414 struct crypt_config *cc = ti->private; 1409 struct crypt_config *cc = ti->private;
1415 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; 1410 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
1416 char *cipher_api = NULL; 1411 char *cipher_api = NULL;
1417 int cpu, ret = -EINVAL; 1412 int ret = -EINVAL;
1418 char dummy; 1413 char dummy;
1419 1414
1420 /* Convert to crypto api definition? */ 1415 /* Convert to crypto api definition? */
@@ -1455,8 +1450,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1455 if (tmp) 1450 if (tmp)
1456 DMWARN("Ignoring unexpected additional cipher options"); 1451 DMWARN("Ignoring unexpected additional cipher options");
1457 1452
1458 cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) + 1453 cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
1459 cc->tfms_count * sizeof(*(cc->cpu->tfms)),
1460 __alignof__(struct crypt_cpu)); 1454 __alignof__(struct crypt_cpu));
1461 if (!cc->cpu) { 1455 if (!cc->cpu) {
1462 ti->error = "Cannot allocate per cpu state"; 1456 ti->error = "Cannot allocate per cpu state";
@@ -1489,12 +1483,10 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1489 } 1483 }
1490 1484
1491 /* Allocate cipher */ 1485 /* Allocate cipher */
1492 for_each_possible_cpu(cpu) { 1486 ret = crypt_alloc_tfms(cc, cipher_api);
1493 ret = crypt_alloc_tfms(cc, cpu, cipher_api); 1487 if (ret < 0) {
1494 if (ret < 0) { 1488 ti->error = "Error allocating crypto tfm";
1495 ti->error = "Error allocating crypto tfm"; 1489 goto bad;
1496 goto bad;
1497 }
1498 } 1490 }
1499 1491
1500 /* Initialize and set key */ 1492 /* Initialize and set key */
@@ -1702,7 +1694,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1702 } 1694 }
1703 1695
1704 ti->num_flush_requests = 1; 1696 ti->num_flush_requests = 1;
1705 ti->discard_zeroes_data_unsupported = 1; 1697 ti->discard_zeroes_data_unsupported = true;
1706 1698
1707 return 0; 1699 return 0;
1708 1700
@@ -1715,7 +1707,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1715 union map_info *map_context) 1707 union map_info *map_context)
1716{ 1708{
1717 struct dm_crypt_io *io; 1709 struct dm_crypt_io *io;
1718 struct crypt_config *cc; 1710 struct crypt_config *cc = ti->private;
1719 1711
1720 /* 1712 /*
1721 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues. 1713 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
@@ -1723,14 +1715,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1723 * - for REQ_DISCARD caller must use flush if IO ordering matters 1715 * - for REQ_DISCARD caller must use flush if IO ordering matters
1724 */ 1716 */
1725 if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { 1717 if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
1726 cc = ti->private;
1727 bio->bi_bdev = cc->dev->bdev; 1718 bio->bi_bdev = cc->dev->bdev;
1728 if (bio_sectors(bio)) 1719 if (bio_sectors(bio))
1729 bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); 1720 bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
1730 return DM_MAPIO_REMAPPED; 1721 return DM_MAPIO_REMAPPED;
1731 } 1722 }
1732 1723
1733 io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); 1724 io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_sector));
1734 1725
1735 if (bio_data_dir(io->base_bio) == READ) { 1726 if (bio_data_dir(io->base_bio) == READ) {
1736 if (kcryptd_io_read(io, GFP_NOWAIT)) 1727 if (kcryptd_io_read(io, GFP_NOWAIT))
@@ -1742,7 +1733,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1742} 1733}
1743 1734
1744static int crypt_status(struct dm_target *ti, status_type_t type, 1735static int crypt_status(struct dm_target *ti, status_type_t type,
1745 char *result, unsigned int maxlen) 1736 unsigned status_flags, char *result, unsigned maxlen)
1746{ 1737{
1747 struct crypt_config *cc = ti->private; 1738 struct crypt_config *cc = ti->private;
1748 unsigned int sz = 0; 1739 unsigned int sz = 0;
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 2dc22dddb2ae..f53846f9ab50 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -295,7 +295,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio,
295} 295}
296 296
297static int delay_status(struct dm_target *ti, status_type_t type, 297static int delay_status(struct dm_target *ti, status_type_t type,
298 char *result, unsigned maxlen) 298 unsigned status_flags, char *result, unsigned maxlen)
299{ 299{
300 struct delay_c *dc = ti->private; 300 struct delay_c *dc = ti->private;
301 int sz = 0; 301 int sz = 0;
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index aa70f7d43a1a..ebaa4f803eec 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -142,24 +142,19 @@ EXPORT_SYMBOL(dm_exception_store_type_unregister);
142static int set_chunk_size(struct dm_exception_store *store, 142static int set_chunk_size(struct dm_exception_store *store,
143 const char *chunk_size_arg, char **error) 143 const char *chunk_size_arg, char **error)
144{ 144{
145 unsigned long chunk_size_ulong; 145 unsigned chunk_size;
146 char *value;
147 146
148 chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10); 147 if (kstrtouint(chunk_size_arg, 10, &chunk_size)) {
149 if (*chunk_size_arg == '\0' || *value != '\0' ||
150 chunk_size_ulong > UINT_MAX) {
151 *error = "Invalid chunk size"; 148 *error = "Invalid chunk size";
152 return -EINVAL; 149 return -EINVAL;
153 } 150 }
154 151
155 if (!chunk_size_ulong) { 152 if (!chunk_size) {
156 store->chunk_size = store->chunk_mask = store->chunk_shift = 0; 153 store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
157 return 0; 154 return 0;
158 } 155 }
159 156
160 return dm_exception_store_set_chunk_size(store, 157 return dm_exception_store_set_chunk_size(store, chunk_size, error);
161 (unsigned) chunk_size_ulong,
162 error);
163} 158}
164 159
165int dm_exception_store_set_chunk_size(struct dm_exception_store *store, 160int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index ac49c01f1a44..cc15543a6ad7 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -333,7 +333,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
333} 333}
334 334
335static int flakey_status(struct dm_target *ti, status_type_t type, 335static int flakey_status(struct dm_target *ti, status_type_t type,
336 char *result, unsigned int maxlen) 336 unsigned status_flags, char *result, unsigned maxlen)
337{ 337{
338 unsigned sz = 0; 338 unsigned sz = 0;
339 struct flakey_c *fc = ti->private; 339 struct flakey_c *fc = ti->private;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index a1a3e6df17b8..afd95986d099 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1054,6 +1054,7 @@ static void retrieve_status(struct dm_table *table,
1054 char *outbuf, *outptr; 1054 char *outbuf, *outptr;
1055 status_type_t type; 1055 status_type_t type;
1056 size_t remaining, len, used = 0; 1056 size_t remaining, len, used = 0;
1057 unsigned status_flags = 0;
1057 1058
1058 outptr = outbuf = get_result_buffer(param, param_size, &len); 1059 outptr = outbuf = get_result_buffer(param, param_size, &len);
1059 1060
@@ -1090,7 +1091,9 @@ static void retrieve_status(struct dm_table *table,
1090 1091
1091 /* Get the status/table string from the target driver */ 1092 /* Get the status/table string from the target driver */
1092 if (ti->type->status) { 1093 if (ti->type->status) {
1093 if (ti->type->status(ti, type, outptr, remaining)) { 1094 if (param->flags & DM_NOFLUSH_FLAG)
1095 status_flags |= DM_STATUS_NOFLUSH_FLAG;
1096 if (ti->type->status(ti, type, status_flags, outptr, remaining)) {
1094 param->flags |= DM_BUFFER_FULL_FLAG; 1097 param->flags |= DM_BUFFER_FULL_FLAG;
1095 break; 1098 break;
1096 } 1099 }
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 3639eeab6042..1bf19a93eef0 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -96,7 +96,7 @@ static int linear_map(struct dm_target *ti, struct bio *bio,
96} 96}
97 97
98static int linear_status(struct dm_target *ti, status_type_t type, 98static int linear_status(struct dm_target *ti, status_type_t type,
99 char *result, unsigned int maxlen) 99 unsigned status_flags, char *result, unsigned maxlen)
100{ 100{
101 struct linear_c *lc = (struct linear_c *) ti->private; 101 struct linear_c *lc = (struct linear_c *) ti->private;
102 102
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 65ebaebf502b..627d19186d5a 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -571,16 +571,6 @@ static void disk_dtr(struct dm_dirty_log *log)
571 destroy_log_context(lc); 571 destroy_log_context(lc);
572} 572}
573 573
574static int count_bits32(uint32_t *addr, unsigned size)
575{
576 int count = 0, i;
577
578 for (i = 0; i < size; i++) {
579 count += hweight32(*(addr+i));
580 }
581 return count;
582}
583
584static void fail_log_device(struct log_c *lc) 574static void fail_log_device(struct log_c *lc)
585{ 575{
586 if (lc->log_dev_failed) 576 if (lc->log_dev_failed)
@@ -629,7 +619,8 @@ static int disk_resume(struct dm_dirty_log *log)
629 619
630 /* copy clean across to sync */ 620 /* copy clean across to sync */
631 memcpy(lc->sync_bits, lc->clean_bits, size); 621 memcpy(lc->sync_bits, lc->clean_bits, size);
632 lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); 622 lc->sync_count = memweight(lc->clean_bits,
623 lc->bitset_uint32_count * sizeof(uint32_t));
633 lc->sync_search = 0; 624 lc->sync_search = 0;
634 625
635 /* set the correct number of regions in the header */ 626 /* set the correct number of regions in the header */
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 638dae048b4f..d8abb90a6c2f 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -85,6 +85,7 @@ struct multipath {
85 unsigned queue_io:1; /* Must we queue all I/O? */ 85 unsigned queue_io:1; /* Must we queue all I/O? */
86 unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ 86 unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */
87 unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ 87 unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
88 unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
88 89
89 unsigned pg_init_retries; /* Number of times to retry pg_init */ 90 unsigned pg_init_retries; /* Number of times to retry pg_init */
90 unsigned pg_init_count; /* Number of times pg_init called */ 91 unsigned pg_init_count; /* Number of times pg_init called */
@@ -568,6 +569,8 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
568 int r; 569 int r;
569 struct pgpath *p; 570 struct pgpath *p;
570 struct multipath *m = ti->private; 571 struct multipath *m = ti->private;
572 struct request_queue *q = NULL;
573 const char *attached_handler_name;
571 574
572 /* we need at least a path arg */ 575 /* we need at least a path arg */
573 if (as->argc < 1) { 576 if (as->argc < 1) {
@@ -586,13 +589,37 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
586 goto bad; 589 goto bad;
587 } 590 }
588 591
589 if (m->hw_handler_name) { 592 if (m->retain_attached_hw_handler || m->hw_handler_name)
590 struct request_queue *q = bdev_get_queue(p->path.dev->bdev); 593 q = bdev_get_queue(p->path.dev->bdev);
594
595 if (m->retain_attached_hw_handler) {
596 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
597 if (attached_handler_name) {
598 /*
599 * Reset hw_handler_name to match the attached handler
600 * and clear any hw_handler_params associated with the
601 * ignored handler.
602 *
603 * NB. This modifies the table line to show the actual
604 * handler instead of the original table passed in.
605 */
606 kfree(m->hw_handler_name);
607 m->hw_handler_name = attached_handler_name;
608
609 kfree(m->hw_handler_params);
610 m->hw_handler_params = NULL;
611 }
612 }
591 613
614 if (m->hw_handler_name) {
615 /*
616 * Increments scsi_dh reference, even when using an
617 * already-attached handler.
618 */
592 r = scsi_dh_attach(q, m->hw_handler_name); 619 r = scsi_dh_attach(q, m->hw_handler_name);
593 if (r == -EBUSY) { 620 if (r == -EBUSY) {
594 /* 621 /*
595 * Already attached to different hw_handler, 622 * Already attached to different hw_handler:
596 * try to reattach with correct one. 623 * try to reattach with correct one.
597 */ 624 */
598 scsi_dh_detach(q); 625 scsi_dh_detach(q);
@@ -760,7 +787,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
760 const char *arg_name; 787 const char *arg_name;
761 788
762 static struct dm_arg _args[] = { 789 static struct dm_arg _args[] = {
763 {0, 5, "invalid number of feature args"}, 790 {0, 6, "invalid number of feature args"},
764 {1, 50, "pg_init_retries must be between 1 and 50"}, 791 {1, 50, "pg_init_retries must be between 1 and 50"},
765 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, 792 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
766 }; 793 };
@@ -781,6 +808,11 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
781 continue; 808 continue;
782 } 809 }
783 810
811 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
812 m->retain_attached_hw_handler = 1;
813 continue;
814 }
815
784 if (!strcasecmp(arg_name, "pg_init_retries") && 816 if (!strcasecmp(arg_name, "pg_init_retries") &&
785 (argc >= 1)) { 817 (argc >= 1)) {
786 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); 818 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
@@ -1346,7 +1378,7 @@ static void multipath_resume(struct dm_target *ti)
1346 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ 1378 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1347 */ 1379 */
1348static int multipath_status(struct dm_target *ti, status_type_t type, 1380static int multipath_status(struct dm_target *ti, status_type_t type,
1349 char *result, unsigned int maxlen) 1381 unsigned status_flags, char *result, unsigned maxlen)
1350{ 1382{
1351 int sz = 0; 1383 int sz = 0;
1352 unsigned long flags; 1384 unsigned long flags;
@@ -1364,13 +1396,16 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
1364 else { 1396 else {
1365 DMEMIT("%u ", m->queue_if_no_path + 1397 DMEMIT("%u ", m->queue_if_no_path +
1366 (m->pg_init_retries > 0) * 2 + 1398 (m->pg_init_retries > 0) * 2 +
1367 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2); 1399 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1400 m->retain_attached_hw_handler);
1368 if (m->queue_if_no_path) 1401 if (m->queue_if_no_path)
1369 DMEMIT("queue_if_no_path "); 1402 DMEMIT("queue_if_no_path ");
1370 if (m->pg_init_retries) 1403 if (m->pg_init_retries)
1371 DMEMIT("pg_init_retries %u ", m->pg_init_retries); 1404 DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1372 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) 1405 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1373 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); 1406 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1407 if (m->retain_attached_hw_handler)
1408 DMEMIT("retain_attached_hw_handler ");
1374 } 1409 }
1375 1410
1376 if (!m->hw_handler_name || type == STATUSTYPE_INFO) 1411 if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1656,7 +1691,7 @@ out:
1656 *---------------------------------------------------------------*/ 1691 *---------------------------------------------------------------*/
1657static struct target_type multipath_target = { 1692static struct target_type multipath_target = {
1658 .name = "multipath", 1693 .name = "multipath",
1659 .version = {1, 4, 0}, 1694 .version = {1, 5, 0},
1660 .module = THIS_MODULE, 1695 .module = THIS_MODULE,
1661 .ctr = multipath_ctr, 1696 .ctr = multipath_ctr,
1662 .dtr = multipath_dtr, 1697 .dtr = multipath_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 017c34d78d61..982e3e390c45 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -11,6 +11,7 @@
11#include "md.h" 11#include "md.h"
12#include "raid1.h" 12#include "raid1.h"
13#include "raid5.h" 13#include "raid5.h"
14#include "raid10.h"
14#include "bitmap.h" 15#include "bitmap.h"
15 16
16#include <linux/device-mapper.h> 17#include <linux/device-mapper.h>
@@ -52,7 +53,10 @@ struct raid_dev {
52#define DMPF_MAX_RECOVERY_RATE 0x20 53#define DMPF_MAX_RECOVERY_RATE 0x20
53#define DMPF_MAX_WRITE_BEHIND 0x40 54#define DMPF_MAX_WRITE_BEHIND 0x40
54#define DMPF_STRIPE_CACHE 0x80 55#define DMPF_STRIPE_CACHE 0x80
55#define DMPF_REGION_SIZE 0X100 56#define DMPF_REGION_SIZE 0x100
57#define DMPF_RAID10_COPIES 0x200
58#define DMPF_RAID10_FORMAT 0x400
59
56struct raid_set { 60struct raid_set {
57 struct dm_target *ti; 61 struct dm_target *ti;
58 62
@@ -76,6 +80,7 @@ static struct raid_type {
76 const unsigned algorithm; /* RAID algorithm. */ 80 const unsigned algorithm; /* RAID algorithm. */
77} raid_types[] = { 81} raid_types[] = {
78 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, 82 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
83 {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */},
79 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, 84 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
80 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 85 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
81 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, 86 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -86,6 +91,17 @@ static struct raid_type {
86 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
87}; 92};
88 93
94static unsigned raid10_md_layout_to_copies(int layout)
95{
96 return layout & 0xFF;
97}
98
99static int raid10_format_to_md_layout(char *format, unsigned copies)
100{
101 /* 1 "far" copy, and 'copies' "near" copies */
102 return (1 << 8) | (copies & 0xFF);
103}
104
89static struct raid_type *get_raid_type(char *name) 105static struct raid_type *get_raid_type(char *name)
90{ 106{
91 int i; 107 int i;
@@ -101,20 +117,12 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
101{ 117{
102 unsigned i; 118 unsigned i;
103 struct raid_set *rs; 119 struct raid_set *rs;
104 sector_t sectors_per_dev;
105 120
106 if (raid_devs <= raid_type->parity_devs) { 121 if (raid_devs <= raid_type->parity_devs) {
107 ti->error = "Insufficient number of devices"; 122 ti->error = "Insufficient number of devices";
108 return ERR_PTR(-EINVAL); 123 return ERR_PTR(-EINVAL);
109 } 124 }
110 125
111 sectors_per_dev = ti->len;
112 if ((raid_type->level > 1) &&
113 sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
114 ti->error = "Target length not divisible by number of data devices";
115 return ERR_PTR(-EINVAL);
116 }
117
118 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); 126 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
119 if (!rs) { 127 if (!rs) {
120 ti->error = "Cannot allocate raid context"; 128 ti->error = "Cannot allocate raid context";
@@ -128,7 +136,6 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
128 rs->md.raid_disks = raid_devs; 136 rs->md.raid_disks = raid_devs;
129 rs->md.level = raid_type->level; 137 rs->md.level = raid_type->level;
130 rs->md.new_level = rs->md.level; 138 rs->md.new_level = rs->md.level;
131 rs->md.dev_sectors = sectors_per_dev;
132 rs->md.layout = raid_type->algorithm; 139 rs->md.layout = raid_type->algorithm;
133 rs->md.new_layout = rs->md.layout; 140 rs->md.new_layout = rs->md.layout;
134 rs->md.delta_disks = 0; 141 rs->md.delta_disks = 0;
@@ -143,6 +150,7 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
143 * rs->md.external 150 * rs->md.external
144 * rs->md.chunk_sectors 151 * rs->md.chunk_sectors
145 * rs->md.new_chunk_sectors 152 * rs->md.new_chunk_sectors
153 * rs->md.dev_sectors
146 */ 154 */
147 155
148 return rs; 156 return rs;
@@ -347,12 +355,20 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
347 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 355 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
348 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 356 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
349 * [region_size <sectors>] Defines granularity of bitmap 357 * [region_size <sectors>] Defines granularity of bitmap
358 *
359 * RAID10-only options:
360 * [raid10_copies <# copies>] Number of copies. (Default: 2)
361 * [raid10_format <near>] Layout algorithm. (Default: near)
350 */ 362 */
351static int parse_raid_params(struct raid_set *rs, char **argv, 363static int parse_raid_params(struct raid_set *rs, char **argv,
352 unsigned num_raid_params) 364 unsigned num_raid_params)
353{ 365{
366 char *raid10_format = "near";
367 unsigned raid10_copies = 2;
354 unsigned i, rebuild_cnt = 0; 368 unsigned i, rebuild_cnt = 0;
355 unsigned long value, region_size = 0; 369 unsigned long value, region_size = 0;
370 sector_t sectors_per_dev = rs->ti->len;
371 sector_t max_io_len;
356 char *key; 372 char *key;
357 373
358 /* 374 /*
@@ -422,20 +438,53 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
422 } 438 }
423 439
424 key = argv[i++]; 440 key = argv[i++];
441
442 /* Parameters that take a string value are checked here. */
443 if (!strcasecmp(key, "raid10_format")) {
444 if (rs->raid_type->level != 10) {
445 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
446 return -EINVAL;
447 }
448 if (strcmp("near", argv[i])) {
449 rs->ti->error = "Invalid 'raid10_format' value given";
450 return -EINVAL;
451 }
452 raid10_format = argv[i];
453 rs->print_flags |= DMPF_RAID10_FORMAT;
454 continue;
455 }
456
425 if (strict_strtoul(argv[i], 10, &value) < 0) { 457 if (strict_strtoul(argv[i], 10, &value) < 0) {
426 rs->ti->error = "Bad numerical argument given in raid params"; 458 rs->ti->error = "Bad numerical argument given in raid params";
427 return -EINVAL; 459 return -EINVAL;
428 } 460 }
429 461
462 /* Parameters that take a numeric value are checked here */
430 if (!strcasecmp(key, "rebuild")) { 463 if (!strcasecmp(key, "rebuild")) {
431 rebuild_cnt++; 464 rebuild_cnt++;
432 if (((rs->raid_type->level != 1) && 465
433 (rebuild_cnt > rs->raid_type->parity_devs)) || 466 switch (rs->raid_type->level) {
434 ((rs->raid_type->level == 1) && 467 case 1:
435 (rebuild_cnt > (rs->md.raid_disks - 1)))) { 468 if (rebuild_cnt >= rs->md.raid_disks) {
436 rs->ti->error = "Too many rebuild devices specified for given RAID type"; 469 rs->ti->error = "Too many rebuild devices specified";
470 return -EINVAL;
471 }
472 break;
473 case 4:
474 case 5:
475 case 6:
476 if (rebuild_cnt > rs->raid_type->parity_devs) {
477 rs->ti->error = "Too many rebuild devices specified for given RAID type";
478 return -EINVAL;
479 }
480 break;
481 case 10:
482 default:
483 DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
484 rs->ti->error = "Rebuild not supported for this RAID type";
437 return -EINVAL; 485 return -EINVAL;
438 } 486 }
487
439 if (value > rs->md.raid_disks) { 488 if (value > rs->md.raid_disks) {
440 rs->ti->error = "Invalid rebuild index given"; 489 rs->ti->error = "Invalid rebuild index given";
441 return -EINVAL; 490 return -EINVAL;
@@ -486,7 +535,8 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
486 */ 535 */
487 value /= 2; 536 value /= 2;
488 537
489 if (rs->raid_type->level < 5) { 538 if ((rs->raid_type->level != 5) &&
539 (rs->raid_type->level != 6)) {
490 rs->ti->error = "Inappropriate argument: stripe_cache"; 540 rs->ti->error = "Inappropriate argument: stripe_cache";
491 return -EINVAL; 541 return -EINVAL;
492 } 542 }
@@ -511,6 +561,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
511 } else if (!strcasecmp(key, "region_size")) { 561 } else if (!strcasecmp(key, "region_size")) {
512 rs->print_flags |= DMPF_REGION_SIZE; 562 rs->print_flags |= DMPF_REGION_SIZE;
513 region_size = value; 563 region_size = value;
564 } else if (!strcasecmp(key, "raid10_copies") &&
565 (rs->raid_type->level == 10)) {
566 if ((value < 2) || (value > 0xFF)) {
567 rs->ti->error = "Bad value for 'raid10_copies'";
568 return -EINVAL;
569 }
570 rs->print_flags |= DMPF_RAID10_COPIES;
571 raid10_copies = value;
514 } else { 572 } else {
515 DMERR("Unable to parse RAID parameter: %s", key); 573 DMERR("Unable to parse RAID parameter: %s", key);
516 rs->ti->error = "Unable to parse RAID parameters"; 574 rs->ti->error = "Unable to parse RAID parameters";
@@ -522,14 +580,33 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
522 return -EINVAL; 580 return -EINVAL;
523 581
524 if (rs->md.chunk_sectors) 582 if (rs->md.chunk_sectors)
525 rs->ti->split_io = rs->md.chunk_sectors; 583 max_io_len = rs->md.chunk_sectors;
526 else 584 else
527 rs->ti->split_io = region_size; 585 max_io_len = region_size;
528 586
529 if (rs->md.chunk_sectors) 587 if (dm_set_target_max_io_len(rs->ti, max_io_len))
530 rs->ti->split_io = rs->md.chunk_sectors; 588 return -EINVAL;
531 else 589
532 rs->ti->split_io = region_size; 590 if (rs->raid_type->level == 10) {
591 if (raid10_copies > rs->md.raid_disks) {
592 rs->ti->error = "Not enough devices to satisfy specification";
593 return -EINVAL;
594 }
595
596 /* (Len * #mirrors) / #devices */
597 sectors_per_dev = rs->ti->len * raid10_copies;
598 sector_div(sectors_per_dev, rs->md.raid_disks);
599
600 rs->md.layout = raid10_format_to_md_layout(raid10_format,
601 raid10_copies);
602 rs->md.new_layout = rs->md.layout;
603 } else if ((rs->raid_type->level > 1) &&
604 sector_div(sectors_per_dev,
605 (rs->md.raid_disks - rs->raid_type->parity_devs))) {
606 rs->ti->error = "Target length not divisible by number of data devices";
607 return -EINVAL;
608 }
609 rs->md.dev_sectors = sectors_per_dev;
533 610
534 /* Assume there are no metadata devices until the drives are parsed */ 611 /* Assume there are no metadata devices until the drives are parsed */
535 rs->md.persistent = 0; 612 rs->md.persistent = 0;
@@ -552,6 +629,9 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
552 if (rs->raid_type->level == 1) 629 if (rs->raid_type->level == 1)
553 return md_raid1_congested(&rs->md, bits); 630 return md_raid1_congested(&rs->md, bits);
554 631
632 if (rs->raid_type->level == 10)
633 return md_raid10_congested(&rs->md, bits);
634
555 return md_raid5_congested(&rs->md, bits); 635 return md_raid5_congested(&rs->md, bits);
556} 636}
557 637
@@ -870,6 +950,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
870 case 6: 950 case 6:
871 redundancy = rs->raid_type->parity_devs; 951 redundancy = rs->raid_type->parity_devs;
872 break; 952 break;
953 case 10:
954 redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
955 break;
873 default: 956 default:
874 ti->error = "Unknown RAID type"; 957 ti->error = "Unknown RAID type";
875 return -EINVAL; 958 return -EINVAL;
@@ -1035,12 +1118,19 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
1035 goto bad; 1118 goto bad;
1036 } 1119 }
1037 1120
1121 if (ti->len != rs->md.array_sectors) {
1122 ti->error = "Array size does not match requested target length";
1123 ret = -EINVAL;
1124 goto size_mismatch;
1125 }
1038 rs->callbacks.congested_fn = raid_is_congested; 1126 rs->callbacks.congested_fn = raid_is_congested;
1039 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 1127 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
1040 1128
1041 mddev_suspend(&rs->md); 1129 mddev_suspend(&rs->md);
1042 return 0; 1130 return 0;
1043 1131
1132size_mismatch:
1133 md_stop(&rs->md);
1044bad: 1134bad:
1045 context_free(rs); 1135 context_free(rs);
1046 1136
@@ -1067,7 +1157,7 @@ static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_c
1067} 1157}
1068 1158
1069static int raid_status(struct dm_target *ti, status_type_t type, 1159static int raid_status(struct dm_target *ti, status_type_t type,
1070 char *result, unsigned maxlen) 1160 unsigned status_flags, char *result, unsigned maxlen)
1071{ 1161{
1072 struct raid_set *rs = ti->private; 1162 struct raid_set *rs = ti->private;
1073 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ 1163 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
@@ -1189,6 +1279,13 @@ static int raid_status(struct dm_target *ti, status_type_t type,
1189 DMEMIT(" region_size %lu", 1279 DMEMIT(" region_size %lu",
1190 rs->md.bitmap_info.chunksize >> 9); 1280 rs->md.bitmap_info.chunksize >> 9);
1191 1281
1282 if (rs->print_flags & DMPF_RAID10_COPIES)
1283 DMEMIT(" raid10_copies %u",
1284 raid10_md_layout_to_copies(rs->md.layout));
1285
1286 if (rs->print_flags & DMPF_RAID10_FORMAT)
1287 DMEMIT(" raid10_format near");
1288
1192 DMEMIT(" %d", rs->md.raid_disks); 1289 DMEMIT(" %d", rs->md.raid_disks);
1193 for (i = 0; i < rs->md.raid_disks; i++) { 1290 for (i = 0; i < rs->md.raid_disks; i++) {
1194 if (rs->dev[i].meta_dev) 1291 if (rs->dev[i].meta_dev)
@@ -1263,7 +1360,7 @@ static void raid_resume(struct dm_target *ti)
1263 1360
1264static struct target_type raid_target = { 1361static struct target_type raid_target = {
1265 .name = "raid", 1362 .name = "raid",
1266 .version = {1, 2, 0}, 1363 .version = {1, 3, 0},
1267 .module = THIS_MODULE, 1364 .module = THIS_MODULE,
1268 .ctr = raid_ctr, 1365 .ctr = raid_ctr,
1269 .dtr = raid_dtr, 1366 .dtr = raid_dtr,
@@ -1290,6 +1387,8 @@ module_init(dm_raid_init);
1290module_exit(dm_raid_exit); 1387module_exit(dm_raid_exit);
1291 1388
1292MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); 1389MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
1390MODULE_ALIAS("dm-raid1");
1391MODULE_ALIAS("dm-raid10");
1293MODULE_ALIAS("dm-raid4"); 1392MODULE_ALIAS("dm-raid4");
1294MODULE_ALIAS("dm-raid5"); 1393MODULE_ALIAS("dm-raid5");
1295MODULE_ALIAS("dm-raid6"); 1394MODULE_ALIAS("dm-raid6");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index b58b7a33914a..bc5ddba8045b 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1081,10 +1081,14 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1081 } 1081 }
1082 1082
1083 ti->private = ms; 1083 ti->private = ms;
1084 ti->split_io = dm_rh_get_region_size(ms->rh); 1084
1085 r = dm_set_target_max_io_len(ti, dm_rh_get_region_size(ms->rh));
1086 if (r)
1087 goto err_free_context;
1088
1085 ti->num_flush_requests = 1; 1089 ti->num_flush_requests = 1;
1086 ti->num_discard_requests = 1; 1090 ti->num_discard_requests = 1;
1087 ti->discard_zeroes_data_unsupported = 1; 1091 ti->discard_zeroes_data_unsupported = true;
1088 1092
1089 ms->kmirrord_wq = alloc_workqueue("kmirrord", 1093 ms->kmirrord_wq = alloc_workqueue("kmirrord",
1090 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); 1094 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
@@ -1363,7 +1367,7 @@ static char device_status_char(struct mirror *m)
1363 1367
1364 1368
1365static int mirror_status(struct dm_target *ti, status_type_t type, 1369static int mirror_status(struct dm_target *ti, status_type_t type,
1366 char *result, unsigned int maxlen) 1370 unsigned status_flags, char *result, unsigned maxlen)
1367{ 1371{
1368 unsigned int m, sz = 0; 1372 unsigned int m, sz = 0;
1369 struct mirror_set *ms = (struct mirror_set *) ti->private; 1373 struct mirror_set *ms = (struct mirror_set *) ti->private;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 6f758870fc19..a143921feaf6 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -691,7 +691,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
691 * Return a minimum chunk size of all snapshots that have the specified origin. 691 * Return a minimum chunk size of all snapshots that have the specified origin.
692 * Return zero if the origin has no snapshots. 692 * Return zero if the origin has no snapshots.
693 */ 693 */
694static sector_t __minimum_chunk_size(struct origin *o) 694static uint32_t __minimum_chunk_size(struct origin *o)
695{ 695{
696 struct dm_snapshot *snap; 696 struct dm_snapshot *snap;
697 unsigned chunk_size = 0; 697 unsigned chunk_size = 0;
@@ -701,7 +701,7 @@ static sector_t __minimum_chunk_size(struct origin *o)
701 chunk_size = min_not_zero(chunk_size, 701 chunk_size = min_not_zero(chunk_size,
702 snap->store->chunk_size); 702 snap->store->chunk_size);
703 703
704 return chunk_size; 704 return (uint32_t) chunk_size;
705} 705}
706 706
707/* 707/*
@@ -1172,7 +1172,10 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1172 ti->error = "Chunk size not set"; 1172 ti->error = "Chunk size not set";
1173 goto bad_read_metadata; 1173 goto bad_read_metadata;
1174 } 1174 }
1175 ti->split_io = s->store->chunk_size; 1175
1176 r = dm_set_target_max_io_len(ti, s->store->chunk_size);
1177 if (r)
1178 goto bad_read_metadata;
1176 1179
1177 return 0; 1180 return 0;
1178 1181
@@ -1239,7 +1242,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src,
1239 snap_dest->store->snap = snap_dest; 1242 snap_dest->store->snap = snap_dest;
1240 snap_src->store->snap = snap_src; 1243 snap_src->store->snap = snap_src;
1241 1244
1242 snap_dest->ti->split_io = snap_dest->store->chunk_size; 1245 snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
1243 snap_dest->valid = snap_src->valid; 1246 snap_dest->valid = snap_src->valid;
1244 1247
1245 /* 1248 /*
@@ -1817,9 +1820,9 @@ static void snapshot_resume(struct dm_target *ti)
1817 up_write(&s->lock); 1820 up_write(&s->lock);
1818} 1821}
1819 1822
1820static sector_t get_origin_minimum_chunksize(struct block_device *bdev) 1823static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
1821{ 1824{
1822 sector_t min_chunksize; 1825 uint32_t min_chunksize;
1823 1826
1824 down_read(&_origins_lock); 1827 down_read(&_origins_lock);
1825 min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); 1828 min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
@@ -1838,15 +1841,15 @@ static void snapshot_merge_resume(struct dm_target *ti)
1838 snapshot_resume(ti); 1841 snapshot_resume(ti);
1839 1842
1840 /* 1843 /*
1841 * snapshot-merge acts as an origin, so set ti->split_io 1844 * snapshot-merge acts as an origin, so set ti->max_io_len
1842 */ 1845 */
1843 ti->split_io = get_origin_minimum_chunksize(s->origin->bdev); 1846 ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev);
1844 1847
1845 start_merge(s); 1848 start_merge(s);
1846} 1849}
1847 1850
1848static int snapshot_status(struct dm_target *ti, status_type_t type, 1851static int snapshot_status(struct dm_target *ti, status_type_t type,
1849 char *result, unsigned int maxlen) 1852 unsigned status_flags, char *result, unsigned maxlen)
1850{ 1853{
1851 unsigned sz = 0; 1854 unsigned sz = 0;
1852 struct dm_snapshot *snap = ti->private; 1855 struct dm_snapshot *snap = ti->private;
@@ -2073,12 +2076,12 @@ static int origin_write_extent(struct dm_snapshot *merging_snap,
2073 struct origin *o; 2076 struct origin *o;
2074 2077
2075 /* 2078 /*
2076 * The origin's __minimum_chunk_size() got stored in split_io 2079 * The origin's __minimum_chunk_size() got stored in max_io_len
2077 * by snapshot_merge_resume(). 2080 * by snapshot_merge_resume().
2078 */ 2081 */
2079 down_read(&_origins_lock); 2082 down_read(&_origins_lock);
2080 o = __lookup_origin(merging_snap->origin->bdev); 2083 o = __lookup_origin(merging_snap->origin->bdev);
2081 for (n = 0; n < size; n += merging_snap->ti->split_io) 2084 for (n = 0; n < size; n += merging_snap->ti->max_io_len)
2082 if (__origin_write(&o->snapshots, sector + n, NULL) == 2085 if (__origin_write(&o->snapshots, sector + n, NULL) ==
2083 DM_MAPIO_SUBMITTED) 2086 DM_MAPIO_SUBMITTED)
2084 must_wait = 1; 2087 must_wait = 1;
@@ -2138,18 +2141,18 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
2138} 2141}
2139 2142
2140/* 2143/*
2141 * Set the target "split_io" field to the minimum of all the snapshots' 2144 * Set the target "max_io_len" field to the minimum of all the snapshots'
2142 * chunk sizes. 2145 * chunk sizes.
2143 */ 2146 */
2144static void origin_resume(struct dm_target *ti) 2147static void origin_resume(struct dm_target *ti)
2145{ 2148{
2146 struct dm_dev *dev = ti->private; 2149 struct dm_dev *dev = ti->private;
2147 2150
2148 ti->split_io = get_origin_minimum_chunksize(dev->bdev); 2151 ti->max_io_len = get_origin_minimum_chunksize(dev->bdev);
2149} 2152}
2150 2153
2151static int origin_status(struct dm_target *ti, status_type_t type, char *result, 2154static int origin_status(struct dm_target *ti, status_type_t type,
2152 unsigned int maxlen) 2155 unsigned status_flags, char *result, unsigned maxlen)
2153{ 2156{
2154 struct dm_dev *dev = ti->private; 2157 struct dm_dev *dev = ti->private;
2155 2158
@@ -2176,7 +2179,6 @@ static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2176 return max_size; 2179 return max_size;
2177 2180
2178 bvm->bi_bdev = dev->bdev; 2181 bvm->bi_bdev = dev->bdev;
2179 bvm->bi_sector = bvm->bi_sector;
2180 2182
2181 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2183 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2182} 2184}
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 35c94ff24ad5..a087bf2a8d66 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -26,14 +26,12 @@ struct stripe {
26struct stripe_c { 26struct stripe_c {
27 uint32_t stripes; 27 uint32_t stripes;
28 int stripes_shift; 28 int stripes_shift;
29 sector_t stripes_mask;
30 29
31 /* The size of this target / num. stripes */ 30 /* The size of this target / num. stripes */
32 sector_t stripe_width; 31 sector_t stripe_width;
33 32
34 /* stripe chunk size */ 33 uint32_t chunk_size;
35 uint32_t chunk_shift; 34 int chunk_size_shift;
36 sector_t chunk_mask;
37 35
38 /* Needed for handling events */ 36 /* Needed for handling events */
39 struct dm_target *ti; 37 struct dm_target *ti;
@@ -91,7 +89,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
91 89
92/* 90/*
93 * Construct a striped mapping. 91 * Construct a striped mapping.
94 * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+ 92 * <number of stripes> <chunk size> [<dev_path> <offset>]+
95 */ 93 */
96static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) 94static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
97{ 95{
@@ -99,7 +97,6 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
99 sector_t width; 97 sector_t width;
100 uint32_t stripes; 98 uint32_t stripes;
101 uint32_t chunk_size; 99 uint32_t chunk_size;
102 char *end;
103 int r; 100 int r;
104 unsigned int i; 101 unsigned int i;
105 102
@@ -108,34 +105,23 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
108 return -EINVAL; 105 return -EINVAL;
109 } 106 }
110 107
111 stripes = simple_strtoul(argv[0], &end, 10); 108 if (kstrtouint(argv[0], 10, &stripes) || !stripes) {
112 if (!stripes || *end) {
113 ti->error = "Invalid stripe count"; 109 ti->error = "Invalid stripe count";
114 return -EINVAL; 110 return -EINVAL;
115 } 111 }
116 112
117 chunk_size = simple_strtoul(argv[1], &end, 10); 113 if (kstrtouint(argv[1], 10, &chunk_size) || !chunk_size) {
118 if (*end) {
119 ti->error = "Invalid chunk_size"; 114 ti->error = "Invalid chunk_size";
120 return -EINVAL; 115 return -EINVAL;
121 } 116 }
122 117
123 /* 118 width = ti->len;
124 * chunk_size is a power of two 119 if (sector_div(width, chunk_size)) {
125 */
126 if (!is_power_of_2(chunk_size) ||
127 (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
128 ti->error = "Invalid chunk size";
129 return -EINVAL;
130 }
131
132 if (ti->len & (chunk_size - 1)) {
133 ti->error = "Target length not divisible by " 120 ti->error = "Target length not divisible by "
134 "chunk size"; 121 "chunk size";
135 return -EINVAL; 122 return -EINVAL;
136 } 123 }
137 124
138 width = ti->len;
139 if (sector_div(width, stripes)) { 125 if (sector_div(width, stripes)) {
140 ti->error = "Target length not divisible by " 126 ti->error = "Target length not divisible by "
141 "number of stripes"; 127 "number of stripes";
@@ -167,17 +153,21 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
167 153
168 if (stripes & (stripes - 1)) 154 if (stripes & (stripes - 1))
169 sc->stripes_shift = -1; 155 sc->stripes_shift = -1;
170 else { 156 else
171 sc->stripes_shift = ffs(stripes) - 1; 157 sc->stripes_shift = __ffs(stripes);
172 sc->stripes_mask = ((sector_t) stripes) - 1; 158
173 } 159 r = dm_set_target_max_io_len(ti, chunk_size);
160 if (r)
161 return r;
174 162
175 ti->split_io = chunk_size;
176 ti->num_flush_requests = stripes; 163 ti->num_flush_requests = stripes;
177 ti->num_discard_requests = stripes; 164 ti->num_discard_requests = stripes;
178 165
179 sc->chunk_shift = ffs(chunk_size) - 1; 166 sc->chunk_size = chunk_size;
180 sc->chunk_mask = ((sector_t) chunk_size) - 1; 167 if (chunk_size & (chunk_size - 1))
168 sc->chunk_size_shift = -1;
169 else
170 sc->chunk_size_shift = __ffs(chunk_size);
181 171
182 /* 172 /*
183 * Get the stripe destinations. 173 * Get the stripe destinations.
@@ -216,17 +206,29 @@ static void stripe_dtr(struct dm_target *ti)
216static void stripe_map_sector(struct stripe_c *sc, sector_t sector, 206static void stripe_map_sector(struct stripe_c *sc, sector_t sector,
217 uint32_t *stripe, sector_t *result) 207 uint32_t *stripe, sector_t *result)
218{ 208{
219 sector_t offset = dm_target_offset(sc->ti, sector); 209 sector_t chunk = dm_target_offset(sc->ti, sector);
220 sector_t chunk = offset >> sc->chunk_shift; 210 sector_t chunk_offset;
211
212 if (sc->chunk_size_shift < 0)
213 chunk_offset = sector_div(chunk, sc->chunk_size);
214 else {
215 chunk_offset = chunk & (sc->chunk_size - 1);
216 chunk >>= sc->chunk_size_shift;
217 }
221 218
222 if (sc->stripes_shift < 0) 219 if (sc->stripes_shift < 0)
223 *stripe = sector_div(chunk, sc->stripes); 220 *stripe = sector_div(chunk, sc->stripes);
224 else { 221 else {
225 *stripe = chunk & sc->stripes_mask; 222 *stripe = chunk & (sc->stripes - 1);
226 chunk >>= sc->stripes_shift; 223 chunk >>= sc->stripes_shift;
227 } 224 }
228 225
229 *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask); 226 if (sc->chunk_size_shift < 0)
227 chunk *= sc->chunk_size;
228 else
229 chunk <<= sc->chunk_size_shift;
230
231 *result = chunk + chunk_offset;
230} 232}
231 233
232static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, 234static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
@@ -237,9 +239,16 @@ static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
237 stripe_map_sector(sc, sector, &stripe, result); 239 stripe_map_sector(sc, sector, &stripe, result);
238 if (stripe == target_stripe) 240 if (stripe == target_stripe)
239 return; 241 return;
240 *result &= ~sc->chunk_mask; /* round down */ 242
243 /* round down */
244 sector = *result;
245 if (sc->chunk_size_shift < 0)
246 *result -= sector_div(sector, sc->chunk_size);
247 else
248 *result = sector & ~(sector_t)(sc->chunk_size - 1);
249
241 if (target_stripe < stripe) 250 if (target_stripe < stripe)
242 *result += sc->chunk_mask + 1; /* next chunk */ 251 *result += sc->chunk_size; /* next chunk */
243} 252}
244 253
245static int stripe_map_discard(struct stripe_c *sc, struct bio *bio, 254static int stripe_map_discard(struct stripe_c *sc, struct bio *bio,
@@ -302,8 +311,8 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
302 * 311 *
303 */ 312 */
304 313
305static int stripe_status(struct dm_target *ti, 314static int stripe_status(struct dm_target *ti, status_type_t type,
306 status_type_t type, char *result, unsigned int maxlen) 315 unsigned status_flags, char *result, unsigned maxlen)
307{ 316{
308 struct stripe_c *sc = (struct stripe_c *) ti->private; 317 struct stripe_c *sc = (struct stripe_c *) ti->private;
309 char buffer[sc->stripes + 1]; 318 char buffer[sc->stripes + 1];
@@ -324,7 +333,7 @@ static int stripe_status(struct dm_target *ti,
324 333
325 case STATUSTYPE_TABLE: 334 case STATUSTYPE_TABLE:
326 DMEMIT("%d %llu", sc->stripes, 335 DMEMIT("%d %llu", sc->stripes,
327 (unsigned long long)sc->chunk_mask + 1); 336 (unsigned long long)sc->chunk_size);
328 for (i = 0; i < sc->stripes; i++) 337 for (i = 0; i < sc->stripes; i++)
329 DMEMIT(" %s %llu", sc->stripe[i].dev->name, 338 DMEMIT(" %s %llu", sc->stripe[i].dev->name,
330 (unsigned long long)sc->stripe[i].physical_start); 339 (unsigned long long)sc->stripe[i].physical_start);
@@ -391,7 +400,7 @@ static void stripe_io_hints(struct dm_target *ti,
391 struct queue_limits *limits) 400 struct queue_limits *limits)
392{ 401{
393 struct stripe_c *sc = ti->private; 402 struct stripe_c *sc = ti->private;
394 unsigned chunk_size = (sc->chunk_mask + 1) << 9; 403 unsigned chunk_size = sc->chunk_size << SECTOR_SHIFT;
395 404
396 blk_limits_io_min(limits, chunk_size); 405 blk_limits_io_min(limits, chunk_size);
397 blk_limits_io_opt(limits, chunk_size * sc->stripes); 406 blk_limits_io_opt(limits, chunk_size * sc->stripes);
@@ -419,7 +428,7 @@ static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
419 428
420static struct target_type stripe_target = { 429static struct target_type stripe_target = {
421 .name = "striped", 430 .name = "striped",
422 .version = {1, 4, 0}, 431 .version = {1, 5, 0},
423 .module = THIS_MODULE, 432 .module = THIS_MODULE,
424 .ctr = stripe_ctr, 433 .ctr = stripe_ctr,
425 .dtr = stripe_dtr, 434 .dtr = stripe_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2e227fbf1622..f90069029aae 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1319,6 +1319,9 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
1319 if (!ti->num_flush_requests) 1319 if (!ti->num_flush_requests)
1320 continue; 1320 continue;
1321 1321
1322 if (ti->flush_supported)
1323 return 1;
1324
1322 if (ti->type->iterate_devices && 1325 if (ti->type->iterate_devices &&
1323 ti->type->iterate_devices(ti, device_flush_capable, &flush)) 1326 ti->type->iterate_devices(ti, device_flush_capable, &flush))
1324 return 1; 1327 return 1;
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 3e2907f0bc46..693e149e9727 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2011 Red Hat, Inc. 2 * Copyright (C) 2011-2012 Red Hat, Inc.
3 * 3 *
4 * This file is released under the GPL. 4 * This file is released under the GPL.
5 */ 5 */
@@ -80,6 +80,12 @@
80#define THIN_METADATA_CACHE_SIZE 64 80#define THIN_METADATA_CACHE_SIZE 64
81#define SECTOR_TO_BLOCK_SHIFT 3 81#define SECTOR_TO_BLOCK_SHIFT 3
82 82
83/*
84 * 3 for btree insert +
85 * 2 for btree lookup used within space map
86 */
87#define THIN_MAX_CONCURRENT_LOCKS 5
88
83/* This should be plenty */ 89/* This should be plenty */
84#define SPACE_MAP_ROOT_SIZE 128 90#define SPACE_MAP_ROOT_SIZE 128
85 91
@@ -172,13 +178,20 @@ struct dm_pool_metadata {
172 178
173 struct rw_semaphore root_lock; 179 struct rw_semaphore root_lock;
174 uint32_t time; 180 uint32_t time;
175 int need_commit;
176 dm_block_t root; 181 dm_block_t root;
177 dm_block_t details_root; 182 dm_block_t details_root;
178 struct list_head thin_devices; 183 struct list_head thin_devices;
179 uint64_t trans_id; 184 uint64_t trans_id;
180 unsigned long flags; 185 unsigned long flags;
181 sector_t data_block_size; 186 sector_t data_block_size;
187 bool read_only:1;
188
189 /*
190 * Set if a transaction has to be aborted but the attempt to roll back
191 * to the previous (good) transaction failed. The only pool metadata
192 * operation possible in this state is the closing of the device.
193 */
194 bool fail_io:1;
182}; 195};
183 196
184struct dm_thin_device { 197struct dm_thin_device {
@@ -187,7 +200,8 @@ struct dm_thin_device {
187 dm_thin_id id; 200 dm_thin_id id;
188 201
189 int open_count; 202 int open_count;
190 int changed; 203 bool changed:1;
204 bool aborted_with_changes:1;
191 uint64_t mapped_blocks; 205 uint64_t mapped_blocks;
192 uint64_t transaction_id; 206 uint64_t transaction_id;
193 uint32_t creation_time; 207 uint32_t creation_time;
@@ -338,7 +352,21 @@ static int subtree_equal(void *context, void *value1_le, void *value2_le)
338 352
339/*----------------------------------------------------------------*/ 353/*----------------------------------------------------------------*/
340 354
341static int superblock_all_zeroes(struct dm_block_manager *bm, int *result) 355static int superblock_lock_zero(struct dm_pool_metadata *pmd,
356 struct dm_block **sblock)
357{
358 return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
359 &sb_validator, sblock);
360}
361
362static int superblock_lock(struct dm_pool_metadata *pmd,
363 struct dm_block **sblock)
364{
365 return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
366 &sb_validator, sblock);
367}
368
369static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
342{ 370{
343 int r; 371 int r;
344 unsigned i; 372 unsigned i;
@@ -365,72 +393,9 @@ static int superblock_all_zeroes(struct dm_block_manager *bm, int *result)
365 return dm_bm_unlock(b); 393 return dm_bm_unlock(b);
366} 394}
367 395
368static int init_pmd(struct dm_pool_metadata *pmd, 396static void __setup_btree_details(struct dm_pool_metadata *pmd)
369 struct dm_block_manager *bm,
370 dm_block_t nr_blocks, int create)
371{ 397{
372 int r; 398 pmd->info.tm = pmd->tm;
373 struct dm_space_map *sm, *data_sm;
374 struct dm_transaction_manager *tm;
375 struct dm_block *sblock;
376
377 if (create) {
378 r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION,
379 &sb_validator, &tm, &sm, &sblock);
380 if (r < 0) {
381 DMERR("tm_create_with_sm failed");
382 return r;
383 }
384
385 data_sm = dm_sm_disk_create(tm, nr_blocks);
386 if (IS_ERR(data_sm)) {
387 DMERR("sm_disk_create failed");
388 dm_tm_unlock(tm, sblock);
389 r = PTR_ERR(data_sm);
390 goto bad;
391 }
392 } else {
393 struct thin_disk_superblock *disk_super = NULL;
394 size_t space_map_root_offset =
395 offsetof(struct thin_disk_superblock, metadata_space_map_root);
396
397 r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION,
398 &sb_validator, space_map_root_offset,
399 SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock);
400 if (r < 0) {
401 DMERR("tm_open_with_sm failed");
402 return r;
403 }
404
405 disk_super = dm_block_data(sblock);
406 data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root,
407 sizeof(disk_super->data_space_map_root));
408 if (IS_ERR(data_sm)) {
409 DMERR("sm_disk_open failed");
410 r = PTR_ERR(data_sm);
411 goto bad;
412 }
413 }
414
415
416 r = dm_tm_unlock(tm, sblock);
417 if (r < 0) {
418 DMERR("couldn't unlock superblock");
419 goto bad_data_sm;
420 }
421
422 pmd->bm = bm;
423 pmd->metadata_sm = sm;
424 pmd->data_sm = data_sm;
425 pmd->tm = tm;
426 pmd->nb_tm = dm_tm_create_non_blocking_clone(tm);
427 if (!pmd->nb_tm) {
428 DMERR("could not create clone tm");
429 r = -ENOMEM;
430 goto bad_data_sm;
431 }
432
433 pmd->info.tm = tm;
434 pmd->info.levels = 2; 399 pmd->info.levels = 2;
435 pmd->info.value_type.context = pmd->data_sm; 400 pmd->info.value_type.context = pmd->data_sm;
436 pmd->info.value_type.size = sizeof(__le64); 401 pmd->info.value_type.size = sizeof(__le64);
@@ -441,7 +406,7 @@ static int init_pmd(struct dm_pool_metadata *pmd,
441 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); 406 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
442 pmd->nb_info.tm = pmd->nb_tm; 407 pmd->nb_info.tm = pmd->nb_tm;
443 408
444 pmd->tl_info.tm = tm; 409 pmd->tl_info.tm = pmd->tm;
445 pmd->tl_info.levels = 1; 410 pmd->tl_info.levels = 1;
446 pmd->tl_info.value_type.context = &pmd->info; 411 pmd->tl_info.value_type.context = &pmd->info;
447 pmd->tl_info.value_type.size = sizeof(__le64); 412 pmd->tl_info.value_type.size = sizeof(__le64);
@@ -449,7 +414,7 @@ static int init_pmd(struct dm_pool_metadata *pmd,
449 pmd->tl_info.value_type.dec = subtree_dec; 414 pmd->tl_info.value_type.dec = subtree_dec;
450 pmd->tl_info.value_type.equal = subtree_equal; 415 pmd->tl_info.value_type.equal = subtree_equal;
451 416
452 pmd->bl_info.tm = tm; 417 pmd->bl_info.tm = pmd->tm;
453 pmd->bl_info.levels = 1; 418 pmd->bl_info.levels = 1;
454 pmd->bl_info.value_type.context = pmd->data_sm; 419 pmd->bl_info.value_type.context = pmd->data_sm;
455 pmd->bl_info.value_type.size = sizeof(__le64); 420 pmd->bl_info.value_type.size = sizeof(__le64);
@@ -457,48 +422,266 @@ static int init_pmd(struct dm_pool_metadata *pmd,
457 pmd->bl_info.value_type.dec = data_block_dec; 422 pmd->bl_info.value_type.dec = data_block_dec;
458 pmd->bl_info.value_type.equal = data_block_equal; 423 pmd->bl_info.value_type.equal = data_block_equal;
459 424
460 pmd->details_info.tm = tm; 425 pmd->details_info.tm = pmd->tm;
461 pmd->details_info.levels = 1; 426 pmd->details_info.levels = 1;
462 pmd->details_info.value_type.context = NULL; 427 pmd->details_info.value_type.context = NULL;
463 pmd->details_info.value_type.size = sizeof(struct disk_device_details); 428 pmd->details_info.value_type.size = sizeof(struct disk_device_details);
464 pmd->details_info.value_type.inc = NULL; 429 pmd->details_info.value_type.inc = NULL;
465 pmd->details_info.value_type.dec = NULL; 430 pmd->details_info.value_type.dec = NULL;
466 pmd->details_info.value_type.equal = NULL; 431 pmd->details_info.value_type.equal = NULL;
432}
467 433
468 pmd->root = 0; 434static int __write_initial_superblock(struct dm_pool_metadata *pmd)
435{
436 int r;
437 struct dm_block *sblock;
438 size_t metadata_len, data_len;
439 struct thin_disk_superblock *disk_super;
440 sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
469 441
470 init_rwsem(&pmd->root_lock); 442 if (bdev_size > THIN_METADATA_MAX_SECTORS)
471 pmd->time = 0; 443 bdev_size = THIN_METADATA_MAX_SECTORS;
472 pmd->need_commit = 0; 444
473 pmd->details_root = 0; 445 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
474 pmd->trans_id = 0; 446 if (r < 0)
475 pmd->flags = 0; 447 return r;
476 INIT_LIST_HEAD(&pmd->thin_devices); 448
449 r = dm_sm_root_size(pmd->data_sm, &data_len);
450 if (r < 0)
451 return r;
452
453 r = dm_sm_commit(pmd->data_sm);
454 if (r < 0)
455 return r;
456
457 r = dm_tm_pre_commit(pmd->tm);
458 if (r < 0)
459 return r;
460
461 r = superblock_lock_zero(pmd, &sblock);
462 if (r)
463 return r;
464
465 disk_super = dm_block_data(sblock);
466 disk_super->flags = 0;
467 memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
468 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
469 disk_super->version = cpu_to_le32(THIN_VERSION);
470 disk_super->time = 0;
471 disk_super->trans_id = 0;
472 disk_super->held_root = 0;
473
474 r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
475 metadata_len);
476 if (r < 0)
477 goto bad_locked;
478
479 r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
480 data_len);
481 if (r < 0)
482 goto bad_locked;
483
484 disk_super->data_mapping_root = cpu_to_le64(pmd->root);
485 disk_super->device_details_root = cpu_to_le64(pmd->details_root);
486 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
487 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
488 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
489
490 return dm_tm_commit(pmd->tm, sblock);
491
492bad_locked:
493 dm_bm_unlock(sblock);
494 return r;
495}
496
497static int __format_metadata(struct dm_pool_metadata *pmd)
498{
499 int r;
500
501 r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
502 &pmd->tm, &pmd->metadata_sm);
503 if (r < 0) {
504 DMERR("tm_create_with_sm failed");
505 return r;
506 }
507
508 pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
509 if (IS_ERR(pmd->data_sm)) {
510 DMERR("sm_disk_create failed");
511 r = PTR_ERR(pmd->data_sm);
512 goto bad_cleanup_tm;
513 }
514
515 pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
516 if (!pmd->nb_tm) {
517 DMERR("could not create non-blocking clone tm");
518 r = -ENOMEM;
519 goto bad_cleanup_data_sm;
520 }
521
522 __setup_btree_details(pmd);
523
524 r = dm_btree_empty(&pmd->info, &pmd->root);
525 if (r < 0)
526 goto bad_cleanup_nb_tm;
527
528 r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
529 if (r < 0) {
530 DMERR("couldn't create devices root");
531 goto bad_cleanup_nb_tm;
532 }
533
534 r = __write_initial_superblock(pmd);
535 if (r)
536 goto bad_cleanup_nb_tm;
477 537
478 return 0; 538 return 0;
479 539
480bad_data_sm: 540bad_cleanup_nb_tm:
481 dm_sm_destroy(data_sm); 541 dm_tm_destroy(pmd->nb_tm);
482bad: 542bad_cleanup_data_sm:
483 dm_tm_destroy(tm); 543 dm_sm_destroy(pmd->data_sm);
484 dm_sm_destroy(sm); 544bad_cleanup_tm:
545 dm_tm_destroy(pmd->tm);
546 dm_sm_destroy(pmd->metadata_sm);
547
548 return r;
549}
550
551static int __check_incompat_features(struct thin_disk_superblock *disk_super,
552 struct dm_pool_metadata *pmd)
553{
554 uint32_t features;
555
556 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
557 if (features) {
558 DMERR("could not access metadata due to unsupported optional features (%lx).",
559 (unsigned long)features);
560 return -EINVAL;
561 }
562
563 /*
564 * Check for read-only metadata to skip the following RDWR checks.
565 */
566 if (get_disk_ro(pmd->bdev->bd_disk))
567 return 0;
568
569 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
570 if (features) {
571 DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
572 (unsigned long)features);
573 return -EINVAL;
574 }
575
576 return 0;
577}
578
579static int __open_metadata(struct dm_pool_metadata *pmd)
580{
581 int r;
582 struct dm_block *sblock;
583 struct thin_disk_superblock *disk_super;
584
585 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
586 &sb_validator, &sblock);
587 if (r < 0) {
588 DMERR("couldn't read superblock");
589 return r;
590 }
591
592 disk_super = dm_block_data(sblock);
593
594 r = __check_incompat_features(disk_super, pmd);
595 if (r < 0)
596 goto bad_unlock_sblock;
597
598 r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
599 disk_super->metadata_space_map_root,
600 sizeof(disk_super->metadata_space_map_root),
601 &pmd->tm, &pmd->metadata_sm);
602 if (r < 0) {
603 DMERR("tm_open_with_sm failed");
604 goto bad_unlock_sblock;
605 }
606
607 pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
608 sizeof(disk_super->data_space_map_root));
609 if (IS_ERR(pmd->data_sm)) {
610 DMERR("sm_disk_open failed");
611 r = PTR_ERR(pmd->data_sm);
612 goto bad_cleanup_tm;
613 }
614
615 pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
616 if (!pmd->nb_tm) {
617 DMERR("could not create non-blocking clone tm");
618 r = -ENOMEM;
619 goto bad_cleanup_data_sm;
620 }
621
622 __setup_btree_details(pmd);
623 return dm_bm_unlock(sblock);
624
625bad_cleanup_data_sm:
626 dm_sm_destroy(pmd->data_sm);
627bad_cleanup_tm:
628 dm_tm_destroy(pmd->tm);
629 dm_sm_destroy(pmd->metadata_sm);
630bad_unlock_sblock:
631 dm_bm_unlock(sblock);
632
633 return r;
634}
635
636static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
637{
638 int r, unformatted;
639
640 r = __superblock_all_zeroes(pmd->bm, &unformatted);
641 if (r)
642 return r;
643
644 if (unformatted)
645 return format_device ? __format_metadata(pmd) : -EPERM;
646
647 return __open_metadata(pmd);
648}
649
650static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
651{
652 int r;
653
654 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE,
655 THIN_METADATA_CACHE_SIZE,
656 THIN_MAX_CONCURRENT_LOCKS);
657 if (IS_ERR(pmd->bm)) {
658 DMERR("could not create block manager");
659 return PTR_ERR(pmd->bm);
660 }
661
662 r = __open_or_format_metadata(pmd, format_device);
663 if (r)
664 dm_block_manager_destroy(pmd->bm);
485 665
486 return r; 666 return r;
487} 667}
488 668
669static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
670{
671 dm_sm_destroy(pmd->data_sm);
672 dm_sm_destroy(pmd->metadata_sm);
673 dm_tm_destroy(pmd->nb_tm);
674 dm_tm_destroy(pmd->tm);
675 dm_block_manager_destroy(pmd->bm);
676}
677
489static int __begin_transaction(struct dm_pool_metadata *pmd) 678static int __begin_transaction(struct dm_pool_metadata *pmd)
490{ 679{
491 int r; 680 int r;
492 u32 features;
493 struct thin_disk_superblock *disk_super; 681 struct thin_disk_superblock *disk_super;
494 struct dm_block *sblock; 682 struct dm_block *sblock;
495 683
496 /* 684 /*
497 * __maybe_commit_transaction() resets these
498 */
499 WARN_ON(pmd->need_commit);
500
501 /*
502 * We re-read the superblock every time. Shouldn't need to do this 685 * We re-read the superblock every time. Shouldn't need to do this
503 * really. 686 * really.
504 */ 687 */
@@ -515,32 +698,8 @@ static int __begin_transaction(struct dm_pool_metadata *pmd)
515 pmd->flags = le32_to_cpu(disk_super->flags); 698 pmd->flags = le32_to_cpu(disk_super->flags);
516 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); 699 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
517 700
518 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
519 if (features) {
520 DMERR("could not access metadata due to "
521 "unsupported optional features (%lx).",
522 (unsigned long)features);
523 r = -EINVAL;
524 goto out;
525 }
526
527 /*
528 * Check for read-only metadata to skip the following RDWR checks.
529 */
530 if (get_disk_ro(pmd->bdev->bd_disk))
531 goto out;
532
533 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
534 if (features) {
535 DMERR("could not access metadata RDWR due to "
536 "unsupported optional features (%lx).",
537 (unsigned long)features);
538 r = -EINVAL;
539 }
540
541out:
542 dm_bm_unlock(sblock); 701 dm_bm_unlock(sblock);
543 return r; 702 return 0;
544} 703}
545 704
546static int __write_changed_details(struct dm_pool_metadata *pmd) 705static int __write_changed_details(struct dm_pool_metadata *pmd)
@@ -573,8 +732,6 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
573 list_del(&td->list); 732 list_del(&td->list);
574 kfree(td); 733 kfree(td);
575 } 734 }
576
577 pmd->need_commit = 1;
578 } 735 }
579 736
580 return 0; 737 return 0;
@@ -582,9 +739,6 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
582 739
583static int __commit_transaction(struct dm_pool_metadata *pmd) 740static int __commit_transaction(struct dm_pool_metadata *pmd)
584{ 741{
585 /*
586 * FIXME: Associated pool should be made read-only on failure.
587 */
588 int r; 742 int r;
589 size_t metadata_len, data_len; 743 size_t metadata_len, data_len;
590 struct thin_disk_superblock *disk_super; 744 struct thin_disk_superblock *disk_super;
@@ -597,31 +751,27 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
597 751
598 r = __write_changed_details(pmd); 752 r = __write_changed_details(pmd);
599 if (r < 0) 753 if (r < 0)
600 goto out; 754 return r;
601
602 if (!pmd->need_commit)
603 goto out;
604 755
605 r = dm_sm_commit(pmd->data_sm); 756 r = dm_sm_commit(pmd->data_sm);
606 if (r < 0) 757 if (r < 0)
607 goto out; 758 return r;
608 759
609 r = dm_tm_pre_commit(pmd->tm); 760 r = dm_tm_pre_commit(pmd->tm);
610 if (r < 0) 761 if (r < 0)
611 goto out; 762 return r;
612 763
613 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); 764 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
614 if (r < 0) 765 if (r < 0)
615 goto out; 766 return r;
616 767
617 r = dm_sm_root_size(pmd->data_sm, &data_len); 768 r = dm_sm_root_size(pmd->data_sm, &data_len);
618 if (r < 0) 769 if (r < 0)
619 goto out; 770 return r;
620 771
621 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 772 r = superblock_lock(pmd, &sblock);
622 &sb_validator, &sblock);
623 if (r) 773 if (r)
624 goto out; 774 return r;
625 775
626 disk_super = dm_block_data(sblock); 776 disk_super = dm_block_data(sblock);
627 disk_super->time = cpu_to_le32(pmd->time); 777 disk_super->time = cpu_to_le32(pmd->time);
@@ -640,12 +790,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
640 if (r < 0) 790 if (r < 0)
641 goto out_locked; 791 goto out_locked;
642 792
643 r = dm_tm_commit(pmd->tm, sblock); 793 return dm_tm_commit(pmd->tm, sblock);
644 if (!r)
645 pmd->need_commit = 0;
646
647out:
648 return r;
649 794
650out_locked: 795out_locked:
651 dm_bm_unlock(sblock); 796 dm_bm_unlock(sblock);
@@ -653,15 +798,11 @@ out_locked:
653} 798}
654 799
655struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, 800struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
656 sector_t data_block_size) 801 sector_t data_block_size,
802 bool format_device)
657{ 803{
658 int r; 804 int r;
659 struct thin_disk_superblock *disk_super;
660 struct dm_pool_metadata *pmd; 805 struct dm_pool_metadata *pmd;
661 sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
662 struct dm_block_manager *bm;
663 int create;
664 struct dm_block *sblock;
665 806
666 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); 807 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
667 if (!pmd) { 808 if (!pmd) {
@@ -669,90 +810,28 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
669 return ERR_PTR(-ENOMEM); 810 return ERR_PTR(-ENOMEM);
670 } 811 }
671 812
672 /* 813 init_rwsem(&pmd->root_lock);
673 * Max hex locks: 814 pmd->time = 0;
674 * 3 for btree insert + 815 INIT_LIST_HEAD(&pmd->thin_devices);
675 * 2 for btree lookup used within space map 816 pmd->read_only = false;
676 */ 817 pmd->fail_io = false;
677 bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE, 818 pmd->bdev = bdev;
678 THIN_METADATA_CACHE_SIZE, 5); 819 pmd->data_block_size = data_block_size;
679 if (!bm) {
680 DMERR("could not create block manager");
681 kfree(pmd);
682 return ERR_PTR(-ENOMEM);
683 }
684
685 r = superblock_all_zeroes(bm, &create);
686 if (r) {
687 dm_block_manager_destroy(bm);
688 kfree(pmd);
689 return ERR_PTR(r);
690 }
691
692 820
693 r = init_pmd(pmd, bm, 0, create); 821 r = __create_persistent_data_objects(pmd, format_device);
694 if (r) { 822 if (r) {
695 dm_block_manager_destroy(bm);
696 kfree(pmd); 823 kfree(pmd);
697 return ERR_PTR(r); 824 return ERR_PTR(r);
698 } 825 }
699 pmd->bdev = bdev;
700
701 if (!create) {
702 r = __begin_transaction(pmd);
703 if (r < 0)
704 goto bad;
705 return pmd;
706 }
707
708 /*
709 * Create.
710 */
711 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
712 &sb_validator, &sblock);
713 if (r)
714 goto bad;
715
716 if (bdev_size > THIN_METADATA_MAX_SECTORS)
717 bdev_size = THIN_METADATA_MAX_SECTORS;
718
719 disk_super = dm_block_data(sblock);
720 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
721 disk_super->version = cpu_to_le32(THIN_VERSION);
722 disk_super->time = 0;
723 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
724 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
725 disk_super->data_block_size = cpu_to_le32(data_block_size);
726
727 r = dm_bm_unlock(sblock);
728 if (r < 0)
729 goto bad;
730
731 r = dm_btree_empty(&pmd->info, &pmd->root);
732 if (r < 0)
733 goto bad;
734
735 r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
736 if (r < 0) {
737 DMERR("couldn't create devices root");
738 goto bad;
739 }
740 826
741 pmd->flags = 0; 827 r = __begin_transaction(pmd);
742 pmd->need_commit = 1;
743 r = dm_pool_commit_metadata(pmd);
744 if (r < 0) { 828 if (r < 0) {
745 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 829 if (dm_pool_metadata_close(pmd) < 0)
746 __func__, r); 830 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
747 goto bad; 831 return ERR_PTR(r);
748 } 832 }
749 833
750 return pmd; 834 return pmd;
751
752bad:
753 if (dm_pool_metadata_close(pmd) < 0)
754 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
755 return ERR_PTR(r);
756} 835}
757 836
758int dm_pool_metadata_close(struct dm_pool_metadata *pmd) 837int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
@@ -778,18 +857,17 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
778 return -EBUSY; 857 return -EBUSY;
779 } 858 }
780 859
781 r = __commit_transaction(pmd); 860 if (!pmd->read_only && !pmd->fail_io) {
782 if (r < 0) 861 r = __commit_transaction(pmd);
783 DMWARN("%s: __commit_transaction() failed, error = %d", 862 if (r < 0)
784 __func__, r); 863 DMWARN("%s: __commit_transaction() failed, error = %d",
864 __func__, r);
865 }
785 866
786 dm_tm_destroy(pmd->tm); 867 if (!pmd->fail_io)
787 dm_tm_destroy(pmd->nb_tm); 868 __destroy_persistent_data_objects(pmd);
788 dm_block_manager_destroy(pmd->bm);
789 dm_sm_destroy(pmd->metadata_sm);
790 dm_sm_destroy(pmd->data_sm);
791 kfree(pmd);
792 869
870 kfree(pmd);
793 return 0; 871 return 0;
794} 872}
795 873
@@ -850,6 +928,7 @@ static int __open_device(struct dm_pool_metadata *pmd,
850 (*td)->id = dev; 928 (*td)->id = dev;
851 (*td)->open_count = 1; 929 (*td)->open_count = 1;
852 (*td)->changed = changed; 930 (*td)->changed = changed;
931 (*td)->aborted_with_changes = false;
853 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); 932 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
854 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); 933 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
855 (*td)->creation_time = le32_to_cpu(details_le.creation_time); 934 (*td)->creation_time = le32_to_cpu(details_le.creation_time);
@@ -911,10 +990,11 @@ static int __create_thin(struct dm_pool_metadata *pmd,
911 990
912int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) 991int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
913{ 992{
914 int r; 993 int r = -EINVAL;
915 994
916 down_write(&pmd->root_lock); 995 down_write(&pmd->root_lock);
917 r = __create_thin(pmd, dev); 996 if (!pmd->fail_io)
997 r = __create_thin(pmd, dev);
918 up_write(&pmd->root_lock); 998 up_write(&pmd->root_lock);
919 999
920 return r; 1000 return r;
@@ -1001,10 +1081,11 @@ int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1001 dm_thin_id dev, 1081 dm_thin_id dev,
1002 dm_thin_id origin) 1082 dm_thin_id origin)
1003{ 1083{
1004 int r; 1084 int r = -EINVAL;
1005 1085
1006 down_write(&pmd->root_lock); 1086 down_write(&pmd->root_lock);
1007 r = __create_snap(pmd, dev, origin); 1087 if (!pmd->fail_io)
1088 r = __create_snap(pmd, dev, origin);
1008 up_write(&pmd->root_lock); 1089 up_write(&pmd->root_lock);
1009 1090
1010 return r; 1091 return r;
@@ -1037,18 +1118,17 @@ static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
1037 if (r) 1118 if (r)
1038 return r; 1119 return r;
1039 1120
1040 pmd->need_commit = 1;
1041
1042 return 0; 1121 return 0;
1043} 1122}
1044 1123
1045int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, 1124int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1046 dm_thin_id dev) 1125 dm_thin_id dev)
1047{ 1126{
1048 int r; 1127 int r = -EINVAL;
1049 1128
1050 down_write(&pmd->root_lock); 1129 down_write(&pmd->root_lock);
1051 r = __delete_device(pmd, dev); 1130 if (!pmd->fail_io)
1131 r = __delete_device(pmd, dev);
1052 up_write(&pmd->root_lock); 1132 up_write(&pmd->root_lock);
1053 1133
1054 return r; 1134 return r;
@@ -1058,28 +1138,40 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1058 uint64_t current_id, 1138 uint64_t current_id,
1059 uint64_t new_id) 1139 uint64_t new_id)
1060{ 1140{
1141 int r = -EINVAL;
1142
1061 down_write(&pmd->root_lock); 1143 down_write(&pmd->root_lock);
1144
1145 if (pmd->fail_io)
1146 goto out;
1147
1062 if (pmd->trans_id != current_id) { 1148 if (pmd->trans_id != current_id) {
1063 up_write(&pmd->root_lock);
1064 DMERR("mismatched transaction id"); 1149 DMERR("mismatched transaction id");
1065 return -EINVAL; 1150 goto out;
1066 } 1151 }
1067 1152
1068 pmd->trans_id = new_id; 1153 pmd->trans_id = new_id;
1069 pmd->need_commit = 1; 1154 r = 0;
1155
1156out:
1070 up_write(&pmd->root_lock); 1157 up_write(&pmd->root_lock);
1071 1158
1072 return 0; 1159 return r;
1073} 1160}
1074 1161
1075int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, 1162int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1076 uint64_t *result) 1163 uint64_t *result)
1077{ 1164{
1165 int r = -EINVAL;
1166
1078 down_read(&pmd->root_lock); 1167 down_read(&pmd->root_lock);
1079 *result = pmd->trans_id; 1168 if (!pmd->fail_io) {
1169 *result = pmd->trans_id;
1170 r = 0;
1171 }
1080 up_read(&pmd->root_lock); 1172 up_read(&pmd->root_lock);
1081 1173
1082 return 0; 1174 return r;
1083} 1175}
1084 1176
1085static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) 1177static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
@@ -1108,8 +1200,6 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1108 1200
1109 dm_tm_dec(pmd->tm, held_root); 1201 dm_tm_dec(pmd->tm, held_root);
1110 dm_tm_unlock(pmd->tm, copy); 1202 dm_tm_unlock(pmd->tm, copy);
1111 pmd->need_commit = 1;
1112
1113 return -EBUSY; 1203 return -EBUSY;
1114 } 1204 }
1115 1205
@@ -1131,29 +1221,25 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1131 /* 1221 /*
1132 * Write the held root into the superblock. 1222 * Write the held root into the superblock.
1133 */ 1223 */
1134 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 1224 r = superblock_lock(pmd, &sblock);
1135 &sb_validator, &sblock);
1136 if (r) { 1225 if (r) {
1137 dm_tm_dec(pmd->tm, held_root); 1226 dm_tm_dec(pmd->tm, held_root);
1138 pmd->need_commit = 1;
1139 return r; 1227 return r;
1140 } 1228 }
1141 1229
1142 disk_super = dm_block_data(sblock); 1230 disk_super = dm_block_data(sblock);
1143 disk_super->held_root = cpu_to_le64(held_root); 1231 disk_super->held_root = cpu_to_le64(held_root);
1144 dm_bm_unlock(sblock); 1232 dm_bm_unlock(sblock);
1145
1146 pmd->need_commit = 1;
1147
1148 return 0; 1233 return 0;
1149} 1234}
1150 1235
1151int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) 1236int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1152{ 1237{
1153 int r; 1238 int r = -EINVAL;
1154 1239
1155 down_write(&pmd->root_lock); 1240 down_write(&pmd->root_lock);
1156 r = __reserve_metadata_snap(pmd); 1241 if (!pmd->fail_io)
1242 r = __reserve_metadata_snap(pmd);
1157 up_write(&pmd->root_lock); 1243 up_write(&pmd->root_lock);
1158 1244
1159 return r; 1245 return r;
@@ -1166,15 +1252,13 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1166 struct dm_block *sblock, *copy; 1252 struct dm_block *sblock, *copy;
1167 dm_block_t held_root; 1253 dm_block_t held_root;
1168 1254
1169 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 1255 r = superblock_lock(pmd, &sblock);
1170 &sb_validator, &sblock);
1171 if (r) 1256 if (r)
1172 return r; 1257 return r;
1173 1258
1174 disk_super = dm_block_data(sblock); 1259 disk_super = dm_block_data(sblock);
1175 held_root = le64_to_cpu(disk_super->held_root); 1260 held_root = le64_to_cpu(disk_super->held_root);
1176 disk_super->held_root = cpu_to_le64(0); 1261 disk_super->held_root = cpu_to_le64(0);
1177 pmd->need_commit = 1;
1178 1262
1179 dm_bm_unlock(sblock); 1263 dm_bm_unlock(sblock);
1180 1264
@@ -1197,10 +1281,11 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1197 1281
1198int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) 1282int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1199{ 1283{
1200 int r; 1284 int r = -EINVAL;
1201 1285
1202 down_write(&pmd->root_lock); 1286 down_write(&pmd->root_lock);
1203 r = __release_metadata_snap(pmd); 1287 if (!pmd->fail_io)
1288 r = __release_metadata_snap(pmd);
1204 up_write(&pmd->root_lock); 1289 up_write(&pmd->root_lock);
1205 1290
1206 return r; 1291 return r;
@@ -1227,10 +1312,11 @@ static int __get_metadata_snap(struct dm_pool_metadata *pmd,
1227int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, 1312int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1228 dm_block_t *result) 1313 dm_block_t *result)
1229{ 1314{
1230 int r; 1315 int r = -EINVAL;
1231 1316
1232 down_read(&pmd->root_lock); 1317 down_read(&pmd->root_lock);
1233 r = __get_metadata_snap(pmd, result); 1318 if (!pmd->fail_io)
1319 r = __get_metadata_snap(pmd, result);
1234 up_read(&pmd->root_lock); 1320 up_read(&pmd->root_lock);
1235 1321
1236 return r; 1322 return r;
@@ -1239,10 +1325,11 @@ int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1239int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, 1325int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1240 struct dm_thin_device **td) 1326 struct dm_thin_device **td)
1241{ 1327{
1242 int r; 1328 int r = -EINVAL;
1243 1329
1244 down_write(&pmd->root_lock); 1330 down_write(&pmd->root_lock);
1245 r = __open_device(pmd, dev, 0, td); 1331 if (!pmd->fail_io)
1332 r = __open_device(pmd, dev, 0, td);
1246 up_write(&pmd->root_lock); 1333 up_write(&pmd->root_lock);
1247 1334
1248 return r; 1335 return r;
@@ -1262,7 +1349,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1262 return td->id; 1349 return td->id;
1263} 1350}
1264 1351
1265static int __snapshotted_since(struct dm_thin_device *td, uint32_t time) 1352static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1266{ 1353{
1267 return td->snapshotted_time > time; 1354 return td->snapshotted_time > time;
1268} 1355}
@@ -1270,28 +1357,31 @@ static int __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1270int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 1357int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1271 int can_block, struct dm_thin_lookup_result *result) 1358 int can_block, struct dm_thin_lookup_result *result)
1272{ 1359{
1273 int r; 1360 int r = -EINVAL;
1274 uint64_t block_time = 0; 1361 uint64_t block_time = 0;
1275 __le64 value; 1362 __le64 value;
1276 struct dm_pool_metadata *pmd = td->pmd; 1363 struct dm_pool_metadata *pmd = td->pmd;
1277 dm_block_t keys[2] = { td->id, block }; 1364 dm_block_t keys[2] = { td->id, block };
1365 struct dm_btree_info *info;
1278 1366
1279 if (can_block) { 1367 if (can_block) {
1280 down_read(&pmd->root_lock); 1368 down_read(&pmd->root_lock);
1281 r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value); 1369 info = &pmd->info;
1282 if (!r) 1370 } else if (down_read_trylock(&pmd->root_lock))
1283 block_time = le64_to_cpu(value); 1371 info = &pmd->nb_info;
1284 up_read(&pmd->root_lock); 1372 else
1285
1286 } else if (down_read_trylock(&pmd->root_lock)) {
1287 r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value);
1288 if (!r)
1289 block_time = le64_to_cpu(value);
1290 up_read(&pmd->root_lock);
1291
1292 } else
1293 return -EWOULDBLOCK; 1373 return -EWOULDBLOCK;
1294 1374
1375 if (pmd->fail_io)
1376 goto out;
1377
1378 r = dm_btree_lookup(info, pmd->root, keys, &value);
1379 if (!r)
1380 block_time = le64_to_cpu(value);
1381
1382out:
1383 up_read(&pmd->root_lock);
1384
1295 if (!r) { 1385 if (!r) {
1296 dm_block_t exception_block; 1386 dm_block_t exception_block;
1297 uint32_t exception_time; 1387 uint32_t exception_time;
@@ -1312,7 +1402,6 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
1312 struct dm_pool_metadata *pmd = td->pmd; 1402 struct dm_pool_metadata *pmd = td->pmd;
1313 dm_block_t keys[2] = { td->id, block }; 1403 dm_block_t keys[2] = { td->id, block };
1314 1404
1315 pmd->need_commit = 1;
1316 value = cpu_to_le64(pack_block_time(data_block, pmd->time)); 1405 value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1317 __dm_bless_for_disk(&value); 1406 __dm_bless_for_disk(&value);
1318 1407
@@ -1321,10 +1410,9 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
1321 if (r) 1410 if (r)
1322 return r; 1411 return r;
1323 1412
1324 if (inserted) { 1413 td->changed = 1;
1414 if (inserted)
1325 td->mapped_blocks++; 1415 td->mapped_blocks++;
1326 td->changed = 1;
1327 }
1328 1416
1329 return 0; 1417 return 0;
1330} 1418}
@@ -1332,10 +1420,11 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
1332int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, 1420int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1333 dm_block_t data_block) 1421 dm_block_t data_block)
1334{ 1422{
1335 int r; 1423 int r = -EINVAL;
1336 1424
1337 down_write(&td->pmd->root_lock); 1425 down_write(&td->pmd->root_lock);
1338 r = __insert(td, block, data_block); 1426 if (!td->pmd->fail_io)
1427 r = __insert(td, block, data_block);
1339 up_write(&td->pmd->root_lock); 1428 up_write(&td->pmd->root_lock);
1340 1429
1341 return r; 1430 return r;
@@ -1353,31 +1442,51 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
1353 1442
1354 td->mapped_blocks--; 1443 td->mapped_blocks--;
1355 td->changed = 1; 1444 td->changed = 1;
1356 pmd->need_commit = 1;
1357 1445
1358 return 0; 1446 return 0;
1359} 1447}
1360 1448
1361int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) 1449int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1362{ 1450{
1363 int r; 1451 int r = -EINVAL;
1364 1452
1365 down_write(&td->pmd->root_lock); 1453 down_write(&td->pmd->root_lock);
1366 r = __remove(td, block); 1454 if (!td->pmd->fail_io)
1455 r = __remove(td, block);
1367 up_write(&td->pmd->root_lock); 1456 up_write(&td->pmd->root_lock);
1368 1457
1369 return r; 1458 return r;
1370} 1459}
1371 1460
1372int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) 1461bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1373{ 1462{
1374 int r; 1463 int r;
1375 1464
1376 down_write(&pmd->root_lock); 1465 down_read(&td->pmd->root_lock);
1466 r = td->changed;
1467 up_read(&td->pmd->root_lock);
1377 1468
1378 r = dm_sm_new_block(pmd->data_sm, result); 1469 return r;
1379 pmd->need_commit = 1; 1470}
1471
1472bool dm_thin_aborted_changes(struct dm_thin_device *td)
1473{
1474 bool r;
1380 1475
1476 down_read(&td->pmd->root_lock);
1477 r = td->aborted_with_changes;
1478 up_read(&td->pmd->root_lock);
1479
1480 return r;
1481}
1482
1483int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1484{
1485 int r = -EINVAL;
1486
1487 down_write(&pmd->root_lock);
1488 if (!pmd->fail_io)
1489 r = dm_sm_new_block(pmd->data_sm, result);
1381 up_write(&pmd->root_lock); 1490 up_write(&pmd->root_lock);
1382 1491
1383 return r; 1492 return r;
@@ -1385,9 +1494,11 @@ int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1385 1494
1386int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) 1495int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1387{ 1496{
1388 int r; 1497 int r = -EINVAL;
1389 1498
1390 down_write(&pmd->root_lock); 1499 down_write(&pmd->root_lock);
1500 if (pmd->fail_io)
1501 goto out;
1391 1502
1392 r = __commit_transaction(pmd); 1503 r = __commit_transaction(pmd);
1393 if (r <= 0) 1504 if (r <= 0)
@@ -1402,12 +1513,41 @@ out:
1402 return r; 1513 return r;
1403} 1514}
1404 1515
1516static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
1517{
1518 struct dm_thin_device *td;
1519
1520 list_for_each_entry(td, &pmd->thin_devices, list)
1521 td->aborted_with_changes = td->changed;
1522}
1523
1524int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1525{
1526 int r = -EINVAL;
1527
1528 down_write(&pmd->root_lock);
1529 if (pmd->fail_io)
1530 goto out;
1531
1532 __set_abort_with_changes_flags(pmd);
1533 __destroy_persistent_data_objects(pmd);
1534 r = __create_persistent_data_objects(pmd, false);
1535 if (r)
1536 pmd->fail_io = true;
1537
1538out:
1539 up_write(&pmd->root_lock);
1540
1541 return r;
1542}
1543
1405int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) 1544int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1406{ 1545{
1407 int r; 1546 int r = -EINVAL;
1408 1547
1409 down_read(&pmd->root_lock); 1548 down_read(&pmd->root_lock);
1410 r = dm_sm_get_nr_free(pmd->data_sm, result); 1549 if (!pmd->fail_io)
1550 r = dm_sm_get_nr_free(pmd->data_sm, result);
1411 up_read(&pmd->root_lock); 1551 up_read(&pmd->root_lock);
1412 1552
1413 return r; 1553 return r;
@@ -1416,10 +1556,11 @@ int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *resul
1416int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, 1556int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1417 dm_block_t *result) 1557 dm_block_t *result)
1418{ 1558{
1419 int r; 1559 int r = -EINVAL;
1420 1560
1421 down_read(&pmd->root_lock); 1561 down_read(&pmd->root_lock);
1422 r = dm_sm_get_nr_free(pmd->metadata_sm, result); 1562 if (!pmd->fail_io)
1563 r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1423 up_read(&pmd->root_lock); 1564 up_read(&pmd->root_lock);
1424 1565
1425 return r; 1566 return r;
@@ -1428,10 +1569,11 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1428int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, 1569int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1429 dm_block_t *result) 1570 dm_block_t *result)
1430{ 1571{
1431 int r; 1572 int r = -EINVAL;
1432 1573
1433 down_read(&pmd->root_lock); 1574 down_read(&pmd->root_lock);
1434 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); 1575 if (!pmd->fail_io)
1576 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1435 up_read(&pmd->root_lock); 1577 up_read(&pmd->root_lock);
1436 1578
1437 return r; 1579 return r;
@@ -1448,10 +1590,11 @@ int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
1448 1590
1449int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) 1591int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1450{ 1592{
1451 int r; 1593 int r = -EINVAL;
1452 1594
1453 down_read(&pmd->root_lock); 1595 down_read(&pmd->root_lock);
1454 r = dm_sm_get_nr_blocks(pmd->data_sm, result); 1596 if (!pmd->fail_io)
1597 r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1455 up_read(&pmd->root_lock); 1598 up_read(&pmd->root_lock);
1456 1599
1457 return r; 1600 return r;
@@ -1459,13 +1602,17 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1459 1602
1460int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) 1603int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1461{ 1604{
1605 int r = -EINVAL;
1462 struct dm_pool_metadata *pmd = td->pmd; 1606 struct dm_pool_metadata *pmd = td->pmd;
1463 1607
1464 down_read(&pmd->root_lock); 1608 down_read(&pmd->root_lock);
1465 *result = td->mapped_blocks; 1609 if (!pmd->fail_io) {
1610 *result = td->mapped_blocks;
1611 r = 0;
1612 }
1466 up_read(&pmd->root_lock); 1613 up_read(&pmd->root_lock);
1467 1614
1468 return 0; 1615 return r;
1469} 1616}
1470 1617
1471static int __highest_block(struct dm_thin_device *td, dm_block_t *result) 1618static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
@@ -1487,11 +1634,12 @@ static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
1487int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, 1634int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
1488 dm_block_t *result) 1635 dm_block_t *result)
1489{ 1636{
1490 int r; 1637 int r = -EINVAL;
1491 struct dm_pool_metadata *pmd = td->pmd; 1638 struct dm_pool_metadata *pmd = td->pmd;
1492 1639
1493 down_read(&pmd->root_lock); 1640 down_read(&pmd->root_lock);
1494 r = __highest_block(td, result); 1641 if (!pmd->fail_io)
1642 r = __highest_block(td, result);
1495 up_read(&pmd->root_lock); 1643 up_read(&pmd->root_lock);
1496 1644
1497 return r; 1645 return r;
@@ -1514,20 +1662,25 @@ static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1514 return -EINVAL; 1662 return -EINVAL;
1515 } 1663 }
1516 1664
1517 r = dm_sm_extend(pmd->data_sm, new_count - old_count); 1665 return dm_sm_extend(pmd->data_sm, new_count - old_count);
1518 if (!r)
1519 pmd->need_commit = 1;
1520
1521 return r;
1522} 1666}
1523 1667
1524int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) 1668int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1525{ 1669{
1526 int r; 1670 int r = -EINVAL;
1527 1671
1528 down_write(&pmd->root_lock); 1672 down_write(&pmd->root_lock);
1529 r = __resize_data_dev(pmd, new_count); 1673 if (!pmd->fail_io)
1674 r = __resize_data_dev(pmd, new_count);
1530 up_write(&pmd->root_lock); 1675 up_write(&pmd->root_lock);
1531 1676
1532 return r; 1677 return r;
1533} 1678}
1679
1680void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
1681{
1682 down_write(&pmd->root_lock);
1683 pmd->read_only = true;
1684 dm_bm_set_read_only(pmd->bm);
1685 up_write(&pmd->root_lock);
1686}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index b88918ccdaf6..0cecc3702885 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -38,7 +38,8 @@ typedef uint64_t dm_thin_id;
38 * Reopens or creates a new, empty metadata volume. 38 * Reopens or creates a new, empty metadata volume.
39 */ 39 */
40struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, 40struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
41 sector_t data_block_size); 41 sector_t data_block_size,
42 bool format_device);
42 43
43int dm_pool_metadata_close(struct dm_pool_metadata *pmd); 44int dm_pool_metadata_close(struct dm_pool_metadata *pmd);
44 45
@@ -79,6 +80,16 @@ int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
79int dm_pool_commit_metadata(struct dm_pool_metadata *pmd); 80int dm_pool_commit_metadata(struct dm_pool_metadata *pmd);
80 81
81/* 82/*
83 * Discards all uncommitted changes. Rereads the superblock, rolling back
84 * to the last good transaction. Thin devices remain open.
85 * dm_thin_aborted_changes() tells you if they had uncommitted changes.
86 *
87 * If this call fails it's only useful to call dm_pool_metadata_close().
88 * All other methods will fail with -EINVAL.
89 */
90int dm_pool_abort_metadata(struct dm_pool_metadata *pmd);
91
92/*
82 * Set/get userspace transaction id. 93 * Set/get userspace transaction id.
83 */ 94 */
84int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, 95int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
@@ -119,7 +130,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
119 130
120struct dm_thin_lookup_result { 131struct dm_thin_lookup_result {
121 dm_block_t block; 132 dm_block_t block;
122 int shared; 133 unsigned shared:1;
123}; 134};
124 135
125/* 136/*
@@ -147,6 +158,10 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
147/* 158/*
148 * Queries. 159 * Queries.
149 */ 160 */
161bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
162
163bool dm_thin_aborted_changes(struct dm_thin_device *td);
164
150int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, 165int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
151 dm_block_t *highest_mapped); 166 dm_block_t *highest_mapped);
152 167
@@ -171,6 +186,12 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
171 */ 186 */
172int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); 187int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
173 188
189/*
190 * Flicks the underlying block manager into read only mode, so you know
191 * that nothing is changing.
192 */
193void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
194
174/*----------------------------------------------------------------*/ 195/*----------------------------------------------------------------*/
175 196
176#endif 197#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 68694da0d21d..af1fc3b2c2ad 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1,10 +1,11 @@
1/* 1/*
2 * Copyright (C) 2011 Red Hat UK. 2 * Copyright (C) 2011-2012 Red Hat UK.
3 * 3 *
4 * This file is released under the GPL. 4 * This file is released under the GPL.
5 */ 5 */
6 6
7#include "dm-thin-metadata.h" 7#include "dm-thin-metadata.h"
8#include "dm.h"
8 9
9#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
10#include <linux/dm-io.h> 11#include <linux/dm-io.h>
@@ -19,7 +20,7 @@
19/* 20/*
20 * Tunable constants 21 * Tunable constants
21 */ 22 */
22#define ENDIO_HOOK_POOL_SIZE 10240 23#define ENDIO_HOOK_POOL_SIZE 1024
23#define DEFERRED_SET_SIZE 64 24#define DEFERRED_SET_SIZE 64
24#define MAPPING_POOL_SIZE 1024 25#define MAPPING_POOL_SIZE 1024
25#define PRISON_CELLS 1024 26#define PRISON_CELLS 1024
@@ -496,12 +497,27 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
496 */ 497 */
497struct dm_thin_new_mapping; 498struct dm_thin_new_mapping;
498 499
500/*
501 * The pool runs in 3 modes. Ordered in degraded order for comparisons.
502 */
503enum pool_mode {
504 PM_WRITE, /* metadata may be changed */
505 PM_READ_ONLY, /* metadata may not be changed */
506 PM_FAIL, /* all I/O fails */
507};
508
499struct pool_features { 509struct pool_features {
510 enum pool_mode mode;
511
500 unsigned zero_new_blocks:1; 512 unsigned zero_new_blocks:1;
501 unsigned discard_enabled:1; 513 unsigned discard_enabled:1;
502 unsigned discard_passdown:1; 514 unsigned discard_passdown:1;
503}; 515};
504 516
517struct thin_c;
518typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
519typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
520
505struct pool { 521struct pool {
506 struct list_head list; 522 struct list_head list;
507 struct dm_target *ti; /* Only set if a pool target is bound */ 523 struct dm_target *ti; /* Only set if a pool target is bound */
@@ -510,10 +526,9 @@ struct pool {
510 struct block_device *md_dev; 526 struct block_device *md_dev;
511 struct dm_pool_metadata *pmd; 527 struct dm_pool_metadata *pmd;
512 528
513 uint32_t sectors_per_block;
514 unsigned block_shift;
515 dm_block_t offset_mask;
516 dm_block_t low_water_blocks; 529 dm_block_t low_water_blocks;
530 uint32_t sectors_per_block;
531 int sectors_per_block_shift;
517 532
518 struct pool_features pf; 533 struct pool_features pf;
519 unsigned low_water_triggered:1; /* A dm event has been sent */ 534 unsigned low_water_triggered:1; /* A dm event has been sent */
@@ -526,8 +541,8 @@ struct pool {
526 struct work_struct worker; 541 struct work_struct worker;
527 struct delayed_work waker; 542 struct delayed_work waker;
528 543
529 unsigned ref_count;
530 unsigned long last_commit_jiffies; 544 unsigned long last_commit_jiffies;
545 unsigned ref_count;
531 546
532 spinlock_t lock; 547 spinlock_t lock;
533 struct bio_list deferred_bios; 548 struct bio_list deferred_bios;
@@ -543,8 +558,17 @@ struct pool {
543 struct dm_thin_new_mapping *next_mapping; 558 struct dm_thin_new_mapping *next_mapping;
544 mempool_t *mapping_pool; 559 mempool_t *mapping_pool;
545 mempool_t *endio_hook_pool; 560 mempool_t *endio_hook_pool;
561
562 process_bio_fn process_bio;
563 process_bio_fn process_discard;
564
565 process_mapping_fn process_prepared_mapping;
566 process_mapping_fn process_prepared_discard;
546}; 567};
547 568
569static enum pool_mode get_pool_mode(struct pool *pool);
570static void set_pool_mode(struct pool *pool, enum pool_mode mode);
571
548/* 572/*
549 * Target context for a pool. 573 * Target context for a pool.
550 */ 574 */
@@ -679,16 +703,28 @@ static void requeue_io(struct thin_c *tc)
679 703
680static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 704static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
681{ 705{
682 return bio->bi_sector >> tc->pool->block_shift; 706 sector_t block_nr = bio->bi_sector;
707
708 if (tc->pool->sectors_per_block_shift < 0)
709 (void) sector_div(block_nr, tc->pool->sectors_per_block);
710 else
711 block_nr >>= tc->pool->sectors_per_block_shift;
712
713 return block_nr;
683} 714}
684 715
685static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 716static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
686{ 717{
687 struct pool *pool = tc->pool; 718 struct pool *pool = tc->pool;
719 sector_t bi_sector = bio->bi_sector;
688 720
689 bio->bi_bdev = tc->pool_dev->bdev; 721 bio->bi_bdev = tc->pool_dev->bdev;
690 bio->bi_sector = (block << pool->block_shift) + 722 if (tc->pool->sectors_per_block_shift < 0)
691 (bio->bi_sector & pool->offset_mask); 723 bio->bi_sector = (block * pool->sectors_per_block) +
724 sector_div(bi_sector, pool->sectors_per_block);
725 else
726 bio->bi_sector = (block << pool->sectors_per_block_shift) |
727 (bi_sector & (pool->sectors_per_block - 1));
692} 728}
693 729
694static void remap_to_origin(struct thin_c *tc, struct bio *bio) 730static void remap_to_origin(struct thin_c *tc, struct bio *bio)
@@ -696,21 +732,39 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio)
696 bio->bi_bdev = tc->origin_dev->bdev; 732 bio->bi_bdev = tc->origin_dev->bdev;
697} 733}
698 734
735static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
736{
737 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
738 dm_thin_changed_this_transaction(tc->td);
739}
740
699static void issue(struct thin_c *tc, struct bio *bio) 741static void issue(struct thin_c *tc, struct bio *bio)
700{ 742{
701 struct pool *pool = tc->pool; 743 struct pool *pool = tc->pool;
702 unsigned long flags; 744 unsigned long flags;
703 745
746 if (!bio_triggers_commit(tc, bio)) {
747 generic_make_request(bio);
748 return;
749 }
750
704 /* 751 /*
705 * Batch together any FUA/FLUSH bios we find and then issue 752 * Complete bio with an error if earlier I/O caused changes to
706 * a single commit for them in process_deferred_bios(). 753 * the metadata that can't be committed e.g, due to I/O errors
754 * on the metadata device.
707 */ 755 */
708 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 756 if (dm_thin_aborted_changes(tc->td)) {
709 spin_lock_irqsave(&pool->lock, flags); 757 bio_io_error(bio);
710 bio_list_add(&pool->deferred_flush_bios, bio); 758 return;
711 spin_unlock_irqrestore(&pool->lock, flags); 759 }
712 } else 760
713 generic_make_request(bio); 761 /*
762 * Batch together any bios that trigger commits and then issue a
763 * single commit for them in process_deferred_bios().
764 */
765 spin_lock_irqsave(&pool->lock, flags);
766 bio_list_add(&pool->deferred_flush_bios, bio);
767 spin_unlock_irqrestore(&pool->lock, flags);
714} 768}
715 769
716static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) 770static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
@@ -847,6 +901,14 @@ static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell
847 wake_worker(pool); 901 wake_worker(pool);
848} 902}
849 903
904static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
905{
906 if (m->bio)
907 m->bio->bi_end_io = m->saved_bi_end_io;
908 cell_error(m->cell);
909 list_del(&m->list);
910 mempool_free(m, m->tc->pool->mapping_pool);
911}
850static void process_prepared_mapping(struct dm_thin_new_mapping *m) 912static void process_prepared_mapping(struct dm_thin_new_mapping *m)
851{ 913{
852 struct thin_c *tc = m->tc; 914 struct thin_c *tc = m->tc;
@@ -859,7 +921,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
859 921
860 if (m->err) { 922 if (m->err) {
861 cell_error(m->cell); 923 cell_error(m->cell);
862 return; 924 goto out;
863 } 925 }
864 926
865 /* 927 /*
@@ -871,7 +933,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
871 if (r) { 933 if (r) {
872 DMERR("dm_thin_insert_block() failed"); 934 DMERR("dm_thin_insert_block() failed");
873 cell_error(m->cell); 935 cell_error(m->cell);
874 return; 936 goto out;
875 } 937 }
876 938
877 /* 939 /*
@@ -886,22 +948,25 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
886 } else 948 } else
887 cell_defer(tc, m->cell, m->data_block); 949 cell_defer(tc, m->cell, m->data_block);
888 950
951out:
889 list_del(&m->list); 952 list_del(&m->list);
890 mempool_free(m, tc->pool->mapping_pool); 953 mempool_free(m, tc->pool->mapping_pool);
891} 954}
892 955
893static void process_prepared_discard(struct dm_thin_new_mapping *m) 956static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
894{ 957{
895 int r;
896 struct thin_c *tc = m->tc; 958 struct thin_c *tc = m->tc;
897 959
898 r = dm_thin_remove_block(tc->td, m->virt_block); 960 bio_io_error(m->bio);
899 if (r) 961 cell_defer_except(tc, m->cell);
900 DMERR("dm_thin_remove_block() failed"); 962 cell_defer_except(tc, m->cell2);
963 mempool_free(m, tc->pool->mapping_pool);
964}
965
966static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
967{
968 struct thin_c *tc = m->tc;
901 969
902 /*
903 * Pass the discard down to the underlying device?
904 */
905 if (m->pass_discard) 970 if (m->pass_discard)
906 remap_and_issue(tc, m->bio, m->data_block); 971 remap_and_issue(tc, m->bio, m->data_block);
907 else 972 else
@@ -912,8 +977,20 @@ static void process_prepared_discard(struct dm_thin_new_mapping *m)
912 mempool_free(m, tc->pool->mapping_pool); 977 mempool_free(m, tc->pool->mapping_pool);
913} 978}
914 979
980static void process_prepared_discard(struct dm_thin_new_mapping *m)
981{
982 int r;
983 struct thin_c *tc = m->tc;
984
985 r = dm_thin_remove_block(tc->td, m->virt_block);
986 if (r)
987 DMERR("dm_thin_remove_block() failed");
988
989 process_prepared_discard_passdown(m);
990}
991
915static void process_prepared(struct pool *pool, struct list_head *head, 992static void process_prepared(struct pool *pool, struct list_head *head,
916 void (*fn)(struct dm_thin_new_mapping *)) 993 process_mapping_fn *fn)
917{ 994{
918 unsigned long flags; 995 unsigned long flags;
919 struct list_head maps; 996 struct list_head maps;
@@ -925,7 +1002,7 @@ static void process_prepared(struct pool *pool, struct list_head *head,
925 spin_unlock_irqrestore(&pool->lock, flags); 1002 spin_unlock_irqrestore(&pool->lock, flags);
926 1003
927 list_for_each_entry_safe(m, tmp, &maps, list) 1004 list_for_each_entry_safe(m, tmp, &maps, list)
928 fn(m); 1005 (*fn)(m);
929} 1006}
930 1007
931/* 1008/*
@@ -933,9 +1010,7 @@ static void process_prepared(struct pool *pool, struct list_head *head,
933 */ 1010 */
934static int io_overlaps_block(struct pool *pool, struct bio *bio) 1011static int io_overlaps_block(struct pool *pool, struct bio *bio)
935{ 1012{
936 return !(bio->bi_sector & pool->offset_mask) && 1013 return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
937 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
938
939} 1014}
940 1015
941static int io_overwrites_block(struct pool *pool, struct bio *bio) 1016static int io_overwrites_block(struct pool *pool, struct bio *bio)
@@ -1093,6 +1168,35 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
1093 } 1168 }
1094} 1169}
1095 1170
1171static int commit(struct pool *pool)
1172{
1173 int r;
1174
1175 r = dm_pool_commit_metadata(pool->pmd);
1176 if (r)
1177 DMERR("commit failed, error = %d", r);
1178
1179 return r;
1180}
1181
1182/*
1183 * A non-zero return indicates read_only or fail_io mode.
1184 * Many callers don't care about the return value.
1185 */
1186static int commit_or_fallback(struct pool *pool)
1187{
1188 int r;
1189
1190 if (get_pool_mode(pool) != PM_WRITE)
1191 return -EINVAL;
1192
1193 r = commit(pool);
1194 if (r)
1195 set_pool_mode(pool, PM_READ_ONLY);
1196
1197 return r;
1198}
1199
1096static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 1200static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1097{ 1201{
1098 int r; 1202 int r;
@@ -1121,12 +1225,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1121 * Try to commit to see if that will free up some 1225 * Try to commit to see if that will free up some
1122 * more space. 1226 * more space.
1123 */ 1227 */
1124 r = dm_pool_commit_metadata(pool->pmd); 1228 (void) commit_or_fallback(pool);
1125 if (r) {
1126 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1127 __func__, r);
1128 return r;
1129 }
1130 1229
1131 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1230 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1132 if (r) 1231 if (r)
@@ -1218,7 +1317,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
1218 */ 1317 */
1219 m = get_next_mapping(pool); 1318 m = get_next_mapping(pool);
1220 m->tc = tc; 1319 m->tc = tc;
1221 m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; 1320 m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
1222 m->virt_block = block; 1321 m->virt_block = block;
1223 m->data_block = lookup_result.block; 1322 m->data_block = lookup_result.block;
1224 m->cell = cell; 1323 m->cell = cell;
@@ -1234,15 +1333,10 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
1234 } 1333 }
1235 } else { 1334 } else {
1236 /* 1335 /*
1237 * This path is hit if people are ignoring 1336 * The DM core makes sure that the discard doesn't span
1238 * limits->discard_granularity. It ignores any 1337 * a block boundary. So we submit the discard of a
1239 * part of the discard that is in a subsequent 1338 * partial block appropriately.
1240 * block.
1241 */ 1339 */
1242 sector_t offset = bio->bi_sector - (block << pool->block_shift);
1243 unsigned remaining = (pool->sectors_per_block - offset) << 9;
1244 bio->bi_size = min(bio->bi_size, remaining);
1245
1246 cell_release_singleton(cell, bio); 1340 cell_release_singleton(cell, bio);
1247 cell_release_singleton(cell2, bio); 1341 cell_release_singleton(cell2, bio);
1248 if ((!lookup_result.shared) && pool->pf.discard_passdown) 1342 if ((!lookup_result.shared) && pool->pf.discard_passdown)
@@ -1310,7 +1404,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1310 if (bio_detain(pool->prison, &key, bio, &cell)) 1404 if (bio_detain(pool->prison, &key, bio, &cell))
1311 return; 1405 return;
1312 1406
1313 if (bio_data_dir(bio) == WRITE) 1407 if (bio_data_dir(bio) == WRITE && bio->bi_size)
1314 break_sharing(tc, bio, block, &key, lookup_result, cell); 1408 break_sharing(tc, bio, block, &key, lookup_result, cell);
1315 else { 1409 else {
1316 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1410 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
@@ -1362,6 +1456,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1362 1456
1363 default: 1457 default:
1364 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1458 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1459 set_pool_mode(tc->pool, PM_READ_ONLY);
1365 cell_error(cell); 1460 cell_error(cell);
1366 break; 1461 break;
1367 } 1462 }
@@ -1419,6 +1514,49 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1419 } 1514 }
1420} 1515}
1421 1516
1517static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1518{
1519 int r;
1520 int rw = bio_data_dir(bio);
1521 dm_block_t block = get_bio_block(tc, bio);
1522 struct dm_thin_lookup_result lookup_result;
1523
1524 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1525 switch (r) {
1526 case 0:
1527 if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
1528 bio_io_error(bio);
1529 else
1530 remap_and_issue(tc, bio, lookup_result.block);
1531 break;
1532
1533 case -ENODATA:
1534 if (rw != READ) {
1535 bio_io_error(bio);
1536 break;
1537 }
1538
1539 if (tc->origin_dev) {
1540 remap_to_origin_and_issue(tc, bio);
1541 break;
1542 }
1543
1544 zero_fill_bio(bio);
1545 bio_endio(bio, 0);
1546 break;
1547
1548 default:
1549 DMERR("dm_thin_find_block() failed, error = %d", r);
1550 bio_io_error(bio);
1551 break;
1552 }
1553}
1554
1555static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1556{
1557 bio_io_error(bio);
1558}
1559
1422static int need_commit_due_to_time(struct pool *pool) 1560static int need_commit_due_to_time(struct pool *pool)
1423{ 1561{
1424 return jiffies < pool->last_commit_jiffies || 1562 return jiffies < pool->last_commit_jiffies ||
@@ -1430,7 +1568,6 @@ static void process_deferred_bios(struct pool *pool)
1430 unsigned long flags; 1568 unsigned long flags;
1431 struct bio *bio; 1569 struct bio *bio;
1432 struct bio_list bios; 1570 struct bio_list bios;
1433 int r;
1434 1571
1435 bio_list_init(&bios); 1572 bio_list_init(&bios);
1436 1573
@@ -1457,9 +1594,9 @@ static void process_deferred_bios(struct pool *pool)
1457 } 1594 }
1458 1595
1459 if (bio->bi_rw & REQ_DISCARD) 1596 if (bio->bi_rw & REQ_DISCARD)
1460 process_discard(tc, bio); 1597 pool->process_discard(tc, bio);
1461 else 1598 else
1462 process_bio(tc, bio); 1599 pool->process_bio(tc, bio);
1463 } 1600 }
1464 1601
1465 /* 1602 /*
@@ -1475,10 +1612,7 @@ static void process_deferred_bios(struct pool *pool)
1475 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1612 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1476 return; 1613 return;
1477 1614
1478 r = dm_pool_commit_metadata(pool->pmd); 1615 if (commit_or_fallback(pool)) {
1479 if (r) {
1480 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1481 __func__, r);
1482 while ((bio = bio_list_pop(&bios))) 1616 while ((bio = bio_list_pop(&bios)))
1483 bio_io_error(bio); 1617 bio_io_error(bio);
1484 return; 1618 return;
@@ -1493,8 +1627,8 @@ static void do_worker(struct work_struct *ws)
1493{ 1627{
1494 struct pool *pool = container_of(ws, struct pool, worker); 1628 struct pool *pool = container_of(ws, struct pool, worker);
1495 1629
1496 process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); 1630 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1497 process_prepared(pool, &pool->prepared_discards, process_prepared_discard); 1631 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1498 process_deferred_bios(pool); 1632 process_deferred_bios(pool);
1499} 1633}
1500 1634
@@ -1511,6 +1645,52 @@ static void do_waker(struct work_struct *ws)
1511 1645
1512/*----------------------------------------------------------------*/ 1646/*----------------------------------------------------------------*/
1513 1647
1648static enum pool_mode get_pool_mode(struct pool *pool)
1649{
1650 return pool->pf.mode;
1651}
1652
1653static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1654{
1655 int r;
1656
1657 pool->pf.mode = mode;
1658
1659 switch (mode) {
1660 case PM_FAIL:
1661 DMERR("switching pool to failure mode");
1662 pool->process_bio = process_bio_fail;
1663 pool->process_discard = process_bio_fail;
1664 pool->process_prepared_mapping = process_prepared_mapping_fail;
1665 pool->process_prepared_discard = process_prepared_discard_fail;
1666 break;
1667
1668 case PM_READ_ONLY:
1669 DMERR("switching pool to read-only mode");
1670 r = dm_pool_abort_metadata(pool->pmd);
1671 if (r) {
1672 DMERR("aborting transaction failed");
1673 set_pool_mode(pool, PM_FAIL);
1674 } else {
1675 dm_pool_metadata_read_only(pool->pmd);
1676 pool->process_bio = process_bio_read_only;
1677 pool->process_discard = process_discard;
1678 pool->process_prepared_mapping = process_prepared_mapping_fail;
1679 pool->process_prepared_discard = process_prepared_discard_passdown;
1680 }
1681 break;
1682
1683 case PM_WRITE:
1684 pool->process_bio = process_bio;
1685 pool->process_discard = process_discard;
1686 pool->process_prepared_mapping = process_prepared_mapping;
1687 pool->process_prepared_discard = process_prepared_discard;
1688 break;
1689 }
1690}
1691
1692/*----------------------------------------------------------------*/
1693
1514/* 1694/*
1515 * Mapping functions. 1695 * Mapping functions.
1516 */ 1696 */
@@ -1556,6 +1736,12 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1556 struct dm_thin_lookup_result result; 1736 struct dm_thin_lookup_result result;
1557 1737
1558 map_context->ptr = thin_hook_bio(tc, bio); 1738 map_context->ptr = thin_hook_bio(tc, bio);
1739
1740 if (get_pool_mode(tc->pool) == PM_FAIL) {
1741 bio_io_error(bio);
1742 return DM_MAPIO_SUBMITTED;
1743 }
1744
1559 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 1745 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1560 thin_defer_bio(tc, bio); 1746 thin_defer_bio(tc, bio);
1561 return DM_MAPIO_SUBMITTED; 1747 return DM_MAPIO_SUBMITTED;
@@ -1592,14 +1778,35 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1592 break; 1778 break;
1593 1779
1594 case -ENODATA: 1780 case -ENODATA:
1781 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1782 /*
1783 * This block isn't provisioned, and we have no way
1784 * of doing so. Just error it.
1785 */
1786 bio_io_error(bio);
1787 r = DM_MAPIO_SUBMITTED;
1788 break;
1789 }
1790 /* fall through */
1791
1792 case -EWOULDBLOCK:
1595 /* 1793 /*
1596 * In future, the failed dm_thin_find_block above could 1794 * In future, the failed dm_thin_find_block above could
1597 * provide the hint to load the metadata into cache. 1795 * provide the hint to load the metadata into cache.
1598 */ 1796 */
1599 case -EWOULDBLOCK:
1600 thin_defer_bio(tc, bio); 1797 thin_defer_bio(tc, bio);
1601 r = DM_MAPIO_SUBMITTED; 1798 r = DM_MAPIO_SUBMITTED;
1602 break; 1799 break;
1800
1801 default:
1802 /*
1803 * Must always call bio_io_error on failure.
1804 * dm_thin_find_block can fail with -EINVAL if the
1805 * pool is switched to fail-io mode.
1806 */
1807 bio_io_error(bio);
1808 r = DM_MAPIO_SUBMITTED;
1809 break;
1603 } 1810 }
1604 1811
1605 return r; 1812 return r;
@@ -1636,15 +1843,26 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1636{ 1843{
1637 struct pool_c *pt = ti->private; 1844 struct pool_c *pt = ti->private;
1638 1845
1846 /*
1847 * We want to make sure that degraded pools are never upgraded.
1848 */
1849 enum pool_mode old_mode = pool->pf.mode;
1850 enum pool_mode new_mode = pt->pf.mode;
1851
1852 if (old_mode > new_mode)
1853 new_mode = old_mode;
1854
1639 pool->ti = ti; 1855 pool->ti = ti;
1640 pool->low_water_blocks = pt->low_water_blocks; 1856 pool->low_water_blocks = pt->low_water_blocks;
1641 pool->pf = pt->pf; 1857 pool->pf = pt->pf;
1858 set_pool_mode(pool, new_mode);
1642 1859
1643 /* 1860 /*
1644 * If discard_passdown was enabled verify that the data device 1861 * If discard_passdown was enabled verify that the data device
1645 * supports discards. Disable discard_passdown if not; otherwise 1862 * supports discards. Disable discard_passdown if not; otherwise
1646 * -EOPNOTSUPP will be returned. 1863 * -EOPNOTSUPP will be returned.
1647 */ 1864 */
1865 /* FIXME: pull this out into a sep fn. */
1648 if (pt->pf.discard_passdown) { 1866 if (pt->pf.discard_passdown) {
1649 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1867 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1650 if (!q || !blk_queue_discard(q)) { 1868 if (!q || !blk_queue_discard(q)) {
@@ -1670,6 +1888,7 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1670/* Initialize pool features. */ 1888/* Initialize pool features. */
1671static void pool_features_init(struct pool_features *pf) 1889static void pool_features_init(struct pool_features *pf)
1672{ 1890{
1891 pf->mode = PM_WRITE;
1673 pf->zero_new_blocks = 1; 1892 pf->zero_new_blocks = 1;
1674 pf->discard_enabled = 1; 1893 pf->discard_enabled = 1;
1675 pf->discard_passdown = 1; 1894 pf->discard_passdown = 1;
@@ -1700,14 +1919,16 @@ static struct kmem_cache *_endio_hook_cache;
1700 1919
1701static struct pool *pool_create(struct mapped_device *pool_md, 1920static struct pool *pool_create(struct mapped_device *pool_md,
1702 struct block_device *metadata_dev, 1921 struct block_device *metadata_dev,
1703 unsigned long block_size, char **error) 1922 unsigned long block_size,
1923 int read_only, char **error)
1704{ 1924{
1705 int r; 1925 int r;
1706 void *err_p; 1926 void *err_p;
1707 struct pool *pool; 1927 struct pool *pool;
1708 struct dm_pool_metadata *pmd; 1928 struct dm_pool_metadata *pmd;
1929 bool format_device = read_only ? false : true;
1709 1930
1710 pmd = dm_pool_metadata_open(metadata_dev, block_size); 1931 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
1711 if (IS_ERR(pmd)) { 1932 if (IS_ERR(pmd)) {
1712 *error = "Error creating metadata object"; 1933 *error = "Error creating metadata object";
1713 return (struct pool *)pmd; 1934 return (struct pool *)pmd;
@@ -1722,8 +1943,10 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1722 1943
1723 pool->pmd = pmd; 1944 pool->pmd = pmd;
1724 pool->sectors_per_block = block_size; 1945 pool->sectors_per_block = block_size;
1725 pool->block_shift = ffs(block_size) - 1; 1946 if (block_size & (block_size - 1))
1726 pool->offset_mask = block_size - 1; 1947 pool->sectors_per_block_shift = -1;
1948 else
1949 pool->sectors_per_block_shift = __ffs(block_size);
1727 pool->low_water_blocks = 0; 1950 pool->low_water_blocks = 0;
1728 pool_features_init(&pool->pf); 1951 pool_features_init(&pool->pf);
1729 pool->prison = prison_create(PRISON_CELLS); 1952 pool->prison = prison_create(PRISON_CELLS);
@@ -1822,25 +2045,29 @@ static void __pool_dec(struct pool *pool)
1822 2045
1823static struct pool *__pool_find(struct mapped_device *pool_md, 2046static struct pool *__pool_find(struct mapped_device *pool_md,
1824 struct block_device *metadata_dev, 2047 struct block_device *metadata_dev,
1825 unsigned long block_size, char **error, 2048 unsigned long block_size, int read_only,
1826 int *created) 2049 char **error, int *created)
1827{ 2050{
1828 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 2051 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1829 2052
1830 if (pool) { 2053 if (pool) {
1831 if (pool->pool_md != pool_md) 2054 if (pool->pool_md != pool_md) {
2055 *error = "metadata device already in use by a pool";
1832 return ERR_PTR(-EBUSY); 2056 return ERR_PTR(-EBUSY);
2057 }
1833 __pool_inc(pool); 2058 __pool_inc(pool);
1834 2059
1835 } else { 2060 } else {
1836 pool = __pool_table_lookup(pool_md); 2061 pool = __pool_table_lookup(pool_md);
1837 if (pool) { 2062 if (pool) {
1838 if (pool->md_dev != metadata_dev) 2063 if (pool->md_dev != metadata_dev) {
2064 *error = "different pool cannot replace a pool";
1839 return ERR_PTR(-EINVAL); 2065 return ERR_PTR(-EINVAL);
2066 }
1840 __pool_inc(pool); 2067 __pool_inc(pool);
1841 2068
1842 } else { 2069 } else {
1843 pool = pool_create(pool_md, metadata_dev, block_size, error); 2070 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
1844 *created = 1; 2071 *created = 1;
1845 } 2072 }
1846 } 2073 }
@@ -1891,19 +2118,23 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1891 arg_name = dm_shift_arg(as); 2118 arg_name = dm_shift_arg(as);
1892 argc--; 2119 argc--;
1893 2120
1894 if (!strcasecmp(arg_name, "skip_block_zeroing")) { 2121 if (!strcasecmp(arg_name, "skip_block_zeroing"))
1895 pf->zero_new_blocks = 0; 2122 pf->zero_new_blocks = 0;
1896 continue; 2123
1897 } else if (!strcasecmp(arg_name, "ignore_discard")) { 2124 else if (!strcasecmp(arg_name, "ignore_discard"))
1898 pf->discard_enabled = 0; 2125 pf->discard_enabled = 0;
1899 continue; 2126
1900 } else if (!strcasecmp(arg_name, "no_discard_passdown")) { 2127 else if (!strcasecmp(arg_name, "no_discard_passdown"))
1901 pf->discard_passdown = 0; 2128 pf->discard_passdown = 0;
1902 continue;
1903 }
1904 2129
1905 ti->error = "Unrecognised pool feature requested"; 2130 else if (!strcasecmp(arg_name, "read_only"))
1906 r = -EINVAL; 2131 pf->mode = PM_READ_ONLY;
2132
2133 else {
2134 ti->error = "Unrecognised pool feature requested";
2135 r = -EINVAL;
2136 break;
2137 }
1907 } 2138 }
1908 2139
1909 return r; 2140 return r;
@@ -1967,7 +2198,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1967 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 2198 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1968 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2199 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1969 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2200 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1970 !is_power_of_2(block_size)) { 2201 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1971 ti->error = "Invalid block size"; 2202 ti->error = "Invalid block size";
1972 r = -EINVAL; 2203 r = -EINVAL;
1973 goto out; 2204 goto out;
@@ -1996,7 +2227,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1996 } 2227 }
1997 2228
1998 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 2229 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1999 block_size, &ti->error, &pool_created); 2230 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
2000 if (IS_ERR(pool)) { 2231 if (IS_ERR(pool)) {
2001 r = PTR_ERR(pool); 2232 r = PTR_ERR(pool);
2002 goto out_free_pt; 2233 goto out_free_pt;
@@ -2014,6 +2245,15 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2014 goto out_flags_changed; 2245 goto out_flags_changed;
2015 } 2246 }
2016 2247
2248 /*
2249 * The block layer requires discard_granularity to be a power of 2.
2250 */
2251 if (pf.discard_enabled && !is_power_of_2(block_size)) {
2252 ti->error = "Discard support must be disabled when the block size is not a power of 2";
2253 r = -EINVAL;
2254 goto out_flags_changed;
2255 }
2256
2017 pt->pool = pool; 2257 pt->pool = pool;
2018 pt->ti = ti; 2258 pt->ti = ti;
2019 pt->metadata_dev = metadata_dev; 2259 pt->metadata_dev = metadata_dev;
@@ -2033,7 +2273,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2033 * stacking of discard limits (this keeps the pool and 2273 * stacking of discard limits (this keeps the pool and
2034 * thin devices' discard limits consistent). 2274 * thin devices' discard limits consistent).
2035 */ 2275 */
2036 ti->discards_supported = 1; 2276 ti->discards_supported = true;
2037 } 2277 }
2038 ti->private = pt; 2278 ti->private = pt;
2039 2279
@@ -2093,7 +2333,8 @@ static int pool_preresume(struct dm_target *ti)
2093 int r; 2333 int r;
2094 struct pool_c *pt = ti->private; 2334 struct pool_c *pt = ti->private;
2095 struct pool *pool = pt->pool; 2335 struct pool *pool = pt->pool;
2096 dm_block_t data_size, sb_data_size; 2336 sector_t data_size = ti->len;
2337 dm_block_t sb_data_size;
2097 2338
2098 /* 2339 /*
2099 * Take control of the pool object. 2340 * Take control of the pool object.
@@ -2102,7 +2343,8 @@ static int pool_preresume(struct dm_target *ti)
2102 if (r) 2343 if (r)
2103 return r; 2344 return r;
2104 2345
2105 data_size = ti->len >> pool->block_shift; 2346 (void) sector_div(data_size, pool->sectors_per_block);
2347
2106 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2348 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2107 if (r) { 2349 if (r) {
2108 DMERR("failed to retrieve data device size"); 2350 DMERR("failed to retrieve data device size");
@@ -2111,22 +2353,19 @@ static int pool_preresume(struct dm_target *ti)
2111 2353
2112 if (data_size < sb_data_size) { 2354 if (data_size < sb_data_size) {
2113 DMERR("pool target too small, is %llu blocks (expected %llu)", 2355 DMERR("pool target too small, is %llu blocks (expected %llu)",
2114 data_size, sb_data_size); 2356 (unsigned long long)data_size, sb_data_size);
2115 return -EINVAL; 2357 return -EINVAL;
2116 2358
2117 } else if (data_size > sb_data_size) { 2359 } else if (data_size > sb_data_size) {
2118 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2360 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2119 if (r) { 2361 if (r) {
2120 DMERR("failed to resize data device"); 2362 DMERR("failed to resize data device");
2363 /* FIXME Stricter than necessary: Rollback transaction instead here */
2364 set_pool_mode(pool, PM_READ_ONLY);
2121 return r; 2365 return r;
2122 } 2366 }
2123 2367
2124 r = dm_pool_commit_metadata(pool->pmd); 2368 (void) commit_or_fallback(pool);
2125 if (r) {
2126 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2127 __func__, r);
2128 return r;
2129 }
2130 } 2369 }
2131 2370
2132 return 0; 2371 return 0;
@@ -2149,19 +2388,12 @@ static void pool_resume(struct dm_target *ti)
2149 2388
2150static void pool_postsuspend(struct dm_target *ti) 2389static void pool_postsuspend(struct dm_target *ti)
2151{ 2390{
2152 int r;
2153 struct pool_c *pt = ti->private; 2391 struct pool_c *pt = ti->private;
2154 struct pool *pool = pt->pool; 2392 struct pool *pool = pt->pool;
2155 2393
2156 cancel_delayed_work(&pool->waker); 2394 cancel_delayed_work(&pool->waker);
2157 flush_workqueue(pool->wq); 2395 flush_workqueue(pool->wq);
2158 2396 (void) commit_or_fallback(pool);
2159 r = dm_pool_commit_metadata(pool->pmd);
2160 if (r < 0) {
2161 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2162 __func__, r);
2163 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
2164 }
2165} 2397}
2166 2398
2167static int check_arg_count(unsigned argc, unsigned args_required) 2399static int check_arg_count(unsigned argc, unsigned args_required)
@@ -2295,12 +2527,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct
2295 if (r) 2527 if (r)
2296 return r; 2528 return r;
2297 2529
2298 r = dm_pool_commit_metadata(pool->pmd); 2530 (void) commit_or_fallback(pool);
2299 if (r) {
2300 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2301 __func__, r);
2302 return r;
2303 }
2304 2531
2305 r = dm_pool_reserve_metadata_snap(pool->pmd); 2532 r = dm_pool_reserve_metadata_snap(pool->pmd);
2306 if (r) 2533 if (r)
@@ -2361,25 +2588,41 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2361 else 2588 else
2362 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2589 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2363 2590
2364 if (!r) { 2591 if (!r)
2365 r = dm_pool_commit_metadata(pool->pmd); 2592 (void) commit_or_fallback(pool);
2366 if (r)
2367 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
2368 argv[0], r);
2369 }
2370 2593
2371 return r; 2594 return r;
2372} 2595}
2373 2596
2597static void emit_flags(struct pool_features *pf, char *result,
2598 unsigned sz, unsigned maxlen)
2599{
2600 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2601 !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
2602 DMEMIT("%u ", count);
2603
2604 if (!pf->zero_new_blocks)
2605 DMEMIT("skip_block_zeroing ");
2606
2607 if (!pf->discard_enabled)
2608 DMEMIT("ignore_discard ");
2609
2610 if (!pf->discard_passdown)
2611 DMEMIT("no_discard_passdown ");
2612
2613 if (pf->mode == PM_READ_ONLY)
2614 DMEMIT("read_only ");
2615}
2616
2374/* 2617/*
2375 * Status line is: 2618 * Status line is:
2376 * <transaction id> <used metadata sectors>/<total metadata sectors> 2619 * <transaction id> <used metadata sectors>/<total metadata sectors>
2377 * <used data sectors>/<total data sectors> <held metadata root> 2620 * <used data sectors>/<total data sectors> <held metadata root>
2378 */ 2621 */
2379static int pool_status(struct dm_target *ti, status_type_t type, 2622static int pool_status(struct dm_target *ti, status_type_t type,
2380 char *result, unsigned maxlen) 2623 unsigned status_flags, char *result, unsigned maxlen)
2381{ 2624{
2382 int r, count; 2625 int r;
2383 unsigned sz = 0; 2626 unsigned sz = 0;
2384 uint64_t transaction_id; 2627 uint64_t transaction_id;
2385 dm_block_t nr_free_blocks_data; 2628 dm_block_t nr_free_blocks_data;
@@ -2394,6 +2637,15 @@ static int pool_status(struct dm_target *ti, status_type_t type,
2394 2637
2395 switch (type) { 2638 switch (type) {
2396 case STATUSTYPE_INFO: 2639 case STATUSTYPE_INFO:
2640 if (get_pool_mode(pool) == PM_FAIL) {
2641 DMEMIT("Fail");
2642 break;
2643 }
2644
2645 /* Commit to ensure statistics aren't out-of-date */
2646 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2647 (void) commit_or_fallback(pool);
2648
2397 r = dm_pool_get_metadata_transaction_id(pool->pmd, 2649 r = dm_pool_get_metadata_transaction_id(pool->pmd,
2398 &transaction_id); 2650 &transaction_id);
2399 if (r) 2651 if (r)
@@ -2429,9 +2681,19 @@ static int pool_status(struct dm_target *ti, status_type_t type,
2429 (unsigned long long)nr_blocks_data); 2681 (unsigned long long)nr_blocks_data);
2430 2682
2431 if (held_root) 2683 if (held_root)
2432 DMEMIT("%llu", held_root); 2684 DMEMIT("%llu ", held_root);
2685 else
2686 DMEMIT("- ");
2687
2688 if (pool->pf.mode == PM_READ_ONLY)
2689 DMEMIT("ro ");
2690 else
2691 DMEMIT("rw ");
2692
2693 if (pool->pf.discard_enabled && pool->pf.discard_passdown)
2694 DMEMIT("discard_passdown");
2433 else 2695 else
2434 DMEMIT("-"); 2696 DMEMIT("no_discard_passdown");
2435 2697
2436 break; 2698 break;
2437 2699
@@ -2441,20 +2703,7 @@ static int pool_status(struct dm_target *ti, status_type_t type,
2441 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 2703 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2442 (unsigned long)pool->sectors_per_block, 2704 (unsigned long)pool->sectors_per_block,
2443 (unsigned long long)pt->low_water_blocks); 2705 (unsigned long long)pt->low_water_blocks);
2444 2706 emit_flags(&pt->pf, result, sz, maxlen);
2445 count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
2446 !pt->pf.discard_passdown;
2447 DMEMIT("%u ", count);
2448
2449 if (!pool->pf.zero_new_blocks)
2450 DMEMIT("skip_block_zeroing ");
2451
2452 if (!pool->pf.discard_enabled)
2453 DMEMIT("ignore_discard ");
2454
2455 if (!pt->pf.discard_passdown)
2456 DMEMIT("no_discard_passdown ");
2457
2458 break; 2707 break;
2459 } 2708 }
2460 2709
@@ -2492,7 +2741,8 @@ static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
2492 2741
2493 /* 2742 /*
2494 * This is just a hint, and not enforced. We have to cope with 2743 * This is just a hint, and not enforced. We have to cope with
2495 * bios that overlap 2 blocks. 2744 * bios that cover a block partially. A discard that spans a block
2745 * boundary is not sent to this target.
2496 */ 2746 */
2497 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 2747 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2498 limits->discard_zeroes_data = pool->pf.zero_new_blocks; 2748 limits->discard_zeroes_data = pool->pf.zero_new_blocks;
@@ -2513,7 +2763,7 @@ static struct target_type pool_target = {
2513 .name = "thin-pool", 2763 .name = "thin-pool",
2514 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2764 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2515 DM_TARGET_IMMUTABLE, 2765 DM_TARGET_IMMUTABLE,
2516 .version = {1, 2, 0}, 2766 .version = {1, 3, 0},
2517 .module = THIS_MODULE, 2767 .module = THIS_MODULE,
2518 .ctr = pool_ctr, 2768 .ctr = pool_ctr,
2519 .dtr = pool_dtr, 2769 .dtr = pool_dtr,
@@ -2618,20 +2868,31 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2618 } 2868 }
2619 __pool_inc(tc->pool); 2869 __pool_inc(tc->pool);
2620 2870
2871 if (get_pool_mode(tc->pool) == PM_FAIL) {
2872 ti->error = "Couldn't open thin device, Pool is in fail mode";
2873 goto bad_thin_open;
2874 }
2875
2621 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 2876 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2622 if (r) { 2877 if (r) {
2623 ti->error = "Couldn't open thin internal device"; 2878 ti->error = "Couldn't open thin internal device";
2624 goto bad_thin_open; 2879 goto bad_thin_open;
2625 } 2880 }
2626 2881
2627 ti->split_io = tc->pool->sectors_per_block; 2882 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2883 if (r)
2884 goto bad_thin_open;
2885
2628 ti->num_flush_requests = 1; 2886 ti->num_flush_requests = 1;
2887 ti->flush_supported = true;
2629 2888
2630 /* In case the pool supports discards, pass them on. */ 2889 /* In case the pool supports discards, pass them on. */
2631 if (tc->pool->pf.discard_enabled) { 2890 if (tc->pool->pf.discard_enabled) {
2632 ti->discards_supported = 1; 2891 ti->discards_supported = true;
2633 ti->num_discard_requests = 1; 2892 ti->num_discard_requests = 1;
2634 ti->discard_zeroes_data_unsupported = 1; 2893 ti->discard_zeroes_data_unsupported = true;
2894 /* Discard requests must be split on a block boundary */
2895 ti->split_discard_requests = true;
2635 } 2896 }
2636 2897
2637 dm_put(pool_md); 2898 dm_put(pool_md);
@@ -2712,7 +2973,7 @@ static void thin_postsuspend(struct dm_target *ti)
2712 * <nr mapped sectors> <highest mapped sector> 2973 * <nr mapped sectors> <highest mapped sector>
2713 */ 2974 */
2714static int thin_status(struct dm_target *ti, status_type_t type, 2975static int thin_status(struct dm_target *ti, status_type_t type,
2715 char *result, unsigned maxlen) 2976 unsigned status_flags, char *result, unsigned maxlen)
2716{ 2977{
2717 int r; 2978 int r;
2718 ssize_t sz = 0; 2979 ssize_t sz = 0;
@@ -2720,6 +2981,11 @@ static int thin_status(struct dm_target *ti, status_type_t type,
2720 char buf[BDEVNAME_SIZE]; 2981 char buf[BDEVNAME_SIZE];
2721 struct thin_c *tc = ti->private; 2982 struct thin_c *tc = ti->private;
2722 2983
2984 if (get_pool_mode(tc->pool) == PM_FAIL) {
2985 DMEMIT("Fail");
2986 return 0;
2987 }
2988
2723 if (!tc->td) 2989 if (!tc->td)
2724 DMEMIT("-"); 2990 DMEMIT("-");
2725 else { 2991 else {
@@ -2757,19 +3023,21 @@ static int thin_status(struct dm_target *ti, status_type_t type,
2757static int thin_iterate_devices(struct dm_target *ti, 3023static int thin_iterate_devices(struct dm_target *ti,
2758 iterate_devices_callout_fn fn, void *data) 3024 iterate_devices_callout_fn fn, void *data)
2759{ 3025{
2760 dm_block_t blocks; 3026 sector_t blocks;
2761 struct thin_c *tc = ti->private; 3027 struct thin_c *tc = ti->private;
3028 struct pool *pool = tc->pool;
2762 3029
2763 /* 3030 /*
2764 * We can't call dm_pool_get_data_dev_size() since that blocks. So 3031 * We can't call dm_pool_get_data_dev_size() since that blocks. So
2765 * we follow a more convoluted path through to the pool's target. 3032 * we follow a more convoluted path through to the pool's target.
2766 */ 3033 */
2767 if (!tc->pool->ti) 3034 if (!pool->ti)
2768 return 0; /* nothing is bound */ 3035 return 0; /* nothing is bound */
2769 3036
2770 blocks = tc->pool->ti->len >> tc->pool->block_shift; 3037 blocks = pool->ti->len;
3038 (void) sector_div(blocks, pool->sectors_per_block);
2771 if (blocks) 3039 if (blocks)
2772 return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); 3040 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
2773 3041
2774 return 0; 3042 return 0;
2775} 3043}
@@ -2786,7 +3054,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2786 3054
2787static struct target_type thin_target = { 3055static struct target_type thin_target = {
2788 .name = "thin", 3056 .name = "thin",
2789 .version = {1, 1, 0}, 3057 .version = {1, 3, 0},
2790 .module = THIS_MODULE, 3058 .module = THIS_MODULE,
2791 .ctr = thin_ctr, 3059 .ctr = thin_ctr,
2792 .dtr = thin_dtr, 3060 .dtr = thin_dtr,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index fa365d39b612..254d19268ad2 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -515,7 +515,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio,
515 * Status: V (valid) or C (corruption found) 515 * Status: V (valid) or C (corruption found)
516 */ 516 */
517static int verity_status(struct dm_target *ti, status_type_t type, 517static int verity_status(struct dm_target *ti, status_type_t type,
518 char *result, unsigned maxlen) 518 unsigned status_flags, char *result, unsigned maxlen)
519{ 519{
520 struct dm_verity *v = ti->private; 520 struct dm_verity *v = ti->private;
521 unsigned sz = 0; 521 unsigned sz = 0;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e24143cc2040..4e09b6ff5b49 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -968,22 +968,41 @@ static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti
968static sector_t max_io_len(sector_t sector, struct dm_target *ti) 968static sector_t max_io_len(sector_t sector, struct dm_target *ti)
969{ 969{
970 sector_t len = max_io_len_target_boundary(sector, ti); 970 sector_t len = max_io_len_target_boundary(sector, ti);
971 sector_t offset, max_len;
971 972
972 /* 973 /*
973 * Does the target need to split even further ? 974 * Does the target need to split even further?
974 */ 975 */
975 if (ti->split_io) { 976 if (ti->max_io_len) {
976 sector_t boundary; 977 offset = dm_target_offset(ti, sector);
977 sector_t offset = dm_target_offset(ti, sector); 978 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
978 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 979 max_len = sector_div(offset, ti->max_io_len);
979 - offset; 980 else
980 if (len > boundary) 981 max_len = offset & (ti->max_io_len - 1);
981 len = boundary; 982 max_len = ti->max_io_len - max_len;
983
984 if (len > max_len)
985 len = max_len;
982 } 986 }
983 987
984 return len; 988 return len;
985} 989}
986 990
991int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
992{
993 if (len > UINT_MAX) {
994 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
995 (unsigned long long)len, UINT_MAX);
996 ti->error = "Maximum size of target IO is too large";
997 return -EINVAL;
998 }
999
1000 ti->max_io_len = (uint32_t) len;
1001
1002 return 0;
1003}
1004EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1005
987static void __map_bio(struct dm_target *ti, struct bio *clone, 1006static void __map_bio(struct dm_target *ti, struct bio *clone,
988 struct dm_target_io *tio) 1007 struct dm_target_io *tio)
989{ 1008{
@@ -1196,7 +1215,10 @@ static int __clone_and_map_discard(struct clone_info *ci)
1196 if (!ti->num_discard_requests) 1215 if (!ti->num_discard_requests)
1197 return -EOPNOTSUPP; 1216 return -EOPNOTSUPP;
1198 1217
1199 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1218 if (!ti->split_discard_requests)
1219 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1220 else
1221 len = min(ci->sector_count, max_io_len(ci->sector, ti));
1200 1222
1201 __issue_target_requests(ci, ti, ti->num_discard_requests, len); 1223 __issue_target_requests(ci, ti, ti->num_discard_requests, len);
1202 1224
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b7dacd59d8d7..52eef493d266 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -23,6 +23,11 @@
23#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) 23#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
24 24
25/* 25/*
26 * Status feature flags
27 */
28#define DM_STATUS_NOFLUSH_FLAG (1 << 0)
29
30/*
26 * Type of table and mapped_device's mempool 31 * Type of table and mapped_device's mempool
27 */ 32 */
28#define DM_TYPE_NONE 0 33#define DM_TYPE_NONE 0
diff --git a/drivers/md/md.c b/drivers/md/md.c
index db02d2efb76f..fcd098794d37 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3893,17 +3893,13 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3893 break; 3893 break;
3894 case clear: 3894 case clear:
3895 /* stopping an active array */ 3895 /* stopping an active array */
3896 if (atomic_read(&mddev->openers) > 0)
3897 return -EBUSY;
3898 err = do_md_stop(mddev, 0, NULL); 3896 err = do_md_stop(mddev, 0, NULL);
3899 break; 3897 break;
3900 case inactive: 3898 case inactive:
3901 /* stopping an active array */ 3899 /* stopping an active array */
3902 if (mddev->pers) { 3900 if (mddev->pers)
3903 if (atomic_read(&mddev->openers) > 0)
3904 return -EBUSY;
3905 err = do_md_stop(mddev, 2, NULL); 3901 err = do_md_stop(mddev, 2, NULL);
3906 } else 3902 else
3907 err = 0; /* already inactive */ 3903 err = 0; /* already inactive */
3908 break; 3904 break;
3909 case suspended: 3905 case suspended:
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile
index cfa95f662230..d8e7cb767c1e 100644
--- a/drivers/md/persistent-data/Makefile
+++ b/drivers/md/persistent-data/Makefile
@@ -1,7 +1,6 @@
1obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o 1obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
2dm-persistent-data-objs := \ 2dm-persistent-data-objs := \
3 dm-block-manager.o \ 3 dm-block-manager.o \
4 dm-space-map-checker.o \
5 dm-space-map-common.o \ 4 dm-space-map-common.o \
6 dm-space-map-disk.o \ 5 dm-space-map-disk.o \
7 dm-space-map-metadata.o \ 6 dm-space-map-metadata.o \
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 0317ecdc6e53..5ba277768d99 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -325,11 +325,6 @@ static struct dm_buffer *to_buffer(struct dm_block *b)
325 return (struct dm_buffer *) b; 325 return (struct dm_buffer *) b;
326} 326}
327 327
328static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm)
329{
330 return (struct dm_bufio_client *) bm;
331}
332
333dm_block_t dm_block_location(struct dm_block *b) 328dm_block_t dm_block_location(struct dm_block *b)
334{ 329{
335 return dm_bufio_get_block_number(to_buffer(b)); 330 return dm_bufio_get_block_number(to_buffer(b));
@@ -367,34 +362,60 @@ static void dm_block_manager_write_callback(struct dm_buffer *buf)
367/*---------------------------------------------------------------- 362/*----------------------------------------------------------------
368 * Public interface 363 * Public interface
369 *--------------------------------------------------------------*/ 364 *--------------------------------------------------------------*/
365struct dm_block_manager {
366 struct dm_bufio_client *bufio;
367 bool read_only:1;
368};
369
370struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, 370struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
371 unsigned block_size, 371 unsigned block_size,
372 unsigned cache_size, 372 unsigned cache_size,
373 unsigned max_held_per_thread) 373 unsigned max_held_per_thread)
374{ 374{
375 return (struct dm_block_manager *) 375 int r;
376 dm_bufio_client_create(bdev, block_size, max_held_per_thread, 376 struct dm_block_manager *bm;
377 sizeof(struct buffer_aux), 377
378 dm_block_manager_alloc_callback, 378 bm = kmalloc(sizeof(*bm), GFP_KERNEL);
379 dm_block_manager_write_callback); 379 if (!bm) {
380 r = -ENOMEM;
381 goto bad;
382 }
383
384 bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread,
385 sizeof(struct buffer_aux),
386 dm_block_manager_alloc_callback,
387 dm_block_manager_write_callback);
388 if (IS_ERR(bm->bufio)) {
389 r = PTR_ERR(bm->bufio);
390 kfree(bm);
391 goto bad;
392 }
393
394 bm->read_only = false;
395
396 return bm;
397
398bad:
399 return ERR_PTR(r);
380} 400}
381EXPORT_SYMBOL_GPL(dm_block_manager_create); 401EXPORT_SYMBOL_GPL(dm_block_manager_create);
382 402
383void dm_block_manager_destroy(struct dm_block_manager *bm) 403void dm_block_manager_destroy(struct dm_block_manager *bm)
384{ 404{
385 return dm_bufio_client_destroy(to_bufio(bm)); 405 dm_bufio_client_destroy(bm->bufio);
406 kfree(bm);
386} 407}
387EXPORT_SYMBOL_GPL(dm_block_manager_destroy); 408EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
388 409
389unsigned dm_bm_block_size(struct dm_block_manager *bm) 410unsigned dm_bm_block_size(struct dm_block_manager *bm)
390{ 411{
391 return dm_bufio_get_block_size(to_bufio(bm)); 412 return dm_bufio_get_block_size(bm->bufio);
392} 413}
393EXPORT_SYMBOL_GPL(dm_bm_block_size); 414EXPORT_SYMBOL_GPL(dm_bm_block_size);
394 415
395dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) 416dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
396{ 417{
397 return dm_bufio_get_device_size(to_bufio(bm)); 418 return dm_bufio_get_device_size(bm->bufio);
398} 419}
399 420
400static int dm_bm_validate_buffer(struct dm_block_manager *bm, 421static int dm_bm_validate_buffer(struct dm_block_manager *bm,
@@ -406,7 +427,7 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm,
406 int r; 427 int r;
407 if (!v) 428 if (!v)
408 return 0; 429 return 0;
409 r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm))); 430 r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio));
410 if (unlikely(r)) 431 if (unlikely(r))
411 return r; 432 return r;
412 aux->validator = v; 433 aux->validator = v;
@@ -430,7 +451,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
430 void *p; 451 void *p;
431 int r; 452 int r;
432 453
433 p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); 454 p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
434 if (unlikely(IS_ERR(p))) 455 if (unlikely(IS_ERR(p)))
435 return PTR_ERR(p); 456 return PTR_ERR(p);
436 457
@@ -463,7 +484,10 @@ int dm_bm_write_lock(struct dm_block_manager *bm,
463 void *p; 484 void *p;
464 int r; 485 int r;
465 486
466 p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); 487 if (bm->read_only)
488 return -EPERM;
489
490 p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
467 if (unlikely(IS_ERR(p))) 491 if (unlikely(IS_ERR(p)))
468 return PTR_ERR(p); 492 return PTR_ERR(p);
469 493
@@ -496,7 +520,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm,
496 void *p; 520 void *p;
497 int r; 521 int r;
498 522
499 p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result); 523 p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
500 if (unlikely(IS_ERR(p))) 524 if (unlikely(IS_ERR(p)))
501 return PTR_ERR(p); 525 return PTR_ERR(p);
502 if (unlikely(!p)) 526 if (unlikely(!p))
@@ -529,7 +553,10 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
529 struct buffer_aux *aux; 553 struct buffer_aux *aux;
530 void *p; 554 void *p;
531 555
532 p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result); 556 if (bm->read_only)
557 return -EPERM;
558
559 p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
533 if (unlikely(IS_ERR(p))) 560 if (unlikely(IS_ERR(p)))
534 return PTR_ERR(p); 561 return PTR_ERR(p);
535 562
@@ -547,6 +574,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
547 574
548 return 0; 575 return 0;
549} 576}
577EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero);
550 578
551int dm_bm_unlock(struct dm_block *b) 579int dm_bm_unlock(struct dm_block *b)
552{ 580{
@@ -565,45 +593,30 @@ int dm_bm_unlock(struct dm_block *b)
565} 593}
566EXPORT_SYMBOL_GPL(dm_bm_unlock); 594EXPORT_SYMBOL_GPL(dm_bm_unlock);
567 595
568int dm_bm_unlock_move(struct dm_block *b, dm_block_t n)
569{
570 struct buffer_aux *aux;
571
572 aux = dm_bufio_get_aux_data(to_buffer(b));
573
574 if (aux->write_locked) {
575 dm_bufio_mark_buffer_dirty(to_buffer(b));
576 bl_up_write(&aux->lock);
577 } else
578 bl_up_read(&aux->lock);
579
580 dm_bufio_release_move(to_buffer(b), n);
581 return 0;
582}
583
584int dm_bm_flush_and_unlock(struct dm_block_manager *bm, 596int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
585 struct dm_block *superblock) 597 struct dm_block *superblock)
586{ 598{
587 int r; 599 int r;
588 600
589 r = dm_bufio_write_dirty_buffers(to_bufio(bm)); 601 if (bm->read_only)
590 if (unlikely(r)) 602 return -EPERM;
591 return r; 603
592 r = dm_bufio_issue_flush(to_bufio(bm)); 604 r = dm_bufio_write_dirty_buffers(bm->bufio);
593 if (unlikely(r)) 605 if (unlikely(r)) {
606 dm_bm_unlock(superblock);
594 return r; 607 return r;
608 }
595 609
596 dm_bm_unlock(superblock); 610 dm_bm_unlock(superblock);
597 611
598 r = dm_bufio_write_dirty_buffers(to_bufio(bm)); 612 return dm_bufio_write_dirty_buffers(bm->bufio);
599 if (unlikely(r)) 613}
600 return r;
601 r = dm_bufio_issue_flush(to_bufio(bm));
602 if (unlikely(r))
603 return r;
604 614
605 return 0; 615void dm_bm_set_read_only(struct dm_block_manager *bm)
616{
617 bm->read_only = true;
606} 618}
619EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
607 620
608u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) 621u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
609{ 622{
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 924833d2dfa6..be5bff61be28 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -97,14 +97,6 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
97int dm_bm_unlock(struct dm_block *b); 97int dm_bm_unlock(struct dm_block *b);
98 98
99/* 99/*
100 * An optimisation; we often want to copy a block's contents to a new
101 * block. eg, as part of the shadowing operation. It's far better for
102 * bufio to do this move behind the scenes than hold 2 locks and memcpy the
103 * data.
104 */
105int dm_bm_unlock_move(struct dm_block *b, dm_block_t n);
106
107/*
108 * It's a common idiom to have a superblock that should be committed last. 100 * It's a common idiom to have a superblock that should be committed last.
109 * 101 *
110 * @superblock should be write-locked on entry. It will be unlocked during 102 * @superblock should be write-locked on entry. It will be unlocked during
@@ -116,6 +108,19 @@ int dm_bm_unlock_move(struct dm_block *b, dm_block_t n);
116int dm_bm_flush_and_unlock(struct dm_block_manager *bm, 108int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
117 struct dm_block *superblock); 109 struct dm_block *superblock);
118 110
111/*
112 * Switches the bm to a read only mode. Once read-only mode
113 * has been entered the following functions will return -EPERM.
114 *
115 * dm_bm_write_lock
116 * dm_bm_write_lock_zero
117 * dm_bm_flush_and_unlock
118 *
119 * Additionally you should not use dm_bm_unlock_move, however no error will
120 * be returned if you do.
121 */
122void dm_bm_set_read_only(struct dm_block_manager *bm);
123
119u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); 124u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
120 125
121/*----------------------------------------------------------------*/ 126/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c
deleted file mode 100644
index fc90c11620ad..000000000000
--- a/drivers/md/persistent-data/dm-space-map-checker.c
+++ /dev/null
@@ -1,446 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map-checker.h"
8
9#include <linux/device-mapper.h>
10#include <linux/export.h>
11#include <linux/vmalloc.h>
12
13#ifdef CONFIG_DM_DEBUG_SPACE_MAPS
14
15#define DM_MSG_PREFIX "space map checker"
16
17/*----------------------------------------------------------------*/
18
19struct count_array {
20 dm_block_t nr;
21 dm_block_t nr_free;
22
23 uint32_t *counts;
24};
25
26static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count)
27{
28 if (b >= ca->nr)
29 return -EINVAL;
30
31 *count = ca->counts[b];
32 return 0;
33}
34
35static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r)
36{
37 if (b >= ca->nr)
38 return -EINVAL;
39
40 *r = ca->counts[b] > 1;
41 return 0;
42}
43
44static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count)
45{
46 uint32_t old_count;
47
48 if (b >= ca->nr)
49 return -EINVAL;
50
51 old_count = ca->counts[b];
52
53 if (!count && old_count)
54 ca->nr_free++;
55
56 else if (count && !old_count)
57 ca->nr_free--;
58
59 ca->counts[b] = count;
60 return 0;
61}
62
63static int ca_inc_block(struct count_array *ca, dm_block_t b)
64{
65 if (b >= ca->nr)
66 return -EINVAL;
67
68 ca_set_count(ca, b, ca->counts[b] + 1);
69 return 0;
70}
71
72static int ca_dec_block(struct count_array *ca, dm_block_t b)
73{
74 if (b >= ca->nr)
75 return -EINVAL;
76
77 BUG_ON(ca->counts[b] == 0);
78 ca_set_count(ca, b, ca->counts[b] - 1);
79 return 0;
80}
81
82static int ca_create(struct count_array *ca, struct dm_space_map *sm)
83{
84 int r;
85 dm_block_t nr_blocks;
86
87 r = dm_sm_get_nr_blocks(sm, &nr_blocks);
88 if (r)
89 return r;
90
91 ca->nr = nr_blocks;
92 ca->nr_free = nr_blocks;
93
94 if (!nr_blocks)
95 ca->counts = NULL;
96 else {
97 ca->counts = vzalloc(sizeof(*ca->counts) * nr_blocks);
98 if (!ca->counts)
99 return -ENOMEM;
100 }
101
102 return 0;
103}
104
105static void ca_destroy(struct count_array *ca)
106{
107 vfree(ca->counts);
108}
109
110static int ca_load(struct count_array *ca, struct dm_space_map *sm)
111{
112 int r;
113 uint32_t count;
114 dm_block_t nr_blocks, i;
115
116 r = dm_sm_get_nr_blocks(sm, &nr_blocks);
117 if (r)
118 return r;
119
120 BUG_ON(ca->nr != nr_blocks);
121
122 DMWARN("Loading debug space map from disk. This may take some time");
123 for (i = 0; i < nr_blocks; i++) {
124 r = dm_sm_get_count(sm, i, &count);
125 if (r) {
126 DMERR("load failed");
127 return r;
128 }
129
130 ca_set_count(ca, i, count);
131 }
132 DMWARN("Load complete");
133
134 return 0;
135}
136
137static int ca_extend(struct count_array *ca, dm_block_t extra_blocks)
138{
139 dm_block_t nr_blocks = ca->nr + extra_blocks;
140 uint32_t *counts = vzalloc(sizeof(*counts) * nr_blocks);
141 if (!counts)
142 return -ENOMEM;
143
144 if (ca->counts) {
145 memcpy(counts, ca->counts, sizeof(*counts) * ca->nr);
146 ca_destroy(ca);
147 }
148 ca->nr = nr_blocks;
149 ca->nr_free += extra_blocks;
150 ca->counts = counts;
151 return 0;
152}
153
154static int ca_commit(struct count_array *old, struct count_array *new)
155{
156 if (old->nr != new->nr) {
157 BUG_ON(old->nr > new->nr);
158 ca_extend(old, new->nr - old->nr);
159 }
160
161 BUG_ON(old->nr != new->nr);
162 old->nr_free = new->nr_free;
163 memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr);
164 return 0;
165}
166
167/*----------------------------------------------------------------*/
168
169struct sm_checker {
170 struct dm_space_map sm;
171
172 struct count_array old_counts;
173 struct count_array counts;
174
175 struct dm_space_map *real_sm;
176};
177
178static void sm_checker_destroy(struct dm_space_map *sm)
179{
180 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
181
182 dm_sm_destroy(smc->real_sm);
183 ca_destroy(&smc->old_counts);
184 ca_destroy(&smc->counts);
185 kfree(smc);
186}
187
188static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
189{
190 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
191 int r = dm_sm_get_nr_blocks(smc->real_sm, count);
192 if (!r)
193 BUG_ON(smc->old_counts.nr != *count);
194 return r;
195}
196
197static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
198{
199 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
200 int r = dm_sm_get_nr_free(smc->real_sm, count);
201 if (!r) {
202 /*
203 * Slow, but we know it's correct.
204 */
205 dm_block_t b, n = 0;
206 for (b = 0; b < smc->old_counts.nr; b++)
207 if (smc->old_counts.counts[b] == 0 &&
208 smc->counts.counts[b] == 0)
209 n++;
210
211 if (n != *count)
212 DMERR("free block counts differ, checker %u, sm-disk:%u",
213 (unsigned) n, (unsigned) *count);
214 }
215 return r;
216}
217
218static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b)
219{
220 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
221 int r = dm_sm_new_block(smc->real_sm, b);
222
223 if (!r) {
224 BUG_ON(*b >= smc->old_counts.nr);
225 BUG_ON(smc->old_counts.counts[*b] != 0);
226 BUG_ON(*b >= smc->counts.nr);
227 BUG_ON(smc->counts.counts[*b] != 0);
228 ca_set_count(&smc->counts, *b, 1);
229 }
230
231 return r;
232}
233
234static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b)
235{
236 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
237 int r = dm_sm_inc_block(smc->real_sm, b);
238 int r2 = ca_inc_block(&smc->counts, b);
239 BUG_ON(r != r2);
240 return r;
241}
242
243static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b)
244{
245 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
246 int r = dm_sm_dec_block(smc->real_sm, b);
247 int r2 = ca_dec_block(&smc->counts, b);
248 BUG_ON(r != r2);
249 return r;
250}
251
252static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result)
253{
254 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
255 uint32_t result2 = 0;
256 int r = dm_sm_get_count(smc->real_sm, b, result);
257 int r2 = ca_get_count(&smc->counts, b, &result2);
258
259 BUG_ON(r != r2);
260 if (!r)
261 BUG_ON(*result != result2);
262 return r;
263}
264
265static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result)
266{
267 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
268 int result2 = 0;
269 int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result);
270 int r2 = ca_count_more_than_one(&smc->counts, b, &result2);
271
272 BUG_ON(r != r2);
273 if (!r)
274 BUG_ON(!(*result) && result2);
275 return r;
276}
277
278static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count)
279{
280 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
281 uint32_t old_rc;
282 int r = dm_sm_set_count(smc->real_sm, b, count);
283 int r2;
284
285 BUG_ON(b >= smc->counts.nr);
286 old_rc = smc->counts.counts[b];
287 r2 = ca_set_count(&smc->counts, b, count);
288 BUG_ON(r != r2);
289
290 return r;
291}
292
293static int sm_checker_commit(struct dm_space_map *sm)
294{
295 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
296 int r;
297
298 r = dm_sm_commit(smc->real_sm);
299 if (r)
300 return r;
301
302 r = ca_commit(&smc->old_counts, &smc->counts);
303 if (r)
304 return r;
305
306 return 0;
307}
308
309static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
310{
311 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
312 int r = dm_sm_extend(smc->real_sm, extra_blocks);
313 if (r)
314 return r;
315
316 return ca_extend(&smc->counts, extra_blocks);
317}
318
319static int sm_checker_root_size(struct dm_space_map *sm, size_t *result)
320{
321 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
322 return dm_sm_root_size(smc->real_sm, result);
323}
324
325static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
326{
327 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
328 return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len);
329}
330
331/*----------------------------------------------------------------*/
332
333static struct dm_space_map ops_ = {
334 .destroy = sm_checker_destroy,
335 .get_nr_blocks = sm_checker_get_nr_blocks,
336 .get_nr_free = sm_checker_get_nr_free,
337 .inc_block = sm_checker_inc_block,
338 .dec_block = sm_checker_dec_block,
339 .new_block = sm_checker_new_block,
340 .get_count = sm_checker_get_count,
341 .count_is_more_than_one = sm_checker_count_more_than_one,
342 .set_count = sm_checker_set_count,
343 .commit = sm_checker_commit,
344 .extend = sm_checker_extend,
345 .root_size = sm_checker_root_size,
346 .copy_root = sm_checker_copy_root
347};
348
349struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
350{
351 int r;
352 struct sm_checker *smc;
353
354 if (IS_ERR_OR_NULL(sm))
355 return ERR_PTR(-EINVAL);
356
357 smc = kmalloc(sizeof(*smc), GFP_KERNEL);
358 if (!smc)
359 return ERR_PTR(-ENOMEM);
360
361 memcpy(&smc->sm, &ops_, sizeof(smc->sm));
362 r = ca_create(&smc->old_counts, sm);
363 if (r) {
364 kfree(smc);
365 return ERR_PTR(r);
366 }
367
368 r = ca_create(&smc->counts, sm);
369 if (r) {
370 ca_destroy(&smc->old_counts);
371 kfree(smc);
372 return ERR_PTR(r);
373 }
374
375 smc->real_sm = sm;
376
377 r = ca_load(&smc->counts, sm);
378 if (r) {
379 ca_destroy(&smc->counts);
380 ca_destroy(&smc->old_counts);
381 kfree(smc);
382 return ERR_PTR(r);
383 }
384
385 r = ca_commit(&smc->old_counts, &smc->counts);
386 if (r) {
387 ca_destroy(&smc->counts);
388 ca_destroy(&smc->old_counts);
389 kfree(smc);
390 return ERR_PTR(r);
391 }
392
393 return &smc->sm;
394}
395EXPORT_SYMBOL_GPL(dm_sm_checker_create);
396
397struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
398{
399 int r;
400 struct sm_checker *smc;
401
402 if (IS_ERR_OR_NULL(sm))
403 return ERR_PTR(-EINVAL);
404
405 smc = kmalloc(sizeof(*smc), GFP_KERNEL);
406 if (!smc)
407 return ERR_PTR(-ENOMEM);
408
409 memcpy(&smc->sm, &ops_, sizeof(smc->sm));
410 r = ca_create(&smc->old_counts, sm);
411 if (r) {
412 kfree(smc);
413 return ERR_PTR(r);
414 }
415
416 r = ca_create(&smc->counts, sm);
417 if (r) {
418 ca_destroy(&smc->old_counts);
419 kfree(smc);
420 return ERR_PTR(r);
421 }
422
423 smc->real_sm = sm;
424 return &smc->sm;
425}
426EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
427
428/*----------------------------------------------------------------*/
429
430#else
431
432struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
433{
434 return sm;
435}
436EXPORT_SYMBOL_GPL(dm_sm_checker_create);
437
438struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
439{
440 return sm;
441}
442EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
443
444/*----------------------------------------------------------------*/
445
446#endif
diff --git a/drivers/md/persistent-data/dm-space-map-checker.h b/drivers/md/persistent-data/dm-space-map-checker.h
deleted file mode 100644
index 444dccf6688c..000000000000
--- a/drivers/md/persistent-data/dm-space-map-checker.h
+++ /dev/null
@@ -1,26 +0,0 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H
8#define SNAPSHOTS_SPACE_MAP_CHECKER_H
9
10#include "dm-space-map.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * This space map wraps a real on-disk space map, and verifies all of its
16 * operations. It uses a lot of memory, so only use if you have a specific
17 * problem that you're debugging.
18 *
19 * Ownership of @sm passes.
20 */
21struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm);
22struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm);
23
24/*----------------------------------------------------------------*/
25
26#endif
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index ff3beed6ad2d..d77602d63c83 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -224,6 +224,7 @@ static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
224 ll->nr_blocks = 0; 224 ll->nr_blocks = 0;
225 ll->bitmap_root = 0; 225 ll->bitmap_root = 0;
226 ll->ref_count_root = 0; 226 ll->ref_count_root = 0;
227 ll->bitmap_index_changed = false;
227 228
228 return 0; 229 return 0;
229} 230}
@@ -476,7 +477,15 @@ int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
476 477
477int sm_ll_commit(struct ll_disk *ll) 478int sm_ll_commit(struct ll_disk *ll)
478{ 479{
479 return ll->commit(ll); 480 int r = 0;
481
482 if (ll->bitmap_index_changed) {
483 r = ll->commit(ll);
484 if (!r)
485 ll->bitmap_index_changed = false;
486 }
487
488 return r;
480} 489}
481 490
482/*----------------------------------------------------------------*/ 491/*----------------------------------------------------------------*/
@@ -491,6 +500,7 @@ static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index,
491static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index, 500static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index,
492 struct disk_index_entry *ie) 501 struct disk_index_entry *ie)
493{ 502{
503 ll->bitmap_index_changed = true;
494 memcpy(ll->mi_le.index + index, ie, sizeof(*ie)); 504 memcpy(ll->mi_le.index + index, ie, sizeof(*ie));
495 return 0; 505 return 0;
496} 506}
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
index 8f220821a9a9..b3078d5eda0c 100644
--- a/drivers/md/persistent-data/dm-space-map-common.h
+++ b/drivers/md/persistent-data/dm-space-map-common.h
@@ -78,6 +78,7 @@ struct ll_disk {
78 open_index_fn open_index; 78 open_index_fn open_index;
79 max_index_entries_fn max_entries; 79 max_index_entries_fn max_entries;
80 commit_fn commit; 80 commit_fn commit;
81 bool bitmap_index_changed:1;
81}; 82};
82 83
83struct disk_sm_root { 84struct disk_sm_root {
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index 3d0ed5332883..f6d29e614ab7 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -4,7 +4,6 @@
4 * This file is released under the GPL. 4 * This file is released under the GPL.
5 */ 5 */
6 6
7#include "dm-space-map-checker.h"
8#include "dm-space-map-common.h" 7#include "dm-space-map-common.h"
9#include "dm-space-map-disk.h" 8#include "dm-space-map-disk.h"
10#include "dm-space-map.h" 9#include "dm-space-map.h"
@@ -252,9 +251,8 @@ static struct dm_space_map ops = {
252 .copy_root = sm_disk_copy_root 251 .copy_root = sm_disk_copy_root
253}; 252};
254 253
255static struct dm_space_map *dm_sm_disk_create_real( 254struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
256 struct dm_transaction_manager *tm, 255 dm_block_t nr_blocks)
257 dm_block_t nr_blocks)
258{ 256{
259 int r; 257 int r;
260 struct sm_disk *smd; 258 struct sm_disk *smd;
@@ -285,27 +283,10 @@ bad:
285 kfree(smd); 283 kfree(smd);
286 return ERR_PTR(r); 284 return ERR_PTR(r);
287} 285}
288
289struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
290 dm_block_t nr_blocks)
291{
292 struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks);
293 struct dm_space_map *smc;
294
295 if (IS_ERR_OR_NULL(sm))
296 return sm;
297
298 smc = dm_sm_checker_create_fresh(sm);
299 if (IS_ERR(smc))
300 dm_sm_destroy(sm);
301
302 return smc;
303}
304EXPORT_SYMBOL_GPL(dm_sm_disk_create); 286EXPORT_SYMBOL_GPL(dm_sm_disk_create);
305 287
306static struct dm_space_map *dm_sm_disk_open_real( 288struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
307 struct dm_transaction_manager *tm, 289 void *root_le, size_t len)
308 void *root_le, size_t len)
309{ 290{
310 int r; 291 int r;
311 struct sm_disk *smd; 292 struct sm_disk *smd;
@@ -332,13 +313,6 @@ bad:
332 kfree(smd); 313 kfree(smd);
333 return ERR_PTR(r); 314 return ERR_PTR(r);
334} 315}
335
336struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
337 void *root_le, size_t len)
338{
339 return dm_sm_checker_create(
340 dm_sm_disk_open_real(tm, root_le, len));
341}
342EXPORT_SYMBOL_GPL(dm_sm_disk_open); 316EXPORT_SYMBOL_GPL(dm_sm_disk_open);
343 317
344/*----------------------------------------------------------------*/ 318/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index e5604b32d91f..d247a35da3c6 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -5,7 +5,6 @@
5 */ 5 */
6#include "dm-transaction-manager.h" 6#include "dm-transaction-manager.h"
7#include "dm-space-map.h" 7#include "dm-space-map.h"
8#include "dm-space-map-checker.h"
9#include "dm-space-map-disk.h" 8#include "dm-space-map-disk.h"
10#include "dm-space-map-metadata.h" 9#include "dm-space-map-metadata.h"
11#include "dm-persistent-data-internal.h" 10#include "dm-persistent-data-internal.h"
@@ -220,13 +219,24 @@ static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
220 if (r < 0) 219 if (r < 0)
221 return r; 220 return r;
222 221
223 r = dm_bm_unlock_move(orig_block, new); 222 /*
224 if (r < 0) { 223 * It would be tempting to use dm_bm_unlock_move here, but some
224 * code, such as the space maps, keeps using the old data structures
225 * secure in the knowledge they won't be changed until the next
226 * transaction. Using unlock_move would force a synchronous read
227 * since the old block would no longer be in the cache.
228 */
229 r = dm_bm_write_lock_zero(tm->bm, new, v, result);
230 if (r) {
225 dm_bm_unlock(orig_block); 231 dm_bm_unlock(orig_block);
226 return r; 232 return r;
227 } 233 }
228 234
229 return dm_bm_write_lock(tm->bm, new, v, result); 235 memcpy(dm_block_data(*result), dm_block_data(orig_block),
236 dm_bm_block_size(tm->bm));
237
238 dm_bm_unlock(orig_block);
239 return r;
230} 240}
231 241
232int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, 242int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
@@ -311,98 +321,61 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
311 321
312static int dm_tm_create_internal(struct dm_block_manager *bm, 322static int dm_tm_create_internal(struct dm_block_manager *bm,
313 dm_block_t sb_location, 323 dm_block_t sb_location,
314 struct dm_block_validator *sb_validator,
315 size_t root_offset, size_t root_max_len,
316 struct dm_transaction_manager **tm, 324 struct dm_transaction_manager **tm,
317 struct dm_space_map **sm, 325 struct dm_space_map **sm,
318 struct dm_block **sblock, 326 int create,
319 int create) 327 void *sm_root, size_t sm_len)
320{ 328{
321 int r; 329 int r;
322 struct dm_space_map *inner;
323 330
324 inner = dm_sm_metadata_init(); 331 *sm = dm_sm_metadata_init();
325 if (IS_ERR(inner)) 332 if (IS_ERR(*sm))
326 return PTR_ERR(inner); 333 return PTR_ERR(*sm);
327 334
328 *tm = dm_tm_create(bm, inner); 335 *tm = dm_tm_create(bm, *sm);
329 if (IS_ERR(*tm)) { 336 if (IS_ERR(*tm)) {
330 dm_sm_destroy(inner); 337 dm_sm_destroy(*sm);
331 return PTR_ERR(*tm); 338 return PTR_ERR(*tm);
332 } 339 }
333 340
334 if (create) { 341 if (create) {
335 r = dm_bm_write_lock_zero(dm_tm_get_bm(*tm), sb_location, 342 r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm),
336 sb_validator, sblock);
337 if (r < 0) {
338 DMERR("couldn't lock superblock");
339 goto bad1;
340 }
341
342 r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm),
343 sb_location); 343 sb_location);
344 if (r) { 344 if (r) {
345 DMERR("couldn't create metadata space map"); 345 DMERR("couldn't create metadata space map");
346 goto bad2; 346 goto bad;
347 }
348
349 *sm = dm_sm_checker_create(inner);
350 if (IS_ERR(*sm)) {
351 r = PTR_ERR(*sm);
352 goto bad2;
353 } 347 }
354 348
355 } else { 349 } else {
356 r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location, 350 r = dm_sm_metadata_open(*sm, *tm, sm_root, sm_len);
357 sb_validator, sblock);
358 if (r < 0) {
359 DMERR("couldn't lock superblock");
360 goto bad1;
361 }
362
363 r = dm_sm_metadata_open(inner, *tm,
364 dm_block_data(*sblock) + root_offset,
365 root_max_len);
366 if (r) { 351 if (r) {
367 DMERR("couldn't open metadata space map"); 352 DMERR("couldn't open metadata space map");
368 goto bad2; 353 goto bad;
369 }
370
371 *sm = dm_sm_checker_create(inner);
372 if (IS_ERR(*sm)) {
373 r = PTR_ERR(*sm);
374 goto bad2;
375 } 354 }
376 } 355 }
377 356
378 return 0; 357 return 0;
379 358
380bad2: 359bad:
381 dm_tm_unlock(*tm, *sblock);
382bad1:
383 dm_tm_destroy(*tm); 360 dm_tm_destroy(*tm);
384 dm_sm_destroy(inner); 361 dm_sm_destroy(*sm);
385 return r; 362 return r;
386} 363}
387 364
388int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, 365int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
389 struct dm_block_validator *sb_validator,
390 struct dm_transaction_manager **tm, 366 struct dm_transaction_manager **tm,
391 struct dm_space_map **sm, struct dm_block **sblock) 367 struct dm_space_map **sm)
392{ 368{
393 return dm_tm_create_internal(bm, sb_location, sb_validator, 369 return dm_tm_create_internal(bm, sb_location, tm, sm, 1, NULL, 0);
394 0, 0, tm, sm, sblock, 1);
395} 370}
396EXPORT_SYMBOL_GPL(dm_tm_create_with_sm); 371EXPORT_SYMBOL_GPL(dm_tm_create_with_sm);
397 372
398int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, 373int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
399 struct dm_block_validator *sb_validator, 374 void *sm_root, size_t root_len,
400 size_t root_offset, size_t root_max_len,
401 struct dm_transaction_manager **tm, 375 struct dm_transaction_manager **tm,
402 struct dm_space_map **sm, struct dm_block **sblock) 376 struct dm_space_map **sm)
403{ 377{
404 return dm_tm_create_internal(bm, sb_location, sb_validator, root_offset, 378 return dm_tm_create_internal(bm, sb_location, tm, sm, 0, sm_root, root_len);
405 root_max_len, tm, sm, sblock, 0);
406} 379}
407EXPORT_SYMBOL_GPL(dm_tm_open_with_sm); 380EXPORT_SYMBOL_GPL(dm_tm_open_with_sm);
408 381
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index 6da784871db4..b5b139076ca5 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -115,16 +115,17 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
115 * 115 *
116 * Returns a tm that has an open transaction to write the new disk sm. 116 * Returns a tm that has an open transaction to write the new disk sm.
117 * Caller should store the new sm root and commit. 117 * Caller should store the new sm root and commit.
118 *
119 * The superblock location is passed so the metadata space map knows it
120 * shouldn't be used.
118 */ 121 */
119int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, 122int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
120 struct dm_block_validator *sb_validator,
121 struct dm_transaction_manager **tm, 123 struct dm_transaction_manager **tm,
122 struct dm_space_map **sm, struct dm_block **sblock); 124 struct dm_space_map **sm);
123 125
124int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, 126int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
125 struct dm_block_validator *sb_validator, 127 void *sm_root, size_t root_len,
126 size_t root_offset, size_t root_max_len,
127 struct dm_transaction_manager **tm, 128 struct dm_transaction_manager **tm,
128 struct dm_space_map **sm, struct dm_block **sblock); 129 struct dm_space_map **sm);
129 130
130#endif /* _LINUX_DM_TRANSACTION_MANAGER_H */ 131#endif /* _LINUX_DM_TRANSACTION_MANAGER_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 9f01870d031c..611b5f797618 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -46,6 +46,20 @@
46 */ 46 */
47#define NR_RAID1_BIOS 256 47#define NR_RAID1_BIOS 256
48 48
49/* when we get a read error on a read-only array, we redirect to another
50 * device without failing the first device, or trying to over-write to
51 * correct the read error. To keep track of bad blocks on a per-bio
52 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
53 */
54#define IO_BLOCKED ((struct bio *)1)
55/* When we successfully write to a known bad-block, we need to remove the
56 * bad-block marking which must be done from process context. So we record
57 * the success by setting devs[n].bio to IO_MADE_GOOD
58 */
59#define IO_MADE_GOOD ((struct bio *)2)
60
61#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
62
49/* When there are this many requests queue to be written by 63/* When there are this many requests queue to be written by
50 * the raid1 thread, we become 'congested' to provide back-pressure 64 * the raid1 thread, we become 'congested' to provide back-pressure
51 * for writeback. 65 * for writeback.
@@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
483 const sector_t this_sector = r1_bio->sector; 497 const sector_t this_sector = r1_bio->sector;
484 int sectors; 498 int sectors;
485 int best_good_sectors; 499 int best_good_sectors;
486 int start_disk; 500 int best_disk, best_dist_disk, best_pending_disk;
487 int best_disk; 501 int has_nonrot_disk;
488 int i; 502 int disk;
489 sector_t best_dist; 503 sector_t best_dist;
504 unsigned int min_pending;
490 struct md_rdev *rdev; 505 struct md_rdev *rdev;
491 int choose_first; 506 int choose_first;
507 int choose_next_idle;
492 508
493 rcu_read_lock(); 509 rcu_read_lock();
494 /* 510 /*
@@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
499 retry: 515 retry:
500 sectors = r1_bio->sectors; 516 sectors = r1_bio->sectors;
501 best_disk = -1; 517 best_disk = -1;
518 best_dist_disk = -1;
502 best_dist = MaxSector; 519 best_dist = MaxSector;
520 best_pending_disk = -1;
521 min_pending = UINT_MAX;
503 best_good_sectors = 0; 522 best_good_sectors = 0;
523 has_nonrot_disk = 0;
524 choose_next_idle = 0;
504 525
505 if (conf->mddev->recovery_cp < MaxSector && 526 if (conf->mddev->recovery_cp < MaxSector &&
506 (this_sector + sectors >= conf->next_resync)) { 527 (this_sector + sectors >= conf->next_resync))
507 choose_first = 1; 528 choose_first = 1;
508 start_disk = 0; 529 else
509 } else {
510 choose_first = 0; 530 choose_first = 0;
511 start_disk = conf->last_used;
512 }
513 531
514 for (i = 0 ; i < conf->raid_disks * 2 ; i++) { 532 for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
515 sector_t dist; 533 sector_t dist;
516 sector_t first_bad; 534 sector_t first_bad;
517 int bad_sectors; 535 int bad_sectors;
518 536 unsigned int pending;
519 int disk = start_disk + i; 537 bool nonrot;
520 if (disk >= conf->raid_disks * 2)
521 disk -= conf->raid_disks * 2;
522 538
523 rdev = rcu_dereference(conf->mirrors[disk].rdev); 539 rdev = rcu_dereference(conf->mirrors[disk].rdev);
524 if (r1_bio->bios[disk] == IO_BLOCKED 540 if (r1_bio->bios[disk] == IO_BLOCKED
@@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
577 } else 593 } else
578 best_good_sectors = sectors; 594 best_good_sectors = sectors;
579 595
596 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
597 has_nonrot_disk |= nonrot;
598 pending = atomic_read(&rdev->nr_pending);
580 dist = abs(this_sector - conf->mirrors[disk].head_position); 599 dist = abs(this_sector - conf->mirrors[disk].head_position);
581 if (choose_first 600 if (choose_first) {
582 /* Don't change to another disk for sequential reads */ 601 best_disk = disk;
583 || conf->next_seq_sect == this_sector 602 break;
584 || dist == 0 603 }
585 /* If device is idle, use it */ 604 /* Don't change to another disk for sequential reads */
586 || atomic_read(&rdev->nr_pending) == 0) { 605 if (conf->mirrors[disk].next_seq_sect == this_sector
606 || dist == 0) {
607 int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
608 struct raid1_info *mirror = &conf->mirrors[disk];
609
610 best_disk = disk;
611 /*
612 * If buffered sequential IO size exceeds optimal
613 * iosize, check if there is idle disk. If yes, choose
614 * the idle disk. read_balance could already choose an
615 * idle disk before noticing it's a sequential IO in
616 * this disk. This doesn't matter because this disk
617 * will idle, next time it will be utilized after the
618 * first disk has IO size exceeds optimal iosize. In
619 * this way, iosize of the first disk will be optimal
620 * iosize at least. iosize of the second disk might be
621 * small, but not a big deal since when the second disk
622 * starts IO, the first disk is likely still busy.
623 */
624 if (nonrot && opt_iosize > 0 &&
625 mirror->seq_start != MaxSector &&
626 mirror->next_seq_sect > opt_iosize &&
627 mirror->next_seq_sect - opt_iosize >=
628 mirror->seq_start) {
629 choose_next_idle = 1;
630 continue;
631 }
632 break;
633 }
634 /* If device is idle, use it */
635 if (pending == 0) {
587 best_disk = disk; 636 best_disk = disk;
588 break; 637 break;
589 } 638 }
639
640 if (choose_next_idle)
641 continue;
642
643 if (min_pending > pending) {
644 min_pending = pending;
645 best_pending_disk = disk;
646 }
647
590 if (dist < best_dist) { 648 if (dist < best_dist) {
591 best_dist = dist; 649 best_dist = dist;
592 best_disk = disk; 650 best_dist_disk = disk;
593 } 651 }
594 } 652 }
595 653
654 /*
655 * If all disks are rotational, choose the closest disk. If any disk is
656 * non-rotational, choose the disk with less pending request even the
657 * disk is rotational, which might/might not be optimal for raids with
658 * mixed ratation/non-rotational disks depending on workload.
659 */
660 if (best_disk == -1) {
661 if (has_nonrot_disk)
662 best_disk = best_pending_disk;
663 else
664 best_disk = best_dist_disk;
665 }
666
596 if (best_disk >= 0) { 667 if (best_disk >= 0) {
597 rdev = rcu_dereference(conf->mirrors[best_disk].rdev); 668 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
598 if (!rdev) 669 if (!rdev)
@@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
606 goto retry; 677 goto retry;
607 } 678 }
608 sectors = best_good_sectors; 679 sectors = best_good_sectors;
609 conf->next_seq_sect = this_sector + sectors; 680
610 conf->last_used = best_disk; 681 if (conf->mirrors[best_disk].next_seq_sect != this_sector)
682 conf->mirrors[best_disk].seq_start = this_sector;
683
684 conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
611 } 685 }
612 rcu_read_unlock(); 686 rcu_read_unlock();
613 *max_sectors = sectors; 687 *max_sectors = sectors;
@@ -911,7 +985,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
911static void make_request(struct mddev *mddev, struct bio * bio) 985static void make_request(struct mddev *mddev, struct bio * bio)
912{ 986{
913 struct r1conf *conf = mddev->private; 987 struct r1conf *conf = mddev->private;
914 struct mirror_info *mirror; 988 struct raid1_info *mirror;
915 struct r1bio *r1_bio; 989 struct r1bio *r1_bio;
916 struct bio *read_bio; 990 struct bio *read_bio;
917 int i, disks; 991 int i, disks;
@@ -1415,7 +1489,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1415 struct r1conf *conf = mddev->private; 1489 struct r1conf *conf = mddev->private;
1416 int err = -EEXIST; 1490 int err = -EEXIST;
1417 int mirror = 0; 1491 int mirror = 0;
1418 struct mirror_info *p; 1492 struct raid1_info *p;
1419 int first = 0; 1493 int first = 0;
1420 int last = conf->raid_disks - 1; 1494 int last = conf->raid_disks - 1;
1421 struct request_queue *q = bdev_get_queue(rdev->bdev); 1495 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1484,7 +1558,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1484 struct r1conf *conf = mddev->private; 1558 struct r1conf *conf = mddev->private;
1485 int err = 0; 1559 int err = 0;
1486 int number = rdev->raid_disk; 1560 int number = rdev->raid_disk;
1487 struct mirror_info *p = conf->mirrors+ number; 1561 struct raid1_info *p = conf->mirrors + number;
1488 1562
1489 if (rdev != p->rdev) 1563 if (rdev != p->rdev)
1490 p = conf->mirrors + conf->raid_disks + number; 1564 p = conf->mirrors + conf->raid_disks + number;
@@ -2421,6 +2495,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2421 bio->bi_rw = READ; 2495 bio->bi_rw = READ;
2422 bio->bi_end_io = end_sync_read; 2496 bio->bi_end_io = end_sync_read;
2423 read_targets++; 2497 read_targets++;
2498 } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
2499 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2500 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
2501 /*
2502 * The device is suitable for reading (InSync),
2503 * but has bad block(s) here. Let's try to correct them,
2504 * if we are doing resync or repair. Otherwise, leave
2505 * this device alone for this sync request.
2506 */
2507 bio->bi_rw = WRITE;
2508 bio->bi_end_io = end_sync_write;
2509 write_targets++;
2424 } 2510 }
2425 } 2511 }
2426 if (bio->bi_end_io) { 2512 if (bio->bi_end_io) {
@@ -2478,7 +2564,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2478 /* There is nowhere to write, so all non-sync 2564 /* There is nowhere to write, so all non-sync
2479 * drives must be failed - so we are finished 2565 * drives must be failed - so we are finished
2480 */ 2566 */
2481 sector_t rv = max_sector - sector_nr; 2567 sector_t rv;
2568 if (min_bad > 0)
2569 max_sector = sector_nr + min_bad;
2570 rv = max_sector - sector_nr;
2482 *skipped = 1; 2571 *skipped = 1;
2483 put_buf(r1_bio); 2572 put_buf(r1_bio);
2484 return rv; 2573 return rv;
@@ -2571,7 +2660,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2571{ 2660{
2572 struct r1conf *conf; 2661 struct r1conf *conf;
2573 int i; 2662 int i;
2574 struct mirror_info *disk; 2663 struct raid1_info *disk;
2575 struct md_rdev *rdev; 2664 struct md_rdev *rdev;
2576 int err = -ENOMEM; 2665 int err = -ENOMEM;
2577 2666
@@ -2579,7 +2668,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2579 if (!conf) 2668 if (!conf)
2580 goto abort; 2669 goto abort;
2581 2670
2582 conf->mirrors = kzalloc(sizeof(struct mirror_info) 2671 conf->mirrors = kzalloc(sizeof(struct raid1_info)
2583 * mddev->raid_disks * 2, 2672 * mddev->raid_disks * 2,
2584 GFP_KERNEL); 2673 GFP_KERNEL);
2585 if (!conf->mirrors) 2674 if (!conf->mirrors)
@@ -2622,6 +2711,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2622 mddev->merge_check_needed = 1; 2711 mddev->merge_check_needed = 1;
2623 2712
2624 disk->head_position = 0; 2713 disk->head_position = 0;
2714 disk->seq_start = MaxSector;
2625 } 2715 }
2626 conf->raid_disks = mddev->raid_disks; 2716 conf->raid_disks = mddev->raid_disks;
2627 conf->mddev = mddev; 2717 conf->mddev = mddev;
@@ -2635,7 +2725,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2635 conf->recovery_disabled = mddev->recovery_disabled - 1; 2725 conf->recovery_disabled = mddev->recovery_disabled - 1;
2636 2726
2637 err = -EIO; 2727 err = -EIO;
2638 conf->last_used = -1;
2639 for (i = 0; i < conf->raid_disks * 2; i++) { 2728 for (i = 0; i < conf->raid_disks * 2; i++) {
2640 2729
2641 disk = conf->mirrors + i; 2730 disk = conf->mirrors + i;
@@ -2661,19 +2750,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2661 if (disk->rdev && 2750 if (disk->rdev &&
2662 (disk->rdev->saved_raid_disk < 0)) 2751 (disk->rdev->saved_raid_disk < 0))
2663 conf->fullsync = 1; 2752 conf->fullsync = 1;
2664 } else if (conf->last_used < 0) 2753 }
2665 /*
2666 * The first working device is used as a
2667 * starting point to read balancing.
2668 */
2669 conf->last_used = i;
2670 } 2754 }
2671 2755
2672 if (conf->last_used < 0) {
2673 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2674 mdname(mddev));
2675 goto abort;
2676 }
2677 err = -ENOMEM; 2756 err = -ENOMEM;
2678 conf->thread = md_register_thread(raid1d, mddev, "raid1"); 2757 conf->thread = md_register_thread(raid1d, mddev, "raid1");
2679 if (!conf->thread) { 2758 if (!conf->thread) {
@@ -2848,7 +2927,7 @@ static int raid1_reshape(struct mddev *mddev)
2848 */ 2927 */
2849 mempool_t *newpool, *oldpool; 2928 mempool_t *newpool, *oldpool;
2850 struct pool_info *newpoolinfo; 2929 struct pool_info *newpoolinfo;
2851 struct mirror_info *newmirrors; 2930 struct raid1_info *newmirrors;
2852 struct r1conf *conf = mddev->private; 2931 struct r1conf *conf = mddev->private;
2853 int cnt, raid_disks; 2932 int cnt, raid_disks;
2854 unsigned long flags; 2933 unsigned long flags;
@@ -2891,7 +2970,7 @@ static int raid1_reshape(struct mddev *mddev)
2891 kfree(newpoolinfo); 2970 kfree(newpoolinfo);
2892 return -ENOMEM; 2971 return -ENOMEM;
2893 } 2972 }
2894 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, 2973 newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
2895 GFP_KERNEL); 2974 GFP_KERNEL);
2896 if (!newmirrors) { 2975 if (!newmirrors) {
2897 kfree(newpoolinfo); 2976 kfree(newpoolinfo);
@@ -2930,7 +3009,6 @@ static int raid1_reshape(struct mddev *mddev)
2930 conf->raid_disks = mddev->raid_disks = raid_disks; 3009 conf->raid_disks = mddev->raid_disks = raid_disks;
2931 mddev->delta_disks = 0; 3010 mddev->delta_disks = 0;
2932 3011
2933 conf->last_used = 0; /* just make sure it is in-range */
2934 lower_barrier(conf); 3012 lower_barrier(conf);
2935 3013
2936 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3014 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 80ded139314c..0ff3715fb7eb 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,9 +1,15 @@
1#ifndef _RAID1_H 1#ifndef _RAID1_H
2#define _RAID1_H 2#define _RAID1_H
3 3
4struct mirror_info { 4struct raid1_info {
5 struct md_rdev *rdev; 5 struct md_rdev *rdev;
6 sector_t head_position; 6 sector_t head_position;
7
8 /* When choose the best device for a read (read_balance())
9 * we try to keep sequential reads one the same device
10 */
11 sector_t next_seq_sect;
12 sector_t seq_start;
7}; 13};
8 14
9/* 15/*
@@ -24,17 +30,11 @@ struct pool_info {
24 30
25struct r1conf { 31struct r1conf {
26 struct mddev *mddev; 32 struct mddev *mddev;
27 struct mirror_info *mirrors; /* twice 'raid_disks' to 33 struct raid1_info *mirrors; /* twice 'raid_disks' to
28 * allow for replacements. 34 * allow for replacements.
29 */ 35 */
30 int raid_disks; 36 int raid_disks;
31 37
32 /* When choose the best device for a read (read_balance())
33 * we try to keep sequential reads one the same device
34 * using 'last_used' and 'next_seq_sect'
35 */
36 int last_used;
37 sector_t next_seq_sect;
38 /* During resync, read_balancing is only allowed on the part 38 /* During resync, read_balancing is only allowed on the part
39 * of the array that has been resynced. 'next_resync' tells us 39 * of the array that has been resynced. 'next_resync' tells us
40 * where that is. 40 * where that is.
@@ -135,20 +135,6 @@ struct r1bio {
135 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ 135 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
136}; 136};
137 137
138/* when we get a read error on a read-only array, we redirect to another
139 * device without failing the first device, or trying to over-write to
140 * correct the read error. To keep track of bad blocks on a per-bio
141 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
142 */
143#define IO_BLOCKED ((struct bio *)1)
144/* When we successfully write to a known bad-block, we need to remove the
145 * bad-block marking which must be done from process context. So we record
146 * the success by setting bios[n] to IO_MADE_GOOD
147 */
148#define IO_MADE_GOOD ((struct bio *)2)
149
150#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
151
152/* bits for r1bio.state */ 138/* bits for r1bio.state */
153#define R1BIO_Uptodate 0 139#define R1BIO_Uptodate 0
154#define R1BIO_IsSync 1 140#define R1BIO_IsSync 1
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5d33603a497d..de5ed6fd8806 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -60,7 +60,21 @@
60 */ 60 */
61#define NR_RAID10_BIOS 256 61#define NR_RAID10_BIOS 256
62 62
63/* When there are this many requests queue to be written by 63/* when we get a read error on a read-only array, we redirect to another
64 * device without failing the first device, or trying to over-write to
65 * correct the read error. To keep track of bad blocks on a per-bio
66 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
67 */
68#define IO_BLOCKED ((struct bio *)1)
69/* When we successfully write to a known bad-block, we need to remove the
70 * bad-block marking which must be done from process context. So we record
71 * the success by setting devs[n].bio to IO_MADE_GOOD
72 */
73#define IO_MADE_GOOD ((struct bio *)2)
74
75#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
76
77/* When there are this many requests queued to be written by
64 * the raid10 thread, we become 'congested' to provide back-pressure 78 * the raid10 thread, we become 'congested' to provide back-pressure
65 * for writeback. 79 * for writeback.
66 */ 80 */
@@ -717,7 +731,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
717 int sectors = r10_bio->sectors; 731 int sectors = r10_bio->sectors;
718 int best_good_sectors; 732 int best_good_sectors;
719 sector_t new_distance, best_dist; 733 sector_t new_distance, best_dist;
720 struct md_rdev *rdev, *best_rdev; 734 struct md_rdev *best_rdev, *rdev = NULL;
721 int do_balance; 735 int do_balance;
722 int best_slot; 736 int best_slot;
723 struct geom *geo = &conf->geo; 737 struct geom *geo = &conf->geo;
@@ -839,9 +853,8 @@ retry:
839 return rdev; 853 return rdev;
840} 854}
841 855
842static int raid10_congested(void *data, int bits) 856int md_raid10_congested(struct mddev *mddev, int bits)
843{ 857{
844 struct mddev *mddev = data;
845 struct r10conf *conf = mddev->private; 858 struct r10conf *conf = mddev->private;
846 int i, ret = 0; 859 int i, ret = 0;
847 860
@@ -849,8 +862,6 @@ static int raid10_congested(void *data, int bits)
849 conf->pending_count >= max_queued_requests) 862 conf->pending_count >= max_queued_requests)
850 return 1; 863 return 1;
851 864
852 if (mddev_congested(mddev, bits))
853 return 1;
854 rcu_read_lock(); 865 rcu_read_lock();
855 for (i = 0; 866 for (i = 0;
856 (i < conf->geo.raid_disks || i < conf->prev.raid_disks) 867 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
@@ -866,6 +877,15 @@ static int raid10_congested(void *data, int bits)
866 rcu_read_unlock(); 877 rcu_read_unlock();
867 return ret; 878 return ret;
868} 879}
880EXPORT_SYMBOL_GPL(md_raid10_congested);
881
882static int raid10_congested(void *data, int bits)
883{
884 struct mddev *mddev = data;
885
886 return mddev_congested(mddev, bits) ||
887 md_raid10_congested(mddev, bits);
888}
869 889
870static void flush_pending_writes(struct r10conf *conf) 890static void flush_pending_writes(struct r10conf *conf)
871{ 891{
@@ -1546,7 +1566,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1546static void print_conf(struct r10conf *conf) 1566static void print_conf(struct r10conf *conf)
1547{ 1567{
1548 int i; 1568 int i;
1549 struct mirror_info *tmp; 1569 struct raid10_info *tmp;
1550 1570
1551 printk(KERN_DEBUG "RAID10 conf printout:\n"); 1571 printk(KERN_DEBUG "RAID10 conf printout:\n");
1552 if (!conf) { 1572 if (!conf) {
@@ -1580,7 +1600,7 @@ static int raid10_spare_active(struct mddev *mddev)
1580{ 1600{
1581 int i; 1601 int i;
1582 struct r10conf *conf = mddev->private; 1602 struct r10conf *conf = mddev->private;
1583 struct mirror_info *tmp; 1603 struct raid10_info *tmp;
1584 int count = 0; 1604 int count = 0;
1585 unsigned long flags; 1605 unsigned long flags;
1586 1606
@@ -1655,7 +1675,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1655 else 1675 else
1656 mirror = first; 1676 mirror = first;
1657 for ( ; mirror <= last ; mirror++) { 1677 for ( ; mirror <= last ; mirror++) {
1658 struct mirror_info *p = &conf->mirrors[mirror]; 1678 struct raid10_info *p = &conf->mirrors[mirror];
1659 if (p->recovery_disabled == mddev->recovery_disabled) 1679 if (p->recovery_disabled == mddev->recovery_disabled)
1660 continue; 1680 continue;
1661 if (p->rdev) { 1681 if (p->rdev) {
@@ -1709,7 +1729,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1709 int err = 0; 1729 int err = 0;
1710 int number = rdev->raid_disk; 1730 int number = rdev->raid_disk;
1711 struct md_rdev **rdevp; 1731 struct md_rdev **rdevp;
1712 struct mirror_info *p = conf->mirrors + number; 1732 struct raid10_info *p = conf->mirrors + number;
1713 1733
1714 print_conf(conf); 1734 print_conf(conf);
1715 if (rdev == p->rdev) 1735 if (rdev == p->rdev)
@@ -2875,7 +2895,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2875 sector_t sect; 2895 sector_t sect;
2876 int must_sync; 2896 int must_sync;
2877 int any_working; 2897 int any_working;
2878 struct mirror_info *mirror = &conf->mirrors[i]; 2898 struct raid10_info *mirror = &conf->mirrors[i];
2879 2899
2880 if ((mirror->rdev == NULL || 2900 if ((mirror->rdev == NULL ||
2881 test_bit(In_sync, &mirror->rdev->flags)) 2901 test_bit(In_sync, &mirror->rdev->flags))
@@ -3387,7 +3407,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3387 goto out; 3407 goto out;
3388 3408
3389 /* FIXME calc properly */ 3409 /* FIXME calc properly */
3390 conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + 3410 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3391 max(0,mddev->delta_disks)), 3411 max(0,mddev->delta_disks)),
3392 GFP_KERNEL); 3412 GFP_KERNEL);
3393 if (!conf->mirrors) 3413 if (!conf->mirrors)
@@ -3451,7 +3471,7 @@ static int run(struct mddev *mddev)
3451{ 3471{
3452 struct r10conf *conf; 3472 struct r10conf *conf;
3453 int i, disk_idx, chunk_size; 3473 int i, disk_idx, chunk_size;
3454 struct mirror_info *disk; 3474 struct raid10_info *disk;
3455 struct md_rdev *rdev; 3475 struct md_rdev *rdev;
3456 sector_t size; 3476 sector_t size;
3457 sector_t min_offset_diff = 0; 3477 sector_t min_offset_diff = 0;
@@ -3471,12 +3491,14 @@ static int run(struct mddev *mddev)
3471 conf->thread = NULL; 3491 conf->thread = NULL;
3472 3492
3473 chunk_size = mddev->chunk_sectors << 9; 3493 chunk_size = mddev->chunk_sectors << 9;
3474 blk_queue_io_min(mddev->queue, chunk_size); 3494 if (mddev->queue) {
3475 if (conf->geo.raid_disks % conf->geo.near_copies) 3495 blk_queue_io_min(mddev->queue, chunk_size);
3476 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3496 if (conf->geo.raid_disks % conf->geo.near_copies)
3477 else 3497 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3478 blk_queue_io_opt(mddev->queue, chunk_size * 3498 else
3479 (conf->geo.raid_disks / conf->geo.near_copies)); 3499 blk_queue_io_opt(mddev->queue, chunk_size *
3500 (conf->geo.raid_disks / conf->geo.near_copies));
3501 }
3480 3502
3481 rdev_for_each(rdev, mddev) { 3503 rdev_for_each(rdev, mddev) {
3482 long long diff; 3504 long long diff;
@@ -3510,8 +3532,9 @@ static int run(struct mddev *mddev)
3510 if (first || diff < min_offset_diff) 3532 if (first || diff < min_offset_diff)
3511 min_offset_diff = diff; 3533 min_offset_diff = diff;
3512 3534
3513 disk_stack_limits(mddev->gendisk, rdev->bdev, 3535 if (mddev->gendisk)
3514 rdev->data_offset << 9); 3536 disk_stack_limits(mddev->gendisk, rdev->bdev,
3537 rdev->data_offset << 9);
3515 3538
3516 disk->head_position = 0; 3539 disk->head_position = 0;
3517 } 3540 }
@@ -3574,22 +3597,22 @@ static int run(struct mddev *mddev)
3574 md_set_array_sectors(mddev, size); 3597 md_set_array_sectors(mddev, size);
3575 mddev->resync_max_sectors = size; 3598 mddev->resync_max_sectors = size;
3576 3599
3577 mddev->queue->backing_dev_info.congested_fn = raid10_congested; 3600 if (mddev->queue) {
3578 mddev->queue->backing_dev_info.congested_data = mddev;
3579
3580 /* Calculate max read-ahead size.
3581 * We need to readahead at least twice a whole stripe....
3582 * maybe...
3583 */
3584 {
3585 int stripe = conf->geo.raid_disks * 3601 int stripe = conf->geo.raid_disks *
3586 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 3602 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3603 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3604 mddev->queue->backing_dev_info.congested_data = mddev;
3605
3606 /* Calculate max read-ahead size.
3607 * We need to readahead at least twice a whole stripe....
3608 * maybe...
3609 */
3587 stripe /= conf->geo.near_copies; 3610 stripe /= conf->geo.near_copies;
3588 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 3611 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3589 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 3612 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3613 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3590 } 3614 }
3591 3615
3592 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3593 3616
3594 if (md_integrity_register(mddev)) 3617 if (md_integrity_register(mddev))
3595 goto out_free_conf; 3618 goto out_free_conf;
@@ -3640,7 +3663,10 @@ static int stop(struct mddev *mddev)
3640 lower_barrier(conf); 3663 lower_barrier(conf);
3641 3664
3642 md_unregister_thread(&mddev->thread); 3665 md_unregister_thread(&mddev->thread);
3643 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 3666 if (mddev->queue)
3667 /* the unplug fn references 'conf'*/
3668 blk_sync_queue(mddev->queue);
3669
3644 if (conf->r10bio_pool) 3670 if (conf->r10bio_pool)
3645 mempool_destroy(conf->r10bio_pool); 3671 mempool_destroy(conf->r10bio_pool);
3646 kfree(conf->mirrors); 3672 kfree(conf->mirrors);
@@ -3804,7 +3830,7 @@ static int raid10_check_reshape(struct mddev *mddev)
3804 if (mddev->delta_disks > 0) { 3830 if (mddev->delta_disks > 0) {
3805 /* allocate new 'mirrors' list */ 3831 /* allocate new 'mirrors' list */
3806 conf->mirrors_new = kzalloc( 3832 conf->mirrors_new = kzalloc(
3807 sizeof(struct mirror_info) 3833 sizeof(struct raid10_info)
3808 *(mddev->raid_disks + 3834 *(mddev->raid_disks +
3809 mddev->delta_disks), 3835 mddev->delta_disks),
3810 GFP_KERNEL); 3836 GFP_KERNEL);
@@ -3929,7 +3955,7 @@ static int raid10_start_reshape(struct mddev *mddev)
3929 spin_lock_irq(&conf->device_lock); 3955 spin_lock_irq(&conf->device_lock);
3930 if (conf->mirrors_new) { 3956 if (conf->mirrors_new) {
3931 memcpy(conf->mirrors_new, conf->mirrors, 3957 memcpy(conf->mirrors_new, conf->mirrors,
3932 sizeof(struct mirror_info)*conf->prev.raid_disks); 3958 sizeof(struct raid10_info)*conf->prev.raid_disks);
3933 smp_mb(); 3959 smp_mb();
3934 kfree(conf->mirrors_old); /* FIXME and elsewhere */ 3960 kfree(conf->mirrors_old); /* FIXME and elsewhere */
3935 conf->mirrors_old = conf->mirrors; 3961 conf->mirrors_old = conf->mirrors;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 135b1b0a1554..007c2c68dd83 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -1,7 +1,7 @@
1#ifndef _RAID10_H 1#ifndef _RAID10_H
2#define _RAID10_H 2#define _RAID10_H
3 3
4struct mirror_info { 4struct raid10_info {
5 struct md_rdev *rdev, *replacement; 5 struct md_rdev *rdev, *replacement;
6 sector_t head_position; 6 sector_t head_position;
7 int recovery_disabled; /* matches 7 int recovery_disabled; /* matches
@@ -13,8 +13,8 @@ struct mirror_info {
13 13
14struct r10conf { 14struct r10conf {
15 struct mddev *mddev; 15 struct mddev *mddev;
16 struct mirror_info *mirrors; 16 struct raid10_info *mirrors;
17 struct mirror_info *mirrors_new, *mirrors_old; 17 struct raid10_info *mirrors_new, *mirrors_old;
18 spinlock_t device_lock; 18 spinlock_t device_lock;
19 19
20 /* geometry */ 20 /* geometry */
@@ -123,20 +123,6 @@ struct r10bio {
123 } devs[0]; 123 } devs[0];
124}; 124};
125 125
126/* when we get a read error on a read-only array, we redirect to another
127 * device without failing the first device, or trying to over-write to
128 * correct the read error. To keep track of bad blocks on a per-bio
129 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
130 */
131#define IO_BLOCKED ((struct bio*)1)
132/* When we successfully write to a known bad-block, we need to remove the
133 * bad-block marking which must be done from process context. So we record
134 * the success by setting devs[n].bio to IO_MADE_GOOD
135 */
136#define IO_MADE_GOOD ((struct bio *)2)
137
138#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
139
140/* bits for r10bio.state */ 126/* bits for r10bio.state */
141enum r10bio_state { 127enum r10bio_state {
142 R10BIO_Uptodate, 128 R10BIO_Uptodate,
@@ -159,4 +145,7 @@ enum r10bio_state {
159 */ 145 */
160 R10BIO_Previous, 146 R10BIO_Previous,
161}; 147};
148
149extern int md_raid10_congested(struct mddev *mddev, int bits);
150
162#endif 151#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9e41ae37bd40..adda94df5eb2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -99,34 +99,40 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
99 * We maintain a biased count of active stripes in the bottom 16 bits of 99 * We maintain a biased count of active stripes in the bottom 16 bits of
100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
101 */ 101 */
102static inline int raid5_bi_phys_segments(struct bio *bio) 102static inline int raid5_bi_processed_stripes(struct bio *bio)
103{ 103{
104 return bio->bi_phys_segments & 0xffff; 104 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
105 return (atomic_read(segments) >> 16) & 0xffff;
105} 106}
106 107
107static inline int raid5_bi_hw_segments(struct bio *bio) 108static inline int raid5_dec_bi_active_stripes(struct bio *bio)
108{ 109{
109 return (bio->bi_phys_segments >> 16) & 0xffff; 110 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
111 return atomic_sub_return(1, segments) & 0xffff;
110} 112}
111 113
112static inline int raid5_dec_bi_phys_segments(struct bio *bio) 114static inline void raid5_inc_bi_active_stripes(struct bio *bio)
113{ 115{
114 --bio->bi_phys_segments; 116 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
115 return raid5_bi_phys_segments(bio); 117 atomic_inc(segments);
116} 118}
117 119
118static inline int raid5_dec_bi_hw_segments(struct bio *bio) 120static inline void raid5_set_bi_processed_stripes(struct bio *bio,
121 unsigned int cnt)
119{ 122{
120 unsigned short val = raid5_bi_hw_segments(bio); 123 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
124 int old, new;
121 125
122 --val; 126 do {
123 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 127 old = atomic_read(segments);
124 return val; 128 new = (old & 0xffff) | (cnt << 16);
129 } while (atomic_cmpxchg(segments, old, new) != old);
125} 130}
126 131
127static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 132static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
128{ 133{
129 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); 134 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
135 atomic_set(segments, cnt);
130} 136}
131 137
132/* Find first data disk in a raid6 stripe */ 138/* Find first data disk in a raid6 stripe */
@@ -190,49 +196,56 @@ static int stripe_operations_active(struct stripe_head *sh)
190 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 196 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
191} 197}
192 198
193static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 199static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
194{ 200{
195 if (atomic_dec_and_test(&sh->count)) { 201 BUG_ON(!list_empty(&sh->lru));
196 BUG_ON(!list_empty(&sh->lru)); 202 BUG_ON(atomic_read(&conf->active_stripes)==0);
197 BUG_ON(atomic_read(&conf->active_stripes)==0); 203 if (test_bit(STRIPE_HANDLE, &sh->state)) {
198 if (test_bit(STRIPE_HANDLE, &sh->state)) { 204 if (test_bit(STRIPE_DELAYED, &sh->state) &&
199 if (test_bit(STRIPE_DELAYED, &sh->state) && 205 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
200 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 206 list_add_tail(&sh->lru, &conf->delayed_list);
201 list_add_tail(&sh->lru, &conf->delayed_list); 207 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
202 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 208 sh->bm_seq - conf->seq_write > 0)
203 sh->bm_seq - conf->seq_write > 0) 209 list_add_tail(&sh->lru, &conf->bitmap_list);
204 list_add_tail(&sh->lru, &conf->bitmap_list); 210 else {
205 else { 211 clear_bit(STRIPE_DELAYED, &sh->state);
206 clear_bit(STRIPE_DELAYED, &sh->state); 212 clear_bit(STRIPE_BIT_DELAY, &sh->state);
207 clear_bit(STRIPE_BIT_DELAY, &sh->state); 213 list_add_tail(&sh->lru, &conf->handle_list);
208 list_add_tail(&sh->lru, &conf->handle_list); 214 }
209 } 215 md_wakeup_thread(conf->mddev->thread);
210 md_wakeup_thread(conf->mddev->thread); 216 } else {
211 } else { 217 BUG_ON(stripe_operations_active(sh));
212 BUG_ON(stripe_operations_active(sh)); 218 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
213 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 219 if (atomic_dec_return(&conf->preread_active_stripes)
214 if (atomic_dec_return(&conf->preread_active_stripes) 220 < IO_THRESHOLD)
215 < IO_THRESHOLD) 221 md_wakeup_thread(conf->mddev->thread);
216 md_wakeup_thread(conf->mddev->thread); 222 atomic_dec(&conf->active_stripes);
217 atomic_dec(&conf->active_stripes); 223 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
218 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 224 list_add_tail(&sh->lru, &conf->inactive_list);
219 list_add_tail(&sh->lru, &conf->inactive_list); 225 wake_up(&conf->wait_for_stripe);
220 wake_up(&conf->wait_for_stripe); 226 if (conf->retry_read_aligned)
221 if (conf->retry_read_aligned) 227 md_wakeup_thread(conf->mddev->thread);
222 md_wakeup_thread(conf->mddev->thread);
223 }
224 } 228 }
225 } 229 }
226} 230}
227 231
232static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
233{
234 if (atomic_dec_and_test(&sh->count))
235 do_release_stripe(conf, sh);
236}
237
228static void release_stripe(struct stripe_head *sh) 238static void release_stripe(struct stripe_head *sh)
229{ 239{
230 struct r5conf *conf = sh->raid_conf; 240 struct r5conf *conf = sh->raid_conf;
231 unsigned long flags; 241 unsigned long flags;
232 242
233 spin_lock_irqsave(&conf->device_lock, flags); 243 local_irq_save(flags);
234 __release_stripe(conf, sh); 244 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
235 spin_unlock_irqrestore(&conf->device_lock, flags); 245 do_release_stripe(conf, sh);
246 spin_unlock(&conf->device_lock);
247 }
248 local_irq_restore(flags);
236} 249}
237 250
238static inline void remove_hash(struct stripe_head *sh) 251static inline void remove_hash(struct stripe_head *sh)
@@ -641,6 +654,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
641 else 654 else
642 bi->bi_sector = (sh->sector 655 bi->bi_sector = (sh->sector
643 + rdev->data_offset); 656 + rdev->data_offset);
657 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
658 bi->bi_rw |= REQ_FLUSH;
659
644 bi->bi_flags = 1 << BIO_UPTODATE; 660 bi->bi_flags = 1 << BIO_UPTODATE;
645 bi->bi_idx = 0; 661 bi->bi_idx = 0;
646 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 662 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -750,14 +766,12 @@ static void ops_complete_biofill(void *stripe_head_ref)
750{ 766{
751 struct stripe_head *sh = stripe_head_ref; 767 struct stripe_head *sh = stripe_head_ref;
752 struct bio *return_bi = NULL; 768 struct bio *return_bi = NULL;
753 struct r5conf *conf = sh->raid_conf;
754 int i; 769 int i;
755 770
756 pr_debug("%s: stripe %llu\n", __func__, 771 pr_debug("%s: stripe %llu\n", __func__,
757 (unsigned long long)sh->sector); 772 (unsigned long long)sh->sector);
758 773
759 /* clear completed biofills */ 774 /* clear completed biofills */
760 spin_lock_irq(&conf->device_lock);
761 for (i = sh->disks; i--; ) { 775 for (i = sh->disks; i--; ) {
762 struct r5dev *dev = &sh->dev[i]; 776 struct r5dev *dev = &sh->dev[i];
763 777
@@ -775,7 +789,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
775 while (rbi && rbi->bi_sector < 789 while (rbi && rbi->bi_sector <
776 dev->sector + STRIPE_SECTORS) { 790 dev->sector + STRIPE_SECTORS) {
777 rbi2 = r5_next_bio(rbi, dev->sector); 791 rbi2 = r5_next_bio(rbi, dev->sector);
778 if (!raid5_dec_bi_phys_segments(rbi)) { 792 if (!raid5_dec_bi_active_stripes(rbi)) {
779 rbi->bi_next = return_bi; 793 rbi->bi_next = return_bi;
780 return_bi = rbi; 794 return_bi = rbi;
781 } 795 }
@@ -783,7 +797,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
783 } 797 }
784 } 798 }
785 } 799 }
786 spin_unlock_irq(&conf->device_lock);
787 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 800 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
788 801
789 return_io(return_bi); 802 return_io(return_bi);
@@ -795,7 +808,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
795static void ops_run_biofill(struct stripe_head *sh) 808static void ops_run_biofill(struct stripe_head *sh)
796{ 809{
797 struct dma_async_tx_descriptor *tx = NULL; 810 struct dma_async_tx_descriptor *tx = NULL;
798 struct r5conf *conf = sh->raid_conf;
799 struct async_submit_ctl submit; 811 struct async_submit_ctl submit;
800 int i; 812 int i;
801 813
@@ -806,10 +818,10 @@ static void ops_run_biofill(struct stripe_head *sh)
806 struct r5dev *dev = &sh->dev[i]; 818 struct r5dev *dev = &sh->dev[i];
807 if (test_bit(R5_Wantfill, &dev->flags)) { 819 if (test_bit(R5_Wantfill, &dev->flags)) {
808 struct bio *rbi; 820 struct bio *rbi;
809 spin_lock_irq(&conf->device_lock); 821 spin_lock_irq(&sh->stripe_lock);
810 dev->read = rbi = dev->toread; 822 dev->read = rbi = dev->toread;
811 dev->toread = NULL; 823 dev->toread = NULL;
812 spin_unlock_irq(&conf->device_lock); 824 spin_unlock_irq(&sh->stripe_lock);
813 while (rbi && rbi->bi_sector < 825 while (rbi && rbi->bi_sector <
814 dev->sector + STRIPE_SECTORS) { 826 dev->sector + STRIPE_SECTORS) {
815 tx = async_copy_data(0, rbi, dev->page, 827 tx = async_copy_data(0, rbi, dev->page,
@@ -1145,12 +1157,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1145 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1157 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1146 struct bio *wbi; 1158 struct bio *wbi;
1147 1159
1148 spin_lock_irq(&sh->raid_conf->device_lock); 1160 spin_lock_irq(&sh->stripe_lock);
1149 chosen = dev->towrite; 1161 chosen = dev->towrite;
1150 dev->towrite = NULL; 1162 dev->towrite = NULL;
1151 BUG_ON(dev->written); 1163 BUG_ON(dev->written);
1152 wbi = dev->written = chosen; 1164 wbi = dev->written = chosen;
1153 spin_unlock_irq(&sh->raid_conf->device_lock); 1165 spin_unlock_irq(&sh->stripe_lock);
1154 1166
1155 while (wbi && wbi->bi_sector < 1167 while (wbi && wbi->bi_sector <
1156 dev->sector + STRIPE_SECTORS) { 1168 dev->sector + STRIPE_SECTORS) {
@@ -1455,6 +1467,8 @@ static int grow_one_stripe(struct r5conf *conf)
1455 init_waitqueue_head(&sh->ops.wait_for_ops); 1467 init_waitqueue_head(&sh->ops.wait_for_ops);
1456 #endif 1468 #endif
1457 1469
1470 spin_lock_init(&sh->stripe_lock);
1471
1458 if (grow_buffers(sh)) { 1472 if (grow_buffers(sh)) {
1459 shrink_buffers(sh); 1473 shrink_buffers(sh);
1460 kmem_cache_free(conf->slab_cache, sh); 1474 kmem_cache_free(conf->slab_cache, sh);
@@ -1740,7 +1754,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
1740 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1754 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1741 clear_bit(R5_ReadError, &sh->dev[i].flags); 1755 clear_bit(R5_ReadError, &sh->dev[i].flags);
1742 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1756 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1743 } 1757 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
1758 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1759
1744 if (atomic_read(&rdev->read_errors)) 1760 if (atomic_read(&rdev->read_errors))
1745 atomic_set(&rdev->read_errors, 0); 1761 atomic_set(&rdev->read_errors, 0);
1746 } else { 1762 } else {
@@ -1785,7 +1801,11 @@ static void raid5_end_read_request(struct bio * bi, int error)
1785 else 1801 else
1786 retry = 1; 1802 retry = 1;
1787 if (retry) 1803 if (retry)
1788 set_bit(R5_ReadError, &sh->dev[i].flags); 1804 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
1805 set_bit(R5_ReadError, &sh->dev[i].flags);
1806 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1807 } else
1808 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1789 else { 1809 else {
1790 clear_bit(R5_ReadError, &sh->dev[i].flags); 1810 clear_bit(R5_ReadError, &sh->dev[i].flags);
1791 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1811 clear_bit(R5_ReWrite, &sh->dev[i].flags);
@@ -2341,11 +2361,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2341 (unsigned long long)bi->bi_sector, 2361 (unsigned long long)bi->bi_sector,
2342 (unsigned long long)sh->sector); 2362 (unsigned long long)sh->sector);
2343 2363
2344 2364 /*
2345 spin_lock_irq(&conf->device_lock); 2365 * If several bio share a stripe. The bio bi_phys_segments acts as a
2366 * reference count to avoid race. The reference count should already be
2367 * increased before this function is called (for example, in
2368 * make_request()), so other bio sharing this stripe will not free the
2369 * stripe. If a stripe is owned by one stripe, the stripe lock will
2370 * protect it.
2371 */
2372 spin_lock_irq(&sh->stripe_lock);
2346 if (forwrite) { 2373 if (forwrite) {
2347 bip = &sh->dev[dd_idx].towrite; 2374 bip = &sh->dev[dd_idx].towrite;
2348 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2375 if (*bip == NULL)
2349 firstwrite = 1; 2376 firstwrite = 1;
2350 } else 2377 } else
2351 bip = &sh->dev[dd_idx].toread; 2378 bip = &sh->dev[dd_idx].toread;
@@ -2361,7 +2388,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2361 if (*bip) 2388 if (*bip)
2362 bi->bi_next = *bip; 2389 bi->bi_next = *bip;
2363 *bip = bi; 2390 *bip = bi;
2364 bi->bi_phys_segments++; 2391 raid5_inc_bi_active_stripes(bi);
2365 2392
2366 if (forwrite) { 2393 if (forwrite) {
2367 /* check if page is covered */ 2394 /* check if page is covered */
@@ -2376,7 +2403,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2376 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2403 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2377 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2404 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2378 } 2405 }
2379 spin_unlock_irq(&conf->device_lock); 2406 spin_unlock_irq(&sh->stripe_lock);
2380 2407
2381 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2408 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2382 (unsigned long long)(*bip)->bi_sector, 2409 (unsigned long long)(*bip)->bi_sector,
@@ -2392,7 +2419,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2392 2419
2393 overlap: 2420 overlap:
2394 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2421 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2395 spin_unlock_irq(&conf->device_lock); 2422 spin_unlock_irq(&sh->stripe_lock);
2396 return 0; 2423 return 0;
2397} 2424}
2398 2425
@@ -2442,10 +2469,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2442 rdev_dec_pending(rdev, conf->mddev); 2469 rdev_dec_pending(rdev, conf->mddev);
2443 } 2470 }
2444 } 2471 }
2445 spin_lock_irq(&conf->device_lock); 2472 spin_lock_irq(&sh->stripe_lock);
2446 /* fail all writes first */ 2473 /* fail all writes first */
2447 bi = sh->dev[i].towrite; 2474 bi = sh->dev[i].towrite;
2448 sh->dev[i].towrite = NULL; 2475 sh->dev[i].towrite = NULL;
2476 spin_unlock_irq(&sh->stripe_lock);
2449 if (bi) { 2477 if (bi) {
2450 s->to_write--; 2478 s->to_write--;
2451 bitmap_end = 1; 2479 bitmap_end = 1;
@@ -2458,13 +2486,17 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2458 sh->dev[i].sector + STRIPE_SECTORS) { 2486 sh->dev[i].sector + STRIPE_SECTORS) {
2459 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2487 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2460 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2488 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2461 if (!raid5_dec_bi_phys_segments(bi)) { 2489 if (!raid5_dec_bi_active_stripes(bi)) {
2462 md_write_end(conf->mddev); 2490 md_write_end(conf->mddev);
2463 bi->bi_next = *return_bi; 2491 bi->bi_next = *return_bi;
2464 *return_bi = bi; 2492 *return_bi = bi;
2465 } 2493 }
2466 bi = nextbi; 2494 bi = nextbi;
2467 } 2495 }
2496 if (bitmap_end)
2497 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2498 STRIPE_SECTORS, 0, 0);
2499 bitmap_end = 0;
2468 /* and fail all 'written' */ 2500 /* and fail all 'written' */
2469 bi = sh->dev[i].written; 2501 bi = sh->dev[i].written;
2470 sh->dev[i].written = NULL; 2502 sh->dev[i].written = NULL;
@@ -2473,7 +2505,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2473 sh->dev[i].sector + STRIPE_SECTORS) { 2505 sh->dev[i].sector + STRIPE_SECTORS) {
2474 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2506 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2475 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2507 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2476 if (!raid5_dec_bi_phys_segments(bi)) { 2508 if (!raid5_dec_bi_active_stripes(bi)) {
2477 md_write_end(conf->mddev); 2509 md_write_end(conf->mddev);
2478 bi->bi_next = *return_bi; 2510 bi->bi_next = *return_bi;
2479 *return_bi = bi; 2511 *return_bi = bi;
@@ -2497,14 +2529,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2497 struct bio *nextbi = 2529 struct bio *nextbi =
2498 r5_next_bio(bi, sh->dev[i].sector); 2530 r5_next_bio(bi, sh->dev[i].sector);
2499 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2531 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2500 if (!raid5_dec_bi_phys_segments(bi)) { 2532 if (!raid5_dec_bi_active_stripes(bi)) {
2501 bi->bi_next = *return_bi; 2533 bi->bi_next = *return_bi;
2502 *return_bi = bi; 2534 *return_bi = bi;
2503 } 2535 }
2504 bi = nextbi; 2536 bi = nextbi;
2505 } 2537 }
2506 } 2538 }
2507 spin_unlock_irq(&conf->device_lock);
2508 if (bitmap_end) 2539 if (bitmap_end)
2509 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2540 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2510 STRIPE_SECTORS, 0, 0); 2541 STRIPE_SECTORS, 0, 0);
@@ -2708,30 +2739,23 @@ static void handle_stripe_clean_event(struct r5conf *conf,
2708 test_bit(R5_UPTODATE, &dev->flags)) { 2739 test_bit(R5_UPTODATE, &dev->flags)) {
2709 /* We can return any write requests */ 2740 /* We can return any write requests */
2710 struct bio *wbi, *wbi2; 2741 struct bio *wbi, *wbi2;
2711 int bitmap_end = 0;
2712 pr_debug("Return write for disc %d\n", i); 2742 pr_debug("Return write for disc %d\n", i);
2713 spin_lock_irq(&conf->device_lock);
2714 wbi = dev->written; 2743 wbi = dev->written;
2715 dev->written = NULL; 2744 dev->written = NULL;
2716 while (wbi && wbi->bi_sector < 2745 while (wbi && wbi->bi_sector <
2717 dev->sector + STRIPE_SECTORS) { 2746 dev->sector + STRIPE_SECTORS) {
2718 wbi2 = r5_next_bio(wbi, dev->sector); 2747 wbi2 = r5_next_bio(wbi, dev->sector);
2719 if (!raid5_dec_bi_phys_segments(wbi)) { 2748 if (!raid5_dec_bi_active_stripes(wbi)) {
2720 md_write_end(conf->mddev); 2749 md_write_end(conf->mddev);
2721 wbi->bi_next = *return_bi; 2750 wbi->bi_next = *return_bi;
2722 *return_bi = wbi; 2751 *return_bi = wbi;
2723 } 2752 }
2724 wbi = wbi2; 2753 wbi = wbi2;
2725 } 2754 }
2726 if (dev->towrite == NULL) 2755 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2727 bitmap_end = 1; 2756 STRIPE_SECTORS,
2728 spin_unlock_irq(&conf->device_lock);
2729 if (bitmap_end)
2730 bitmap_endwrite(conf->mddev->bitmap,
2731 sh->sector,
2732 STRIPE_SECTORS,
2733 !test_bit(STRIPE_DEGRADED, &sh->state), 2757 !test_bit(STRIPE_DEGRADED, &sh->state),
2734 0); 2758 0);
2735 } 2759 }
2736 } 2760 }
2737 2761
@@ -3183,7 +3207,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3183 3207
3184 /* Now to look around and see what can be done */ 3208 /* Now to look around and see what can be done */
3185 rcu_read_lock(); 3209 rcu_read_lock();
3186 spin_lock_irq(&conf->device_lock);
3187 for (i=disks; i--; ) { 3210 for (i=disks; i--; ) {
3188 struct md_rdev *rdev; 3211 struct md_rdev *rdev;
3189 sector_t first_bad; 3212 sector_t first_bad;
@@ -3329,7 +3352,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3329 do_recovery = 1; 3352 do_recovery = 1;
3330 } 3353 }
3331 } 3354 }
3332 spin_unlock_irq(&conf->device_lock);
3333 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3355 if (test_bit(STRIPE_SYNCING, &sh->state)) {
3334 /* If there is a failed device being replaced, 3356 /* If there is a failed device being replaced,
3335 * we must be recovering. 3357 * we must be recovering.
@@ -3792,7 +3814,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
3792 * this sets the active strip count to 1 and the processed 3814 * this sets the active strip count to 1 and the processed
3793 * strip count to zero (upper 8 bits) 3815 * strip count to zero (upper 8 bits)
3794 */ 3816 */
3795 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3817 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
3796 } 3818 }
3797 3819
3798 return bi; 3820 return bi;
@@ -4170,7 +4192,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4170 finish_wait(&conf->wait_for_overlap, &w); 4192 finish_wait(&conf->wait_for_overlap, &w);
4171 set_bit(STRIPE_HANDLE, &sh->state); 4193 set_bit(STRIPE_HANDLE, &sh->state);
4172 clear_bit(STRIPE_DELAYED, &sh->state); 4194 clear_bit(STRIPE_DELAYED, &sh->state);
4173 if ((bi->bi_rw & REQ_SYNC) && 4195 if ((bi->bi_rw & REQ_NOIDLE) &&
4174 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4196 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4175 atomic_inc(&conf->preread_active_stripes); 4197 atomic_inc(&conf->preread_active_stripes);
4176 release_stripe_plug(mddev, sh); 4198 release_stripe_plug(mddev, sh);
@@ -4182,9 +4204,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4182 } 4204 }
4183 } 4205 }
4184 4206
4185 spin_lock_irq(&conf->device_lock); 4207 remaining = raid5_dec_bi_active_stripes(bi);
4186 remaining = raid5_dec_bi_phys_segments(bi);
4187 spin_unlock_irq(&conf->device_lock);
4188 if (remaining == 0) { 4208 if (remaining == 0) {
4189 4209
4190 if ( rw == WRITE ) 4210 if ( rw == WRITE )
@@ -4540,7 +4560,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4540 sector += STRIPE_SECTORS, 4560 sector += STRIPE_SECTORS,
4541 scnt++) { 4561 scnt++) {
4542 4562
4543 if (scnt < raid5_bi_hw_segments(raid_bio)) 4563 if (scnt < raid5_bi_processed_stripes(raid_bio))
4544 /* already done this stripe */ 4564 /* already done this stripe */
4545 continue; 4565 continue;
4546 4566
@@ -4548,25 +4568,24 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4548 4568
4549 if (!sh) { 4569 if (!sh) {
4550 /* failed to get a stripe - must wait */ 4570 /* failed to get a stripe - must wait */
4551 raid5_set_bi_hw_segments(raid_bio, scnt); 4571 raid5_set_bi_processed_stripes(raid_bio, scnt);
4552 conf->retry_read_aligned = raid_bio; 4572 conf->retry_read_aligned = raid_bio;
4553 return handled; 4573 return handled;
4554 } 4574 }
4555 4575
4556 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4576 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4557 release_stripe(sh); 4577 release_stripe(sh);
4558 raid5_set_bi_hw_segments(raid_bio, scnt); 4578 raid5_set_bi_processed_stripes(raid_bio, scnt);
4559 conf->retry_read_aligned = raid_bio; 4579 conf->retry_read_aligned = raid_bio;
4560 return handled; 4580 return handled;
4561 } 4581 }
4562 4582
4583 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
4563 handle_stripe(sh); 4584 handle_stripe(sh);
4564 release_stripe(sh); 4585 release_stripe(sh);
4565 handled++; 4586 handled++;
4566 } 4587 }
4567 spin_lock_irq(&conf->device_lock); 4588 remaining = raid5_dec_bi_active_stripes(raid_bio);
4568 remaining = raid5_dec_bi_phys_segments(raid_bio);
4569 spin_unlock_irq(&conf->device_lock);
4570 if (remaining == 0) 4589 if (remaining == 0)
4571 bio_endio(raid_bio, 0); 4590 bio_endio(raid_bio, 0);
4572 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4591 if (atomic_dec_and_test(&conf->active_aligned_reads))
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 9a7b36f0a425..a9fc24901eda 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -210,6 +210,7 @@ struct stripe_head {
210 int disks; /* disks in stripe */ 210 int disks; /* disks in stripe */
211 enum check_states check_state; 211 enum check_states check_state;
212 enum reconstruct_states reconstruct_state; 212 enum reconstruct_states reconstruct_state;
213 spinlock_t stripe_lock;
213 /** 214 /**
214 * struct stripe_operations 215 * struct stripe_operations
215 * @target - STRIPE_OP_COMPUTE_BLK target 216 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -273,6 +274,7 @@ enum r5dev_flags {
273 R5_Wantwrite, 274 R5_Wantwrite,
274 R5_Overlap, /* There is a pending overlapping request 275 R5_Overlap, /* There is a pending overlapping request
275 * on this block */ 276 * on this block */
277 R5_ReadNoMerge, /* prevent bio from merging in block-layer */
276 R5_ReadError, /* seen a read error here recently */ 278 R5_ReadError, /* seen a read error here recently */
277 R5_ReWrite, /* have tried to over-write the readerror */ 279 R5_ReWrite, /* have tried to over-write the readerror */
278 280