diff options
Diffstat (limited to 'drivers/md')
41 files changed, 1999 insertions, 1638 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 10f122a3a856..d949b781f6f8 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -260,15 +260,6 @@ config DM_DEBUG_BLOCK_STACK_TRACING | |||
260 | 260 | ||
261 | If unsure, say N. | 261 | If unsure, say N. |
262 | 262 | ||
263 | config DM_DEBUG_SPACE_MAPS | ||
264 | boolean "Extra validation for thin provisioning space maps" | ||
265 | depends on DM_THIN_PROVISIONING | ||
266 | ---help--- | ||
267 | Enable this for messages that may help debug problems with the | ||
268 | space maps used by thin provisioning. | ||
269 | |||
270 | If unsure, say N. | ||
271 | |||
272 | config DM_MIRROR | 263 | config DM_MIRROR |
273 | tristate "Mirror target" | 264 | tristate "Mirror target" |
274 | depends on BLK_DEV_DM | 265 | depends on BLK_DEV_DM |
@@ -277,13 +268,14 @@ config DM_MIRROR | |||
277 | needed for live data migration tools such as 'pvmove'. | 268 | needed for live data migration tools such as 'pvmove'. |
278 | 269 | ||
279 | config DM_RAID | 270 | config DM_RAID |
280 | tristate "RAID 1/4/5/6 target" | 271 | tristate "RAID 1/4/5/6/10 target" |
281 | depends on BLK_DEV_DM | 272 | depends on BLK_DEV_DM |
282 | select MD_RAID1 | 273 | select MD_RAID1 |
274 | select MD_RAID10 | ||
283 | select MD_RAID456 | 275 | select MD_RAID456 |
284 | select BLK_DEV_MD | 276 | select BLK_DEV_MD |
285 | ---help--- | 277 | ---help--- |
286 | A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings | 278 | A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings |
287 | 279 | ||
288 | A RAID-5 set of N drives with a capacity of C MB per drive provides | 280 | A RAID-5 set of N drives with a capacity of C MB per drive provides |
289 | the capacity of C * (N - 1) MB, and protects against a failure | 281 | the capacity of C * (N - 1) MB, and protects against a failure |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 15dbe03117e4..94e7f6ba2e11 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -1305,7 +1305,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
1305 | prepare_to_wait(&bitmap->overflow_wait, &__wait, | 1305 | prepare_to_wait(&bitmap->overflow_wait, &__wait, |
1306 | TASK_UNINTERRUPTIBLE); | 1306 | TASK_UNINTERRUPTIBLE); |
1307 | spin_unlock_irq(&bitmap->counts.lock); | 1307 | spin_unlock_irq(&bitmap->counts.lock); |
1308 | io_schedule(); | 1308 | schedule(); |
1309 | finish_wait(&bitmap->overflow_wait, &__wait); | 1309 | finish_wait(&bitmap->overflow_wait, &__wait); |
1310 | continue; | 1310 | continue; |
1311 | } | 1311 | } |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3f06df59fd82..664743d6a6cd 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -42,21 +42,21 @@ struct convert_context { | |||
42 | unsigned int offset_out; | 42 | unsigned int offset_out; |
43 | unsigned int idx_in; | 43 | unsigned int idx_in; |
44 | unsigned int idx_out; | 44 | unsigned int idx_out; |
45 | sector_t sector; | 45 | sector_t cc_sector; |
46 | atomic_t pending; | 46 | atomic_t cc_pending; |
47 | }; | 47 | }; |
48 | 48 | ||
49 | /* | 49 | /* |
50 | * per bio private data | 50 | * per bio private data |
51 | */ | 51 | */ |
52 | struct dm_crypt_io { | 52 | struct dm_crypt_io { |
53 | struct dm_target *target; | 53 | struct crypt_config *cc; |
54 | struct bio *base_bio; | 54 | struct bio *base_bio; |
55 | struct work_struct work; | 55 | struct work_struct work; |
56 | 56 | ||
57 | struct convert_context ctx; | 57 | struct convert_context ctx; |
58 | 58 | ||
59 | atomic_t pending; | 59 | atomic_t io_pending; |
60 | int error; | 60 | int error; |
61 | sector_t sector; | 61 | sector_t sector; |
62 | struct dm_crypt_io *base_io; | 62 | struct dm_crypt_io *base_io; |
@@ -109,9 +109,6 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; | |||
109 | */ | 109 | */ |
110 | struct crypt_cpu { | 110 | struct crypt_cpu { |
111 | struct ablkcipher_request *req; | 111 | struct ablkcipher_request *req; |
112 | /* ESSIV: struct crypto_cipher *essiv_tfm */ | ||
113 | void *iv_private; | ||
114 | struct crypto_ablkcipher *tfms[0]; | ||
115 | }; | 112 | }; |
116 | 113 | ||
117 | /* | 114 | /* |
@@ -151,6 +148,10 @@ struct crypt_config { | |||
151 | * per_cpu_ptr() only. | 148 | * per_cpu_ptr() only. |
152 | */ | 149 | */ |
153 | struct crypt_cpu __percpu *cpu; | 150 | struct crypt_cpu __percpu *cpu; |
151 | |||
152 | /* ESSIV: struct crypto_cipher *essiv_tfm */ | ||
153 | void *iv_private; | ||
154 | struct crypto_ablkcipher **tfms; | ||
154 | unsigned tfms_count; | 155 | unsigned tfms_count; |
155 | 156 | ||
156 | /* | 157 | /* |
@@ -193,7 +194,7 @@ static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) | |||
193 | */ | 194 | */ |
194 | static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) | 195 | static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) |
195 | { | 196 | { |
196 | return __this_cpu_ptr(cc->cpu)->tfms[0]; | 197 | return cc->tfms[0]; |
197 | } | 198 | } |
198 | 199 | ||
199 | /* | 200 | /* |
@@ -258,7 +259,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) | |||
258 | struct hash_desc desc; | 259 | struct hash_desc desc; |
259 | struct scatterlist sg; | 260 | struct scatterlist sg; |
260 | struct crypto_cipher *essiv_tfm; | 261 | struct crypto_cipher *essiv_tfm; |
261 | int err, cpu; | 262 | int err; |
262 | 263 | ||
263 | sg_init_one(&sg, cc->key, cc->key_size); | 264 | sg_init_one(&sg, cc->key, cc->key_size); |
264 | desc.tfm = essiv->hash_tfm; | 265 | desc.tfm = essiv->hash_tfm; |
@@ -268,14 +269,12 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) | |||
268 | if (err) | 269 | if (err) |
269 | return err; | 270 | return err; |
270 | 271 | ||
271 | for_each_possible_cpu(cpu) { | 272 | essiv_tfm = cc->iv_private; |
272 | essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private, | ||
273 | 273 | ||
274 | err = crypto_cipher_setkey(essiv_tfm, essiv->salt, | 274 | err = crypto_cipher_setkey(essiv_tfm, essiv->salt, |
275 | crypto_hash_digestsize(essiv->hash_tfm)); | 275 | crypto_hash_digestsize(essiv->hash_tfm)); |
276 | if (err) | 276 | if (err) |
277 | return err; | 277 | return err; |
278 | } | ||
279 | 278 | ||
280 | return 0; | 279 | return 0; |
281 | } | 280 | } |
@@ -286,16 +285,14 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc) | |||
286 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | 285 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; |
287 | unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); | 286 | unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); |
288 | struct crypto_cipher *essiv_tfm; | 287 | struct crypto_cipher *essiv_tfm; |
289 | int cpu, r, err = 0; | 288 | int r, err = 0; |
290 | 289 | ||
291 | memset(essiv->salt, 0, salt_size); | 290 | memset(essiv->salt, 0, salt_size); |
292 | 291 | ||
293 | for_each_possible_cpu(cpu) { | 292 | essiv_tfm = cc->iv_private; |
294 | essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private; | 293 | r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); |
295 | r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); | 294 | if (r) |
296 | if (r) | 295 | err = r; |
297 | err = r; | ||
298 | } | ||
299 | 296 | ||
300 | return err; | 297 | return err; |
301 | } | 298 | } |
@@ -335,8 +332,6 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, | |||
335 | 332 | ||
336 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) | 333 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) |
337 | { | 334 | { |
338 | int cpu; | ||
339 | struct crypt_cpu *cpu_cc; | ||
340 | struct crypto_cipher *essiv_tfm; | 335 | struct crypto_cipher *essiv_tfm; |
341 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | 336 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; |
342 | 337 | ||
@@ -346,15 +341,12 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc) | |||
346 | kzfree(essiv->salt); | 341 | kzfree(essiv->salt); |
347 | essiv->salt = NULL; | 342 | essiv->salt = NULL; |
348 | 343 | ||
349 | for_each_possible_cpu(cpu) { | 344 | essiv_tfm = cc->iv_private; |
350 | cpu_cc = per_cpu_ptr(cc->cpu, cpu); | ||
351 | essiv_tfm = cpu_cc->iv_private; | ||
352 | 345 | ||
353 | if (essiv_tfm) | 346 | if (essiv_tfm) |
354 | crypto_free_cipher(essiv_tfm); | 347 | crypto_free_cipher(essiv_tfm); |
355 | 348 | ||
356 | cpu_cc->iv_private = NULL; | 349 | cc->iv_private = NULL; |
357 | } | ||
358 | } | 350 | } |
359 | 351 | ||
360 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | 352 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, |
@@ -363,7 +355,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
363 | struct crypto_cipher *essiv_tfm = NULL; | 355 | struct crypto_cipher *essiv_tfm = NULL; |
364 | struct crypto_hash *hash_tfm = NULL; | 356 | struct crypto_hash *hash_tfm = NULL; |
365 | u8 *salt = NULL; | 357 | u8 *salt = NULL; |
366 | int err, cpu; | 358 | int err; |
367 | 359 | ||
368 | if (!opts) { | 360 | if (!opts) { |
369 | ti->error = "Digest algorithm missing for ESSIV mode"; | 361 | ti->error = "Digest algorithm missing for ESSIV mode"; |
@@ -388,15 +380,13 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
388 | cc->iv_gen_private.essiv.salt = salt; | 380 | cc->iv_gen_private.essiv.salt = salt; |
389 | cc->iv_gen_private.essiv.hash_tfm = hash_tfm; | 381 | cc->iv_gen_private.essiv.hash_tfm = hash_tfm; |
390 | 382 | ||
391 | for_each_possible_cpu(cpu) { | 383 | essiv_tfm = setup_essiv_cpu(cc, ti, salt, |
392 | essiv_tfm = setup_essiv_cpu(cc, ti, salt, | 384 | crypto_hash_digestsize(hash_tfm)); |
393 | crypto_hash_digestsize(hash_tfm)); | 385 | if (IS_ERR(essiv_tfm)) { |
394 | if (IS_ERR(essiv_tfm)) { | 386 | crypt_iv_essiv_dtr(cc); |
395 | crypt_iv_essiv_dtr(cc); | 387 | return PTR_ERR(essiv_tfm); |
396 | return PTR_ERR(essiv_tfm); | ||
397 | } | ||
398 | per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm; | ||
399 | } | 388 | } |
389 | cc->iv_private = essiv_tfm; | ||
400 | 390 | ||
401 | return 0; | 391 | return 0; |
402 | 392 | ||
@@ -410,7 +400,7 @@ bad: | |||
410 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, | 400 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, |
411 | struct dm_crypt_request *dmreq) | 401 | struct dm_crypt_request *dmreq) |
412 | { | 402 | { |
413 | struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; | 403 | struct crypto_cipher *essiv_tfm = cc->iv_private; |
414 | 404 | ||
415 | memset(iv, 0, cc->iv_size); | 405 | memset(iv, 0, cc->iv_size); |
416 | *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); | 406 | *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); |
@@ -664,7 +654,7 @@ static void crypt_convert_init(struct crypt_config *cc, | |||
664 | ctx->offset_out = 0; | 654 | ctx->offset_out = 0; |
665 | ctx->idx_in = bio_in ? bio_in->bi_idx : 0; | 655 | ctx->idx_in = bio_in ? bio_in->bi_idx : 0; |
666 | ctx->idx_out = bio_out ? bio_out->bi_idx : 0; | 656 | ctx->idx_out = bio_out ? bio_out->bi_idx : 0; |
667 | ctx->sector = sector + cc->iv_offset; | 657 | ctx->cc_sector = sector + cc->iv_offset; |
668 | init_completion(&ctx->restart); | 658 | init_completion(&ctx->restart); |
669 | } | 659 | } |
670 | 660 | ||
@@ -695,12 +685,12 @@ static int crypt_convert_block(struct crypt_config *cc, | |||
695 | struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); | 685 | struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); |
696 | struct dm_crypt_request *dmreq; | 686 | struct dm_crypt_request *dmreq; |
697 | u8 *iv; | 687 | u8 *iv; |
698 | int r = 0; | 688 | int r; |
699 | 689 | ||
700 | dmreq = dmreq_of_req(cc, req); | 690 | dmreq = dmreq_of_req(cc, req); |
701 | iv = iv_of_dmreq(cc, dmreq); | 691 | iv = iv_of_dmreq(cc, dmreq); |
702 | 692 | ||
703 | dmreq->iv_sector = ctx->sector; | 693 | dmreq->iv_sector = ctx->cc_sector; |
704 | dmreq->ctx = ctx; | 694 | dmreq->ctx = ctx; |
705 | sg_init_table(&dmreq->sg_in, 1); | 695 | sg_init_table(&dmreq->sg_in, 1); |
706 | sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, | 696 | sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, |
@@ -749,12 +739,12 @@ static void crypt_alloc_req(struct crypt_config *cc, | |||
749 | struct convert_context *ctx) | 739 | struct convert_context *ctx) |
750 | { | 740 | { |
751 | struct crypt_cpu *this_cc = this_crypt_config(cc); | 741 | struct crypt_cpu *this_cc = this_crypt_config(cc); |
752 | unsigned key_index = ctx->sector & (cc->tfms_count - 1); | 742 | unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); |
753 | 743 | ||
754 | if (!this_cc->req) | 744 | if (!this_cc->req) |
755 | this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); | 745 | this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); |
756 | 746 | ||
757 | ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]); | 747 | ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]); |
758 | ablkcipher_request_set_callback(this_cc->req, | 748 | ablkcipher_request_set_callback(this_cc->req, |
759 | CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, | 749 | CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, |
760 | kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); | 750 | kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); |
@@ -769,14 +759,14 @@ static int crypt_convert(struct crypt_config *cc, | |||
769 | struct crypt_cpu *this_cc = this_crypt_config(cc); | 759 | struct crypt_cpu *this_cc = this_crypt_config(cc); |
770 | int r; | 760 | int r; |
771 | 761 | ||
772 | atomic_set(&ctx->pending, 1); | 762 | atomic_set(&ctx->cc_pending, 1); |
773 | 763 | ||
774 | while(ctx->idx_in < ctx->bio_in->bi_vcnt && | 764 | while(ctx->idx_in < ctx->bio_in->bi_vcnt && |
775 | ctx->idx_out < ctx->bio_out->bi_vcnt) { | 765 | ctx->idx_out < ctx->bio_out->bi_vcnt) { |
776 | 766 | ||
777 | crypt_alloc_req(cc, ctx); | 767 | crypt_alloc_req(cc, ctx); |
778 | 768 | ||
779 | atomic_inc(&ctx->pending); | 769 | atomic_inc(&ctx->cc_pending); |
780 | 770 | ||
781 | r = crypt_convert_block(cc, ctx, this_cc->req); | 771 | r = crypt_convert_block(cc, ctx, this_cc->req); |
782 | 772 | ||
@@ -788,19 +778,19 @@ static int crypt_convert(struct crypt_config *cc, | |||
788 | /* fall through*/ | 778 | /* fall through*/ |
789 | case -EINPROGRESS: | 779 | case -EINPROGRESS: |
790 | this_cc->req = NULL; | 780 | this_cc->req = NULL; |
791 | ctx->sector++; | 781 | ctx->cc_sector++; |
792 | continue; | 782 | continue; |
793 | 783 | ||
794 | /* sync */ | 784 | /* sync */ |
795 | case 0: | 785 | case 0: |
796 | atomic_dec(&ctx->pending); | 786 | atomic_dec(&ctx->cc_pending); |
797 | ctx->sector++; | 787 | ctx->cc_sector++; |
798 | cond_resched(); | 788 | cond_resched(); |
799 | continue; | 789 | continue; |
800 | 790 | ||
801 | /* error */ | 791 | /* error */ |
802 | default: | 792 | default: |
803 | atomic_dec(&ctx->pending); | 793 | atomic_dec(&ctx->cc_pending); |
804 | return r; | 794 | return r; |
805 | } | 795 | } |
806 | } | 796 | } |
@@ -811,7 +801,7 @@ static int crypt_convert(struct crypt_config *cc, | |||
811 | static void dm_crypt_bio_destructor(struct bio *bio) | 801 | static void dm_crypt_bio_destructor(struct bio *bio) |
812 | { | 802 | { |
813 | struct dm_crypt_io *io = bio->bi_private; | 803 | struct dm_crypt_io *io = bio->bi_private; |
814 | struct crypt_config *cc = io->target->private; | 804 | struct crypt_config *cc = io->cc; |
815 | 805 | ||
816 | bio_free(bio, cc->bs); | 806 | bio_free(bio, cc->bs); |
817 | } | 807 | } |
@@ -825,7 +815,7 @@ static void dm_crypt_bio_destructor(struct bio *bio) | |||
825 | static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, | 815 | static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, |
826 | unsigned *out_of_pages) | 816 | unsigned *out_of_pages) |
827 | { | 817 | { |
828 | struct crypt_config *cc = io->target->private; | 818 | struct crypt_config *cc = io->cc; |
829 | struct bio *clone; | 819 | struct bio *clone; |
830 | unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 820 | unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
831 | gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; | 821 | gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; |
@@ -884,26 +874,25 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) | |||
884 | } | 874 | } |
885 | } | 875 | } |
886 | 876 | ||
887 | static struct dm_crypt_io *crypt_io_alloc(struct dm_target *ti, | 877 | static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc, |
888 | struct bio *bio, sector_t sector) | 878 | struct bio *bio, sector_t sector) |
889 | { | 879 | { |
890 | struct crypt_config *cc = ti->private; | ||
891 | struct dm_crypt_io *io; | 880 | struct dm_crypt_io *io; |
892 | 881 | ||
893 | io = mempool_alloc(cc->io_pool, GFP_NOIO); | 882 | io = mempool_alloc(cc->io_pool, GFP_NOIO); |
894 | io->target = ti; | 883 | io->cc = cc; |
895 | io->base_bio = bio; | 884 | io->base_bio = bio; |
896 | io->sector = sector; | 885 | io->sector = sector; |
897 | io->error = 0; | 886 | io->error = 0; |
898 | io->base_io = NULL; | 887 | io->base_io = NULL; |
899 | atomic_set(&io->pending, 0); | 888 | atomic_set(&io->io_pending, 0); |
900 | 889 | ||
901 | return io; | 890 | return io; |
902 | } | 891 | } |
903 | 892 | ||
904 | static void crypt_inc_pending(struct dm_crypt_io *io) | 893 | static void crypt_inc_pending(struct dm_crypt_io *io) |
905 | { | 894 | { |
906 | atomic_inc(&io->pending); | 895 | atomic_inc(&io->io_pending); |
907 | } | 896 | } |
908 | 897 | ||
909 | /* | 898 | /* |
@@ -913,12 +902,12 @@ static void crypt_inc_pending(struct dm_crypt_io *io) | |||
913 | */ | 902 | */ |
914 | static void crypt_dec_pending(struct dm_crypt_io *io) | 903 | static void crypt_dec_pending(struct dm_crypt_io *io) |
915 | { | 904 | { |
916 | struct crypt_config *cc = io->target->private; | 905 | struct crypt_config *cc = io->cc; |
917 | struct bio *base_bio = io->base_bio; | 906 | struct bio *base_bio = io->base_bio; |
918 | struct dm_crypt_io *base_io = io->base_io; | 907 | struct dm_crypt_io *base_io = io->base_io; |
919 | int error = io->error; | 908 | int error = io->error; |
920 | 909 | ||
921 | if (!atomic_dec_and_test(&io->pending)) | 910 | if (!atomic_dec_and_test(&io->io_pending)) |
922 | return; | 911 | return; |
923 | 912 | ||
924 | mempool_free(io, cc->io_pool); | 913 | mempool_free(io, cc->io_pool); |
@@ -952,7 +941,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io) | |||
952 | static void crypt_endio(struct bio *clone, int error) | 941 | static void crypt_endio(struct bio *clone, int error) |
953 | { | 942 | { |
954 | struct dm_crypt_io *io = clone->bi_private; | 943 | struct dm_crypt_io *io = clone->bi_private; |
955 | struct crypt_config *cc = io->target->private; | 944 | struct crypt_config *cc = io->cc; |
956 | unsigned rw = bio_data_dir(clone); | 945 | unsigned rw = bio_data_dir(clone); |
957 | 946 | ||
958 | if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) | 947 | if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) |
@@ -979,7 +968,7 @@ static void crypt_endio(struct bio *clone, int error) | |||
979 | 968 | ||
980 | static void clone_init(struct dm_crypt_io *io, struct bio *clone) | 969 | static void clone_init(struct dm_crypt_io *io, struct bio *clone) |
981 | { | 970 | { |
982 | struct crypt_config *cc = io->target->private; | 971 | struct crypt_config *cc = io->cc; |
983 | 972 | ||
984 | clone->bi_private = io; | 973 | clone->bi_private = io; |
985 | clone->bi_end_io = crypt_endio; | 974 | clone->bi_end_io = crypt_endio; |
@@ -990,7 +979,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) | |||
990 | 979 | ||
991 | static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) | 980 | static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) |
992 | { | 981 | { |
993 | struct crypt_config *cc = io->target->private; | 982 | struct crypt_config *cc = io->cc; |
994 | struct bio *base_bio = io->base_bio; | 983 | struct bio *base_bio = io->base_bio; |
995 | struct bio *clone; | 984 | struct bio *clone; |
996 | 985 | ||
@@ -1038,7 +1027,7 @@ static void kcryptd_io(struct work_struct *work) | |||
1038 | 1027 | ||
1039 | static void kcryptd_queue_io(struct dm_crypt_io *io) | 1028 | static void kcryptd_queue_io(struct dm_crypt_io *io) |
1040 | { | 1029 | { |
1041 | struct crypt_config *cc = io->target->private; | 1030 | struct crypt_config *cc = io->cc; |
1042 | 1031 | ||
1043 | INIT_WORK(&io->work, kcryptd_io); | 1032 | INIT_WORK(&io->work, kcryptd_io); |
1044 | queue_work(cc->io_queue, &io->work); | 1033 | queue_work(cc->io_queue, &io->work); |
@@ -1047,7 +1036,7 @@ static void kcryptd_queue_io(struct dm_crypt_io *io) | |||
1047 | static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) | 1036 | static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) |
1048 | { | 1037 | { |
1049 | struct bio *clone = io->ctx.bio_out; | 1038 | struct bio *clone = io->ctx.bio_out; |
1050 | struct crypt_config *cc = io->target->private; | 1039 | struct crypt_config *cc = io->cc; |
1051 | 1040 | ||
1052 | if (unlikely(io->error < 0)) { | 1041 | if (unlikely(io->error < 0)) { |
1053 | crypt_free_buffer_pages(cc, clone); | 1042 | crypt_free_buffer_pages(cc, clone); |
@@ -1069,7 +1058,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) | |||
1069 | 1058 | ||
1070 | static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | 1059 | static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) |
1071 | { | 1060 | { |
1072 | struct crypt_config *cc = io->target->private; | 1061 | struct crypt_config *cc = io->cc; |
1073 | struct bio *clone; | 1062 | struct bio *clone; |
1074 | struct dm_crypt_io *new_io; | 1063 | struct dm_crypt_io *new_io; |
1075 | int crypt_finished; | 1064 | int crypt_finished; |
@@ -1107,7 +1096,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
1107 | if (r < 0) | 1096 | if (r < 0) |
1108 | io->error = -EIO; | 1097 | io->error = -EIO; |
1109 | 1098 | ||
1110 | crypt_finished = atomic_dec_and_test(&io->ctx.pending); | 1099 | crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); |
1111 | 1100 | ||
1112 | /* Encryption was already finished, submit io now */ | 1101 | /* Encryption was already finished, submit io now */ |
1113 | if (crypt_finished) { | 1102 | if (crypt_finished) { |
@@ -1135,7 +1124,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
1135 | * between fragments, so switch to a new dm_crypt_io structure. | 1124 | * between fragments, so switch to a new dm_crypt_io structure. |
1136 | */ | 1125 | */ |
1137 | if (unlikely(!crypt_finished && remaining)) { | 1126 | if (unlikely(!crypt_finished && remaining)) { |
1138 | new_io = crypt_io_alloc(io->target, io->base_bio, | 1127 | new_io = crypt_io_alloc(io->cc, io->base_bio, |
1139 | sector); | 1128 | sector); |
1140 | crypt_inc_pending(new_io); | 1129 | crypt_inc_pending(new_io); |
1141 | crypt_convert_init(cc, &new_io->ctx, NULL, | 1130 | crypt_convert_init(cc, &new_io->ctx, NULL, |
@@ -1169,7 +1158,7 @@ static void kcryptd_crypt_read_done(struct dm_crypt_io *io) | |||
1169 | 1158 | ||
1170 | static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) | 1159 | static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) |
1171 | { | 1160 | { |
1172 | struct crypt_config *cc = io->target->private; | 1161 | struct crypt_config *cc = io->cc; |
1173 | int r = 0; | 1162 | int r = 0; |
1174 | 1163 | ||
1175 | crypt_inc_pending(io); | 1164 | crypt_inc_pending(io); |
@@ -1181,7 +1170,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) | |||
1181 | if (r < 0) | 1170 | if (r < 0) |
1182 | io->error = -EIO; | 1171 | io->error = -EIO; |
1183 | 1172 | ||
1184 | if (atomic_dec_and_test(&io->ctx.pending)) | 1173 | if (atomic_dec_and_test(&io->ctx.cc_pending)) |
1185 | kcryptd_crypt_read_done(io); | 1174 | kcryptd_crypt_read_done(io); |
1186 | 1175 | ||
1187 | crypt_dec_pending(io); | 1176 | crypt_dec_pending(io); |
@@ -1193,7 +1182,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, | |||
1193 | struct dm_crypt_request *dmreq = async_req->data; | 1182 | struct dm_crypt_request *dmreq = async_req->data; |
1194 | struct convert_context *ctx = dmreq->ctx; | 1183 | struct convert_context *ctx = dmreq->ctx; |
1195 | struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); | 1184 | struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); |
1196 | struct crypt_config *cc = io->target->private; | 1185 | struct crypt_config *cc = io->cc; |
1197 | 1186 | ||
1198 | if (error == -EINPROGRESS) { | 1187 | if (error == -EINPROGRESS) { |
1199 | complete(&ctx->restart); | 1188 | complete(&ctx->restart); |
@@ -1208,7 +1197,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, | |||
1208 | 1197 | ||
1209 | mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); | 1198 | mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); |
1210 | 1199 | ||
1211 | if (!atomic_dec_and_test(&ctx->pending)) | 1200 | if (!atomic_dec_and_test(&ctx->cc_pending)) |
1212 | return; | 1201 | return; |
1213 | 1202 | ||
1214 | if (bio_data_dir(io->base_bio) == READ) | 1203 | if (bio_data_dir(io->base_bio) == READ) |
@@ -1229,7 +1218,7 @@ static void kcryptd_crypt(struct work_struct *work) | |||
1229 | 1218 | ||
1230 | static void kcryptd_queue_crypt(struct dm_crypt_io *io) | 1219 | static void kcryptd_queue_crypt(struct dm_crypt_io *io) |
1231 | { | 1220 | { |
1232 | struct crypt_config *cc = io->target->private; | 1221 | struct crypt_config *cc = io->cc; |
1233 | 1222 | ||
1234 | INIT_WORK(&io->work, kcryptd_crypt); | 1223 | INIT_WORK(&io->work, kcryptd_crypt); |
1235 | queue_work(cc->crypt_queue, &io->work); | 1224 | queue_work(cc->crypt_queue, &io->work); |
@@ -1241,7 +1230,6 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io) | |||
1241 | static int crypt_decode_key(u8 *key, char *hex, unsigned int size) | 1230 | static int crypt_decode_key(u8 *key, char *hex, unsigned int size) |
1242 | { | 1231 | { |
1243 | char buffer[3]; | 1232 | char buffer[3]; |
1244 | char *endp; | ||
1245 | unsigned int i; | 1233 | unsigned int i; |
1246 | 1234 | ||
1247 | buffer[2] = '\0'; | 1235 | buffer[2] = '\0'; |
@@ -1250,9 +1238,7 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size) | |||
1250 | buffer[0] = *hex++; | 1238 | buffer[0] = *hex++; |
1251 | buffer[1] = *hex++; | 1239 | buffer[1] = *hex++; |
1252 | 1240 | ||
1253 | key[i] = (u8)simple_strtoul(buffer, &endp, 16); | 1241 | if (kstrtou8(buffer, 16, &key[i])) |
1254 | |||
1255 | if (endp != &buffer[2]) | ||
1256 | return -EINVAL; | 1242 | return -EINVAL; |
1257 | } | 1243 | } |
1258 | 1244 | ||
@@ -1276,29 +1262,38 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size) | |||
1276 | } | 1262 | } |
1277 | } | 1263 | } |
1278 | 1264 | ||
1279 | static void crypt_free_tfms(struct crypt_config *cc, int cpu) | 1265 | static void crypt_free_tfms(struct crypt_config *cc) |
1280 | { | 1266 | { |
1281 | struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); | ||
1282 | unsigned i; | 1267 | unsigned i; |
1283 | 1268 | ||
1269 | if (!cc->tfms) | ||
1270 | return; | ||
1271 | |||
1284 | for (i = 0; i < cc->tfms_count; i++) | 1272 | for (i = 0; i < cc->tfms_count; i++) |
1285 | if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) { | 1273 | if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) { |
1286 | crypto_free_ablkcipher(cpu_cc->tfms[i]); | 1274 | crypto_free_ablkcipher(cc->tfms[i]); |
1287 | cpu_cc->tfms[i] = NULL; | 1275 | cc->tfms[i] = NULL; |
1288 | } | 1276 | } |
1277 | |||
1278 | kfree(cc->tfms); | ||
1279 | cc->tfms = NULL; | ||
1289 | } | 1280 | } |
1290 | 1281 | ||
1291 | static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode) | 1282 | static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) |
1292 | { | 1283 | { |
1293 | struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); | ||
1294 | unsigned i; | 1284 | unsigned i; |
1295 | int err; | 1285 | int err; |
1296 | 1286 | ||
1287 | cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_ablkcipher *), | ||
1288 | GFP_KERNEL); | ||
1289 | if (!cc->tfms) | ||
1290 | return -ENOMEM; | ||
1291 | |||
1297 | for (i = 0; i < cc->tfms_count; i++) { | 1292 | for (i = 0; i < cc->tfms_count; i++) { |
1298 | cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); | 1293 | cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); |
1299 | if (IS_ERR(cpu_cc->tfms[i])) { | 1294 | if (IS_ERR(cc->tfms[i])) { |
1300 | err = PTR_ERR(cpu_cc->tfms[i]); | 1295 | err = PTR_ERR(cc->tfms[i]); |
1301 | crypt_free_tfms(cc, cpu); | 1296 | crypt_free_tfms(cc); |
1302 | return err; | 1297 | return err; |
1303 | } | 1298 | } |
1304 | } | 1299 | } |
@@ -1309,15 +1304,14 @@ static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode) | |||
1309 | static int crypt_setkey_allcpus(struct crypt_config *cc) | 1304 | static int crypt_setkey_allcpus(struct crypt_config *cc) |
1310 | { | 1305 | { |
1311 | unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); | 1306 | unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); |
1312 | int cpu, err = 0, i, r; | 1307 | int err = 0, i, r; |
1313 | 1308 | ||
1314 | for_each_possible_cpu(cpu) { | 1309 | for (i = 0; i < cc->tfms_count; i++) { |
1315 | for (i = 0; i < cc->tfms_count; i++) { | 1310 | r = crypto_ablkcipher_setkey(cc->tfms[i], |
1316 | r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i], | 1311 | cc->key + (i * subkey_size), |
1317 | cc->key + (i * subkey_size), subkey_size); | 1312 | subkey_size); |
1318 | if (r) | 1313 | if (r) |
1319 | err = r; | 1314 | err = r; |
1320 | } | ||
1321 | } | 1315 | } |
1322 | 1316 | ||
1323 | return err; | 1317 | return err; |
@@ -1379,9 +1373,10 @@ static void crypt_dtr(struct dm_target *ti) | |||
1379 | cpu_cc = per_cpu_ptr(cc->cpu, cpu); | 1373 | cpu_cc = per_cpu_ptr(cc->cpu, cpu); |
1380 | if (cpu_cc->req) | 1374 | if (cpu_cc->req) |
1381 | mempool_free(cpu_cc->req, cc->req_pool); | 1375 | mempool_free(cpu_cc->req, cc->req_pool); |
1382 | crypt_free_tfms(cc, cpu); | ||
1383 | } | 1376 | } |
1384 | 1377 | ||
1378 | crypt_free_tfms(cc); | ||
1379 | |||
1385 | if (cc->bs) | 1380 | if (cc->bs) |
1386 | bioset_free(cc->bs); | 1381 | bioset_free(cc->bs); |
1387 | 1382 | ||
@@ -1414,7 +1409,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1414 | struct crypt_config *cc = ti->private; | 1409 | struct crypt_config *cc = ti->private; |
1415 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; | 1410 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; |
1416 | char *cipher_api = NULL; | 1411 | char *cipher_api = NULL; |
1417 | int cpu, ret = -EINVAL; | 1412 | int ret = -EINVAL; |
1418 | char dummy; | 1413 | char dummy; |
1419 | 1414 | ||
1420 | /* Convert to crypto api definition? */ | 1415 | /* Convert to crypto api definition? */ |
@@ -1455,8 +1450,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1455 | if (tmp) | 1450 | if (tmp) |
1456 | DMWARN("Ignoring unexpected additional cipher options"); | 1451 | DMWARN("Ignoring unexpected additional cipher options"); |
1457 | 1452 | ||
1458 | cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) + | 1453 | cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)), |
1459 | cc->tfms_count * sizeof(*(cc->cpu->tfms)), | ||
1460 | __alignof__(struct crypt_cpu)); | 1454 | __alignof__(struct crypt_cpu)); |
1461 | if (!cc->cpu) { | 1455 | if (!cc->cpu) { |
1462 | ti->error = "Cannot allocate per cpu state"; | 1456 | ti->error = "Cannot allocate per cpu state"; |
@@ -1489,12 +1483,10 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1489 | } | 1483 | } |
1490 | 1484 | ||
1491 | /* Allocate cipher */ | 1485 | /* Allocate cipher */ |
1492 | for_each_possible_cpu(cpu) { | 1486 | ret = crypt_alloc_tfms(cc, cipher_api); |
1493 | ret = crypt_alloc_tfms(cc, cpu, cipher_api); | 1487 | if (ret < 0) { |
1494 | if (ret < 0) { | 1488 | ti->error = "Error allocating crypto tfm"; |
1495 | ti->error = "Error allocating crypto tfm"; | 1489 | goto bad; |
1496 | goto bad; | ||
1497 | } | ||
1498 | } | 1490 | } |
1499 | 1491 | ||
1500 | /* Initialize and set key */ | 1492 | /* Initialize and set key */ |
@@ -1702,7 +1694,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1702 | } | 1694 | } |
1703 | 1695 | ||
1704 | ti->num_flush_requests = 1; | 1696 | ti->num_flush_requests = 1; |
1705 | ti->discard_zeroes_data_unsupported = 1; | 1697 | ti->discard_zeroes_data_unsupported = true; |
1706 | 1698 | ||
1707 | return 0; | 1699 | return 0; |
1708 | 1700 | ||
@@ -1715,7 +1707,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, | |||
1715 | union map_info *map_context) | 1707 | union map_info *map_context) |
1716 | { | 1708 | { |
1717 | struct dm_crypt_io *io; | 1709 | struct dm_crypt_io *io; |
1718 | struct crypt_config *cc; | 1710 | struct crypt_config *cc = ti->private; |
1719 | 1711 | ||
1720 | /* | 1712 | /* |
1721 | * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues. | 1713 | * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues. |
@@ -1723,14 +1715,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, | |||
1723 | * - for REQ_DISCARD caller must use flush if IO ordering matters | 1715 | * - for REQ_DISCARD caller must use flush if IO ordering matters |
1724 | */ | 1716 | */ |
1725 | if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { | 1717 | if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { |
1726 | cc = ti->private; | ||
1727 | bio->bi_bdev = cc->dev->bdev; | 1718 | bio->bi_bdev = cc->dev->bdev; |
1728 | if (bio_sectors(bio)) | 1719 | if (bio_sectors(bio)) |
1729 | bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); | 1720 | bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); |
1730 | return DM_MAPIO_REMAPPED; | 1721 | return DM_MAPIO_REMAPPED; |
1731 | } | 1722 | } |
1732 | 1723 | ||
1733 | io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); | 1724 | io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_sector)); |
1734 | 1725 | ||
1735 | if (bio_data_dir(io->base_bio) == READ) { | 1726 | if (bio_data_dir(io->base_bio) == READ) { |
1736 | if (kcryptd_io_read(io, GFP_NOWAIT)) | 1727 | if (kcryptd_io_read(io, GFP_NOWAIT)) |
@@ -1742,7 +1733,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, | |||
1742 | } | 1733 | } |
1743 | 1734 | ||
1744 | static int crypt_status(struct dm_target *ti, status_type_t type, | 1735 | static int crypt_status(struct dm_target *ti, status_type_t type, |
1745 | char *result, unsigned int maxlen) | 1736 | unsigned status_flags, char *result, unsigned maxlen) |
1746 | { | 1737 | { |
1747 | struct crypt_config *cc = ti->private; | 1738 | struct crypt_config *cc = ti->private; |
1748 | unsigned int sz = 0; | 1739 | unsigned int sz = 0; |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 2dc22dddb2ae..f53846f9ab50 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
@@ -295,7 +295,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio, | |||
295 | } | 295 | } |
296 | 296 | ||
297 | static int delay_status(struct dm_target *ti, status_type_t type, | 297 | static int delay_status(struct dm_target *ti, status_type_t type, |
298 | char *result, unsigned maxlen) | 298 | unsigned status_flags, char *result, unsigned maxlen) |
299 | { | 299 | { |
300 | struct delay_c *dc = ti->private; | 300 | struct delay_c *dc = ti->private; |
301 | int sz = 0; | 301 | int sz = 0; |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index aa70f7d43a1a..ebaa4f803eec 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
@@ -142,24 +142,19 @@ EXPORT_SYMBOL(dm_exception_store_type_unregister); | |||
142 | static int set_chunk_size(struct dm_exception_store *store, | 142 | static int set_chunk_size(struct dm_exception_store *store, |
143 | const char *chunk_size_arg, char **error) | 143 | const char *chunk_size_arg, char **error) |
144 | { | 144 | { |
145 | unsigned long chunk_size_ulong; | 145 | unsigned chunk_size; |
146 | char *value; | ||
147 | 146 | ||
148 | chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10); | 147 | if (kstrtouint(chunk_size_arg, 10, &chunk_size)) { |
149 | if (*chunk_size_arg == '\0' || *value != '\0' || | ||
150 | chunk_size_ulong > UINT_MAX) { | ||
151 | *error = "Invalid chunk size"; | 148 | *error = "Invalid chunk size"; |
152 | return -EINVAL; | 149 | return -EINVAL; |
153 | } | 150 | } |
154 | 151 | ||
155 | if (!chunk_size_ulong) { | 152 | if (!chunk_size) { |
156 | store->chunk_size = store->chunk_mask = store->chunk_shift = 0; | 153 | store->chunk_size = store->chunk_mask = store->chunk_shift = 0; |
157 | return 0; | 154 | return 0; |
158 | } | 155 | } |
159 | 156 | ||
160 | return dm_exception_store_set_chunk_size(store, | 157 | return dm_exception_store_set_chunk_size(store, chunk_size, error); |
161 | (unsigned) chunk_size_ulong, | ||
162 | error); | ||
163 | } | 158 | } |
164 | 159 | ||
165 | int dm_exception_store_set_chunk_size(struct dm_exception_store *store, | 160 | int dm_exception_store_set_chunk_size(struct dm_exception_store *store, |
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index ac49c01f1a44..cc15543a6ad7 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c | |||
@@ -333,7 +333,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, | |||
333 | } | 333 | } |
334 | 334 | ||
335 | static int flakey_status(struct dm_target *ti, status_type_t type, | 335 | static int flakey_status(struct dm_target *ti, status_type_t type, |
336 | char *result, unsigned int maxlen) | 336 | unsigned status_flags, char *result, unsigned maxlen) |
337 | { | 337 | { |
338 | unsigned sz = 0; | 338 | unsigned sz = 0; |
339 | struct flakey_c *fc = ti->private; | 339 | struct flakey_c *fc = ti->private; |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index a1a3e6df17b8..afd95986d099 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -1054,6 +1054,7 @@ static void retrieve_status(struct dm_table *table, | |||
1054 | char *outbuf, *outptr; | 1054 | char *outbuf, *outptr; |
1055 | status_type_t type; | 1055 | status_type_t type; |
1056 | size_t remaining, len, used = 0; | 1056 | size_t remaining, len, used = 0; |
1057 | unsigned status_flags = 0; | ||
1057 | 1058 | ||
1058 | outptr = outbuf = get_result_buffer(param, param_size, &len); | 1059 | outptr = outbuf = get_result_buffer(param, param_size, &len); |
1059 | 1060 | ||
@@ -1090,7 +1091,9 @@ static void retrieve_status(struct dm_table *table, | |||
1090 | 1091 | ||
1091 | /* Get the status/table string from the target driver */ | 1092 | /* Get the status/table string from the target driver */ |
1092 | if (ti->type->status) { | 1093 | if (ti->type->status) { |
1093 | if (ti->type->status(ti, type, outptr, remaining)) { | 1094 | if (param->flags & DM_NOFLUSH_FLAG) |
1095 | status_flags |= DM_STATUS_NOFLUSH_FLAG; | ||
1096 | if (ti->type->status(ti, type, status_flags, outptr, remaining)) { | ||
1094 | param->flags |= DM_BUFFER_FULL_FLAG; | 1097 | param->flags |= DM_BUFFER_FULL_FLAG; |
1095 | break; | 1098 | break; |
1096 | } | 1099 | } |
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 3639eeab6042..1bf19a93eef0 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
@@ -96,7 +96,7 @@ static int linear_map(struct dm_target *ti, struct bio *bio, | |||
96 | } | 96 | } |
97 | 97 | ||
98 | static int linear_status(struct dm_target *ti, status_type_t type, | 98 | static int linear_status(struct dm_target *ti, status_type_t type, |
99 | char *result, unsigned int maxlen) | 99 | unsigned status_flags, char *result, unsigned maxlen) |
100 | { | 100 | { |
101 | struct linear_c *lc = (struct linear_c *) ti->private; | 101 | struct linear_c *lc = (struct linear_c *) ti->private; |
102 | 102 | ||
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 65ebaebf502b..627d19186d5a 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
@@ -571,16 +571,6 @@ static void disk_dtr(struct dm_dirty_log *log) | |||
571 | destroy_log_context(lc); | 571 | destroy_log_context(lc); |
572 | } | 572 | } |
573 | 573 | ||
574 | static int count_bits32(uint32_t *addr, unsigned size) | ||
575 | { | ||
576 | int count = 0, i; | ||
577 | |||
578 | for (i = 0; i < size; i++) { | ||
579 | count += hweight32(*(addr+i)); | ||
580 | } | ||
581 | return count; | ||
582 | } | ||
583 | |||
584 | static void fail_log_device(struct log_c *lc) | 574 | static void fail_log_device(struct log_c *lc) |
585 | { | 575 | { |
586 | if (lc->log_dev_failed) | 576 | if (lc->log_dev_failed) |
@@ -629,7 +619,8 @@ static int disk_resume(struct dm_dirty_log *log) | |||
629 | 619 | ||
630 | /* copy clean across to sync */ | 620 | /* copy clean across to sync */ |
631 | memcpy(lc->sync_bits, lc->clean_bits, size); | 621 | memcpy(lc->sync_bits, lc->clean_bits, size); |
632 | lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); | 622 | lc->sync_count = memweight(lc->clean_bits, |
623 | lc->bitset_uint32_count * sizeof(uint32_t)); | ||
633 | lc->sync_search = 0; | 624 | lc->sync_search = 0; |
634 | 625 | ||
635 | /* set the correct number of regions in the header */ | 626 | /* set the correct number of regions in the header */ |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 638dae048b4f..d8abb90a6c2f 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -85,6 +85,7 @@ struct multipath { | |||
85 | unsigned queue_io:1; /* Must we queue all I/O? */ | 85 | unsigned queue_io:1; /* Must we queue all I/O? */ |
86 | unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ | 86 | unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ |
87 | unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ | 87 | unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ |
88 | unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ | ||
88 | 89 | ||
89 | unsigned pg_init_retries; /* Number of times to retry pg_init */ | 90 | unsigned pg_init_retries; /* Number of times to retry pg_init */ |
90 | unsigned pg_init_count; /* Number of times pg_init called */ | 91 | unsigned pg_init_count; /* Number of times pg_init called */ |
@@ -568,6 +569,8 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps | |||
568 | int r; | 569 | int r; |
569 | struct pgpath *p; | 570 | struct pgpath *p; |
570 | struct multipath *m = ti->private; | 571 | struct multipath *m = ti->private; |
572 | struct request_queue *q = NULL; | ||
573 | const char *attached_handler_name; | ||
571 | 574 | ||
572 | /* we need at least a path arg */ | 575 | /* we need at least a path arg */ |
573 | if (as->argc < 1) { | 576 | if (as->argc < 1) { |
@@ -586,13 +589,37 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps | |||
586 | goto bad; | 589 | goto bad; |
587 | } | 590 | } |
588 | 591 | ||
589 | if (m->hw_handler_name) { | 592 | if (m->retain_attached_hw_handler || m->hw_handler_name) |
590 | struct request_queue *q = bdev_get_queue(p->path.dev->bdev); | 593 | q = bdev_get_queue(p->path.dev->bdev); |
594 | |||
595 | if (m->retain_attached_hw_handler) { | ||
596 | attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); | ||
597 | if (attached_handler_name) { | ||
598 | /* | ||
599 | * Reset hw_handler_name to match the attached handler | ||
600 | * and clear any hw_handler_params associated with the | ||
601 | * ignored handler. | ||
602 | * | ||
603 | * NB. This modifies the table line to show the actual | ||
604 | * handler instead of the original table passed in. | ||
605 | */ | ||
606 | kfree(m->hw_handler_name); | ||
607 | m->hw_handler_name = attached_handler_name; | ||
608 | |||
609 | kfree(m->hw_handler_params); | ||
610 | m->hw_handler_params = NULL; | ||
611 | } | ||
612 | } | ||
591 | 613 | ||
614 | if (m->hw_handler_name) { | ||
615 | /* | ||
616 | * Increments scsi_dh reference, even when using an | ||
617 | * already-attached handler. | ||
618 | */ | ||
592 | r = scsi_dh_attach(q, m->hw_handler_name); | 619 | r = scsi_dh_attach(q, m->hw_handler_name); |
593 | if (r == -EBUSY) { | 620 | if (r == -EBUSY) { |
594 | /* | 621 | /* |
595 | * Already attached to different hw_handler, | 622 | * Already attached to different hw_handler: |
596 | * try to reattach with correct one. | 623 | * try to reattach with correct one. |
597 | */ | 624 | */ |
598 | scsi_dh_detach(q); | 625 | scsi_dh_detach(q); |
@@ -760,7 +787,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) | |||
760 | const char *arg_name; | 787 | const char *arg_name; |
761 | 788 | ||
762 | static struct dm_arg _args[] = { | 789 | static struct dm_arg _args[] = { |
763 | {0, 5, "invalid number of feature args"}, | 790 | {0, 6, "invalid number of feature args"}, |
764 | {1, 50, "pg_init_retries must be between 1 and 50"}, | 791 | {1, 50, "pg_init_retries must be between 1 and 50"}, |
765 | {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, | 792 | {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, |
766 | }; | 793 | }; |
@@ -781,6 +808,11 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) | |||
781 | continue; | 808 | continue; |
782 | } | 809 | } |
783 | 810 | ||
811 | if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { | ||
812 | m->retain_attached_hw_handler = 1; | ||
813 | continue; | ||
814 | } | ||
815 | |||
784 | if (!strcasecmp(arg_name, "pg_init_retries") && | 816 | if (!strcasecmp(arg_name, "pg_init_retries") && |
785 | (argc >= 1)) { | 817 | (argc >= 1)) { |
786 | r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); | 818 | r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); |
@@ -1346,7 +1378,7 @@ static void multipath_resume(struct dm_target *ti) | |||
1346 | * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ | 1378 | * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ |
1347 | */ | 1379 | */ |
1348 | static int multipath_status(struct dm_target *ti, status_type_t type, | 1380 | static int multipath_status(struct dm_target *ti, status_type_t type, |
1349 | char *result, unsigned int maxlen) | 1381 | unsigned status_flags, char *result, unsigned maxlen) |
1350 | { | 1382 | { |
1351 | int sz = 0; | 1383 | int sz = 0; |
1352 | unsigned long flags; | 1384 | unsigned long flags; |
@@ -1364,13 +1396,16 @@ static int multipath_status(struct dm_target *ti, status_type_t type, | |||
1364 | else { | 1396 | else { |
1365 | DMEMIT("%u ", m->queue_if_no_path + | 1397 | DMEMIT("%u ", m->queue_if_no_path + |
1366 | (m->pg_init_retries > 0) * 2 + | 1398 | (m->pg_init_retries > 0) * 2 + |
1367 | (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2); | 1399 | (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + |
1400 | m->retain_attached_hw_handler); | ||
1368 | if (m->queue_if_no_path) | 1401 | if (m->queue_if_no_path) |
1369 | DMEMIT("queue_if_no_path "); | 1402 | DMEMIT("queue_if_no_path "); |
1370 | if (m->pg_init_retries) | 1403 | if (m->pg_init_retries) |
1371 | DMEMIT("pg_init_retries %u ", m->pg_init_retries); | 1404 | DMEMIT("pg_init_retries %u ", m->pg_init_retries); |
1372 | if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) | 1405 | if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) |
1373 | DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); | 1406 | DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); |
1407 | if (m->retain_attached_hw_handler) | ||
1408 | DMEMIT("retain_attached_hw_handler "); | ||
1374 | } | 1409 | } |
1375 | 1410 | ||
1376 | if (!m->hw_handler_name || type == STATUSTYPE_INFO) | 1411 | if (!m->hw_handler_name || type == STATUSTYPE_INFO) |
@@ -1656,7 +1691,7 @@ out: | |||
1656 | *---------------------------------------------------------------*/ | 1691 | *---------------------------------------------------------------*/ |
1657 | static struct target_type multipath_target = { | 1692 | static struct target_type multipath_target = { |
1658 | .name = "multipath", | 1693 | .name = "multipath", |
1659 | .version = {1, 4, 0}, | 1694 | .version = {1, 5, 0}, |
1660 | .module = THIS_MODULE, | 1695 | .module = THIS_MODULE, |
1661 | .ctr = multipath_ctr, | 1696 | .ctr = multipath_ctr, |
1662 | .dtr = multipath_dtr, | 1697 | .dtr = multipath_dtr, |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 017c34d78d61..982e3e390c45 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include "md.h" | 11 | #include "md.h" |
12 | #include "raid1.h" | 12 | #include "raid1.h" |
13 | #include "raid5.h" | 13 | #include "raid5.h" |
14 | #include "raid10.h" | ||
14 | #include "bitmap.h" | 15 | #include "bitmap.h" |
15 | 16 | ||
16 | #include <linux/device-mapper.h> | 17 | #include <linux/device-mapper.h> |
@@ -52,7 +53,10 @@ struct raid_dev { | |||
52 | #define DMPF_MAX_RECOVERY_RATE 0x20 | 53 | #define DMPF_MAX_RECOVERY_RATE 0x20 |
53 | #define DMPF_MAX_WRITE_BEHIND 0x40 | 54 | #define DMPF_MAX_WRITE_BEHIND 0x40 |
54 | #define DMPF_STRIPE_CACHE 0x80 | 55 | #define DMPF_STRIPE_CACHE 0x80 |
55 | #define DMPF_REGION_SIZE 0X100 | 56 | #define DMPF_REGION_SIZE 0x100 |
57 | #define DMPF_RAID10_COPIES 0x200 | ||
58 | #define DMPF_RAID10_FORMAT 0x400 | ||
59 | |||
56 | struct raid_set { | 60 | struct raid_set { |
57 | struct dm_target *ti; | 61 | struct dm_target *ti; |
58 | 62 | ||
@@ -76,6 +80,7 @@ static struct raid_type { | |||
76 | const unsigned algorithm; /* RAID algorithm. */ | 80 | const unsigned algorithm; /* RAID algorithm. */ |
77 | } raid_types[] = { | 81 | } raid_types[] = { |
78 | {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, | 82 | {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, |
83 | {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */}, | ||
79 | {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, | 84 | {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, |
80 | {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, | 85 | {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, |
81 | {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, | 86 | {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, |
@@ -86,6 +91,17 @@ static struct raid_type { | |||
86 | {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} | 91 | {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} |
87 | }; | 92 | }; |
88 | 93 | ||
94 | static unsigned raid10_md_layout_to_copies(int layout) | ||
95 | { | ||
96 | return layout & 0xFF; | ||
97 | } | ||
98 | |||
99 | static int raid10_format_to_md_layout(char *format, unsigned copies) | ||
100 | { | ||
101 | /* 1 "far" copy, and 'copies' "near" copies */ | ||
102 | return (1 << 8) | (copies & 0xFF); | ||
103 | } | ||
104 | |||
89 | static struct raid_type *get_raid_type(char *name) | 105 | static struct raid_type *get_raid_type(char *name) |
90 | { | 106 | { |
91 | int i; | 107 | int i; |
@@ -101,20 +117,12 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra | |||
101 | { | 117 | { |
102 | unsigned i; | 118 | unsigned i; |
103 | struct raid_set *rs; | 119 | struct raid_set *rs; |
104 | sector_t sectors_per_dev; | ||
105 | 120 | ||
106 | if (raid_devs <= raid_type->parity_devs) { | 121 | if (raid_devs <= raid_type->parity_devs) { |
107 | ti->error = "Insufficient number of devices"; | 122 | ti->error = "Insufficient number of devices"; |
108 | return ERR_PTR(-EINVAL); | 123 | return ERR_PTR(-EINVAL); |
109 | } | 124 | } |
110 | 125 | ||
111 | sectors_per_dev = ti->len; | ||
112 | if ((raid_type->level > 1) && | ||
113 | sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { | ||
114 | ti->error = "Target length not divisible by number of data devices"; | ||
115 | return ERR_PTR(-EINVAL); | ||
116 | } | ||
117 | |||
118 | rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); | 126 | rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); |
119 | if (!rs) { | 127 | if (!rs) { |
120 | ti->error = "Cannot allocate raid context"; | 128 | ti->error = "Cannot allocate raid context"; |
@@ -128,7 +136,6 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra | |||
128 | rs->md.raid_disks = raid_devs; | 136 | rs->md.raid_disks = raid_devs; |
129 | rs->md.level = raid_type->level; | 137 | rs->md.level = raid_type->level; |
130 | rs->md.new_level = rs->md.level; | 138 | rs->md.new_level = rs->md.level; |
131 | rs->md.dev_sectors = sectors_per_dev; | ||
132 | rs->md.layout = raid_type->algorithm; | 139 | rs->md.layout = raid_type->algorithm; |
133 | rs->md.new_layout = rs->md.layout; | 140 | rs->md.new_layout = rs->md.layout; |
134 | rs->md.delta_disks = 0; | 141 | rs->md.delta_disks = 0; |
@@ -143,6 +150,7 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra | |||
143 | * rs->md.external | 150 | * rs->md.external |
144 | * rs->md.chunk_sectors | 151 | * rs->md.chunk_sectors |
145 | * rs->md.new_chunk_sectors | 152 | * rs->md.new_chunk_sectors |
153 | * rs->md.dev_sectors | ||
146 | */ | 154 | */ |
147 | 155 | ||
148 | return rs; | 156 | return rs; |
@@ -347,12 +355,20 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) | |||
347 | * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) | 355 | * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) |
348 | * [stripe_cache <sectors>] Stripe cache size for higher RAIDs | 356 | * [stripe_cache <sectors>] Stripe cache size for higher RAIDs |
349 | * [region_size <sectors>] Defines granularity of bitmap | 357 | * [region_size <sectors>] Defines granularity of bitmap |
358 | * | ||
359 | * RAID10-only options: | ||
360 | * [raid10_copies <# copies>] Number of copies. (Default: 2) | ||
361 | * [raid10_format <near>] Layout algorithm. (Default: near) | ||
350 | */ | 362 | */ |
351 | static int parse_raid_params(struct raid_set *rs, char **argv, | 363 | static int parse_raid_params(struct raid_set *rs, char **argv, |
352 | unsigned num_raid_params) | 364 | unsigned num_raid_params) |
353 | { | 365 | { |
366 | char *raid10_format = "near"; | ||
367 | unsigned raid10_copies = 2; | ||
354 | unsigned i, rebuild_cnt = 0; | 368 | unsigned i, rebuild_cnt = 0; |
355 | unsigned long value, region_size = 0; | 369 | unsigned long value, region_size = 0; |
370 | sector_t sectors_per_dev = rs->ti->len; | ||
371 | sector_t max_io_len; | ||
356 | char *key; | 372 | char *key; |
357 | 373 | ||
358 | /* | 374 | /* |
@@ -422,20 +438,53 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
422 | } | 438 | } |
423 | 439 | ||
424 | key = argv[i++]; | 440 | key = argv[i++]; |
441 | |||
442 | /* Parameters that take a string value are checked here. */ | ||
443 | if (!strcasecmp(key, "raid10_format")) { | ||
444 | if (rs->raid_type->level != 10) { | ||
445 | rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; | ||
446 | return -EINVAL; | ||
447 | } | ||
448 | if (strcmp("near", argv[i])) { | ||
449 | rs->ti->error = "Invalid 'raid10_format' value given"; | ||
450 | return -EINVAL; | ||
451 | } | ||
452 | raid10_format = argv[i]; | ||
453 | rs->print_flags |= DMPF_RAID10_FORMAT; | ||
454 | continue; | ||
455 | } | ||
456 | |||
425 | if (strict_strtoul(argv[i], 10, &value) < 0) { | 457 | if (strict_strtoul(argv[i], 10, &value) < 0) { |
426 | rs->ti->error = "Bad numerical argument given in raid params"; | 458 | rs->ti->error = "Bad numerical argument given in raid params"; |
427 | return -EINVAL; | 459 | return -EINVAL; |
428 | } | 460 | } |
429 | 461 | ||
462 | /* Parameters that take a numeric value are checked here */ | ||
430 | if (!strcasecmp(key, "rebuild")) { | 463 | if (!strcasecmp(key, "rebuild")) { |
431 | rebuild_cnt++; | 464 | rebuild_cnt++; |
432 | if (((rs->raid_type->level != 1) && | 465 | |
433 | (rebuild_cnt > rs->raid_type->parity_devs)) || | 466 | switch (rs->raid_type->level) { |
434 | ((rs->raid_type->level == 1) && | 467 | case 1: |
435 | (rebuild_cnt > (rs->md.raid_disks - 1)))) { | 468 | if (rebuild_cnt >= rs->md.raid_disks) { |
436 | rs->ti->error = "Too many rebuild devices specified for given RAID type"; | 469 | rs->ti->error = "Too many rebuild devices specified"; |
470 | return -EINVAL; | ||
471 | } | ||
472 | break; | ||
473 | case 4: | ||
474 | case 5: | ||
475 | case 6: | ||
476 | if (rebuild_cnt > rs->raid_type->parity_devs) { | ||
477 | rs->ti->error = "Too many rebuild devices specified for given RAID type"; | ||
478 | return -EINVAL; | ||
479 | } | ||
480 | break; | ||
481 | case 10: | ||
482 | default: | ||
483 | DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); | ||
484 | rs->ti->error = "Rebuild not supported for this RAID type"; | ||
437 | return -EINVAL; | 485 | return -EINVAL; |
438 | } | 486 | } |
487 | |||
439 | if (value > rs->md.raid_disks) { | 488 | if (value > rs->md.raid_disks) { |
440 | rs->ti->error = "Invalid rebuild index given"; | 489 | rs->ti->error = "Invalid rebuild index given"; |
441 | return -EINVAL; | 490 | return -EINVAL; |
@@ -486,7 +535,8 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
486 | */ | 535 | */ |
487 | value /= 2; | 536 | value /= 2; |
488 | 537 | ||
489 | if (rs->raid_type->level < 5) { | 538 | if ((rs->raid_type->level != 5) && |
539 | (rs->raid_type->level != 6)) { | ||
490 | rs->ti->error = "Inappropriate argument: stripe_cache"; | 540 | rs->ti->error = "Inappropriate argument: stripe_cache"; |
491 | return -EINVAL; | 541 | return -EINVAL; |
492 | } | 542 | } |
@@ -511,6 +561,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
511 | } else if (!strcasecmp(key, "region_size")) { | 561 | } else if (!strcasecmp(key, "region_size")) { |
512 | rs->print_flags |= DMPF_REGION_SIZE; | 562 | rs->print_flags |= DMPF_REGION_SIZE; |
513 | region_size = value; | 563 | region_size = value; |
564 | } else if (!strcasecmp(key, "raid10_copies") && | ||
565 | (rs->raid_type->level == 10)) { | ||
566 | if ((value < 2) || (value > 0xFF)) { | ||
567 | rs->ti->error = "Bad value for 'raid10_copies'"; | ||
568 | return -EINVAL; | ||
569 | } | ||
570 | rs->print_flags |= DMPF_RAID10_COPIES; | ||
571 | raid10_copies = value; | ||
514 | } else { | 572 | } else { |
515 | DMERR("Unable to parse RAID parameter: %s", key); | 573 | DMERR("Unable to parse RAID parameter: %s", key); |
516 | rs->ti->error = "Unable to parse RAID parameters"; | 574 | rs->ti->error = "Unable to parse RAID parameters"; |
@@ -522,14 +580,33 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
522 | return -EINVAL; | 580 | return -EINVAL; |
523 | 581 | ||
524 | if (rs->md.chunk_sectors) | 582 | if (rs->md.chunk_sectors) |
525 | rs->ti->split_io = rs->md.chunk_sectors; | 583 | max_io_len = rs->md.chunk_sectors; |
526 | else | 584 | else |
527 | rs->ti->split_io = region_size; | 585 | max_io_len = region_size; |
528 | 586 | ||
529 | if (rs->md.chunk_sectors) | 587 | if (dm_set_target_max_io_len(rs->ti, max_io_len)) |
530 | rs->ti->split_io = rs->md.chunk_sectors; | 588 | return -EINVAL; |
531 | else | 589 | |
532 | rs->ti->split_io = region_size; | 590 | if (rs->raid_type->level == 10) { |
591 | if (raid10_copies > rs->md.raid_disks) { | ||
592 | rs->ti->error = "Not enough devices to satisfy specification"; | ||
593 | return -EINVAL; | ||
594 | } | ||
595 | |||
596 | /* (Len * #mirrors) / #devices */ | ||
597 | sectors_per_dev = rs->ti->len * raid10_copies; | ||
598 | sector_div(sectors_per_dev, rs->md.raid_disks); | ||
599 | |||
600 | rs->md.layout = raid10_format_to_md_layout(raid10_format, | ||
601 | raid10_copies); | ||
602 | rs->md.new_layout = rs->md.layout; | ||
603 | } else if ((rs->raid_type->level > 1) && | ||
604 | sector_div(sectors_per_dev, | ||
605 | (rs->md.raid_disks - rs->raid_type->parity_devs))) { | ||
606 | rs->ti->error = "Target length not divisible by number of data devices"; | ||
607 | return -EINVAL; | ||
608 | } | ||
609 | rs->md.dev_sectors = sectors_per_dev; | ||
533 | 610 | ||
534 | /* Assume there are no metadata devices until the drives are parsed */ | 611 | /* Assume there are no metadata devices until the drives are parsed */ |
535 | rs->md.persistent = 0; | 612 | rs->md.persistent = 0; |
@@ -552,6 +629,9 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) | |||
552 | if (rs->raid_type->level == 1) | 629 | if (rs->raid_type->level == 1) |
553 | return md_raid1_congested(&rs->md, bits); | 630 | return md_raid1_congested(&rs->md, bits); |
554 | 631 | ||
632 | if (rs->raid_type->level == 10) | ||
633 | return md_raid10_congested(&rs->md, bits); | ||
634 | |||
555 | return md_raid5_congested(&rs->md, bits); | 635 | return md_raid5_congested(&rs->md, bits); |
556 | } | 636 | } |
557 | 637 | ||
@@ -870,6 +950,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) | |||
870 | case 6: | 950 | case 6: |
871 | redundancy = rs->raid_type->parity_devs; | 951 | redundancy = rs->raid_type->parity_devs; |
872 | break; | 952 | break; |
953 | case 10: | ||
954 | redundancy = raid10_md_layout_to_copies(mddev->layout) - 1; | ||
955 | break; | ||
873 | default: | 956 | default: |
874 | ti->error = "Unknown RAID type"; | 957 | ti->error = "Unknown RAID type"; |
875 | return -EINVAL; | 958 | return -EINVAL; |
@@ -1035,12 +1118,19 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1035 | goto bad; | 1118 | goto bad; |
1036 | } | 1119 | } |
1037 | 1120 | ||
1121 | if (ti->len != rs->md.array_sectors) { | ||
1122 | ti->error = "Array size does not match requested target length"; | ||
1123 | ret = -EINVAL; | ||
1124 | goto size_mismatch; | ||
1125 | } | ||
1038 | rs->callbacks.congested_fn = raid_is_congested; | 1126 | rs->callbacks.congested_fn = raid_is_congested; |
1039 | dm_table_add_target_callbacks(ti->table, &rs->callbacks); | 1127 | dm_table_add_target_callbacks(ti->table, &rs->callbacks); |
1040 | 1128 | ||
1041 | mddev_suspend(&rs->md); | 1129 | mddev_suspend(&rs->md); |
1042 | return 0; | 1130 | return 0; |
1043 | 1131 | ||
1132 | size_mismatch: | ||
1133 | md_stop(&rs->md); | ||
1044 | bad: | 1134 | bad: |
1045 | context_free(rs); | 1135 | context_free(rs); |
1046 | 1136 | ||
@@ -1067,7 +1157,7 @@ static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_c | |||
1067 | } | 1157 | } |
1068 | 1158 | ||
1069 | static int raid_status(struct dm_target *ti, status_type_t type, | 1159 | static int raid_status(struct dm_target *ti, status_type_t type, |
1070 | char *result, unsigned maxlen) | 1160 | unsigned status_flags, char *result, unsigned maxlen) |
1071 | { | 1161 | { |
1072 | struct raid_set *rs = ti->private; | 1162 | struct raid_set *rs = ti->private; |
1073 | unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ | 1163 | unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ |
@@ -1189,6 +1279,13 @@ static int raid_status(struct dm_target *ti, status_type_t type, | |||
1189 | DMEMIT(" region_size %lu", | 1279 | DMEMIT(" region_size %lu", |
1190 | rs->md.bitmap_info.chunksize >> 9); | 1280 | rs->md.bitmap_info.chunksize >> 9); |
1191 | 1281 | ||
1282 | if (rs->print_flags & DMPF_RAID10_COPIES) | ||
1283 | DMEMIT(" raid10_copies %u", | ||
1284 | raid10_md_layout_to_copies(rs->md.layout)); | ||
1285 | |||
1286 | if (rs->print_flags & DMPF_RAID10_FORMAT) | ||
1287 | DMEMIT(" raid10_format near"); | ||
1288 | |||
1192 | DMEMIT(" %d", rs->md.raid_disks); | 1289 | DMEMIT(" %d", rs->md.raid_disks); |
1193 | for (i = 0; i < rs->md.raid_disks; i++) { | 1290 | for (i = 0; i < rs->md.raid_disks; i++) { |
1194 | if (rs->dev[i].meta_dev) | 1291 | if (rs->dev[i].meta_dev) |
@@ -1263,7 +1360,7 @@ static void raid_resume(struct dm_target *ti) | |||
1263 | 1360 | ||
1264 | static struct target_type raid_target = { | 1361 | static struct target_type raid_target = { |
1265 | .name = "raid", | 1362 | .name = "raid", |
1266 | .version = {1, 2, 0}, | 1363 | .version = {1, 3, 0}, |
1267 | .module = THIS_MODULE, | 1364 | .module = THIS_MODULE, |
1268 | .ctr = raid_ctr, | 1365 | .ctr = raid_ctr, |
1269 | .dtr = raid_dtr, | 1366 | .dtr = raid_dtr, |
@@ -1290,6 +1387,8 @@ module_init(dm_raid_init); | |||
1290 | module_exit(dm_raid_exit); | 1387 | module_exit(dm_raid_exit); |
1291 | 1388 | ||
1292 | MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); | 1389 | MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); |
1390 | MODULE_ALIAS("dm-raid1"); | ||
1391 | MODULE_ALIAS("dm-raid10"); | ||
1293 | MODULE_ALIAS("dm-raid4"); | 1392 | MODULE_ALIAS("dm-raid4"); |
1294 | MODULE_ALIAS("dm-raid5"); | 1393 | MODULE_ALIAS("dm-raid5"); |
1295 | MODULE_ALIAS("dm-raid6"); | 1394 | MODULE_ALIAS("dm-raid6"); |
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index d039de8322f0..bc5ddba8045b 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -1081,9 +1081,14 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1081 | } | 1081 | } |
1082 | 1082 | ||
1083 | ti->private = ms; | 1083 | ti->private = ms; |
1084 | ti->split_io = dm_rh_get_region_size(ms->rh); | 1084 | |
1085 | r = dm_set_target_max_io_len(ti, dm_rh_get_region_size(ms->rh)); | ||
1086 | if (r) | ||
1087 | goto err_free_context; | ||
1088 | |||
1085 | ti->num_flush_requests = 1; | 1089 | ti->num_flush_requests = 1; |
1086 | ti->num_discard_requests = 1; | 1090 | ti->num_discard_requests = 1; |
1091 | ti->discard_zeroes_data_unsupported = true; | ||
1087 | 1092 | ||
1088 | ms->kmirrord_wq = alloc_workqueue("kmirrord", | 1093 | ms->kmirrord_wq = alloc_workqueue("kmirrord", |
1089 | WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); | 1094 | WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); |
@@ -1214,7 +1219,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, | |||
1214 | * We need to dec pending if this was a write. | 1219 | * We need to dec pending if this was a write. |
1215 | */ | 1220 | */ |
1216 | if (rw == WRITE) { | 1221 | if (rw == WRITE) { |
1217 | if (!(bio->bi_rw & REQ_FLUSH)) | 1222 | if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) |
1218 | dm_rh_dec(ms->rh, map_context->ll); | 1223 | dm_rh_dec(ms->rh, map_context->ll); |
1219 | return error; | 1224 | return error; |
1220 | } | 1225 | } |
@@ -1362,7 +1367,7 @@ static char device_status_char(struct mirror *m) | |||
1362 | 1367 | ||
1363 | 1368 | ||
1364 | static int mirror_status(struct dm_target *ti, status_type_t type, | 1369 | static int mirror_status(struct dm_target *ti, status_type_t type, |
1365 | char *result, unsigned int maxlen) | 1370 | unsigned status_flags, char *result, unsigned maxlen) |
1366 | { | 1371 | { |
1367 | unsigned int m, sz = 0; | 1372 | unsigned int m, sz = 0; |
1368 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1373 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 7771ed212182..69732e03eb34 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c | |||
@@ -404,6 +404,9 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) | |||
404 | return; | 404 | return; |
405 | } | 405 | } |
406 | 406 | ||
407 | if (bio->bi_rw & REQ_DISCARD) | ||
408 | return; | ||
409 | |||
407 | /* We must inform the log that the sync count has changed. */ | 410 | /* We must inform the log that the sync count has changed. */ |
408 | log->type->set_region_sync(log, region, 0); | 411 | log->type->set_region_sync(log, region, 0); |
409 | 412 | ||
@@ -524,7 +527,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) | |||
524 | struct bio *bio; | 527 | struct bio *bio; |
525 | 528 | ||
526 | for (bio = bios->head; bio; bio = bio->bi_next) { | 529 | for (bio = bios->head; bio; bio = bio->bi_next) { |
527 | if (bio->bi_rw & REQ_FLUSH) | 530 | if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)) |
528 | continue; | 531 | continue; |
529 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); | 532 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); |
530 | } | 533 | } |
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 6f758870fc19..a143921feaf6 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
@@ -691,7 +691,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) | |||
691 | * Return a minimum chunk size of all snapshots that have the specified origin. | 691 | * Return a minimum chunk size of all snapshots that have the specified origin. |
692 | * Return zero if the origin has no snapshots. | 692 | * Return zero if the origin has no snapshots. |
693 | */ | 693 | */ |
694 | static sector_t __minimum_chunk_size(struct origin *o) | 694 | static uint32_t __minimum_chunk_size(struct origin *o) |
695 | { | 695 | { |
696 | struct dm_snapshot *snap; | 696 | struct dm_snapshot *snap; |
697 | unsigned chunk_size = 0; | 697 | unsigned chunk_size = 0; |
@@ -701,7 +701,7 @@ static sector_t __minimum_chunk_size(struct origin *o) | |||
701 | chunk_size = min_not_zero(chunk_size, | 701 | chunk_size = min_not_zero(chunk_size, |
702 | snap->store->chunk_size); | 702 | snap->store->chunk_size); |
703 | 703 | ||
704 | return chunk_size; | 704 | return (uint32_t) chunk_size; |
705 | } | 705 | } |
706 | 706 | ||
707 | /* | 707 | /* |
@@ -1172,7 +1172,10 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1172 | ti->error = "Chunk size not set"; | 1172 | ti->error = "Chunk size not set"; |
1173 | goto bad_read_metadata; | 1173 | goto bad_read_metadata; |
1174 | } | 1174 | } |
1175 | ti->split_io = s->store->chunk_size; | 1175 | |
1176 | r = dm_set_target_max_io_len(ti, s->store->chunk_size); | ||
1177 | if (r) | ||
1178 | goto bad_read_metadata; | ||
1176 | 1179 | ||
1177 | return 0; | 1180 | return 0; |
1178 | 1181 | ||
@@ -1239,7 +1242,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src, | |||
1239 | snap_dest->store->snap = snap_dest; | 1242 | snap_dest->store->snap = snap_dest; |
1240 | snap_src->store->snap = snap_src; | 1243 | snap_src->store->snap = snap_src; |
1241 | 1244 | ||
1242 | snap_dest->ti->split_io = snap_dest->store->chunk_size; | 1245 | snap_dest->ti->max_io_len = snap_dest->store->chunk_size; |
1243 | snap_dest->valid = snap_src->valid; | 1246 | snap_dest->valid = snap_src->valid; |
1244 | 1247 | ||
1245 | /* | 1248 | /* |
@@ -1817,9 +1820,9 @@ static void snapshot_resume(struct dm_target *ti) | |||
1817 | up_write(&s->lock); | 1820 | up_write(&s->lock); |
1818 | } | 1821 | } |
1819 | 1822 | ||
1820 | static sector_t get_origin_minimum_chunksize(struct block_device *bdev) | 1823 | static uint32_t get_origin_minimum_chunksize(struct block_device *bdev) |
1821 | { | 1824 | { |
1822 | sector_t min_chunksize; | 1825 | uint32_t min_chunksize; |
1823 | 1826 | ||
1824 | down_read(&_origins_lock); | 1827 | down_read(&_origins_lock); |
1825 | min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); | 1828 | min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); |
@@ -1838,15 +1841,15 @@ static void snapshot_merge_resume(struct dm_target *ti) | |||
1838 | snapshot_resume(ti); | 1841 | snapshot_resume(ti); |
1839 | 1842 | ||
1840 | /* | 1843 | /* |
1841 | * snapshot-merge acts as an origin, so set ti->split_io | 1844 | * snapshot-merge acts as an origin, so set ti->max_io_len |
1842 | */ | 1845 | */ |
1843 | ti->split_io = get_origin_minimum_chunksize(s->origin->bdev); | 1846 | ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev); |
1844 | 1847 | ||
1845 | start_merge(s); | 1848 | start_merge(s); |
1846 | } | 1849 | } |
1847 | 1850 | ||
1848 | static int snapshot_status(struct dm_target *ti, status_type_t type, | 1851 | static int snapshot_status(struct dm_target *ti, status_type_t type, |
1849 | char *result, unsigned int maxlen) | 1852 | unsigned status_flags, char *result, unsigned maxlen) |
1850 | { | 1853 | { |
1851 | unsigned sz = 0; | 1854 | unsigned sz = 0; |
1852 | struct dm_snapshot *snap = ti->private; | 1855 | struct dm_snapshot *snap = ti->private; |
@@ -2073,12 +2076,12 @@ static int origin_write_extent(struct dm_snapshot *merging_snap, | |||
2073 | struct origin *o; | 2076 | struct origin *o; |
2074 | 2077 | ||
2075 | /* | 2078 | /* |
2076 | * The origin's __minimum_chunk_size() got stored in split_io | 2079 | * The origin's __minimum_chunk_size() got stored in max_io_len |
2077 | * by snapshot_merge_resume(). | 2080 | * by snapshot_merge_resume(). |
2078 | */ | 2081 | */ |
2079 | down_read(&_origins_lock); | 2082 | down_read(&_origins_lock); |
2080 | o = __lookup_origin(merging_snap->origin->bdev); | 2083 | o = __lookup_origin(merging_snap->origin->bdev); |
2081 | for (n = 0; n < size; n += merging_snap->ti->split_io) | 2084 | for (n = 0; n < size; n += merging_snap->ti->max_io_len) |
2082 | if (__origin_write(&o->snapshots, sector + n, NULL) == | 2085 | if (__origin_write(&o->snapshots, sector + n, NULL) == |
2083 | DM_MAPIO_SUBMITTED) | 2086 | DM_MAPIO_SUBMITTED) |
2084 | must_wait = 1; | 2087 | must_wait = 1; |
@@ -2138,18 +2141,18 @@ static int origin_map(struct dm_target *ti, struct bio *bio, | |||
2138 | } | 2141 | } |
2139 | 2142 | ||
2140 | /* | 2143 | /* |
2141 | * Set the target "split_io" field to the minimum of all the snapshots' | 2144 | * Set the target "max_io_len" field to the minimum of all the snapshots' |
2142 | * chunk sizes. | 2145 | * chunk sizes. |
2143 | */ | 2146 | */ |
2144 | static void origin_resume(struct dm_target *ti) | 2147 | static void origin_resume(struct dm_target *ti) |
2145 | { | 2148 | { |
2146 | struct dm_dev *dev = ti->private; | 2149 | struct dm_dev *dev = ti->private; |
2147 | 2150 | ||
2148 | ti->split_io = get_origin_minimum_chunksize(dev->bdev); | 2151 | ti->max_io_len = get_origin_minimum_chunksize(dev->bdev); |
2149 | } | 2152 | } |
2150 | 2153 | ||
2151 | static int origin_status(struct dm_target *ti, status_type_t type, char *result, | 2154 | static int origin_status(struct dm_target *ti, status_type_t type, |
2152 | unsigned int maxlen) | 2155 | unsigned status_flags, char *result, unsigned maxlen) |
2153 | { | 2156 | { |
2154 | struct dm_dev *dev = ti->private; | 2157 | struct dm_dev *dev = ti->private; |
2155 | 2158 | ||
@@ -2176,7 +2179,6 @@ static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
2176 | return max_size; | 2179 | return max_size; |
2177 | 2180 | ||
2178 | bvm->bi_bdev = dev->bdev; | 2181 | bvm->bi_bdev = dev->bdev; |
2179 | bvm->bi_sector = bvm->bi_sector; | ||
2180 | 2182 | ||
2181 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 2183 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
2182 | } | 2184 | } |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 35c94ff24ad5..a087bf2a8d66 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -26,14 +26,12 @@ struct stripe { | |||
26 | struct stripe_c { | 26 | struct stripe_c { |
27 | uint32_t stripes; | 27 | uint32_t stripes; |
28 | int stripes_shift; | 28 | int stripes_shift; |
29 | sector_t stripes_mask; | ||
30 | 29 | ||
31 | /* The size of this target / num. stripes */ | 30 | /* The size of this target / num. stripes */ |
32 | sector_t stripe_width; | 31 | sector_t stripe_width; |
33 | 32 | ||
34 | /* stripe chunk size */ | 33 | uint32_t chunk_size; |
35 | uint32_t chunk_shift; | 34 | int chunk_size_shift; |
36 | sector_t chunk_mask; | ||
37 | 35 | ||
38 | /* Needed for handling events */ | 36 | /* Needed for handling events */ |
39 | struct dm_target *ti; | 37 | struct dm_target *ti; |
@@ -91,7 +89,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, | |||
91 | 89 | ||
92 | /* | 90 | /* |
93 | * Construct a striped mapping. | 91 | * Construct a striped mapping. |
94 | * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+ | 92 | * <number of stripes> <chunk size> [<dev_path> <offset>]+ |
95 | */ | 93 | */ |
96 | static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | 94 | static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) |
97 | { | 95 | { |
@@ -99,7 +97,6 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
99 | sector_t width; | 97 | sector_t width; |
100 | uint32_t stripes; | 98 | uint32_t stripes; |
101 | uint32_t chunk_size; | 99 | uint32_t chunk_size; |
102 | char *end; | ||
103 | int r; | 100 | int r; |
104 | unsigned int i; | 101 | unsigned int i; |
105 | 102 | ||
@@ -108,34 +105,23 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
108 | return -EINVAL; | 105 | return -EINVAL; |
109 | } | 106 | } |
110 | 107 | ||
111 | stripes = simple_strtoul(argv[0], &end, 10); | 108 | if (kstrtouint(argv[0], 10, &stripes) || !stripes) { |
112 | if (!stripes || *end) { | ||
113 | ti->error = "Invalid stripe count"; | 109 | ti->error = "Invalid stripe count"; |
114 | return -EINVAL; | 110 | return -EINVAL; |
115 | } | 111 | } |
116 | 112 | ||
117 | chunk_size = simple_strtoul(argv[1], &end, 10); | 113 | if (kstrtouint(argv[1], 10, &chunk_size) || !chunk_size) { |
118 | if (*end) { | ||
119 | ti->error = "Invalid chunk_size"; | 114 | ti->error = "Invalid chunk_size"; |
120 | return -EINVAL; | 115 | return -EINVAL; |
121 | } | 116 | } |
122 | 117 | ||
123 | /* | 118 | width = ti->len; |
124 | * chunk_size is a power of two | 119 | if (sector_div(width, chunk_size)) { |
125 | */ | ||
126 | if (!is_power_of_2(chunk_size) || | ||
127 | (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) { | ||
128 | ti->error = "Invalid chunk size"; | ||
129 | return -EINVAL; | ||
130 | } | ||
131 | |||
132 | if (ti->len & (chunk_size - 1)) { | ||
133 | ti->error = "Target length not divisible by " | 120 | ti->error = "Target length not divisible by " |
134 | "chunk size"; | 121 | "chunk size"; |
135 | return -EINVAL; | 122 | return -EINVAL; |
136 | } | 123 | } |
137 | 124 | ||
138 | width = ti->len; | ||
139 | if (sector_div(width, stripes)) { | 125 | if (sector_div(width, stripes)) { |
140 | ti->error = "Target length not divisible by " | 126 | ti->error = "Target length not divisible by " |
141 | "number of stripes"; | 127 | "number of stripes"; |
@@ -167,17 +153,21 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
167 | 153 | ||
168 | if (stripes & (stripes - 1)) | 154 | if (stripes & (stripes - 1)) |
169 | sc->stripes_shift = -1; | 155 | sc->stripes_shift = -1; |
170 | else { | 156 | else |
171 | sc->stripes_shift = ffs(stripes) - 1; | 157 | sc->stripes_shift = __ffs(stripes); |
172 | sc->stripes_mask = ((sector_t) stripes) - 1; | 158 | |
173 | } | 159 | r = dm_set_target_max_io_len(ti, chunk_size); |
160 | if (r) | ||
161 | return r; | ||
174 | 162 | ||
175 | ti->split_io = chunk_size; | ||
176 | ti->num_flush_requests = stripes; | 163 | ti->num_flush_requests = stripes; |
177 | ti->num_discard_requests = stripes; | 164 | ti->num_discard_requests = stripes; |
178 | 165 | ||
179 | sc->chunk_shift = ffs(chunk_size) - 1; | 166 | sc->chunk_size = chunk_size; |
180 | sc->chunk_mask = ((sector_t) chunk_size) - 1; | 167 | if (chunk_size & (chunk_size - 1)) |
168 | sc->chunk_size_shift = -1; | ||
169 | else | ||
170 | sc->chunk_size_shift = __ffs(chunk_size); | ||
181 | 171 | ||
182 | /* | 172 | /* |
183 | * Get the stripe destinations. | 173 | * Get the stripe destinations. |
@@ -216,17 +206,29 @@ static void stripe_dtr(struct dm_target *ti) | |||
216 | static void stripe_map_sector(struct stripe_c *sc, sector_t sector, | 206 | static void stripe_map_sector(struct stripe_c *sc, sector_t sector, |
217 | uint32_t *stripe, sector_t *result) | 207 | uint32_t *stripe, sector_t *result) |
218 | { | 208 | { |
219 | sector_t offset = dm_target_offset(sc->ti, sector); | 209 | sector_t chunk = dm_target_offset(sc->ti, sector); |
220 | sector_t chunk = offset >> sc->chunk_shift; | 210 | sector_t chunk_offset; |
211 | |||
212 | if (sc->chunk_size_shift < 0) | ||
213 | chunk_offset = sector_div(chunk, sc->chunk_size); | ||
214 | else { | ||
215 | chunk_offset = chunk & (sc->chunk_size - 1); | ||
216 | chunk >>= sc->chunk_size_shift; | ||
217 | } | ||
221 | 218 | ||
222 | if (sc->stripes_shift < 0) | 219 | if (sc->stripes_shift < 0) |
223 | *stripe = sector_div(chunk, sc->stripes); | 220 | *stripe = sector_div(chunk, sc->stripes); |
224 | else { | 221 | else { |
225 | *stripe = chunk & sc->stripes_mask; | 222 | *stripe = chunk & (sc->stripes - 1); |
226 | chunk >>= sc->stripes_shift; | 223 | chunk >>= sc->stripes_shift; |
227 | } | 224 | } |
228 | 225 | ||
229 | *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask); | 226 | if (sc->chunk_size_shift < 0) |
227 | chunk *= sc->chunk_size; | ||
228 | else | ||
229 | chunk <<= sc->chunk_size_shift; | ||
230 | |||
231 | *result = chunk + chunk_offset; | ||
230 | } | 232 | } |
231 | 233 | ||
232 | static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, | 234 | static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, |
@@ -237,9 +239,16 @@ static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, | |||
237 | stripe_map_sector(sc, sector, &stripe, result); | 239 | stripe_map_sector(sc, sector, &stripe, result); |
238 | if (stripe == target_stripe) | 240 | if (stripe == target_stripe) |
239 | return; | 241 | return; |
240 | *result &= ~sc->chunk_mask; /* round down */ | 242 | |
243 | /* round down */ | ||
244 | sector = *result; | ||
245 | if (sc->chunk_size_shift < 0) | ||
246 | *result -= sector_div(sector, sc->chunk_size); | ||
247 | else | ||
248 | *result = sector & ~(sector_t)(sc->chunk_size - 1); | ||
249 | |||
241 | if (target_stripe < stripe) | 250 | if (target_stripe < stripe) |
242 | *result += sc->chunk_mask + 1; /* next chunk */ | 251 | *result += sc->chunk_size; /* next chunk */ |
243 | } | 252 | } |
244 | 253 | ||
245 | static int stripe_map_discard(struct stripe_c *sc, struct bio *bio, | 254 | static int stripe_map_discard(struct stripe_c *sc, struct bio *bio, |
@@ -302,8 +311,8 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, | |||
302 | * | 311 | * |
303 | */ | 312 | */ |
304 | 313 | ||
305 | static int stripe_status(struct dm_target *ti, | 314 | static int stripe_status(struct dm_target *ti, status_type_t type, |
306 | status_type_t type, char *result, unsigned int maxlen) | 315 | unsigned status_flags, char *result, unsigned maxlen) |
307 | { | 316 | { |
308 | struct stripe_c *sc = (struct stripe_c *) ti->private; | 317 | struct stripe_c *sc = (struct stripe_c *) ti->private; |
309 | char buffer[sc->stripes + 1]; | 318 | char buffer[sc->stripes + 1]; |
@@ -324,7 +333,7 @@ static int stripe_status(struct dm_target *ti, | |||
324 | 333 | ||
325 | case STATUSTYPE_TABLE: | 334 | case STATUSTYPE_TABLE: |
326 | DMEMIT("%d %llu", sc->stripes, | 335 | DMEMIT("%d %llu", sc->stripes, |
327 | (unsigned long long)sc->chunk_mask + 1); | 336 | (unsigned long long)sc->chunk_size); |
328 | for (i = 0; i < sc->stripes; i++) | 337 | for (i = 0; i < sc->stripes; i++) |
329 | DMEMIT(" %s %llu", sc->stripe[i].dev->name, | 338 | DMEMIT(" %s %llu", sc->stripe[i].dev->name, |
330 | (unsigned long long)sc->stripe[i].physical_start); | 339 | (unsigned long long)sc->stripe[i].physical_start); |
@@ -391,7 +400,7 @@ static void stripe_io_hints(struct dm_target *ti, | |||
391 | struct queue_limits *limits) | 400 | struct queue_limits *limits) |
392 | { | 401 | { |
393 | struct stripe_c *sc = ti->private; | 402 | struct stripe_c *sc = ti->private; |
394 | unsigned chunk_size = (sc->chunk_mask + 1) << 9; | 403 | unsigned chunk_size = sc->chunk_size << SECTOR_SHIFT; |
395 | 404 | ||
396 | blk_limits_io_min(limits, chunk_size); | 405 | blk_limits_io_min(limits, chunk_size); |
397 | blk_limits_io_opt(limits, chunk_size * sc->stripes); | 406 | blk_limits_io_opt(limits, chunk_size * sc->stripes); |
@@ -419,7 +428,7 @@ static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
419 | 428 | ||
420 | static struct target_type stripe_target = { | 429 | static struct target_type stripe_target = { |
421 | .name = "striped", | 430 | .name = "striped", |
422 | .version = {1, 4, 0}, | 431 | .version = {1, 5, 0}, |
423 | .module = THIS_MODULE, | 432 | .module = THIS_MODULE, |
424 | .ctr = stripe_ctr, | 433 | .ctr = stripe_ctr, |
425 | .dtr = stripe_dtr, | 434 | .dtr = stripe_dtr, |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2e227fbf1622..f90069029aae 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -1319,6 +1319,9 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) | |||
1319 | if (!ti->num_flush_requests) | 1319 | if (!ti->num_flush_requests) |
1320 | continue; | 1320 | continue; |
1321 | 1321 | ||
1322 | if (ti->flush_supported) | ||
1323 | return 1; | ||
1324 | |||
1322 | if (ti->type->iterate_devices && | 1325 | if (ti->type->iterate_devices && |
1323 | ti->type->iterate_devices(ti, device_flush_capable, &flush)) | 1326 | ti->type->iterate_devices(ti, device_flush_capable, &flush)) |
1324 | return 1; | 1327 | return 1; |
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 3e2907f0bc46..693e149e9727 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2011 Red Hat, Inc. | 2 | * Copyright (C) 2011-2012 Red Hat, Inc. |
3 | * | 3 | * |
4 | * This file is released under the GPL. | 4 | * This file is released under the GPL. |
5 | */ | 5 | */ |
@@ -80,6 +80,12 @@ | |||
80 | #define THIN_METADATA_CACHE_SIZE 64 | 80 | #define THIN_METADATA_CACHE_SIZE 64 |
81 | #define SECTOR_TO_BLOCK_SHIFT 3 | 81 | #define SECTOR_TO_BLOCK_SHIFT 3 |
82 | 82 | ||
83 | /* | ||
84 | * 3 for btree insert + | ||
85 | * 2 for btree lookup used within space map | ||
86 | */ | ||
87 | #define THIN_MAX_CONCURRENT_LOCKS 5 | ||
88 | |||
83 | /* This should be plenty */ | 89 | /* This should be plenty */ |
84 | #define SPACE_MAP_ROOT_SIZE 128 | 90 | #define SPACE_MAP_ROOT_SIZE 128 |
85 | 91 | ||
@@ -172,13 +178,20 @@ struct dm_pool_metadata { | |||
172 | 178 | ||
173 | struct rw_semaphore root_lock; | 179 | struct rw_semaphore root_lock; |
174 | uint32_t time; | 180 | uint32_t time; |
175 | int need_commit; | ||
176 | dm_block_t root; | 181 | dm_block_t root; |
177 | dm_block_t details_root; | 182 | dm_block_t details_root; |
178 | struct list_head thin_devices; | 183 | struct list_head thin_devices; |
179 | uint64_t trans_id; | 184 | uint64_t trans_id; |
180 | unsigned long flags; | 185 | unsigned long flags; |
181 | sector_t data_block_size; | 186 | sector_t data_block_size; |
187 | bool read_only:1; | ||
188 | |||
189 | /* | ||
190 | * Set if a transaction has to be aborted but the attempt to roll back | ||
191 | * to the previous (good) transaction failed. The only pool metadata | ||
192 | * operation possible in this state is the closing of the device. | ||
193 | */ | ||
194 | bool fail_io:1; | ||
182 | }; | 195 | }; |
183 | 196 | ||
184 | struct dm_thin_device { | 197 | struct dm_thin_device { |
@@ -187,7 +200,8 @@ struct dm_thin_device { | |||
187 | dm_thin_id id; | 200 | dm_thin_id id; |
188 | 201 | ||
189 | int open_count; | 202 | int open_count; |
190 | int changed; | 203 | bool changed:1; |
204 | bool aborted_with_changes:1; | ||
191 | uint64_t mapped_blocks; | 205 | uint64_t mapped_blocks; |
192 | uint64_t transaction_id; | 206 | uint64_t transaction_id; |
193 | uint32_t creation_time; | 207 | uint32_t creation_time; |
@@ -338,7 +352,21 @@ static int subtree_equal(void *context, void *value1_le, void *value2_le) | |||
338 | 352 | ||
339 | /*----------------------------------------------------------------*/ | 353 | /*----------------------------------------------------------------*/ |
340 | 354 | ||
341 | static int superblock_all_zeroes(struct dm_block_manager *bm, int *result) | 355 | static int superblock_lock_zero(struct dm_pool_metadata *pmd, |
356 | struct dm_block **sblock) | ||
357 | { | ||
358 | return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION, | ||
359 | &sb_validator, sblock); | ||
360 | } | ||
361 | |||
362 | static int superblock_lock(struct dm_pool_metadata *pmd, | ||
363 | struct dm_block **sblock) | ||
364 | { | ||
365 | return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, | ||
366 | &sb_validator, sblock); | ||
367 | } | ||
368 | |||
369 | static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) | ||
342 | { | 370 | { |
343 | int r; | 371 | int r; |
344 | unsigned i; | 372 | unsigned i; |
@@ -365,72 +393,9 @@ static int superblock_all_zeroes(struct dm_block_manager *bm, int *result) | |||
365 | return dm_bm_unlock(b); | 393 | return dm_bm_unlock(b); |
366 | } | 394 | } |
367 | 395 | ||
368 | static int init_pmd(struct dm_pool_metadata *pmd, | 396 | static void __setup_btree_details(struct dm_pool_metadata *pmd) |
369 | struct dm_block_manager *bm, | ||
370 | dm_block_t nr_blocks, int create) | ||
371 | { | 397 | { |
372 | int r; | 398 | pmd->info.tm = pmd->tm; |
373 | struct dm_space_map *sm, *data_sm; | ||
374 | struct dm_transaction_manager *tm; | ||
375 | struct dm_block *sblock; | ||
376 | |||
377 | if (create) { | ||
378 | r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION, | ||
379 | &sb_validator, &tm, &sm, &sblock); | ||
380 | if (r < 0) { | ||
381 | DMERR("tm_create_with_sm failed"); | ||
382 | return r; | ||
383 | } | ||
384 | |||
385 | data_sm = dm_sm_disk_create(tm, nr_blocks); | ||
386 | if (IS_ERR(data_sm)) { | ||
387 | DMERR("sm_disk_create failed"); | ||
388 | dm_tm_unlock(tm, sblock); | ||
389 | r = PTR_ERR(data_sm); | ||
390 | goto bad; | ||
391 | } | ||
392 | } else { | ||
393 | struct thin_disk_superblock *disk_super = NULL; | ||
394 | size_t space_map_root_offset = | ||
395 | offsetof(struct thin_disk_superblock, metadata_space_map_root); | ||
396 | |||
397 | r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION, | ||
398 | &sb_validator, space_map_root_offset, | ||
399 | SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock); | ||
400 | if (r < 0) { | ||
401 | DMERR("tm_open_with_sm failed"); | ||
402 | return r; | ||
403 | } | ||
404 | |||
405 | disk_super = dm_block_data(sblock); | ||
406 | data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root, | ||
407 | sizeof(disk_super->data_space_map_root)); | ||
408 | if (IS_ERR(data_sm)) { | ||
409 | DMERR("sm_disk_open failed"); | ||
410 | r = PTR_ERR(data_sm); | ||
411 | goto bad; | ||
412 | } | ||
413 | } | ||
414 | |||
415 | |||
416 | r = dm_tm_unlock(tm, sblock); | ||
417 | if (r < 0) { | ||
418 | DMERR("couldn't unlock superblock"); | ||
419 | goto bad_data_sm; | ||
420 | } | ||
421 | |||
422 | pmd->bm = bm; | ||
423 | pmd->metadata_sm = sm; | ||
424 | pmd->data_sm = data_sm; | ||
425 | pmd->tm = tm; | ||
426 | pmd->nb_tm = dm_tm_create_non_blocking_clone(tm); | ||
427 | if (!pmd->nb_tm) { | ||
428 | DMERR("could not create clone tm"); | ||
429 | r = -ENOMEM; | ||
430 | goto bad_data_sm; | ||
431 | } | ||
432 | |||
433 | pmd->info.tm = tm; | ||
434 | pmd->info.levels = 2; | 399 | pmd->info.levels = 2; |
435 | pmd->info.value_type.context = pmd->data_sm; | 400 | pmd->info.value_type.context = pmd->data_sm; |
436 | pmd->info.value_type.size = sizeof(__le64); | 401 | pmd->info.value_type.size = sizeof(__le64); |
@@ -441,7 +406,7 @@ static int init_pmd(struct dm_pool_metadata *pmd, | |||
441 | memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); | 406 | memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); |
442 | pmd->nb_info.tm = pmd->nb_tm; | 407 | pmd->nb_info.tm = pmd->nb_tm; |
443 | 408 | ||
444 | pmd->tl_info.tm = tm; | 409 | pmd->tl_info.tm = pmd->tm; |
445 | pmd->tl_info.levels = 1; | 410 | pmd->tl_info.levels = 1; |
446 | pmd->tl_info.value_type.context = &pmd->info; | 411 | pmd->tl_info.value_type.context = &pmd->info; |
447 | pmd->tl_info.value_type.size = sizeof(__le64); | 412 | pmd->tl_info.value_type.size = sizeof(__le64); |
@@ -449,7 +414,7 @@ static int init_pmd(struct dm_pool_metadata *pmd, | |||
449 | pmd->tl_info.value_type.dec = subtree_dec; | 414 | pmd->tl_info.value_type.dec = subtree_dec; |
450 | pmd->tl_info.value_type.equal = subtree_equal; | 415 | pmd->tl_info.value_type.equal = subtree_equal; |
451 | 416 | ||
452 | pmd->bl_info.tm = tm; | 417 | pmd->bl_info.tm = pmd->tm; |
453 | pmd->bl_info.levels = 1; | 418 | pmd->bl_info.levels = 1; |
454 | pmd->bl_info.value_type.context = pmd->data_sm; | 419 | pmd->bl_info.value_type.context = pmd->data_sm; |
455 | pmd->bl_info.value_type.size = sizeof(__le64); | 420 | pmd->bl_info.value_type.size = sizeof(__le64); |
@@ -457,48 +422,266 @@ static int init_pmd(struct dm_pool_metadata *pmd, | |||
457 | pmd->bl_info.value_type.dec = data_block_dec; | 422 | pmd->bl_info.value_type.dec = data_block_dec; |
458 | pmd->bl_info.value_type.equal = data_block_equal; | 423 | pmd->bl_info.value_type.equal = data_block_equal; |
459 | 424 | ||
460 | pmd->details_info.tm = tm; | 425 | pmd->details_info.tm = pmd->tm; |
461 | pmd->details_info.levels = 1; | 426 | pmd->details_info.levels = 1; |
462 | pmd->details_info.value_type.context = NULL; | 427 | pmd->details_info.value_type.context = NULL; |
463 | pmd->details_info.value_type.size = sizeof(struct disk_device_details); | 428 | pmd->details_info.value_type.size = sizeof(struct disk_device_details); |
464 | pmd->details_info.value_type.inc = NULL; | 429 | pmd->details_info.value_type.inc = NULL; |
465 | pmd->details_info.value_type.dec = NULL; | 430 | pmd->details_info.value_type.dec = NULL; |
466 | pmd->details_info.value_type.equal = NULL; | 431 | pmd->details_info.value_type.equal = NULL; |
432 | } | ||
467 | 433 | ||
468 | pmd->root = 0; | 434 | static int __write_initial_superblock(struct dm_pool_metadata *pmd) |
435 | { | ||
436 | int r; | ||
437 | struct dm_block *sblock; | ||
438 | size_t metadata_len, data_len; | ||
439 | struct thin_disk_superblock *disk_super; | ||
440 | sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT; | ||
469 | 441 | ||
470 | init_rwsem(&pmd->root_lock); | 442 | if (bdev_size > THIN_METADATA_MAX_SECTORS) |
471 | pmd->time = 0; | 443 | bdev_size = THIN_METADATA_MAX_SECTORS; |
472 | pmd->need_commit = 0; | 444 | |
473 | pmd->details_root = 0; | 445 | r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); |
474 | pmd->trans_id = 0; | 446 | if (r < 0) |
475 | pmd->flags = 0; | 447 | return r; |
476 | INIT_LIST_HEAD(&pmd->thin_devices); | 448 | |
449 | r = dm_sm_root_size(pmd->data_sm, &data_len); | ||
450 | if (r < 0) | ||
451 | return r; | ||
452 | |||
453 | r = dm_sm_commit(pmd->data_sm); | ||
454 | if (r < 0) | ||
455 | return r; | ||
456 | |||
457 | r = dm_tm_pre_commit(pmd->tm); | ||
458 | if (r < 0) | ||
459 | return r; | ||
460 | |||
461 | r = superblock_lock_zero(pmd, &sblock); | ||
462 | if (r) | ||
463 | return r; | ||
464 | |||
465 | disk_super = dm_block_data(sblock); | ||
466 | disk_super->flags = 0; | ||
467 | memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); | ||
468 | disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); | ||
469 | disk_super->version = cpu_to_le32(THIN_VERSION); | ||
470 | disk_super->time = 0; | ||
471 | disk_super->trans_id = 0; | ||
472 | disk_super->held_root = 0; | ||
473 | |||
474 | r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, | ||
475 | metadata_len); | ||
476 | if (r < 0) | ||
477 | goto bad_locked; | ||
478 | |||
479 | r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, | ||
480 | data_len); | ||
481 | if (r < 0) | ||
482 | goto bad_locked; | ||
483 | |||
484 | disk_super->data_mapping_root = cpu_to_le64(pmd->root); | ||
485 | disk_super->device_details_root = cpu_to_le64(pmd->details_root); | ||
486 | disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); | ||
487 | disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); | ||
488 | disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); | ||
489 | |||
490 | return dm_tm_commit(pmd->tm, sblock); | ||
491 | |||
492 | bad_locked: | ||
493 | dm_bm_unlock(sblock); | ||
494 | return r; | ||
495 | } | ||
496 | |||
497 | static int __format_metadata(struct dm_pool_metadata *pmd) | ||
498 | { | ||
499 | int r; | ||
500 | |||
501 | r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION, | ||
502 | &pmd->tm, &pmd->metadata_sm); | ||
503 | if (r < 0) { | ||
504 | DMERR("tm_create_with_sm failed"); | ||
505 | return r; | ||
506 | } | ||
507 | |||
508 | pmd->data_sm = dm_sm_disk_create(pmd->tm, 0); | ||
509 | if (IS_ERR(pmd->data_sm)) { | ||
510 | DMERR("sm_disk_create failed"); | ||
511 | r = PTR_ERR(pmd->data_sm); | ||
512 | goto bad_cleanup_tm; | ||
513 | } | ||
514 | |||
515 | pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm); | ||
516 | if (!pmd->nb_tm) { | ||
517 | DMERR("could not create non-blocking clone tm"); | ||
518 | r = -ENOMEM; | ||
519 | goto bad_cleanup_data_sm; | ||
520 | } | ||
521 | |||
522 | __setup_btree_details(pmd); | ||
523 | |||
524 | r = dm_btree_empty(&pmd->info, &pmd->root); | ||
525 | if (r < 0) | ||
526 | goto bad_cleanup_nb_tm; | ||
527 | |||
528 | r = dm_btree_empty(&pmd->details_info, &pmd->details_root); | ||
529 | if (r < 0) { | ||
530 | DMERR("couldn't create devices root"); | ||
531 | goto bad_cleanup_nb_tm; | ||
532 | } | ||
533 | |||
534 | r = __write_initial_superblock(pmd); | ||
535 | if (r) | ||
536 | goto bad_cleanup_nb_tm; | ||
477 | 537 | ||
478 | return 0; | 538 | return 0; |
479 | 539 | ||
480 | bad_data_sm: | 540 | bad_cleanup_nb_tm: |
481 | dm_sm_destroy(data_sm); | 541 | dm_tm_destroy(pmd->nb_tm); |
482 | bad: | 542 | bad_cleanup_data_sm: |
483 | dm_tm_destroy(tm); | 543 | dm_sm_destroy(pmd->data_sm); |
484 | dm_sm_destroy(sm); | 544 | bad_cleanup_tm: |
545 | dm_tm_destroy(pmd->tm); | ||
546 | dm_sm_destroy(pmd->metadata_sm); | ||
547 | |||
548 | return r; | ||
549 | } | ||
550 | |||
551 | static int __check_incompat_features(struct thin_disk_superblock *disk_super, | ||
552 | struct dm_pool_metadata *pmd) | ||
553 | { | ||
554 | uint32_t features; | ||
555 | |||
556 | features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; | ||
557 | if (features) { | ||
558 | DMERR("could not access metadata due to unsupported optional features (%lx).", | ||
559 | (unsigned long)features); | ||
560 | return -EINVAL; | ||
561 | } | ||
562 | |||
563 | /* | ||
564 | * Check for read-only metadata to skip the following RDWR checks. | ||
565 | */ | ||
566 | if (get_disk_ro(pmd->bdev->bd_disk)) | ||
567 | return 0; | ||
568 | |||
569 | features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; | ||
570 | if (features) { | ||
571 | DMERR("could not access metadata RDWR due to unsupported optional features (%lx).", | ||
572 | (unsigned long)features); | ||
573 | return -EINVAL; | ||
574 | } | ||
575 | |||
576 | return 0; | ||
577 | } | ||
578 | |||
579 | static int __open_metadata(struct dm_pool_metadata *pmd) | ||
580 | { | ||
581 | int r; | ||
582 | struct dm_block *sblock; | ||
583 | struct thin_disk_superblock *disk_super; | ||
584 | |||
585 | r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, | ||
586 | &sb_validator, &sblock); | ||
587 | if (r < 0) { | ||
588 | DMERR("couldn't read superblock"); | ||
589 | return r; | ||
590 | } | ||
591 | |||
592 | disk_super = dm_block_data(sblock); | ||
593 | |||
594 | r = __check_incompat_features(disk_super, pmd); | ||
595 | if (r < 0) | ||
596 | goto bad_unlock_sblock; | ||
597 | |||
598 | r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION, | ||
599 | disk_super->metadata_space_map_root, | ||
600 | sizeof(disk_super->metadata_space_map_root), | ||
601 | &pmd->tm, &pmd->metadata_sm); | ||
602 | if (r < 0) { | ||
603 | DMERR("tm_open_with_sm failed"); | ||
604 | goto bad_unlock_sblock; | ||
605 | } | ||
606 | |||
607 | pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root, | ||
608 | sizeof(disk_super->data_space_map_root)); | ||
609 | if (IS_ERR(pmd->data_sm)) { | ||
610 | DMERR("sm_disk_open failed"); | ||
611 | r = PTR_ERR(pmd->data_sm); | ||
612 | goto bad_cleanup_tm; | ||
613 | } | ||
614 | |||
615 | pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm); | ||
616 | if (!pmd->nb_tm) { | ||
617 | DMERR("could not create non-blocking clone tm"); | ||
618 | r = -ENOMEM; | ||
619 | goto bad_cleanup_data_sm; | ||
620 | } | ||
621 | |||
622 | __setup_btree_details(pmd); | ||
623 | return dm_bm_unlock(sblock); | ||
624 | |||
625 | bad_cleanup_data_sm: | ||
626 | dm_sm_destroy(pmd->data_sm); | ||
627 | bad_cleanup_tm: | ||
628 | dm_tm_destroy(pmd->tm); | ||
629 | dm_sm_destroy(pmd->metadata_sm); | ||
630 | bad_unlock_sblock: | ||
631 | dm_bm_unlock(sblock); | ||
632 | |||
633 | return r; | ||
634 | } | ||
635 | |||
636 | static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device) | ||
637 | { | ||
638 | int r, unformatted; | ||
639 | |||
640 | r = __superblock_all_zeroes(pmd->bm, &unformatted); | ||
641 | if (r) | ||
642 | return r; | ||
643 | |||
644 | if (unformatted) | ||
645 | return format_device ? __format_metadata(pmd) : -EPERM; | ||
646 | |||
647 | return __open_metadata(pmd); | ||
648 | } | ||
649 | |||
650 | static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device) | ||
651 | { | ||
652 | int r; | ||
653 | |||
654 | pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE, | ||
655 | THIN_METADATA_CACHE_SIZE, | ||
656 | THIN_MAX_CONCURRENT_LOCKS); | ||
657 | if (IS_ERR(pmd->bm)) { | ||
658 | DMERR("could not create block manager"); | ||
659 | return PTR_ERR(pmd->bm); | ||
660 | } | ||
661 | |||
662 | r = __open_or_format_metadata(pmd, format_device); | ||
663 | if (r) | ||
664 | dm_block_manager_destroy(pmd->bm); | ||
485 | 665 | ||
486 | return r; | 666 | return r; |
487 | } | 667 | } |
488 | 668 | ||
669 | static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd) | ||
670 | { | ||
671 | dm_sm_destroy(pmd->data_sm); | ||
672 | dm_sm_destroy(pmd->metadata_sm); | ||
673 | dm_tm_destroy(pmd->nb_tm); | ||
674 | dm_tm_destroy(pmd->tm); | ||
675 | dm_block_manager_destroy(pmd->bm); | ||
676 | } | ||
677 | |||
489 | static int __begin_transaction(struct dm_pool_metadata *pmd) | 678 | static int __begin_transaction(struct dm_pool_metadata *pmd) |
490 | { | 679 | { |
491 | int r; | 680 | int r; |
492 | u32 features; | ||
493 | struct thin_disk_superblock *disk_super; | 681 | struct thin_disk_superblock *disk_super; |
494 | struct dm_block *sblock; | 682 | struct dm_block *sblock; |
495 | 683 | ||
496 | /* | 684 | /* |
497 | * __maybe_commit_transaction() resets these | ||
498 | */ | ||
499 | WARN_ON(pmd->need_commit); | ||
500 | |||
501 | /* | ||
502 | * We re-read the superblock every time. Shouldn't need to do this | 685 | * We re-read the superblock every time. Shouldn't need to do this |
503 | * really. | 686 | * really. |
504 | */ | 687 | */ |
@@ -515,32 +698,8 @@ static int __begin_transaction(struct dm_pool_metadata *pmd) | |||
515 | pmd->flags = le32_to_cpu(disk_super->flags); | 698 | pmd->flags = le32_to_cpu(disk_super->flags); |
516 | pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); | 699 | pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); |
517 | 700 | ||
518 | features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; | ||
519 | if (features) { | ||
520 | DMERR("could not access metadata due to " | ||
521 | "unsupported optional features (%lx).", | ||
522 | (unsigned long)features); | ||
523 | r = -EINVAL; | ||
524 | goto out; | ||
525 | } | ||
526 | |||
527 | /* | ||
528 | * Check for read-only metadata to skip the following RDWR checks. | ||
529 | */ | ||
530 | if (get_disk_ro(pmd->bdev->bd_disk)) | ||
531 | goto out; | ||
532 | |||
533 | features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; | ||
534 | if (features) { | ||
535 | DMERR("could not access metadata RDWR due to " | ||
536 | "unsupported optional features (%lx).", | ||
537 | (unsigned long)features); | ||
538 | r = -EINVAL; | ||
539 | } | ||
540 | |||
541 | out: | ||
542 | dm_bm_unlock(sblock); | 701 | dm_bm_unlock(sblock); |
543 | return r; | 702 | return 0; |
544 | } | 703 | } |
545 | 704 | ||
546 | static int __write_changed_details(struct dm_pool_metadata *pmd) | 705 | static int __write_changed_details(struct dm_pool_metadata *pmd) |
@@ -573,8 +732,6 @@ static int __write_changed_details(struct dm_pool_metadata *pmd) | |||
573 | list_del(&td->list); | 732 | list_del(&td->list); |
574 | kfree(td); | 733 | kfree(td); |
575 | } | 734 | } |
576 | |||
577 | pmd->need_commit = 1; | ||
578 | } | 735 | } |
579 | 736 | ||
580 | return 0; | 737 | return 0; |
@@ -582,9 +739,6 @@ static int __write_changed_details(struct dm_pool_metadata *pmd) | |||
582 | 739 | ||
583 | static int __commit_transaction(struct dm_pool_metadata *pmd) | 740 | static int __commit_transaction(struct dm_pool_metadata *pmd) |
584 | { | 741 | { |
585 | /* | ||
586 | * FIXME: Associated pool should be made read-only on failure. | ||
587 | */ | ||
588 | int r; | 742 | int r; |
589 | size_t metadata_len, data_len; | 743 | size_t metadata_len, data_len; |
590 | struct thin_disk_superblock *disk_super; | 744 | struct thin_disk_superblock *disk_super; |
@@ -597,31 +751,27 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) | |||
597 | 751 | ||
598 | r = __write_changed_details(pmd); | 752 | r = __write_changed_details(pmd); |
599 | if (r < 0) | 753 | if (r < 0) |
600 | goto out; | 754 | return r; |
601 | |||
602 | if (!pmd->need_commit) | ||
603 | goto out; | ||
604 | 755 | ||
605 | r = dm_sm_commit(pmd->data_sm); | 756 | r = dm_sm_commit(pmd->data_sm); |
606 | if (r < 0) | 757 | if (r < 0) |
607 | goto out; | 758 | return r; |
608 | 759 | ||
609 | r = dm_tm_pre_commit(pmd->tm); | 760 | r = dm_tm_pre_commit(pmd->tm); |
610 | if (r < 0) | 761 | if (r < 0) |
611 | goto out; | 762 | return r; |
612 | 763 | ||
613 | r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); | 764 | r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); |
614 | if (r < 0) | 765 | if (r < 0) |
615 | goto out; | 766 | return r; |
616 | 767 | ||
617 | r = dm_sm_root_size(pmd->data_sm, &data_len); | 768 | r = dm_sm_root_size(pmd->data_sm, &data_len); |
618 | if (r < 0) | 769 | if (r < 0) |
619 | goto out; | 770 | return r; |
620 | 771 | ||
621 | r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, | 772 | r = superblock_lock(pmd, &sblock); |
622 | &sb_validator, &sblock); | ||
623 | if (r) | 773 | if (r) |
624 | goto out; | 774 | return r; |
625 | 775 | ||
626 | disk_super = dm_block_data(sblock); | 776 | disk_super = dm_block_data(sblock); |
627 | disk_super->time = cpu_to_le32(pmd->time); | 777 | disk_super->time = cpu_to_le32(pmd->time); |
@@ -640,12 +790,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) | |||
640 | if (r < 0) | 790 | if (r < 0) |
641 | goto out_locked; | 791 | goto out_locked; |
642 | 792 | ||
643 | r = dm_tm_commit(pmd->tm, sblock); | 793 | return dm_tm_commit(pmd->tm, sblock); |
644 | if (!r) | ||
645 | pmd->need_commit = 0; | ||
646 | |||
647 | out: | ||
648 | return r; | ||
649 | 794 | ||
650 | out_locked: | 795 | out_locked: |
651 | dm_bm_unlock(sblock); | 796 | dm_bm_unlock(sblock); |
@@ -653,15 +798,11 @@ out_locked: | |||
653 | } | 798 | } |
654 | 799 | ||
655 | struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, | 800 | struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, |
656 | sector_t data_block_size) | 801 | sector_t data_block_size, |
802 | bool format_device) | ||
657 | { | 803 | { |
658 | int r; | 804 | int r; |
659 | struct thin_disk_superblock *disk_super; | ||
660 | struct dm_pool_metadata *pmd; | 805 | struct dm_pool_metadata *pmd; |
661 | sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; | ||
662 | struct dm_block_manager *bm; | ||
663 | int create; | ||
664 | struct dm_block *sblock; | ||
665 | 806 | ||
666 | pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); | 807 | pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); |
667 | if (!pmd) { | 808 | if (!pmd) { |
@@ -669,90 +810,28 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, | |||
669 | return ERR_PTR(-ENOMEM); | 810 | return ERR_PTR(-ENOMEM); |
670 | } | 811 | } |
671 | 812 | ||
672 | /* | 813 | init_rwsem(&pmd->root_lock); |
673 | * Max hex locks: | 814 | pmd->time = 0; |
674 | * 3 for btree insert + | 815 | INIT_LIST_HEAD(&pmd->thin_devices); |
675 | * 2 for btree lookup used within space map | 816 | pmd->read_only = false; |
676 | */ | 817 | pmd->fail_io = false; |
677 | bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE, | 818 | pmd->bdev = bdev; |
678 | THIN_METADATA_CACHE_SIZE, 5); | 819 | pmd->data_block_size = data_block_size; |
679 | if (!bm) { | ||
680 | DMERR("could not create block manager"); | ||
681 | kfree(pmd); | ||
682 | return ERR_PTR(-ENOMEM); | ||
683 | } | ||
684 | |||
685 | r = superblock_all_zeroes(bm, &create); | ||
686 | if (r) { | ||
687 | dm_block_manager_destroy(bm); | ||
688 | kfree(pmd); | ||
689 | return ERR_PTR(r); | ||
690 | } | ||
691 | |||
692 | 820 | ||
693 | r = init_pmd(pmd, bm, 0, create); | 821 | r = __create_persistent_data_objects(pmd, format_device); |
694 | if (r) { | 822 | if (r) { |
695 | dm_block_manager_destroy(bm); | ||
696 | kfree(pmd); | 823 | kfree(pmd); |
697 | return ERR_PTR(r); | 824 | return ERR_PTR(r); |
698 | } | 825 | } |
699 | pmd->bdev = bdev; | ||
700 | |||
701 | if (!create) { | ||
702 | r = __begin_transaction(pmd); | ||
703 | if (r < 0) | ||
704 | goto bad; | ||
705 | return pmd; | ||
706 | } | ||
707 | |||
708 | /* | ||
709 | * Create. | ||
710 | */ | ||
711 | r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, | ||
712 | &sb_validator, &sblock); | ||
713 | if (r) | ||
714 | goto bad; | ||
715 | |||
716 | if (bdev_size > THIN_METADATA_MAX_SECTORS) | ||
717 | bdev_size = THIN_METADATA_MAX_SECTORS; | ||
718 | |||
719 | disk_super = dm_block_data(sblock); | ||
720 | disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); | ||
721 | disk_super->version = cpu_to_le32(THIN_VERSION); | ||
722 | disk_super->time = 0; | ||
723 | disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); | ||
724 | disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); | ||
725 | disk_super->data_block_size = cpu_to_le32(data_block_size); | ||
726 | |||
727 | r = dm_bm_unlock(sblock); | ||
728 | if (r < 0) | ||
729 | goto bad; | ||
730 | |||
731 | r = dm_btree_empty(&pmd->info, &pmd->root); | ||
732 | if (r < 0) | ||
733 | goto bad; | ||
734 | |||
735 | r = dm_btree_empty(&pmd->details_info, &pmd->details_root); | ||
736 | if (r < 0) { | ||
737 | DMERR("couldn't create devices root"); | ||
738 | goto bad; | ||
739 | } | ||
740 | 826 | ||
741 | pmd->flags = 0; | 827 | r = __begin_transaction(pmd); |
742 | pmd->need_commit = 1; | ||
743 | r = dm_pool_commit_metadata(pmd); | ||
744 | if (r < 0) { | 828 | if (r < 0) { |
745 | DMERR("%s: dm_pool_commit_metadata() failed, error = %d", | 829 | if (dm_pool_metadata_close(pmd) < 0) |
746 | __func__, r); | 830 | DMWARN("%s: dm_pool_metadata_close() failed.", __func__); |
747 | goto bad; | 831 | return ERR_PTR(r); |
748 | } | 832 | } |
749 | 833 | ||
750 | return pmd; | 834 | return pmd; |
751 | |||
752 | bad: | ||
753 | if (dm_pool_metadata_close(pmd) < 0) | ||
754 | DMWARN("%s: dm_pool_metadata_close() failed.", __func__); | ||
755 | return ERR_PTR(r); | ||
756 | } | 835 | } |
757 | 836 | ||
758 | int dm_pool_metadata_close(struct dm_pool_metadata *pmd) | 837 | int dm_pool_metadata_close(struct dm_pool_metadata *pmd) |
@@ -778,18 +857,17 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) | |||
778 | return -EBUSY; | 857 | return -EBUSY; |
779 | } | 858 | } |
780 | 859 | ||
781 | r = __commit_transaction(pmd); | 860 | if (!pmd->read_only && !pmd->fail_io) { |
782 | if (r < 0) | 861 | r = __commit_transaction(pmd); |
783 | DMWARN("%s: __commit_transaction() failed, error = %d", | 862 | if (r < 0) |
784 | __func__, r); | 863 | DMWARN("%s: __commit_transaction() failed, error = %d", |
864 | __func__, r); | ||
865 | } | ||
785 | 866 | ||
786 | dm_tm_destroy(pmd->tm); | 867 | if (!pmd->fail_io) |
787 | dm_tm_destroy(pmd->nb_tm); | 868 | __destroy_persistent_data_objects(pmd); |
788 | dm_block_manager_destroy(pmd->bm); | ||
789 | dm_sm_destroy(pmd->metadata_sm); | ||
790 | dm_sm_destroy(pmd->data_sm); | ||
791 | kfree(pmd); | ||
792 | 869 | ||
870 | kfree(pmd); | ||
793 | return 0; | 871 | return 0; |
794 | } | 872 | } |
795 | 873 | ||
@@ -850,6 +928,7 @@ static int __open_device(struct dm_pool_metadata *pmd, | |||
850 | (*td)->id = dev; | 928 | (*td)->id = dev; |
851 | (*td)->open_count = 1; | 929 | (*td)->open_count = 1; |
852 | (*td)->changed = changed; | 930 | (*td)->changed = changed; |
931 | (*td)->aborted_with_changes = false; | ||
853 | (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); | 932 | (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); |
854 | (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); | 933 | (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); |
855 | (*td)->creation_time = le32_to_cpu(details_le.creation_time); | 934 | (*td)->creation_time = le32_to_cpu(details_le.creation_time); |
@@ -911,10 +990,11 @@ static int __create_thin(struct dm_pool_metadata *pmd, | |||
911 | 990 | ||
912 | int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) | 991 | int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) |
913 | { | 992 | { |
914 | int r; | 993 | int r = -EINVAL; |
915 | 994 | ||
916 | down_write(&pmd->root_lock); | 995 | down_write(&pmd->root_lock); |
917 | r = __create_thin(pmd, dev); | 996 | if (!pmd->fail_io) |
997 | r = __create_thin(pmd, dev); | ||
918 | up_write(&pmd->root_lock); | 998 | up_write(&pmd->root_lock); |
919 | 999 | ||
920 | return r; | 1000 | return r; |
@@ -1001,10 +1081,11 @@ int dm_pool_create_snap(struct dm_pool_metadata *pmd, | |||
1001 | dm_thin_id dev, | 1081 | dm_thin_id dev, |
1002 | dm_thin_id origin) | 1082 | dm_thin_id origin) |
1003 | { | 1083 | { |
1004 | int r; | 1084 | int r = -EINVAL; |
1005 | 1085 | ||
1006 | down_write(&pmd->root_lock); | 1086 | down_write(&pmd->root_lock); |
1007 | r = __create_snap(pmd, dev, origin); | 1087 | if (!pmd->fail_io) |
1088 | r = __create_snap(pmd, dev, origin); | ||
1008 | up_write(&pmd->root_lock); | 1089 | up_write(&pmd->root_lock); |
1009 | 1090 | ||
1010 | return r; | 1091 | return r; |
@@ -1037,18 +1118,17 @@ static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev) | |||
1037 | if (r) | 1118 | if (r) |
1038 | return r; | 1119 | return r; |
1039 | 1120 | ||
1040 | pmd->need_commit = 1; | ||
1041 | |||
1042 | return 0; | 1121 | return 0; |
1043 | } | 1122 | } |
1044 | 1123 | ||
1045 | int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, | 1124 | int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, |
1046 | dm_thin_id dev) | 1125 | dm_thin_id dev) |
1047 | { | 1126 | { |
1048 | int r; | 1127 | int r = -EINVAL; |
1049 | 1128 | ||
1050 | down_write(&pmd->root_lock); | 1129 | down_write(&pmd->root_lock); |
1051 | r = __delete_device(pmd, dev); | 1130 | if (!pmd->fail_io) |
1131 | r = __delete_device(pmd, dev); | ||
1052 | up_write(&pmd->root_lock); | 1132 | up_write(&pmd->root_lock); |
1053 | 1133 | ||
1054 | return r; | 1134 | return r; |
@@ -1058,28 +1138,40 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, | |||
1058 | uint64_t current_id, | 1138 | uint64_t current_id, |
1059 | uint64_t new_id) | 1139 | uint64_t new_id) |
1060 | { | 1140 | { |
1141 | int r = -EINVAL; | ||
1142 | |||
1061 | down_write(&pmd->root_lock); | 1143 | down_write(&pmd->root_lock); |
1144 | |||
1145 | if (pmd->fail_io) | ||
1146 | goto out; | ||
1147 | |||
1062 | if (pmd->trans_id != current_id) { | 1148 | if (pmd->trans_id != current_id) { |
1063 | up_write(&pmd->root_lock); | ||
1064 | DMERR("mismatched transaction id"); | 1149 | DMERR("mismatched transaction id"); |
1065 | return -EINVAL; | 1150 | goto out; |
1066 | } | 1151 | } |
1067 | 1152 | ||
1068 | pmd->trans_id = new_id; | 1153 | pmd->trans_id = new_id; |
1069 | pmd->need_commit = 1; | 1154 | r = 0; |
1155 | |||
1156 | out: | ||
1070 | up_write(&pmd->root_lock); | 1157 | up_write(&pmd->root_lock); |
1071 | 1158 | ||
1072 | return 0; | 1159 | return r; |
1073 | } | 1160 | } |
1074 | 1161 | ||
1075 | int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, | 1162 | int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, |
1076 | uint64_t *result) | 1163 | uint64_t *result) |
1077 | { | 1164 | { |
1165 | int r = -EINVAL; | ||
1166 | |||
1078 | down_read(&pmd->root_lock); | 1167 | down_read(&pmd->root_lock); |
1079 | *result = pmd->trans_id; | 1168 | if (!pmd->fail_io) { |
1169 | *result = pmd->trans_id; | ||
1170 | r = 0; | ||
1171 | } | ||
1080 | up_read(&pmd->root_lock); | 1172 | up_read(&pmd->root_lock); |
1081 | 1173 | ||
1082 | return 0; | 1174 | return r; |
1083 | } | 1175 | } |
1084 | 1176 | ||
1085 | static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) | 1177 | static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) |
@@ -1108,8 +1200,6 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) | |||
1108 | 1200 | ||
1109 | dm_tm_dec(pmd->tm, held_root); | 1201 | dm_tm_dec(pmd->tm, held_root); |
1110 | dm_tm_unlock(pmd->tm, copy); | 1202 | dm_tm_unlock(pmd->tm, copy); |
1111 | pmd->need_commit = 1; | ||
1112 | |||
1113 | return -EBUSY; | 1203 | return -EBUSY; |
1114 | } | 1204 | } |
1115 | 1205 | ||
@@ -1131,29 +1221,25 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) | |||
1131 | /* | 1221 | /* |
1132 | * Write the held root into the superblock. | 1222 | * Write the held root into the superblock. |
1133 | */ | 1223 | */ |
1134 | r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, | 1224 | r = superblock_lock(pmd, &sblock); |
1135 | &sb_validator, &sblock); | ||
1136 | if (r) { | 1225 | if (r) { |
1137 | dm_tm_dec(pmd->tm, held_root); | 1226 | dm_tm_dec(pmd->tm, held_root); |
1138 | pmd->need_commit = 1; | ||
1139 | return r; | 1227 | return r; |
1140 | } | 1228 | } |
1141 | 1229 | ||
1142 | disk_super = dm_block_data(sblock); | 1230 | disk_super = dm_block_data(sblock); |
1143 | disk_super->held_root = cpu_to_le64(held_root); | 1231 | disk_super->held_root = cpu_to_le64(held_root); |
1144 | dm_bm_unlock(sblock); | 1232 | dm_bm_unlock(sblock); |
1145 | |||
1146 | pmd->need_commit = 1; | ||
1147 | |||
1148 | return 0; | 1233 | return 0; |
1149 | } | 1234 | } |
1150 | 1235 | ||
1151 | int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) | 1236 | int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) |
1152 | { | 1237 | { |
1153 | int r; | 1238 | int r = -EINVAL; |
1154 | 1239 | ||
1155 | down_write(&pmd->root_lock); | 1240 | down_write(&pmd->root_lock); |
1156 | r = __reserve_metadata_snap(pmd); | 1241 | if (!pmd->fail_io) |
1242 | r = __reserve_metadata_snap(pmd); | ||
1157 | up_write(&pmd->root_lock); | 1243 | up_write(&pmd->root_lock); |
1158 | 1244 | ||
1159 | return r; | 1245 | return r; |
@@ -1166,15 +1252,13 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd) | |||
1166 | struct dm_block *sblock, *copy; | 1252 | struct dm_block *sblock, *copy; |
1167 | dm_block_t held_root; | 1253 | dm_block_t held_root; |
1168 | 1254 | ||
1169 | r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, | 1255 | r = superblock_lock(pmd, &sblock); |
1170 | &sb_validator, &sblock); | ||
1171 | if (r) | 1256 | if (r) |
1172 | return r; | 1257 | return r; |
1173 | 1258 | ||
1174 | disk_super = dm_block_data(sblock); | 1259 | disk_super = dm_block_data(sblock); |
1175 | held_root = le64_to_cpu(disk_super->held_root); | 1260 | held_root = le64_to_cpu(disk_super->held_root); |
1176 | disk_super->held_root = cpu_to_le64(0); | 1261 | disk_super->held_root = cpu_to_le64(0); |
1177 | pmd->need_commit = 1; | ||
1178 | 1262 | ||
1179 | dm_bm_unlock(sblock); | 1263 | dm_bm_unlock(sblock); |
1180 | 1264 | ||
@@ -1197,10 +1281,11 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd) | |||
1197 | 1281 | ||
1198 | int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) | 1282 | int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) |
1199 | { | 1283 | { |
1200 | int r; | 1284 | int r = -EINVAL; |
1201 | 1285 | ||
1202 | down_write(&pmd->root_lock); | 1286 | down_write(&pmd->root_lock); |
1203 | r = __release_metadata_snap(pmd); | 1287 | if (!pmd->fail_io) |
1288 | r = __release_metadata_snap(pmd); | ||
1204 | up_write(&pmd->root_lock); | 1289 | up_write(&pmd->root_lock); |
1205 | 1290 | ||
1206 | return r; | 1291 | return r; |
@@ -1227,10 +1312,11 @@ static int __get_metadata_snap(struct dm_pool_metadata *pmd, | |||
1227 | int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, | 1312 | int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, |
1228 | dm_block_t *result) | 1313 | dm_block_t *result) |
1229 | { | 1314 | { |
1230 | int r; | 1315 | int r = -EINVAL; |
1231 | 1316 | ||
1232 | down_read(&pmd->root_lock); | 1317 | down_read(&pmd->root_lock); |
1233 | r = __get_metadata_snap(pmd, result); | 1318 | if (!pmd->fail_io) |
1319 | r = __get_metadata_snap(pmd, result); | ||
1234 | up_read(&pmd->root_lock); | 1320 | up_read(&pmd->root_lock); |
1235 | 1321 | ||
1236 | return r; | 1322 | return r; |
@@ -1239,10 +1325,11 @@ int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, | |||
1239 | int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, | 1325 | int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, |
1240 | struct dm_thin_device **td) | 1326 | struct dm_thin_device **td) |
1241 | { | 1327 | { |
1242 | int r; | 1328 | int r = -EINVAL; |
1243 | 1329 | ||
1244 | down_write(&pmd->root_lock); | 1330 | down_write(&pmd->root_lock); |
1245 | r = __open_device(pmd, dev, 0, td); | 1331 | if (!pmd->fail_io) |
1332 | r = __open_device(pmd, dev, 0, td); | ||
1246 | up_write(&pmd->root_lock); | 1333 | up_write(&pmd->root_lock); |
1247 | 1334 | ||
1248 | return r; | 1335 | return r; |
@@ -1262,7 +1349,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) | |||
1262 | return td->id; | 1349 | return td->id; |
1263 | } | 1350 | } |
1264 | 1351 | ||
1265 | static int __snapshotted_since(struct dm_thin_device *td, uint32_t time) | 1352 | static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) |
1266 | { | 1353 | { |
1267 | return td->snapshotted_time > time; | 1354 | return td->snapshotted_time > time; |
1268 | } | 1355 | } |
@@ -1270,28 +1357,31 @@ static int __snapshotted_since(struct dm_thin_device *td, uint32_t time) | |||
1270 | int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, | 1357 | int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, |
1271 | int can_block, struct dm_thin_lookup_result *result) | 1358 | int can_block, struct dm_thin_lookup_result *result) |
1272 | { | 1359 | { |
1273 | int r; | 1360 | int r = -EINVAL; |
1274 | uint64_t block_time = 0; | 1361 | uint64_t block_time = 0; |
1275 | __le64 value; | 1362 | __le64 value; |
1276 | struct dm_pool_metadata *pmd = td->pmd; | 1363 | struct dm_pool_metadata *pmd = td->pmd; |
1277 | dm_block_t keys[2] = { td->id, block }; | 1364 | dm_block_t keys[2] = { td->id, block }; |
1365 | struct dm_btree_info *info; | ||
1278 | 1366 | ||
1279 | if (can_block) { | 1367 | if (can_block) { |
1280 | down_read(&pmd->root_lock); | 1368 | down_read(&pmd->root_lock); |
1281 | r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value); | 1369 | info = &pmd->info; |
1282 | if (!r) | 1370 | } else if (down_read_trylock(&pmd->root_lock)) |
1283 | block_time = le64_to_cpu(value); | 1371 | info = &pmd->nb_info; |
1284 | up_read(&pmd->root_lock); | 1372 | else |
1285 | |||
1286 | } else if (down_read_trylock(&pmd->root_lock)) { | ||
1287 | r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value); | ||
1288 | if (!r) | ||
1289 | block_time = le64_to_cpu(value); | ||
1290 | up_read(&pmd->root_lock); | ||
1291 | |||
1292 | } else | ||
1293 | return -EWOULDBLOCK; | 1373 | return -EWOULDBLOCK; |
1294 | 1374 | ||
1375 | if (pmd->fail_io) | ||
1376 | goto out; | ||
1377 | |||
1378 | r = dm_btree_lookup(info, pmd->root, keys, &value); | ||
1379 | if (!r) | ||
1380 | block_time = le64_to_cpu(value); | ||
1381 | |||
1382 | out: | ||
1383 | up_read(&pmd->root_lock); | ||
1384 | |||
1295 | if (!r) { | 1385 | if (!r) { |
1296 | dm_block_t exception_block; | 1386 | dm_block_t exception_block; |
1297 | uint32_t exception_time; | 1387 | uint32_t exception_time; |
@@ -1312,7 +1402,6 @@ static int __insert(struct dm_thin_device *td, dm_block_t block, | |||
1312 | struct dm_pool_metadata *pmd = td->pmd; | 1402 | struct dm_pool_metadata *pmd = td->pmd; |
1313 | dm_block_t keys[2] = { td->id, block }; | 1403 | dm_block_t keys[2] = { td->id, block }; |
1314 | 1404 | ||
1315 | pmd->need_commit = 1; | ||
1316 | value = cpu_to_le64(pack_block_time(data_block, pmd->time)); | 1405 | value = cpu_to_le64(pack_block_time(data_block, pmd->time)); |
1317 | __dm_bless_for_disk(&value); | 1406 | __dm_bless_for_disk(&value); |
1318 | 1407 | ||
@@ -1321,10 +1410,9 @@ static int __insert(struct dm_thin_device *td, dm_block_t block, | |||
1321 | if (r) | 1410 | if (r) |
1322 | return r; | 1411 | return r; |
1323 | 1412 | ||
1324 | if (inserted) { | 1413 | td->changed = 1; |
1414 | if (inserted) | ||
1325 | td->mapped_blocks++; | 1415 | td->mapped_blocks++; |
1326 | td->changed = 1; | ||
1327 | } | ||
1328 | 1416 | ||
1329 | return 0; | 1417 | return 0; |
1330 | } | 1418 | } |
@@ -1332,10 +1420,11 @@ static int __insert(struct dm_thin_device *td, dm_block_t block, | |||
1332 | int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, | 1420 | int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, |
1333 | dm_block_t data_block) | 1421 | dm_block_t data_block) |
1334 | { | 1422 | { |
1335 | int r; | 1423 | int r = -EINVAL; |
1336 | 1424 | ||
1337 | down_write(&td->pmd->root_lock); | 1425 | down_write(&td->pmd->root_lock); |
1338 | r = __insert(td, block, data_block); | 1426 | if (!td->pmd->fail_io) |
1427 | r = __insert(td, block, data_block); | ||
1339 | up_write(&td->pmd->root_lock); | 1428 | up_write(&td->pmd->root_lock); |
1340 | 1429 | ||
1341 | return r; | 1430 | return r; |
@@ -1353,31 +1442,51 @@ static int __remove(struct dm_thin_device *td, dm_block_t block) | |||
1353 | 1442 | ||
1354 | td->mapped_blocks--; | 1443 | td->mapped_blocks--; |
1355 | td->changed = 1; | 1444 | td->changed = 1; |
1356 | pmd->need_commit = 1; | ||
1357 | 1445 | ||
1358 | return 0; | 1446 | return 0; |
1359 | } | 1447 | } |
1360 | 1448 | ||
1361 | int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) | 1449 | int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) |
1362 | { | 1450 | { |
1363 | int r; | 1451 | int r = -EINVAL; |
1364 | 1452 | ||
1365 | down_write(&td->pmd->root_lock); | 1453 | down_write(&td->pmd->root_lock); |
1366 | r = __remove(td, block); | 1454 | if (!td->pmd->fail_io) |
1455 | r = __remove(td, block); | ||
1367 | up_write(&td->pmd->root_lock); | 1456 | up_write(&td->pmd->root_lock); |
1368 | 1457 | ||
1369 | return r; | 1458 | return r; |
1370 | } | 1459 | } |
1371 | 1460 | ||
1372 | int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) | 1461 | bool dm_thin_changed_this_transaction(struct dm_thin_device *td) |
1373 | { | 1462 | { |
1374 | int r; | 1463 | int r; |
1375 | 1464 | ||
1376 | down_write(&pmd->root_lock); | 1465 | down_read(&td->pmd->root_lock); |
1466 | r = td->changed; | ||
1467 | up_read(&td->pmd->root_lock); | ||
1377 | 1468 | ||
1378 | r = dm_sm_new_block(pmd->data_sm, result); | 1469 | return r; |
1379 | pmd->need_commit = 1; | 1470 | } |
1471 | |||
1472 | bool dm_thin_aborted_changes(struct dm_thin_device *td) | ||
1473 | { | ||
1474 | bool r; | ||
1380 | 1475 | ||
1476 | down_read(&td->pmd->root_lock); | ||
1477 | r = td->aborted_with_changes; | ||
1478 | up_read(&td->pmd->root_lock); | ||
1479 | |||
1480 | return r; | ||
1481 | } | ||
1482 | |||
1483 | int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) | ||
1484 | { | ||
1485 | int r = -EINVAL; | ||
1486 | |||
1487 | down_write(&pmd->root_lock); | ||
1488 | if (!pmd->fail_io) | ||
1489 | r = dm_sm_new_block(pmd->data_sm, result); | ||
1381 | up_write(&pmd->root_lock); | 1490 | up_write(&pmd->root_lock); |
1382 | 1491 | ||
1383 | return r; | 1492 | return r; |
@@ -1385,9 +1494,11 @@ int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) | |||
1385 | 1494 | ||
1386 | int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) | 1495 | int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) |
1387 | { | 1496 | { |
1388 | int r; | 1497 | int r = -EINVAL; |
1389 | 1498 | ||
1390 | down_write(&pmd->root_lock); | 1499 | down_write(&pmd->root_lock); |
1500 | if (pmd->fail_io) | ||
1501 | goto out; | ||
1391 | 1502 | ||
1392 | r = __commit_transaction(pmd); | 1503 | r = __commit_transaction(pmd); |
1393 | if (r <= 0) | 1504 | if (r <= 0) |
@@ -1402,12 +1513,41 @@ out: | |||
1402 | return r; | 1513 | return r; |
1403 | } | 1514 | } |
1404 | 1515 | ||
1516 | static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd) | ||
1517 | { | ||
1518 | struct dm_thin_device *td; | ||
1519 | |||
1520 | list_for_each_entry(td, &pmd->thin_devices, list) | ||
1521 | td->aborted_with_changes = td->changed; | ||
1522 | } | ||
1523 | |||
1524 | int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) | ||
1525 | { | ||
1526 | int r = -EINVAL; | ||
1527 | |||
1528 | down_write(&pmd->root_lock); | ||
1529 | if (pmd->fail_io) | ||
1530 | goto out; | ||
1531 | |||
1532 | __set_abort_with_changes_flags(pmd); | ||
1533 | __destroy_persistent_data_objects(pmd); | ||
1534 | r = __create_persistent_data_objects(pmd, false); | ||
1535 | if (r) | ||
1536 | pmd->fail_io = true; | ||
1537 | |||
1538 | out: | ||
1539 | up_write(&pmd->root_lock); | ||
1540 | |||
1541 | return r; | ||
1542 | } | ||
1543 | |||
1405 | int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) | 1544 | int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) |
1406 | { | 1545 | { |
1407 | int r; | 1546 | int r = -EINVAL; |
1408 | 1547 | ||
1409 | down_read(&pmd->root_lock); | 1548 | down_read(&pmd->root_lock); |
1410 | r = dm_sm_get_nr_free(pmd->data_sm, result); | 1549 | if (!pmd->fail_io) |
1550 | r = dm_sm_get_nr_free(pmd->data_sm, result); | ||
1411 | up_read(&pmd->root_lock); | 1551 | up_read(&pmd->root_lock); |
1412 | 1552 | ||
1413 | return r; | 1553 | return r; |
@@ -1416,10 +1556,11 @@ int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *resul | |||
1416 | int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, | 1556 | int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, |
1417 | dm_block_t *result) | 1557 | dm_block_t *result) |
1418 | { | 1558 | { |
1419 | int r; | 1559 | int r = -EINVAL; |
1420 | 1560 | ||
1421 | down_read(&pmd->root_lock); | 1561 | down_read(&pmd->root_lock); |
1422 | r = dm_sm_get_nr_free(pmd->metadata_sm, result); | 1562 | if (!pmd->fail_io) |
1563 | r = dm_sm_get_nr_free(pmd->metadata_sm, result); | ||
1423 | up_read(&pmd->root_lock); | 1564 | up_read(&pmd->root_lock); |
1424 | 1565 | ||
1425 | return r; | 1566 | return r; |
@@ -1428,10 +1569,11 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, | |||
1428 | int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, | 1569 | int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, |
1429 | dm_block_t *result) | 1570 | dm_block_t *result) |
1430 | { | 1571 | { |
1431 | int r; | 1572 | int r = -EINVAL; |
1432 | 1573 | ||
1433 | down_read(&pmd->root_lock); | 1574 | down_read(&pmd->root_lock); |
1434 | r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); | 1575 | if (!pmd->fail_io) |
1576 | r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); | ||
1435 | up_read(&pmd->root_lock); | 1577 | up_read(&pmd->root_lock); |
1436 | 1578 | ||
1437 | return r; | 1579 | return r; |
@@ -1448,10 +1590,11 @@ int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result) | |||
1448 | 1590 | ||
1449 | int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) | 1591 | int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) |
1450 | { | 1592 | { |
1451 | int r; | 1593 | int r = -EINVAL; |
1452 | 1594 | ||
1453 | down_read(&pmd->root_lock); | 1595 | down_read(&pmd->root_lock); |
1454 | r = dm_sm_get_nr_blocks(pmd->data_sm, result); | 1596 | if (!pmd->fail_io) |
1597 | r = dm_sm_get_nr_blocks(pmd->data_sm, result); | ||
1455 | up_read(&pmd->root_lock); | 1598 | up_read(&pmd->root_lock); |
1456 | 1599 | ||
1457 | return r; | 1600 | return r; |
@@ -1459,13 +1602,17 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) | |||
1459 | 1602 | ||
1460 | int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) | 1603 | int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) |
1461 | { | 1604 | { |
1605 | int r = -EINVAL; | ||
1462 | struct dm_pool_metadata *pmd = td->pmd; | 1606 | struct dm_pool_metadata *pmd = td->pmd; |
1463 | 1607 | ||
1464 | down_read(&pmd->root_lock); | 1608 | down_read(&pmd->root_lock); |
1465 | *result = td->mapped_blocks; | 1609 | if (!pmd->fail_io) { |
1610 | *result = td->mapped_blocks; | ||
1611 | r = 0; | ||
1612 | } | ||
1466 | up_read(&pmd->root_lock); | 1613 | up_read(&pmd->root_lock); |
1467 | 1614 | ||
1468 | return 0; | 1615 | return r; |
1469 | } | 1616 | } |
1470 | 1617 | ||
1471 | static int __highest_block(struct dm_thin_device *td, dm_block_t *result) | 1618 | static int __highest_block(struct dm_thin_device *td, dm_block_t *result) |
@@ -1487,11 +1634,12 @@ static int __highest_block(struct dm_thin_device *td, dm_block_t *result) | |||
1487 | int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, | 1634 | int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, |
1488 | dm_block_t *result) | 1635 | dm_block_t *result) |
1489 | { | 1636 | { |
1490 | int r; | 1637 | int r = -EINVAL; |
1491 | struct dm_pool_metadata *pmd = td->pmd; | 1638 | struct dm_pool_metadata *pmd = td->pmd; |
1492 | 1639 | ||
1493 | down_read(&pmd->root_lock); | 1640 | down_read(&pmd->root_lock); |
1494 | r = __highest_block(td, result); | 1641 | if (!pmd->fail_io) |
1642 | r = __highest_block(td, result); | ||
1495 | up_read(&pmd->root_lock); | 1643 | up_read(&pmd->root_lock); |
1496 | 1644 | ||
1497 | return r; | 1645 | return r; |
@@ -1514,20 +1662,25 @@ static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) | |||
1514 | return -EINVAL; | 1662 | return -EINVAL; |
1515 | } | 1663 | } |
1516 | 1664 | ||
1517 | r = dm_sm_extend(pmd->data_sm, new_count - old_count); | 1665 | return dm_sm_extend(pmd->data_sm, new_count - old_count); |
1518 | if (!r) | ||
1519 | pmd->need_commit = 1; | ||
1520 | |||
1521 | return r; | ||
1522 | } | 1666 | } |
1523 | 1667 | ||
1524 | int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) | 1668 | int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) |
1525 | { | 1669 | { |
1526 | int r; | 1670 | int r = -EINVAL; |
1527 | 1671 | ||
1528 | down_write(&pmd->root_lock); | 1672 | down_write(&pmd->root_lock); |
1529 | r = __resize_data_dev(pmd, new_count); | 1673 | if (!pmd->fail_io) |
1674 | r = __resize_data_dev(pmd, new_count); | ||
1530 | up_write(&pmd->root_lock); | 1675 | up_write(&pmd->root_lock); |
1531 | 1676 | ||
1532 | return r; | 1677 | return r; |
1533 | } | 1678 | } |
1679 | |||
1680 | void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) | ||
1681 | { | ||
1682 | down_write(&pmd->root_lock); | ||
1683 | pmd->read_only = true; | ||
1684 | dm_bm_set_read_only(pmd->bm); | ||
1685 | up_write(&pmd->root_lock); | ||
1686 | } | ||
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index b88918ccdaf6..0cecc3702885 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h | |||
@@ -38,7 +38,8 @@ typedef uint64_t dm_thin_id; | |||
38 | * Reopens or creates a new, empty metadata volume. | 38 | * Reopens or creates a new, empty metadata volume. |
39 | */ | 39 | */ |
40 | struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, | 40 | struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, |
41 | sector_t data_block_size); | 41 | sector_t data_block_size, |
42 | bool format_device); | ||
42 | 43 | ||
43 | int dm_pool_metadata_close(struct dm_pool_metadata *pmd); | 44 | int dm_pool_metadata_close(struct dm_pool_metadata *pmd); |
44 | 45 | ||
@@ -79,6 +80,16 @@ int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, | |||
79 | int dm_pool_commit_metadata(struct dm_pool_metadata *pmd); | 80 | int dm_pool_commit_metadata(struct dm_pool_metadata *pmd); |
80 | 81 | ||
81 | /* | 82 | /* |
83 | * Discards all uncommitted changes. Rereads the superblock, rolling back | ||
84 | * to the last good transaction. Thin devices remain open. | ||
85 | * dm_thin_aborted_changes() tells you if they had uncommitted changes. | ||
86 | * | ||
87 | * If this call fails it's only useful to call dm_pool_metadata_close(). | ||
88 | * All other methods will fail with -EINVAL. | ||
89 | */ | ||
90 | int dm_pool_abort_metadata(struct dm_pool_metadata *pmd); | ||
91 | |||
92 | /* | ||
82 | * Set/get userspace transaction id. | 93 | * Set/get userspace transaction id. |
83 | */ | 94 | */ |
84 | int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, | 95 | int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, |
@@ -119,7 +130,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td); | |||
119 | 130 | ||
120 | struct dm_thin_lookup_result { | 131 | struct dm_thin_lookup_result { |
121 | dm_block_t block; | 132 | dm_block_t block; |
122 | int shared; | 133 | unsigned shared:1; |
123 | }; | 134 | }; |
124 | 135 | ||
125 | /* | 136 | /* |
@@ -147,6 +158,10 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block); | |||
147 | /* | 158 | /* |
148 | * Queries. | 159 | * Queries. |
149 | */ | 160 | */ |
161 | bool dm_thin_changed_this_transaction(struct dm_thin_device *td); | ||
162 | |||
163 | bool dm_thin_aborted_changes(struct dm_thin_device *td); | ||
164 | |||
150 | int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, | 165 | int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, |
151 | dm_block_t *highest_mapped); | 166 | dm_block_t *highest_mapped); |
152 | 167 | ||
@@ -171,6 +186,12 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); | |||
171 | */ | 186 | */ |
172 | int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); | 187 | int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); |
173 | 188 | ||
189 | /* | ||
190 | * Flicks the underlying block manager into read only mode, so you know | ||
191 | * that nothing is changing. | ||
192 | */ | ||
193 | void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); | ||
194 | |||
174 | /*----------------------------------------------------------------*/ | 195 | /*----------------------------------------------------------------*/ |
175 | 196 | ||
176 | #endif | 197 | #endif |
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 37fdaf81bd1f..af1fc3b2c2ad 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
@@ -1,10 +1,11 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2011 Red Hat UK. | 2 | * Copyright (C) 2011-2012 Red Hat UK. |
3 | * | 3 | * |
4 | * This file is released under the GPL. | 4 | * This file is released under the GPL. |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include "dm-thin-metadata.h" | 7 | #include "dm-thin-metadata.h" |
8 | #include "dm.h" | ||
8 | 9 | ||
9 | #include <linux/device-mapper.h> | 10 | #include <linux/device-mapper.h> |
10 | #include <linux/dm-io.h> | 11 | #include <linux/dm-io.h> |
@@ -19,7 +20,7 @@ | |||
19 | /* | 20 | /* |
20 | * Tunable constants | 21 | * Tunable constants |
21 | */ | 22 | */ |
22 | #define ENDIO_HOOK_POOL_SIZE 10240 | 23 | #define ENDIO_HOOK_POOL_SIZE 1024 |
23 | #define DEFERRED_SET_SIZE 64 | 24 | #define DEFERRED_SET_SIZE 64 |
24 | #define MAPPING_POOL_SIZE 1024 | 25 | #define MAPPING_POOL_SIZE 1024 |
25 | #define PRISON_CELLS 1024 | 26 | #define PRISON_CELLS 1024 |
@@ -496,12 +497,27 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, | |||
496 | */ | 497 | */ |
497 | struct dm_thin_new_mapping; | 498 | struct dm_thin_new_mapping; |
498 | 499 | ||
500 | /* | ||
501 | * The pool runs in 3 modes. Ordered in degraded order for comparisons. | ||
502 | */ | ||
503 | enum pool_mode { | ||
504 | PM_WRITE, /* metadata may be changed */ | ||
505 | PM_READ_ONLY, /* metadata may not be changed */ | ||
506 | PM_FAIL, /* all I/O fails */ | ||
507 | }; | ||
508 | |||
499 | struct pool_features { | 509 | struct pool_features { |
510 | enum pool_mode mode; | ||
511 | |||
500 | unsigned zero_new_blocks:1; | 512 | unsigned zero_new_blocks:1; |
501 | unsigned discard_enabled:1; | 513 | unsigned discard_enabled:1; |
502 | unsigned discard_passdown:1; | 514 | unsigned discard_passdown:1; |
503 | }; | 515 | }; |
504 | 516 | ||
517 | struct thin_c; | ||
518 | typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); | ||
519 | typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); | ||
520 | |||
505 | struct pool { | 521 | struct pool { |
506 | struct list_head list; | 522 | struct list_head list; |
507 | struct dm_target *ti; /* Only set if a pool target is bound */ | 523 | struct dm_target *ti; /* Only set if a pool target is bound */ |
@@ -510,10 +526,9 @@ struct pool { | |||
510 | struct block_device *md_dev; | 526 | struct block_device *md_dev; |
511 | struct dm_pool_metadata *pmd; | 527 | struct dm_pool_metadata *pmd; |
512 | 528 | ||
513 | uint32_t sectors_per_block; | ||
514 | unsigned block_shift; | ||
515 | dm_block_t offset_mask; | ||
516 | dm_block_t low_water_blocks; | 529 | dm_block_t low_water_blocks; |
530 | uint32_t sectors_per_block; | ||
531 | int sectors_per_block_shift; | ||
517 | 532 | ||
518 | struct pool_features pf; | 533 | struct pool_features pf; |
519 | unsigned low_water_triggered:1; /* A dm event has been sent */ | 534 | unsigned low_water_triggered:1; /* A dm event has been sent */ |
@@ -526,8 +541,8 @@ struct pool { | |||
526 | struct work_struct worker; | 541 | struct work_struct worker; |
527 | struct delayed_work waker; | 542 | struct delayed_work waker; |
528 | 543 | ||
529 | unsigned ref_count; | ||
530 | unsigned long last_commit_jiffies; | 544 | unsigned long last_commit_jiffies; |
545 | unsigned ref_count; | ||
531 | 546 | ||
532 | spinlock_t lock; | 547 | spinlock_t lock; |
533 | struct bio_list deferred_bios; | 548 | struct bio_list deferred_bios; |
@@ -543,8 +558,17 @@ struct pool { | |||
543 | struct dm_thin_new_mapping *next_mapping; | 558 | struct dm_thin_new_mapping *next_mapping; |
544 | mempool_t *mapping_pool; | 559 | mempool_t *mapping_pool; |
545 | mempool_t *endio_hook_pool; | 560 | mempool_t *endio_hook_pool; |
561 | |||
562 | process_bio_fn process_bio; | ||
563 | process_bio_fn process_discard; | ||
564 | |||
565 | process_mapping_fn process_prepared_mapping; | ||
566 | process_mapping_fn process_prepared_discard; | ||
546 | }; | 567 | }; |
547 | 568 | ||
569 | static enum pool_mode get_pool_mode(struct pool *pool); | ||
570 | static void set_pool_mode(struct pool *pool, enum pool_mode mode); | ||
571 | |||
548 | /* | 572 | /* |
549 | * Target context for a pool. | 573 | * Target context for a pool. |
550 | */ | 574 | */ |
@@ -679,16 +703,28 @@ static void requeue_io(struct thin_c *tc) | |||
679 | 703 | ||
680 | static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) | 704 | static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) |
681 | { | 705 | { |
682 | return bio->bi_sector >> tc->pool->block_shift; | 706 | sector_t block_nr = bio->bi_sector; |
707 | |||
708 | if (tc->pool->sectors_per_block_shift < 0) | ||
709 | (void) sector_div(block_nr, tc->pool->sectors_per_block); | ||
710 | else | ||
711 | block_nr >>= tc->pool->sectors_per_block_shift; | ||
712 | |||
713 | return block_nr; | ||
683 | } | 714 | } |
684 | 715 | ||
685 | static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) | 716 | static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) |
686 | { | 717 | { |
687 | struct pool *pool = tc->pool; | 718 | struct pool *pool = tc->pool; |
719 | sector_t bi_sector = bio->bi_sector; | ||
688 | 720 | ||
689 | bio->bi_bdev = tc->pool_dev->bdev; | 721 | bio->bi_bdev = tc->pool_dev->bdev; |
690 | bio->bi_sector = (block << pool->block_shift) + | 722 | if (tc->pool->sectors_per_block_shift < 0) |
691 | (bio->bi_sector & pool->offset_mask); | 723 | bio->bi_sector = (block * pool->sectors_per_block) + |
724 | sector_div(bi_sector, pool->sectors_per_block); | ||
725 | else | ||
726 | bio->bi_sector = (block << pool->sectors_per_block_shift) | | ||
727 | (bi_sector & (pool->sectors_per_block - 1)); | ||
692 | } | 728 | } |
693 | 729 | ||
694 | static void remap_to_origin(struct thin_c *tc, struct bio *bio) | 730 | static void remap_to_origin(struct thin_c *tc, struct bio *bio) |
@@ -696,21 +732,39 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio) | |||
696 | bio->bi_bdev = tc->origin_dev->bdev; | 732 | bio->bi_bdev = tc->origin_dev->bdev; |
697 | } | 733 | } |
698 | 734 | ||
735 | static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) | ||
736 | { | ||
737 | return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && | ||
738 | dm_thin_changed_this_transaction(tc->td); | ||
739 | } | ||
740 | |||
699 | static void issue(struct thin_c *tc, struct bio *bio) | 741 | static void issue(struct thin_c *tc, struct bio *bio) |
700 | { | 742 | { |
701 | struct pool *pool = tc->pool; | 743 | struct pool *pool = tc->pool; |
702 | unsigned long flags; | 744 | unsigned long flags; |
703 | 745 | ||
746 | if (!bio_triggers_commit(tc, bio)) { | ||
747 | generic_make_request(bio); | ||
748 | return; | ||
749 | } | ||
750 | |||
704 | /* | 751 | /* |
705 | * Batch together any FUA/FLUSH bios we find and then issue | 752 | * Complete bio with an error if earlier I/O caused changes to |
706 | * a single commit for them in process_deferred_bios(). | 753 | * the metadata that can't be committed e.g, due to I/O errors |
754 | * on the metadata device. | ||
707 | */ | 755 | */ |
708 | if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { | 756 | if (dm_thin_aborted_changes(tc->td)) { |
709 | spin_lock_irqsave(&pool->lock, flags); | 757 | bio_io_error(bio); |
710 | bio_list_add(&pool->deferred_flush_bios, bio); | 758 | return; |
711 | spin_unlock_irqrestore(&pool->lock, flags); | 759 | } |
712 | } else | 760 | |
713 | generic_make_request(bio); | 761 | /* |
762 | * Batch together any bios that trigger commits and then issue a | ||
763 | * single commit for them in process_deferred_bios(). | ||
764 | */ | ||
765 | spin_lock_irqsave(&pool->lock, flags); | ||
766 | bio_list_add(&pool->deferred_flush_bios, bio); | ||
767 | spin_unlock_irqrestore(&pool->lock, flags); | ||
714 | } | 768 | } |
715 | 769 | ||
716 | static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) | 770 | static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) |
@@ -847,6 +901,14 @@ static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell | |||
847 | wake_worker(pool); | 901 | wake_worker(pool); |
848 | } | 902 | } |
849 | 903 | ||
904 | static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) | ||
905 | { | ||
906 | if (m->bio) | ||
907 | m->bio->bi_end_io = m->saved_bi_end_io; | ||
908 | cell_error(m->cell); | ||
909 | list_del(&m->list); | ||
910 | mempool_free(m, m->tc->pool->mapping_pool); | ||
911 | } | ||
850 | static void process_prepared_mapping(struct dm_thin_new_mapping *m) | 912 | static void process_prepared_mapping(struct dm_thin_new_mapping *m) |
851 | { | 913 | { |
852 | struct thin_c *tc = m->tc; | 914 | struct thin_c *tc = m->tc; |
@@ -859,7 +921,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) | |||
859 | 921 | ||
860 | if (m->err) { | 922 | if (m->err) { |
861 | cell_error(m->cell); | 923 | cell_error(m->cell); |
862 | return; | 924 | goto out; |
863 | } | 925 | } |
864 | 926 | ||
865 | /* | 927 | /* |
@@ -871,7 +933,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) | |||
871 | if (r) { | 933 | if (r) { |
872 | DMERR("dm_thin_insert_block() failed"); | 934 | DMERR("dm_thin_insert_block() failed"); |
873 | cell_error(m->cell); | 935 | cell_error(m->cell); |
874 | return; | 936 | goto out; |
875 | } | 937 | } |
876 | 938 | ||
877 | /* | 939 | /* |
@@ -886,22 +948,25 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) | |||
886 | } else | 948 | } else |
887 | cell_defer(tc, m->cell, m->data_block); | 949 | cell_defer(tc, m->cell, m->data_block); |
888 | 950 | ||
951 | out: | ||
889 | list_del(&m->list); | 952 | list_del(&m->list); |
890 | mempool_free(m, tc->pool->mapping_pool); | 953 | mempool_free(m, tc->pool->mapping_pool); |
891 | } | 954 | } |
892 | 955 | ||
893 | static void process_prepared_discard(struct dm_thin_new_mapping *m) | 956 | static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) |
894 | { | 957 | { |
895 | int r; | ||
896 | struct thin_c *tc = m->tc; | 958 | struct thin_c *tc = m->tc; |
897 | 959 | ||
898 | r = dm_thin_remove_block(tc->td, m->virt_block); | 960 | bio_io_error(m->bio); |
899 | if (r) | 961 | cell_defer_except(tc, m->cell); |
900 | DMERR("dm_thin_remove_block() failed"); | 962 | cell_defer_except(tc, m->cell2); |
963 | mempool_free(m, tc->pool->mapping_pool); | ||
964 | } | ||
965 | |||
966 | static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) | ||
967 | { | ||
968 | struct thin_c *tc = m->tc; | ||
901 | 969 | ||
902 | /* | ||
903 | * Pass the discard down to the underlying device? | ||
904 | */ | ||
905 | if (m->pass_discard) | 970 | if (m->pass_discard) |
906 | remap_and_issue(tc, m->bio, m->data_block); | 971 | remap_and_issue(tc, m->bio, m->data_block); |
907 | else | 972 | else |
@@ -912,8 +977,20 @@ static void process_prepared_discard(struct dm_thin_new_mapping *m) | |||
912 | mempool_free(m, tc->pool->mapping_pool); | 977 | mempool_free(m, tc->pool->mapping_pool); |
913 | } | 978 | } |
914 | 979 | ||
980 | static void process_prepared_discard(struct dm_thin_new_mapping *m) | ||
981 | { | ||
982 | int r; | ||
983 | struct thin_c *tc = m->tc; | ||
984 | |||
985 | r = dm_thin_remove_block(tc->td, m->virt_block); | ||
986 | if (r) | ||
987 | DMERR("dm_thin_remove_block() failed"); | ||
988 | |||
989 | process_prepared_discard_passdown(m); | ||
990 | } | ||
991 | |||
915 | static void process_prepared(struct pool *pool, struct list_head *head, | 992 | static void process_prepared(struct pool *pool, struct list_head *head, |
916 | void (*fn)(struct dm_thin_new_mapping *)) | 993 | process_mapping_fn *fn) |
917 | { | 994 | { |
918 | unsigned long flags; | 995 | unsigned long flags; |
919 | struct list_head maps; | 996 | struct list_head maps; |
@@ -925,7 +1002,7 @@ static void process_prepared(struct pool *pool, struct list_head *head, | |||
925 | spin_unlock_irqrestore(&pool->lock, flags); | 1002 | spin_unlock_irqrestore(&pool->lock, flags); |
926 | 1003 | ||
927 | list_for_each_entry_safe(m, tmp, &maps, list) | 1004 | list_for_each_entry_safe(m, tmp, &maps, list) |
928 | fn(m); | 1005 | (*fn)(m); |
929 | } | 1006 | } |
930 | 1007 | ||
931 | /* | 1008 | /* |
@@ -933,9 +1010,7 @@ static void process_prepared(struct pool *pool, struct list_head *head, | |||
933 | */ | 1010 | */ |
934 | static int io_overlaps_block(struct pool *pool, struct bio *bio) | 1011 | static int io_overlaps_block(struct pool *pool, struct bio *bio) |
935 | { | 1012 | { |
936 | return !(bio->bi_sector & pool->offset_mask) && | 1013 | return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT); |
937 | (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); | ||
938 | |||
939 | } | 1014 | } |
940 | 1015 | ||
941 | static int io_overwrites_block(struct pool *pool, struct bio *bio) | 1016 | static int io_overwrites_block(struct pool *pool, struct bio *bio) |
@@ -1093,6 +1168,35 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | |||
1093 | } | 1168 | } |
1094 | } | 1169 | } |
1095 | 1170 | ||
1171 | static int commit(struct pool *pool) | ||
1172 | { | ||
1173 | int r; | ||
1174 | |||
1175 | r = dm_pool_commit_metadata(pool->pmd); | ||
1176 | if (r) | ||
1177 | DMERR("commit failed, error = %d", r); | ||
1178 | |||
1179 | return r; | ||
1180 | } | ||
1181 | |||
1182 | /* | ||
1183 | * A non-zero return indicates read_only or fail_io mode. | ||
1184 | * Many callers don't care about the return value. | ||
1185 | */ | ||
1186 | static int commit_or_fallback(struct pool *pool) | ||
1187 | { | ||
1188 | int r; | ||
1189 | |||
1190 | if (get_pool_mode(pool) != PM_WRITE) | ||
1191 | return -EINVAL; | ||
1192 | |||
1193 | r = commit(pool); | ||
1194 | if (r) | ||
1195 | set_pool_mode(pool, PM_READ_ONLY); | ||
1196 | |||
1197 | return r; | ||
1198 | } | ||
1199 | |||
1096 | static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | 1200 | static int alloc_data_block(struct thin_c *tc, dm_block_t *result) |
1097 | { | 1201 | { |
1098 | int r; | 1202 | int r; |
@@ -1121,12 +1225,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | |||
1121 | * Try to commit to see if that will free up some | 1225 | * Try to commit to see if that will free up some |
1122 | * more space. | 1226 | * more space. |
1123 | */ | 1227 | */ |
1124 | r = dm_pool_commit_metadata(pool->pmd); | 1228 | (void) commit_or_fallback(pool); |
1125 | if (r) { | ||
1126 | DMERR("%s: dm_pool_commit_metadata() failed, error = %d", | ||
1127 | __func__, r); | ||
1128 | return r; | ||
1129 | } | ||
1130 | 1229 | ||
1131 | r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); | 1230 | r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); |
1132 | if (r) | 1231 | if (r) |
@@ -1218,7 +1317,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio) | |||
1218 | */ | 1317 | */ |
1219 | m = get_next_mapping(pool); | 1318 | m = get_next_mapping(pool); |
1220 | m->tc = tc; | 1319 | m->tc = tc; |
1221 | m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; | 1320 | m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown; |
1222 | m->virt_block = block; | 1321 | m->virt_block = block; |
1223 | m->data_block = lookup_result.block; | 1322 | m->data_block = lookup_result.block; |
1224 | m->cell = cell; | 1323 | m->cell = cell; |
@@ -1234,18 +1333,16 @@ static void process_discard(struct thin_c *tc, struct bio *bio) | |||
1234 | } | 1333 | } |
1235 | } else { | 1334 | } else { |
1236 | /* | 1335 | /* |
1237 | * This path is hit if people are ignoring | 1336 | * The DM core makes sure that the discard doesn't span |
1238 | * limits->discard_granularity. It ignores any | 1337 | * a block boundary. So we submit the discard of a |
1239 | * part of the discard that is in a subsequent | 1338 | * partial block appropriately. |
1240 | * block. | ||
1241 | */ | 1339 | */ |
1242 | sector_t offset = bio->bi_sector - (block << pool->block_shift); | ||
1243 | unsigned remaining = (pool->sectors_per_block - offset) << 9; | ||
1244 | bio->bi_size = min(bio->bi_size, remaining); | ||
1245 | |||
1246 | cell_release_singleton(cell, bio); | 1340 | cell_release_singleton(cell, bio); |
1247 | cell_release_singleton(cell2, bio); | 1341 | cell_release_singleton(cell2, bio); |
1248 | remap_and_issue(tc, bio, lookup_result.block); | 1342 | if ((!lookup_result.shared) && pool->pf.discard_passdown) |
1343 | remap_and_issue(tc, bio, lookup_result.block); | ||
1344 | else | ||
1345 | bio_endio(bio, 0); | ||
1249 | } | 1346 | } |
1250 | break; | 1347 | break; |
1251 | 1348 | ||
@@ -1307,7 +1404,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, | |||
1307 | if (bio_detain(pool->prison, &key, bio, &cell)) | 1404 | if (bio_detain(pool->prison, &key, bio, &cell)) |
1308 | return; | 1405 | return; |
1309 | 1406 | ||
1310 | if (bio_data_dir(bio) == WRITE) | 1407 | if (bio_data_dir(bio) == WRITE && bio->bi_size) |
1311 | break_sharing(tc, bio, block, &key, lookup_result, cell); | 1408 | break_sharing(tc, bio, block, &key, lookup_result, cell); |
1312 | else { | 1409 | else { |
1313 | struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; | 1410 | struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; |
@@ -1359,6 +1456,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block | |||
1359 | 1456 | ||
1360 | default: | 1457 | default: |
1361 | DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); | 1458 | DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); |
1459 | set_pool_mode(tc->pool, PM_READ_ONLY); | ||
1362 | cell_error(cell); | 1460 | cell_error(cell); |
1363 | break; | 1461 | break; |
1364 | } | 1462 | } |
@@ -1416,6 +1514,49 @@ static void process_bio(struct thin_c *tc, struct bio *bio) | |||
1416 | } | 1514 | } |
1417 | } | 1515 | } |
1418 | 1516 | ||
1517 | static void process_bio_read_only(struct thin_c *tc, struct bio *bio) | ||
1518 | { | ||
1519 | int r; | ||
1520 | int rw = bio_data_dir(bio); | ||
1521 | dm_block_t block = get_bio_block(tc, bio); | ||
1522 | struct dm_thin_lookup_result lookup_result; | ||
1523 | |||
1524 | r = dm_thin_find_block(tc->td, block, 1, &lookup_result); | ||
1525 | switch (r) { | ||
1526 | case 0: | ||
1527 | if (lookup_result.shared && (rw == WRITE) && bio->bi_size) | ||
1528 | bio_io_error(bio); | ||
1529 | else | ||
1530 | remap_and_issue(tc, bio, lookup_result.block); | ||
1531 | break; | ||
1532 | |||
1533 | case -ENODATA: | ||
1534 | if (rw != READ) { | ||
1535 | bio_io_error(bio); | ||
1536 | break; | ||
1537 | } | ||
1538 | |||
1539 | if (tc->origin_dev) { | ||
1540 | remap_to_origin_and_issue(tc, bio); | ||
1541 | break; | ||
1542 | } | ||
1543 | |||
1544 | zero_fill_bio(bio); | ||
1545 | bio_endio(bio, 0); | ||
1546 | break; | ||
1547 | |||
1548 | default: | ||
1549 | DMERR("dm_thin_find_block() failed, error = %d", r); | ||
1550 | bio_io_error(bio); | ||
1551 | break; | ||
1552 | } | ||
1553 | } | ||
1554 | |||
1555 | static void process_bio_fail(struct thin_c *tc, struct bio *bio) | ||
1556 | { | ||
1557 | bio_io_error(bio); | ||
1558 | } | ||
1559 | |||
1419 | static int need_commit_due_to_time(struct pool *pool) | 1560 | static int need_commit_due_to_time(struct pool *pool) |
1420 | { | 1561 | { |
1421 | return jiffies < pool->last_commit_jiffies || | 1562 | return jiffies < pool->last_commit_jiffies || |
@@ -1427,7 +1568,6 @@ static void process_deferred_bios(struct pool *pool) | |||
1427 | unsigned long flags; | 1568 | unsigned long flags; |
1428 | struct bio *bio; | 1569 | struct bio *bio; |
1429 | struct bio_list bios; | 1570 | struct bio_list bios; |
1430 | int r; | ||
1431 | 1571 | ||
1432 | bio_list_init(&bios); | 1572 | bio_list_init(&bios); |
1433 | 1573 | ||
@@ -1454,9 +1594,9 @@ static void process_deferred_bios(struct pool *pool) | |||
1454 | } | 1594 | } |
1455 | 1595 | ||
1456 | if (bio->bi_rw & REQ_DISCARD) | 1596 | if (bio->bi_rw & REQ_DISCARD) |
1457 | process_discard(tc, bio); | 1597 | pool->process_discard(tc, bio); |
1458 | else | 1598 | else |
1459 | process_bio(tc, bio); | 1599 | pool->process_bio(tc, bio); |
1460 | } | 1600 | } |
1461 | 1601 | ||
1462 | /* | 1602 | /* |
@@ -1472,10 +1612,7 @@ static void process_deferred_bios(struct pool *pool) | |||
1472 | if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) | 1612 | if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) |
1473 | return; | 1613 | return; |
1474 | 1614 | ||
1475 | r = dm_pool_commit_metadata(pool->pmd); | 1615 | if (commit_or_fallback(pool)) { |
1476 | if (r) { | ||
1477 | DMERR("%s: dm_pool_commit_metadata() failed, error = %d", | ||
1478 | __func__, r); | ||
1479 | while ((bio = bio_list_pop(&bios))) | 1616 | while ((bio = bio_list_pop(&bios))) |
1480 | bio_io_error(bio); | 1617 | bio_io_error(bio); |
1481 | return; | 1618 | return; |
@@ -1490,8 +1627,8 @@ static void do_worker(struct work_struct *ws) | |||
1490 | { | 1627 | { |
1491 | struct pool *pool = container_of(ws, struct pool, worker); | 1628 | struct pool *pool = container_of(ws, struct pool, worker); |
1492 | 1629 | ||
1493 | process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); | 1630 | process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); |
1494 | process_prepared(pool, &pool->prepared_discards, process_prepared_discard); | 1631 | process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); |
1495 | process_deferred_bios(pool); | 1632 | process_deferred_bios(pool); |
1496 | } | 1633 | } |
1497 | 1634 | ||
@@ -1508,6 +1645,52 @@ static void do_waker(struct work_struct *ws) | |||
1508 | 1645 | ||
1509 | /*----------------------------------------------------------------*/ | 1646 | /*----------------------------------------------------------------*/ |
1510 | 1647 | ||
1648 | static enum pool_mode get_pool_mode(struct pool *pool) | ||
1649 | { | ||
1650 | return pool->pf.mode; | ||
1651 | } | ||
1652 | |||
1653 | static void set_pool_mode(struct pool *pool, enum pool_mode mode) | ||
1654 | { | ||
1655 | int r; | ||
1656 | |||
1657 | pool->pf.mode = mode; | ||
1658 | |||
1659 | switch (mode) { | ||
1660 | case PM_FAIL: | ||
1661 | DMERR("switching pool to failure mode"); | ||
1662 | pool->process_bio = process_bio_fail; | ||
1663 | pool->process_discard = process_bio_fail; | ||
1664 | pool->process_prepared_mapping = process_prepared_mapping_fail; | ||
1665 | pool->process_prepared_discard = process_prepared_discard_fail; | ||
1666 | break; | ||
1667 | |||
1668 | case PM_READ_ONLY: | ||
1669 | DMERR("switching pool to read-only mode"); | ||
1670 | r = dm_pool_abort_metadata(pool->pmd); | ||
1671 | if (r) { | ||
1672 | DMERR("aborting transaction failed"); | ||
1673 | set_pool_mode(pool, PM_FAIL); | ||
1674 | } else { | ||
1675 | dm_pool_metadata_read_only(pool->pmd); | ||
1676 | pool->process_bio = process_bio_read_only; | ||
1677 | pool->process_discard = process_discard; | ||
1678 | pool->process_prepared_mapping = process_prepared_mapping_fail; | ||
1679 | pool->process_prepared_discard = process_prepared_discard_passdown; | ||
1680 | } | ||
1681 | break; | ||
1682 | |||
1683 | case PM_WRITE: | ||
1684 | pool->process_bio = process_bio; | ||
1685 | pool->process_discard = process_discard; | ||
1686 | pool->process_prepared_mapping = process_prepared_mapping; | ||
1687 | pool->process_prepared_discard = process_prepared_discard; | ||
1688 | break; | ||
1689 | } | ||
1690 | } | ||
1691 | |||
1692 | /*----------------------------------------------------------------*/ | ||
1693 | |||
1511 | /* | 1694 | /* |
1512 | * Mapping functions. | 1695 | * Mapping functions. |
1513 | */ | 1696 | */ |
@@ -1553,6 +1736,12 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio, | |||
1553 | struct dm_thin_lookup_result result; | 1736 | struct dm_thin_lookup_result result; |
1554 | 1737 | ||
1555 | map_context->ptr = thin_hook_bio(tc, bio); | 1738 | map_context->ptr = thin_hook_bio(tc, bio); |
1739 | |||
1740 | if (get_pool_mode(tc->pool) == PM_FAIL) { | ||
1741 | bio_io_error(bio); | ||
1742 | return DM_MAPIO_SUBMITTED; | ||
1743 | } | ||
1744 | |||
1556 | if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { | 1745 | if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { |
1557 | thin_defer_bio(tc, bio); | 1746 | thin_defer_bio(tc, bio); |
1558 | return DM_MAPIO_SUBMITTED; | 1747 | return DM_MAPIO_SUBMITTED; |
@@ -1589,14 +1778,35 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio, | |||
1589 | break; | 1778 | break; |
1590 | 1779 | ||
1591 | case -ENODATA: | 1780 | case -ENODATA: |
1781 | if (get_pool_mode(tc->pool) == PM_READ_ONLY) { | ||
1782 | /* | ||
1783 | * This block isn't provisioned, and we have no way | ||
1784 | * of doing so. Just error it. | ||
1785 | */ | ||
1786 | bio_io_error(bio); | ||
1787 | r = DM_MAPIO_SUBMITTED; | ||
1788 | break; | ||
1789 | } | ||
1790 | /* fall through */ | ||
1791 | |||
1792 | case -EWOULDBLOCK: | ||
1592 | /* | 1793 | /* |
1593 | * In future, the failed dm_thin_find_block above could | 1794 | * In future, the failed dm_thin_find_block above could |
1594 | * provide the hint to load the metadata into cache. | 1795 | * provide the hint to load the metadata into cache. |
1595 | */ | 1796 | */ |
1596 | case -EWOULDBLOCK: | ||
1597 | thin_defer_bio(tc, bio); | 1797 | thin_defer_bio(tc, bio); |
1598 | r = DM_MAPIO_SUBMITTED; | 1798 | r = DM_MAPIO_SUBMITTED; |
1599 | break; | 1799 | break; |
1800 | |||
1801 | default: | ||
1802 | /* | ||
1803 | * Must always call bio_io_error on failure. | ||
1804 | * dm_thin_find_block can fail with -EINVAL if the | ||
1805 | * pool is switched to fail-io mode. | ||
1806 | */ | ||
1807 | bio_io_error(bio); | ||
1808 | r = DM_MAPIO_SUBMITTED; | ||
1809 | break; | ||
1600 | } | 1810 | } |
1601 | 1811 | ||
1602 | return r; | 1812 | return r; |
@@ -1633,15 +1843,26 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) | |||
1633 | { | 1843 | { |
1634 | struct pool_c *pt = ti->private; | 1844 | struct pool_c *pt = ti->private; |
1635 | 1845 | ||
1846 | /* | ||
1847 | * We want to make sure that degraded pools are never upgraded. | ||
1848 | */ | ||
1849 | enum pool_mode old_mode = pool->pf.mode; | ||
1850 | enum pool_mode new_mode = pt->pf.mode; | ||
1851 | |||
1852 | if (old_mode > new_mode) | ||
1853 | new_mode = old_mode; | ||
1854 | |||
1636 | pool->ti = ti; | 1855 | pool->ti = ti; |
1637 | pool->low_water_blocks = pt->low_water_blocks; | 1856 | pool->low_water_blocks = pt->low_water_blocks; |
1638 | pool->pf = pt->pf; | 1857 | pool->pf = pt->pf; |
1858 | set_pool_mode(pool, new_mode); | ||
1639 | 1859 | ||
1640 | /* | 1860 | /* |
1641 | * If discard_passdown was enabled verify that the data device | 1861 | * If discard_passdown was enabled verify that the data device |
1642 | * supports discards. Disable discard_passdown if not; otherwise | 1862 | * supports discards. Disable discard_passdown if not; otherwise |
1643 | * -EOPNOTSUPP will be returned. | 1863 | * -EOPNOTSUPP will be returned. |
1644 | */ | 1864 | */ |
1865 | /* FIXME: pull this out into a sep fn. */ | ||
1645 | if (pt->pf.discard_passdown) { | 1866 | if (pt->pf.discard_passdown) { |
1646 | struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); | 1867 | struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); |
1647 | if (!q || !blk_queue_discard(q)) { | 1868 | if (!q || !blk_queue_discard(q)) { |
@@ -1667,6 +1888,7 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti) | |||
1667 | /* Initialize pool features. */ | 1888 | /* Initialize pool features. */ |
1668 | static void pool_features_init(struct pool_features *pf) | 1889 | static void pool_features_init(struct pool_features *pf) |
1669 | { | 1890 | { |
1891 | pf->mode = PM_WRITE; | ||
1670 | pf->zero_new_blocks = 1; | 1892 | pf->zero_new_blocks = 1; |
1671 | pf->discard_enabled = 1; | 1893 | pf->discard_enabled = 1; |
1672 | pf->discard_passdown = 1; | 1894 | pf->discard_passdown = 1; |
@@ -1697,14 +1919,16 @@ static struct kmem_cache *_endio_hook_cache; | |||
1697 | 1919 | ||
1698 | static struct pool *pool_create(struct mapped_device *pool_md, | 1920 | static struct pool *pool_create(struct mapped_device *pool_md, |
1699 | struct block_device *metadata_dev, | 1921 | struct block_device *metadata_dev, |
1700 | unsigned long block_size, char **error) | 1922 | unsigned long block_size, |
1923 | int read_only, char **error) | ||
1701 | { | 1924 | { |
1702 | int r; | 1925 | int r; |
1703 | void *err_p; | 1926 | void *err_p; |
1704 | struct pool *pool; | 1927 | struct pool *pool; |
1705 | struct dm_pool_metadata *pmd; | 1928 | struct dm_pool_metadata *pmd; |
1929 | bool format_device = read_only ? false : true; | ||
1706 | 1930 | ||
1707 | pmd = dm_pool_metadata_open(metadata_dev, block_size); | 1931 | pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device); |
1708 | if (IS_ERR(pmd)) { | 1932 | if (IS_ERR(pmd)) { |
1709 | *error = "Error creating metadata object"; | 1933 | *error = "Error creating metadata object"; |
1710 | return (struct pool *)pmd; | 1934 | return (struct pool *)pmd; |
@@ -1719,8 +1943,10 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
1719 | 1943 | ||
1720 | pool->pmd = pmd; | 1944 | pool->pmd = pmd; |
1721 | pool->sectors_per_block = block_size; | 1945 | pool->sectors_per_block = block_size; |
1722 | pool->block_shift = ffs(block_size) - 1; | 1946 | if (block_size & (block_size - 1)) |
1723 | pool->offset_mask = block_size - 1; | 1947 | pool->sectors_per_block_shift = -1; |
1948 | else | ||
1949 | pool->sectors_per_block_shift = __ffs(block_size); | ||
1724 | pool->low_water_blocks = 0; | 1950 | pool->low_water_blocks = 0; |
1725 | pool_features_init(&pool->pf); | 1951 | pool_features_init(&pool->pf); |
1726 | pool->prison = prison_create(PRISON_CELLS); | 1952 | pool->prison = prison_create(PRISON_CELLS); |
@@ -1819,25 +2045,29 @@ static void __pool_dec(struct pool *pool) | |||
1819 | 2045 | ||
1820 | static struct pool *__pool_find(struct mapped_device *pool_md, | 2046 | static struct pool *__pool_find(struct mapped_device *pool_md, |
1821 | struct block_device *metadata_dev, | 2047 | struct block_device *metadata_dev, |
1822 | unsigned long block_size, char **error, | 2048 | unsigned long block_size, int read_only, |
1823 | int *created) | 2049 | char **error, int *created) |
1824 | { | 2050 | { |
1825 | struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); | 2051 | struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); |
1826 | 2052 | ||
1827 | if (pool) { | 2053 | if (pool) { |
1828 | if (pool->pool_md != pool_md) | 2054 | if (pool->pool_md != pool_md) { |
2055 | *error = "metadata device already in use by a pool"; | ||
1829 | return ERR_PTR(-EBUSY); | 2056 | return ERR_PTR(-EBUSY); |
2057 | } | ||
1830 | __pool_inc(pool); | 2058 | __pool_inc(pool); |
1831 | 2059 | ||
1832 | } else { | 2060 | } else { |
1833 | pool = __pool_table_lookup(pool_md); | 2061 | pool = __pool_table_lookup(pool_md); |
1834 | if (pool) { | 2062 | if (pool) { |
1835 | if (pool->md_dev != metadata_dev) | 2063 | if (pool->md_dev != metadata_dev) { |
2064 | *error = "different pool cannot replace a pool"; | ||
1836 | return ERR_PTR(-EINVAL); | 2065 | return ERR_PTR(-EINVAL); |
2066 | } | ||
1837 | __pool_inc(pool); | 2067 | __pool_inc(pool); |
1838 | 2068 | ||
1839 | } else { | 2069 | } else { |
1840 | pool = pool_create(pool_md, metadata_dev, block_size, error); | 2070 | pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); |
1841 | *created = 1; | 2071 | *created = 1; |
1842 | } | 2072 | } |
1843 | } | 2073 | } |
@@ -1888,19 +2118,23 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | |||
1888 | arg_name = dm_shift_arg(as); | 2118 | arg_name = dm_shift_arg(as); |
1889 | argc--; | 2119 | argc--; |
1890 | 2120 | ||
1891 | if (!strcasecmp(arg_name, "skip_block_zeroing")) { | 2121 | if (!strcasecmp(arg_name, "skip_block_zeroing")) |
1892 | pf->zero_new_blocks = 0; | 2122 | pf->zero_new_blocks = 0; |
1893 | continue; | 2123 | |
1894 | } else if (!strcasecmp(arg_name, "ignore_discard")) { | 2124 | else if (!strcasecmp(arg_name, "ignore_discard")) |
1895 | pf->discard_enabled = 0; | 2125 | pf->discard_enabled = 0; |
1896 | continue; | 2126 | |
1897 | } else if (!strcasecmp(arg_name, "no_discard_passdown")) { | 2127 | else if (!strcasecmp(arg_name, "no_discard_passdown")) |
1898 | pf->discard_passdown = 0; | 2128 | pf->discard_passdown = 0; |
1899 | continue; | ||
1900 | } | ||
1901 | 2129 | ||
1902 | ti->error = "Unrecognised pool feature requested"; | 2130 | else if (!strcasecmp(arg_name, "read_only")) |
1903 | r = -EINVAL; | 2131 | pf->mode = PM_READ_ONLY; |
2132 | |||
2133 | else { | ||
2134 | ti->error = "Unrecognised pool feature requested"; | ||
2135 | r = -EINVAL; | ||
2136 | break; | ||
2137 | } | ||
1904 | } | 2138 | } |
1905 | 2139 | ||
1906 | return r; | 2140 | return r; |
@@ -1964,7 +2198,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1964 | if (kstrtoul(argv[2], 10, &block_size) || !block_size || | 2198 | if (kstrtoul(argv[2], 10, &block_size) || !block_size || |
1965 | block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || | 2199 | block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || |
1966 | block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || | 2200 | block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || |
1967 | !is_power_of_2(block_size)) { | 2201 | block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { |
1968 | ti->error = "Invalid block size"; | 2202 | ti->error = "Invalid block size"; |
1969 | r = -EINVAL; | 2203 | r = -EINVAL; |
1970 | goto out; | 2204 | goto out; |
@@ -1993,7 +2227,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1993 | } | 2227 | } |
1994 | 2228 | ||
1995 | pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, | 2229 | pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, |
1996 | block_size, &ti->error, &pool_created); | 2230 | block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); |
1997 | if (IS_ERR(pool)) { | 2231 | if (IS_ERR(pool)) { |
1998 | r = PTR_ERR(pool); | 2232 | r = PTR_ERR(pool); |
1999 | goto out_free_pt; | 2233 | goto out_free_pt; |
@@ -2011,6 +2245,15 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2011 | goto out_flags_changed; | 2245 | goto out_flags_changed; |
2012 | } | 2246 | } |
2013 | 2247 | ||
2248 | /* | ||
2249 | * The block layer requires discard_granularity to be a power of 2. | ||
2250 | */ | ||
2251 | if (pf.discard_enabled && !is_power_of_2(block_size)) { | ||
2252 | ti->error = "Discard support must be disabled when the block size is not a power of 2"; | ||
2253 | r = -EINVAL; | ||
2254 | goto out_flags_changed; | ||
2255 | } | ||
2256 | |||
2014 | pt->pool = pool; | 2257 | pt->pool = pool; |
2015 | pt->ti = ti; | 2258 | pt->ti = ti; |
2016 | pt->metadata_dev = metadata_dev; | 2259 | pt->metadata_dev = metadata_dev; |
@@ -2030,7 +2273,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2030 | * stacking of discard limits (this keeps the pool and | 2273 | * stacking of discard limits (this keeps the pool and |
2031 | * thin devices' discard limits consistent). | 2274 | * thin devices' discard limits consistent). |
2032 | */ | 2275 | */ |
2033 | ti->discards_supported = 1; | 2276 | ti->discards_supported = true; |
2034 | } | 2277 | } |
2035 | ti->private = pt; | 2278 | ti->private = pt; |
2036 | 2279 | ||
@@ -2090,7 +2333,8 @@ static int pool_preresume(struct dm_target *ti) | |||
2090 | int r; | 2333 | int r; |
2091 | struct pool_c *pt = ti->private; | 2334 | struct pool_c *pt = ti->private; |
2092 | struct pool *pool = pt->pool; | 2335 | struct pool *pool = pt->pool; |
2093 | dm_block_t data_size, sb_data_size; | 2336 | sector_t data_size = ti->len; |
2337 | dm_block_t sb_data_size; | ||
2094 | 2338 | ||
2095 | /* | 2339 | /* |
2096 | * Take control of the pool object. | 2340 | * Take control of the pool object. |
@@ -2099,7 +2343,8 @@ static int pool_preresume(struct dm_target *ti) | |||
2099 | if (r) | 2343 | if (r) |
2100 | return r; | 2344 | return r; |
2101 | 2345 | ||
2102 | data_size = ti->len >> pool->block_shift; | 2346 | (void) sector_div(data_size, pool->sectors_per_block); |
2347 | |||
2103 | r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); | 2348 | r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); |
2104 | if (r) { | 2349 | if (r) { |
2105 | DMERR("failed to retrieve data device size"); | 2350 | DMERR("failed to retrieve data device size"); |
@@ -2108,22 +2353,19 @@ static int pool_preresume(struct dm_target *ti) | |||
2108 | 2353 | ||
2109 | if (data_size < sb_data_size) { | 2354 | if (data_size < sb_data_size) { |
2110 | DMERR("pool target too small, is %llu blocks (expected %llu)", | 2355 | DMERR("pool target too small, is %llu blocks (expected %llu)", |
2111 | data_size, sb_data_size); | 2356 | (unsigned long long)data_size, sb_data_size); |
2112 | return -EINVAL; | 2357 | return -EINVAL; |
2113 | 2358 | ||
2114 | } else if (data_size > sb_data_size) { | 2359 | } else if (data_size > sb_data_size) { |
2115 | r = dm_pool_resize_data_dev(pool->pmd, data_size); | 2360 | r = dm_pool_resize_data_dev(pool->pmd, data_size); |
2116 | if (r) { | 2361 | if (r) { |
2117 | DMERR("failed to resize data device"); | 2362 | DMERR("failed to resize data device"); |
2363 | /* FIXME Stricter than necessary: Rollback transaction instead here */ | ||
2364 | set_pool_mode(pool, PM_READ_ONLY); | ||
2118 | return r; | 2365 | return r; |
2119 | } | 2366 | } |
2120 | 2367 | ||
2121 | r = dm_pool_commit_metadata(pool->pmd); | 2368 | (void) commit_or_fallback(pool); |
2122 | if (r) { | ||
2123 | DMERR("%s: dm_pool_commit_metadata() failed, error = %d", | ||
2124 | __func__, r); | ||
2125 | return r; | ||
2126 | } | ||
2127 | } | 2369 | } |
2128 | 2370 | ||
2129 | return 0; | 2371 | return 0; |
@@ -2146,19 +2388,12 @@ static void pool_resume(struct dm_target *ti) | |||
2146 | 2388 | ||
2147 | static void pool_postsuspend(struct dm_target *ti) | 2389 | static void pool_postsuspend(struct dm_target *ti) |
2148 | { | 2390 | { |
2149 | int r; | ||
2150 | struct pool_c *pt = ti->private; | 2391 | struct pool_c *pt = ti->private; |
2151 | struct pool *pool = pt->pool; | 2392 | struct pool *pool = pt->pool; |
2152 | 2393 | ||
2153 | cancel_delayed_work(&pool->waker); | 2394 | cancel_delayed_work(&pool->waker); |
2154 | flush_workqueue(pool->wq); | 2395 | flush_workqueue(pool->wq); |
2155 | 2396 | (void) commit_or_fallback(pool); | |
2156 | r = dm_pool_commit_metadata(pool->pmd); | ||
2157 | if (r < 0) { | ||
2158 | DMERR("%s: dm_pool_commit_metadata() failed, error = %d", | ||
2159 | __func__, r); | ||
2160 | /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/ | ||
2161 | } | ||
2162 | } | 2397 | } |
2163 | 2398 | ||
2164 | static int check_arg_count(unsigned argc, unsigned args_required) | 2399 | static int check_arg_count(unsigned argc, unsigned args_required) |
@@ -2292,6 +2527,8 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct | |||
2292 | if (r) | 2527 | if (r) |
2293 | return r; | 2528 | return r; |
2294 | 2529 | ||
2530 | (void) commit_or_fallback(pool); | ||
2531 | |||
2295 | r = dm_pool_reserve_metadata_snap(pool->pmd); | 2532 | r = dm_pool_reserve_metadata_snap(pool->pmd); |
2296 | if (r) | 2533 | if (r) |
2297 | DMWARN("reserve_metadata_snap message failed."); | 2534 | DMWARN("reserve_metadata_snap message failed."); |
@@ -2351,25 +2588,41 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) | |||
2351 | else | 2588 | else |
2352 | DMWARN("Unrecognised thin pool target message received: %s", argv[0]); | 2589 | DMWARN("Unrecognised thin pool target message received: %s", argv[0]); |
2353 | 2590 | ||
2354 | if (!r) { | 2591 | if (!r) |
2355 | r = dm_pool_commit_metadata(pool->pmd); | 2592 | (void) commit_or_fallback(pool); |
2356 | if (r) | ||
2357 | DMERR("%s message: dm_pool_commit_metadata() failed, error = %d", | ||
2358 | argv[0], r); | ||
2359 | } | ||
2360 | 2593 | ||
2361 | return r; | 2594 | return r; |
2362 | } | 2595 | } |
2363 | 2596 | ||
2597 | static void emit_flags(struct pool_features *pf, char *result, | ||
2598 | unsigned sz, unsigned maxlen) | ||
2599 | { | ||
2600 | unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + | ||
2601 | !pf->discard_passdown + (pf->mode == PM_READ_ONLY); | ||
2602 | DMEMIT("%u ", count); | ||
2603 | |||
2604 | if (!pf->zero_new_blocks) | ||
2605 | DMEMIT("skip_block_zeroing "); | ||
2606 | |||
2607 | if (!pf->discard_enabled) | ||
2608 | DMEMIT("ignore_discard "); | ||
2609 | |||
2610 | if (!pf->discard_passdown) | ||
2611 | DMEMIT("no_discard_passdown "); | ||
2612 | |||
2613 | if (pf->mode == PM_READ_ONLY) | ||
2614 | DMEMIT("read_only "); | ||
2615 | } | ||
2616 | |||
2364 | /* | 2617 | /* |
2365 | * Status line is: | 2618 | * Status line is: |
2366 | * <transaction id> <used metadata sectors>/<total metadata sectors> | 2619 | * <transaction id> <used metadata sectors>/<total metadata sectors> |
2367 | * <used data sectors>/<total data sectors> <held metadata root> | 2620 | * <used data sectors>/<total data sectors> <held metadata root> |
2368 | */ | 2621 | */ |
2369 | static int pool_status(struct dm_target *ti, status_type_t type, | 2622 | static int pool_status(struct dm_target *ti, status_type_t type, |
2370 | char *result, unsigned maxlen) | 2623 | unsigned status_flags, char *result, unsigned maxlen) |
2371 | { | 2624 | { |
2372 | int r, count; | 2625 | int r; |
2373 | unsigned sz = 0; | 2626 | unsigned sz = 0; |
2374 | uint64_t transaction_id; | 2627 | uint64_t transaction_id; |
2375 | dm_block_t nr_free_blocks_data; | 2628 | dm_block_t nr_free_blocks_data; |
@@ -2384,6 +2637,15 @@ static int pool_status(struct dm_target *ti, status_type_t type, | |||
2384 | 2637 | ||
2385 | switch (type) { | 2638 | switch (type) { |
2386 | case STATUSTYPE_INFO: | 2639 | case STATUSTYPE_INFO: |
2640 | if (get_pool_mode(pool) == PM_FAIL) { | ||
2641 | DMEMIT("Fail"); | ||
2642 | break; | ||
2643 | } | ||
2644 | |||
2645 | /* Commit to ensure statistics aren't out-of-date */ | ||
2646 | if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) | ||
2647 | (void) commit_or_fallback(pool); | ||
2648 | |||
2387 | r = dm_pool_get_metadata_transaction_id(pool->pmd, | 2649 | r = dm_pool_get_metadata_transaction_id(pool->pmd, |
2388 | &transaction_id); | 2650 | &transaction_id); |
2389 | if (r) | 2651 | if (r) |
@@ -2419,9 +2681,19 @@ static int pool_status(struct dm_target *ti, status_type_t type, | |||
2419 | (unsigned long long)nr_blocks_data); | 2681 | (unsigned long long)nr_blocks_data); |
2420 | 2682 | ||
2421 | if (held_root) | 2683 | if (held_root) |
2422 | DMEMIT("%llu", held_root); | 2684 | DMEMIT("%llu ", held_root); |
2685 | else | ||
2686 | DMEMIT("- "); | ||
2687 | |||
2688 | if (pool->pf.mode == PM_READ_ONLY) | ||
2689 | DMEMIT("ro "); | ||
2690 | else | ||
2691 | DMEMIT("rw "); | ||
2692 | |||
2693 | if (pool->pf.discard_enabled && pool->pf.discard_passdown) | ||
2694 | DMEMIT("discard_passdown"); | ||
2423 | else | 2695 | else |
2424 | DMEMIT("-"); | 2696 | DMEMIT("no_discard_passdown"); |
2425 | 2697 | ||
2426 | break; | 2698 | break; |
2427 | 2699 | ||
@@ -2431,20 +2703,7 @@ static int pool_status(struct dm_target *ti, status_type_t type, | |||
2431 | format_dev_t(buf2, pt->data_dev->bdev->bd_dev), | 2703 | format_dev_t(buf2, pt->data_dev->bdev->bd_dev), |
2432 | (unsigned long)pool->sectors_per_block, | 2704 | (unsigned long)pool->sectors_per_block, |
2433 | (unsigned long long)pt->low_water_blocks); | 2705 | (unsigned long long)pt->low_water_blocks); |
2434 | 2706 | emit_flags(&pt->pf, result, sz, maxlen); | |
2435 | count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + | ||
2436 | !pt->pf.discard_passdown; | ||
2437 | DMEMIT("%u ", count); | ||
2438 | |||
2439 | if (!pool->pf.zero_new_blocks) | ||
2440 | DMEMIT("skip_block_zeroing "); | ||
2441 | |||
2442 | if (!pool->pf.discard_enabled) | ||
2443 | DMEMIT("ignore_discard "); | ||
2444 | |||
2445 | if (!pt->pf.discard_passdown) | ||
2446 | DMEMIT("no_discard_passdown "); | ||
2447 | |||
2448 | break; | 2707 | break; |
2449 | } | 2708 | } |
2450 | 2709 | ||
@@ -2482,7 +2741,8 @@ static void set_discard_limits(struct pool *pool, struct queue_limits *limits) | |||
2482 | 2741 | ||
2483 | /* | 2742 | /* |
2484 | * This is just a hint, and not enforced. We have to cope with | 2743 | * This is just a hint, and not enforced. We have to cope with |
2485 | * bios that overlap 2 blocks. | 2744 | * bios that cover a block partially. A discard that spans a block |
2745 | * boundary is not sent to this target. | ||
2486 | */ | 2746 | */ |
2487 | limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; | 2747 | limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; |
2488 | limits->discard_zeroes_data = pool->pf.zero_new_blocks; | 2748 | limits->discard_zeroes_data = pool->pf.zero_new_blocks; |
@@ -2503,7 +2763,7 @@ static struct target_type pool_target = { | |||
2503 | .name = "thin-pool", | 2763 | .name = "thin-pool", |
2504 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | | 2764 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | |
2505 | DM_TARGET_IMMUTABLE, | 2765 | DM_TARGET_IMMUTABLE, |
2506 | .version = {1, 2, 0}, | 2766 | .version = {1, 3, 0}, |
2507 | .module = THIS_MODULE, | 2767 | .module = THIS_MODULE, |
2508 | .ctr = pool_ctr, | 2768 | .ctr = pool_ctr, |
2509 | .dtr = pool_dtr, | 2769 | .dtr = pool_dtr, |
@@ -2608,19 +2868,31 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2608 | } | 2868 | } |
2609 | __pool_inc(tc->pool); | 2869 | __pool_inc(tc->pool); |
2610 | 2870 | ||
2871 | if (get_pool_mode(tc->pool) == PM_FAIL) { | ||
2872 | ti->error = "Couldn't open thin device, Pool is in fail mode"; | ||
2873 | goto bad_thin_open; | ||
2874 | } | ||
2875 | |||
2611 | r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); | 2876 | r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); |
2612 | if (r) { | 2877 | if (r) { |
2613 | ti->error = "Couldn't open thin internal device"; | 2878 | ti->error = "Couldn't open thin internal device"; |
2614 | goto bad_thin_open; | 2879 | goto bad_thin_open; |
2615 | } | 2880 | } |
2616 | 2881 | ||
2617 | ti->split_io = tc->pool->sectors_per_block; | 2882 | r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); |
2883 | if (r) | ||
2884 | goto bad_thin_open; | ||
2885 | |||
2618 | ti->num_flush_requests = 1; | 2886 | ti->num_flush_requests = 1; |
2887 | ti->flush_supported = true; | ||
2619 | 2888 | ||
2620 | /* In case the pool supports discards, pass them on. */ | 2889 | /* In case the pool supports discards, pass them on. */ |
2621 | if (tc->pool->pf.discard_enabled) { | 2890 | if (tc->pool->pf.discard_enabled) { |
2622 | ti->discards_supported = 1; | 2891 | ti->discards_supported = true; |
2623 | ti->num_discard_requests = 1; | 2892 | ti->num_discard_requests = 1; |
2893 | ti->discard_zeroes_data_unsupported = true; | ||
2894 | /* Discard requests must be split on a block boundary */ | ||
2895 | ti->split_discard_requests = true; | ||
2624 | } | 2896 | } |
2625 | 2897 | ||
2626 | dm_put(pool_md); | 2898 | dm_put(pool_md); |
@@ -2701,7 +2973,7 @@ static void thin_postsuspend(struct dm_target *ti) | |||
2701 | * <nr mapped sectors> <highest mapped sector> | 2973 | * <nr mapped sectors> <highest mapped sector> |
2702 | */ | 2974 | */ |
2703 | static int thin_status(struct dm_target *ti, status_type_t type, | 2975 | static int thin_status(struct dm_target *ti, status_type_t type, |
2704 | char *result, unsigned maxlen) | 2976 | unsigned status_flags, char *result, unsigned maxlen) |
2705 | { | 2977 | { |
2706 | int r; | 2978 | int r; |
2707 | ssize_t sz = 0; | 2979 | ssize_t sz = 0; |
@@ -2709,6 +2981,11 @@ static int thin_status(struct dm_target *ti, status_type_t type, | |||
2709 | char buf[BDEVNAME_SIZE]; | 2981 | char buf[BDEVNAME_SIZE]; |
2710 | struct thin_c *tc = ti->private; | 2982 | struct thin_c *tc = ti->private; |
2711 | 2983 | ||
2984 | if (get_pool_mode(tc->pool) == PM_FAIL) { | ||
2985 | DMEMIT("Fail"); | ||
2986 | return 0; | ||
2987 | } | ||
2988 | |||
2712 | if (!tc->td) | 2989 | if (!tc->td) |
2713 | DMEMIT("-"); | 2990 | DMEMIT("-"); |
2714 | else { | 2991 | else { |
@@ -2746,19 +3023,21 @@ static int thin_status(struct dm_target *ti, status_type_t type, | |||
2746 | static int thin_iterate_devices(struct dm_target *ti, | 3023 | static int thin_iterate_devices(struct dm_target *ti, |
2747 | iterate_devices_callout_fn fn, void *data) | 3024 | iterate_devices_callout_fn fn, void *data) |
2748 | { | 3025 | { |
2749 | dm_block_t blocks; | 3026 | sector_t blocks; |
2750 | struct thin_c *tc = ti->private; | 3027 | struct thin_c *tc = ti->private; |
3028 | struct pool *pool = tc->pool; | ||
2751 | 3029 | ||
2752 | /* | 3030 | /* |
2753 | * We can't call dm_pool_get_data_dev_size() since that blocks. So | 3031 | * We can't call dm_pool_get_data_dev_size() since that blocks. So |
2754 | * we follow a more convoluted path through to the pool's target. | 3032 | * we follow a more convoluted path through to the pool's target. |
2755 | */ | 3033 | */ |
2756 | if (!tc->pool->ti) | 3034 | if (!pool->ti) |
2757 | return 0; /* nothing is bound */ | 3035 | return 0; /* nothing is bound */ |
2758 | 3036 | ||
2759 | blocks = tc->pool->ti->len >> tc->pool->block_shift; | 3037 | blocks = pool->ti->len; |
3038 | (void) sector_div(blocks, pool->sectors_per_block); | ||
2760 | if (blocks) | 3039 | if (blocks) |
2761 | return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); | 3040 | return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data); |
2762 | 3041 | ||
2763 | return 0; | 3042 | return 0; |
2764 | } | 3043 | } |
@@ -2775,7 +3054,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) | |||
2775 | 3054 | ||
2776 | static struct target_type thin_target = { | 3055 | static struct target_type thin_target = { |
2777 | .name = "thin", | 3056 | .name = "thin", |
2778 | .version = {1, 1, 0}, | 3057 | .version = {1, 3, 0}, |
2779 | .module = THIS_MODULE, | 3058 | .module = THIS_MODULE, |
2780 | .ctr = thin_ctr, | 3059 | .ctr = thin_ctr, |
2781 | .dtr = thin_dtr, | 3060 | .dtr = thin_dtr, |
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index fa365d39b612..254d19268ad2 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c | |||
@@ -515,7 +515,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio, | |||
515 | * Status: V (valid) or C (corruption found) | 515 | * Status: V (valid) or C (corruption found) |
516 | */ | 516 | */ |
517 | static int verity_status(struct dm_target *ti, status_type_t type, | 517 | static int verity_status(struct dm_target *ti, status_type_t type, |
518 | char *result, unsigned maxlen) | 518 | unsigned status_flags, char *result, unsigned maxlen) |
519 | { | 519 | { |
520 | struct dm_verity *v = ti->private; | 520 | struct dm_verity *v = ti->private; |
521 | unsigned sz = 0; | 521 | unsigned sz = 0; |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e24143cc2040..4e09b6ff5b49 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -968,22 +968,41 @@ static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti | |||
968 | static sector_t max_io_len(sector_t sector, struct dm_target *ti) | 968 | static sector_t max_io_len(sector_t sector, struct dm_target *ti) |
969 | { | 969 | { |
970 | sector_t len = max_io_len_target_boundary(sector, ti); | 970 | sector_t len = max_io_len_target_boundary(sector, ti); |
971 | sector_t offset, max_len; | ||
971 | 972 | ||
972 | /* | 973 | /* |
973 | * Does the target need to split even further ? | 974 | * Does the target need to split even further? |
974 | */ | 975 | */ |
975 | if (ti->split_io) { | 976 | if (ti->max_io_len) { |
976 | sector_t boundary; | 977 | offset = dm_target_offset(ti, sector); |
977 | sector_t offset = dm_target_offset(ti, sector); | 978 | if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) |
978 | boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) | 979 | max_len = sector_div(offset, ti->max_io_len); |
979 | - offset; | 980 | else |
980 | if (len > boundary) | 981 | max_len = offset & (ti->max_io_len - 1); |
981 | len = boundary; | 982 | max_len = ti->max_io_len - max_len; |
983 | |||
984 | if (len > max_len) | ||
985 | len = max_len; | ||
982 | } | 986 | } |
983 | 987 | ||
984 | return len; | 988 | return len; |
985 | } | 989 | } |
986 | 990 | ||
991 | int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) | ||
992 | { | ||
993 | if (len > UINT_MAX) { | ||
994 | DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", | ||
995 | (unsigned long long)len, UINT_MAX); | ||
996 | ti->error = "Maximum size of target IO is too large"; | ||
997 | return -EINVAL; | ||
998 | } | ||
999 | |||
1000 | ti->max_io_len = (uint32_t) len; | ||
1001 | |||
1002 | return 0; | ||
1003 | } | ||
1004 | EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); | ||
1005 | |||
987 | static void __map_bio(struct dm_target *ti, struct bio *clone, | 1006 | static void __map_bio(struct dm_target *ti, struct bio *clone, |
988 | struct dm_target_io *tio) | 1007 | struct dm_target_io *tio) |
989 | { | 1008 | { |
@@ -1196,7 +1215,10 @@ static int __clone_and_map_discard(struct clone_info *ci) | |||
1196 | if (!ti->num_discard_requests) | 1215 | if (!ti->num_discard_requests) |
1197 | return -EOPNOTSUPP; | 1216 | return -EOPNOTSUPP; |
1198 | 1217 | ||
1199 | len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); | 1218 | if (!ti->split_discard_requests) |
1219 | len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); | ||
1220 | else | ||
1221 | len = min(ci->sector_count, max_io_len(ci->sector, ti)); | ||
1200 | 1222 | ||
1201 | __issue_target_requests(ci, ti, ti->num_discard_requests, len); | 1223 | __issue_target_requests(ci, ti, ti->num_discard_requests, len); |
1202 | 1224 | ||
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index b7dacd59d8d7..52eef493d266 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -23,6 +23,11 @@ | |||
23 | #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) | 23 | #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) |
24 | 24 | ||
25 | /* | 25 | /* |
26 | * Status feature flags | ||
27 | */ | ||
28 | #define DM_STATUS_NOFLUSH_FLAG (1 << 0) | ||
29 | |||
30 | /* | ||
26 | * Type of table and mapped_device's mempool | 31 | * Type of table and mapped_device's mempool |
27 | */ | 32 | */ |
28 | #define DM_TYPE_NONE 0 | 33 | #define DM_TYPE_NONE 0 |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 1c2f9048e1ae..fcd098794d37 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -498,61 +498,13 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) | |||
498 | } | 498 | } |
499 | EXPORT_SYMBOL(md_flush_request); | 499 | EXPORT_SYMBOL(md_flush_request); |
500 | 500 | ||
501 | /* Support for plugging. | 501 | void md_unplug(struct blk_plug_cb *cb, bool from_schedule) |
502 | * This mirrors the plugging support in request_queue, but does not | ||
503 | * require having a whole queue or request structures. | ||
504 | * We allocate an md_plug_cb for each md device and each thread it gets | ||
505 | * plugged on. This links tot the private plug_handle structure in the | ||
506 | * personality data where we keep a count of the number of outstanding | ||
507 | * plugs so other code can see if a plug is active. | ||
508 | */ | ||
509 | struct md_plug_cb { | ||
510 | struct blk_plug_cb cb; | ||
511 | struct mddev *mddev; | ||
512 | }; | ||
513 | |||
514 | static void plugger_unplug(struct blk_plug_cb *cb) | ||
515 | { | ||
516 | struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb); | ||
517 | if (atomic_dec_and_test(&mdcb->mddev->plug_cnt)) | ||
518 | md_wakeup_thread(mdcb->mddev->thread); | ||
519 | kfree(mdcb); | ||
520 | } | ||
521 | |||
522 | /* Check that an unplug wakeup will come shortly. | ||
523 | * If not, wakeup the md thread immediately | ||
524 | */ | ||
525 | int mddev_check_plugged(struct mddev *mddev) | ||
526 | { | 502 | { |
527 | struct blk_plug *plug = current->plug; | 503 | struct mddev *mddev = cb->data; |
528 | struct md_plug_cb *mdcb; | 504 | md_wakeup_thread(mddev->thread); |
529 | 505 | kfree(cb); | |
530 | if (!plug) | ||
531 | return 0; | ||
532 | |||
533 | list_for_each_entry(mdcb, &plug->cb_list, cb.list) { | ||
534 | if (mdcb->cb.callback == plugger_unplug && | ||
535 | mdcb->mddev == mddev) { | ||
536 | /* Already on the list, move to top */ | ||
537 | if (mdcb != list_first_entry(&plug->cb_list, | ||
538 | struct md_plug_cb, | ||
539 | cb.list)) | ||
540 | list_move(&mdcb->cb.list, &plug->cb_list); | ||
541 | return 1; | ||
542 | } | ||
543 | } | ||
544 | /* Not currently on the callback list */ | ||
545 | mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC); | ||
546 | if (!mdcb) | ||
547 | return 0; | ||
548 | |||
549 | mdcb->mddev = mddev; | ||
550 | mdcb->cb.callback = plugger_unplug; | ||
551 | atomic_inc(&mddev->plug_cnt); | ||
552 | list_add(&mdcb->cb.list, &plug->cb_list); | ||
553 | return 1; | ||
554 | } | 506 | } |
555 | EXPORT_SYMBOL_GPL(mddev_check_plugged); | 507 | EXPORT_SYMBOL(md_unplug); |
556 | 508 | ||
557 | static inline struct mddev *mddev_get(struct mddev *mddev) | 509 | static inline struct mddev *mddev_get(struct mddev *mddev) |
558 | { | 510 | { |
@@ -602,7 +554,6 @@ void mddev_init(struct mddev *mddev) | |||
602 | atomic_set(&mddev->active, 1); | 554 | atomic_set(&mddev->active, 1); |
603 | atomic_set(&mddev->openers, 0); | 555 | atomic_set(&mddev->openers, 0); |
604 | atomic_set(&mddev->active_io, 0); | 556 | atomic_set(&mddev->active_io, 0); |
605 | atomic_set(&mddev->plug_cnt, 0); | ||
606 | spin_lock_init(&mddev->write_lock); | 557 | spin_lock_init(&mddev->write_lock); |
607 | atomic_set(&mddev->flush_pending, 0); | 558 | atomic_set(&mddev->flush_pending, 0); |
608 | init_waitqueue_head(&mddev->sb_wait); | 559 | init_waitqueue_head(&mddev->sb_wait); |
@@ -2931,6 +2882,7 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2931 | * can be sane */ | 2882 | * can be sane */ |
2932 | return -EBUSY; | 2883 | return -EBUSY; |
2933 | rdev->data_offset = offset; | 2884 | rdev->data_offset = offset; |
2885 | rdev->new_data_offset = offset; | ||
2934 | return len; | 2886 | return len; |
2935 | } | 2887 | } |
2936 | 2888 | ||
@@ -3926,8 +3878,8 @@ array_state_show(struct mddev *mddev, char *page) | |||
3926 | return sprintf(page, "%s\n", array_states[st]); | 3878 | return sprintf(page, "%s\n", array_states[st]); |
3927 | } | 3879 | } |
3928 | 3880 | ||
3929 | static int do_md_stop(struct mddev * mddev, int ro, int is_open); | 3881 | static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev); |
3930 | static int md_set_readonly(struct mddev * mddev, int is_open); | 3882 | static int md_set_readonly(struct mddev * mddev, struct block_device *bdev); |
3931 | static int do_md_run(struct mddev * mddev); | 3883 | static int do_md_run(struct mddev * mddev); |
3932 | static int restart_array(struct mddev *mddev); | 3884 | static int restart_array(struct mddev *mddev); |
3933 | 3885 | ||
@@ -3941,24 +3893,20 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) | |||
3941 | break; | 3893 | break; |
3942 | case clear: | 3894 | case clear: |
3943 | /* stopping an active array */ | 3895 | /* stopping an active array */ |
3944 | if (atomic_read(&mddev->openers) > 0) | 3896 | err = do_md_stop(mddev, 0, NULL); |
3945 | return -EBUSY; | ||
3946 | err = do_md_stop(mddev, 0, 0); | ||
3947 | break; | 3897 | break; |
3948 | case inactive: | 3898 | case inactive: |
3949 | /* stopping an active array */ | 3899 | /* stopping an active array */ |
3950 | if (mddev->pers) { | 3900 | if (mddev->pers) |
3951 | if (atomic_read(&mddev->openers) > 0) | 3901 | err = do_md_stop(mddev, 2, NULL); |
3952 | return -EBUSY; | 3902 | else |
3953 | err = do_md_stop(mddev, 2, 0); | ||
3954 | } else | ||
3955 | err = 0; /* already inactive */ | 3903 | err = 0; /* already inactive */ |
3956 | break; | 3904 | break; |
3957 | case suspended: | 3905 | case suspended: |
3958 | break; /* not supported yet */ | 3906 | break; /* not supported yet */ |
3959 | case readonly: | 3907 | case readonly: |
3960 | if (mddev->pers) | 3908 | if (mddev->pers) |
3961 | err = md_set_readonly(mddev, 0); | 3909 | err = md_set_readonly(mddev, NULL); |
3962 | else { | 3910 | else { |
3963 | mddev->ro = 1; | 3911 | mddev->ro = 1; |
3964 | set_disk_ro(mddev->gendisk, 1); | 3912 | set_disk_ro(mddev->gendisk, 1); |
@@ -3968,7 +3916,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) | |||
3968 | case read_auto: | 3916 | case read_auto: |
3969 | if (mddev->pers) { | 3917 | if (mddev->pers) { |
3970 | if (mddev->ro == 0) | 3918 | if (mddev->ro == 0) |
3971 | err = md_set_readonly(mddev, 0); | 3919 | err = md_set_readonly(mddev, NULL); |
3972 | else if (mddev->ro == 1) | 3920 | else if (mddev->ro == 1) |
3973 | err = restart_array(mddev); | 3921 | err = restart_array(mddev); |
3974 | if (err == 0) { | 3922 | if (err == 0) { |
@@ -5351,15 +5299,17 @@ void md_stop(struct mddev *mddev) | |||
5351 | } | 5299 | } |
5352 | EXPORT_SYMBOL_GPL(md_stop); | 5300 | EXPORT_SYMBOL_GPL(md_stop); |
5353 | 5301 | ||
5354 | static int md_set_readonly(struct mddev *mddev, int is_open) | 5302 | static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) |
5355 | { | 5303 | { |
5356 | int err = 0; | 5304 | int err = 0; |
5357 | mutex_lock(&mddev->open_mutex); | 5305 | mutex_lock(&mddev->open_mutex); |
5358 | if (atomic_read(&mddev->openers) > is_open) { | 5306 | if (atomic_read(&mddev->openers) > !!bdev) { |
5359 | printk("md: %s still in use.\n",mdname(mddev)); | 5307 | printk("md: %s still in use.\n",mdname(mddev)); |
5360 | err = -EBUSY; | 5308 | err = -EBUSY; |
5361 | goto out; | 5309 | goto out; |
5362 | } | 5310 | } |
5311 | if (bdev) | ||
5312 | sync_blockdev(bdev); | ||
5363 | if (mddev->pers) { | 5313 | if (mddev->pers) { |
5364 | __md_stop_writes(mddev); | 5314 | __md_stop_writes(mddev); |
5365 | 5315 | ||
@@ -5381,18 +5331,26 @@ out: | |||
5381 | * 0 - completely stop and dis-assemble array | 5331 | * 0 - completely stop and dis-assemble array |
5382 | * 2 - stop but do not disassemble array | 5332 | * 2 - stop but do not disassemble array |
5383 | */ | 5333 | */ |
5384 | static int do_md_stop(struct mddev * mddev, int mode, int is_open) | 5334 | static int do_md_stop(struct mddev * mddev, int mode, |
5335 | struct block_device *bdev) | ||
5385 | { | 5336 | { |
5386 | struct gendisk *disk = mddev->gendisk; | 5337 | struct gendisk *disk = mddev->gendisk; |
5387 | struct md_rdev *rdev; | 5338 | struct md_rdev *rdev; |
5388 | 5339 | ||
5389 | mutex_lock(&mddev->open_mutex); | 5340 | mutex_lock(&mddev->open_mutex); |
5390 | if (atomic_read(&mddev->openers) > is_open || | 5341 | if (atomic_read(&mddev->openers) > !!bdev || |
5391 | mddev->sysfs_active) { | 5342 | mddev->sysfs_active) { |
5392 | printk("md: %s still in use.\n",mdname(mddev)); | 5343 | printk("md: %s still in use.\n",mdname(mddev)); |
5393 | mutex_unlock(&mddev->open_mutex); | 5344 | mutex_unlock(&mddev->open_mutex); |
5394 | return -EBUSY; | 5345 | return -EBUSY; |
5395 | } | 5346 | } |
5347 | if (bdev) | ||
5348 | /* It is possible IO was issued on some other | ||
5349 | * open file which was closed before we took ->open_mutex. | ||
5350 | * As that was not the last close __blkdev_put will not | ||
5351 | * have called sync_blockdev, so we must. | ||
5352 | */ | ||
5353 | sync_blockdev(bdev); | ||
5396 | 5354 | ||
5397 | if (mddev->pers) { | 5355 | if (mddev->pers) { |
5398 | if (mddev->ro) | 5356 | if (mddev->ro) |
@@ -5466,7 +5424,7 @@ static void autorun_array(struct mddev *mddev) | |||
5466 | err = do_md_run(mddev); | 5424 | err = do_md_run(mddev); |
5467 | if (err) { | 5425 | if (err) { |
5468 | printk(KERN_WARNING "md: do_md_run() returned %d\n", err); | 5426 | printk(KERN_WARNING "md: do_md_run() returned %d\n", err); |
5469 | do_md_stop(mddev, 0, 0); | 5427 | do_md_stop(mddev, 0, NULL); |
5470 | } | 5428 | } |
5471 | } | 5429 | } |
5472 | 5430 | ||
@@ -5784,8 +5742,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) | |||
5784 | super_types[mddev->major_version]. | 5742 | super_types[mddev->major_version]. |
5785 | validate_super(mddev, rdev); | 5743 | validate_super(mddev, rdev); |
5786 | if ((info->state & (1<<MD_DISK_SYNC)) && | 5744 | if ((info->state & (1<<MD_DISK_SYNC)) && |
5787 | (!test_bit(In_sync, &rdev->flags) || | 5745 | rdev->raid_disk != info->raid_disk) { |
5788 | rdev->raid_disk != info->raid_disk)) { | ||
5789 | /* This was a hot-add request, but events doesn't | 5746 | /* This was a hot-add request, but events doesn't |
5790 | * match, so reject it. | 5747 | * match, so reject it. |
5791 | */ | 5748 | */ |
@@ -6482,11 +6439,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6482 | goto done_unlock; | 6439 | goto done_unlock; |
6483 | 6440 | ||
6484 | case STOP_ARRAY: | 6441 | case STOP_ARRAY: |
6485 | err = do_md_stop(mddev, 0, 1); | 6442 | err = do_md_stop(mddev, 0, bdev); |
6486 | goto done_unlock; | 6443 | goto done_unlock; |
6487 | 6444 | ||
6488 | case STOP_ARRAY_RO: | 6445 | case STOP_ARRAY_RO: |
6489 | err = md_set_readonly(mddev, 1); | 6446 | err = md_set_readonly(mddev, bdev); |
6490 | goto done_unlock; | 6447 | goto done_unlock; |
6491 | 6448 | ||
6492 | case BLKROSET: | 6449 | case BLKROSET: |
@@ -6751,7 +6708,7 @@ struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev | |||
6751 | thread->tsk = kthread_run(md_thread, thread, | 6708 | thread->tsk = kthread_run(md_thread, thread, |
6752 | "%s_%s", | 6709 | "%s_%s", |
6753 | mdname(thread->mddev), | 6710 | mdname(thread->mddev), |
6754 | name ?: mddev->pers->name); | 6711 | name); |
6755 | if (IS_ERR(thread->tsk)) { | 6712 | if (IS_ERR(thread->tsk)) { |
6756 | kfree(thread); | 6713 | kfree(thread); |
6757 | return NULL; | 6714 | return NULL; |
@@ -7298,6 +7255,7 @@ void md_do_sync(struct mddev *mddev) | |||
7298 | int skipped = 0; | 7255 | int skipped = 0; |
7299 | struct md_rdev *rdev; | 7256 | struct md_rdev *rdev; |
7300 | char *desc; | 7257 | char *desc; |
7258 | struct blk_plug plug; | ||
7301 | 7259 | ||
7302 | /* just incase thread restarts... */ | 7260 | /* just incase thread restarts... */ |
7303 | if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) | 7261 | if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) |
@@ -7447,6 +7405,7 @@ void md_do_sync(struct mddev *mddev) | |||
7447 | } | 7405 | } |
7448 | mddev->curr_resync_completed = j; | 7406 | mddev->curr_resync_completed = j; |
7449 | 7407 | ||
7408 | blk_start_plug(&plug); | ||
7450 | while (j < max_sectors) { | 7409 | while (j < max_sectors) { |
7451 | sector_t sectors; | 7410 | sector_t sectors; |
7452 | 7411 | ||
@@ -7552,6 +7511,7 @@ void md_do_sync(struct mddev *mddev) | |||
7552 | * this also signals 'finished resyncing' to md_stop | 7511 | * this also signals 'finished resyncing' to md_stop |
7553 | */ | 7512 | */ |
7554 | out: | 7513 | out: |
7514 | blk_finish_plug(&plug); | ||
7555 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); | 7515 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); |
7556 | 7516 | ||
7557 | /* tell personality that we are finished */ | 7517 | /* tell personality that we are finished */ |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 7b4a3c318cae..f385b038589d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -266,9 +266,6 @@ struct mddev { | |||
266 | int new_chunk_sectors; | 266 | int new_chunk_sectors; |
267 | int reshape_backwards; | 267 | int reshape_backwards; |
268 | 268 | ||
269 | atomic_t plug_cnt; /* If device is expecting | ||
270 | * more bios soon. | ||
271 | */ | ||
272 | struct md_thread *thread; /* management thread */ | 269 | struct md_thread *thread; /* management thread */ |
273 | struct md_thread *sync_thread; /* doing resync or reconstruct */ | 270 | struct md_thread *sync_thread; /* doing resync or reconstruct */ |
274 | sector_t curr_resync; /* last block scheduled */ | 271 | sector_t curr_resync; /* last block scheduled */ |
@@ -630,6 +627,12 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | |||
630 | struct mddev *mddev); | 627 | struct mddev *mddev); |
631 | extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, | 628 | extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, |
632 | struct mddev *mddev); | 629 | struct mddev *mddev); |
633 | extern int mddev_check_plugged(struct mddev *mddev); | ||
634 | extern void md_trim_bio(struct bio *bio, int offset, int size); | 630 | extern void md_trim_bio(struct bio *bio, int offset, int size); |
631 | |||
632 | extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); | ||
633 | static inline int mddev_check_plugged(struct mddev *mddev) | ||
634 | { | ||
635 | return !!blk_check_plugged(md_unplug, mddev, | ||
636 | sizeof(struct blk_plug_cb)); | ||
637 | } | ||
635 | #endif /* _MD_MD_H */ | 638 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 9339e67fcc79..61a1833ebaf3 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -474,7 +474,8 @@ static int multipath_run (struct mddev *mddev) | |||
474 | } | 474 | } |
475 | 475 | ||
476 | { | 476 | { |
477 | mddev->thread = md_register_thread(multipathd, mddev, NULL); | 477 | mddev->thread = md_register_thread(multipathd, mddev, |
478 | "multipath"); | ||
478 | if (!mddev->thread) { | 479 | if (!mddev->thread) { |
479 | printk(KERN_ERR "multipath: couldn't allocate thread" | 480 | printk(KERN_ERR "multipath: couldn't allocate thread" |
480 | " for %s\n", mdname(mddev)); | 481 | " for %s\n", mdname(mddev)); |
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile index cfa95f662230..d8e7cb767c1e 100644 --- a/drivers/md/persistent-data/Makefile +++ b/drivers/md/persistent-data/Makefile | |||
@@ -1,7 +1,6 @@ | |||
1 | obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o | 1 | obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o |
2 | dm-persistent-data-objs := \ | 2 | dm-persistent-data-objs := \ |
3 | dm-block-manager.o \ | 3 | dm-block-manager.o \ |
4 | dm-space-map-checker.o \ | ||
5 | dm-space-map-common.o \ | 4 | dm-space-map-common.o \ |
6 | dm-space-map-disk.o \ | 5 | dm-space-map-disk.o \ |
7 | dm-space-map-metadata.o \ | 6 | dm-space-map-metadata.o \ |
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 0317ecdc6e53..5ba277768d99 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c | |||
@@ -325,11 +325,6 @@ static struct dm_buffer *to_buffer(struct dm_block *b) | |||
325 | return (struct dm_buffer *) b; | 325 | return (struct dm_buffer *) b; |
326 | } | 326 | } |
327 | 327 | ||
328 | static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm) | ||
329 | { | ||
330 | return (struct dm_bufio_client *) bm; | ||
331 | } | ||
332 | |||
333 | dm_block_t dm_block_location(struct dm_block *b) | 328 | dm_block_t dm_block_location(struct dm_block *b) |
334 | { | 329 | { |
335 | return dm_bufio_get_block_number(to_buffer(b)); | 330 | return dm_bufio_get_block_number(to_buffer(b)); |
@@ -367,34 +362,60 @@ static void dm_block_manager_write_callback(struct dm_buffer *buf) | |||
367 | /*---------------------------------------------------------------- | 362 | /*---------------------------------------------------------------- |
368 | * Public interface | 363 | * Public interface |
369 | *--------------------------------------------------------------*/ | 364 | *--------------------------------------------------------------*/ |
365 | struct dm_block_manager { | ||
366 | struct dm_bufio_client *bufio; | ||
367 | bool read_only:1; | ||
368 | }; | ||
369 | |||
370 | struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, | 370 | struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, |
371 | unsigned block_size, | 371 | unsigned block_size, |
372 | unsigned cache_size, | 372 | unsigned cache_size, |
373 | unsigned max_held_per_thread) | 373 | unsigned max_held_per_thread) |
374 | { | 374 | { |
375 | return (struct dm_block_manager *) | 375 | int r; |
376 | dm_bufio_client_create(bdev, block_size, max_held_per_thread, | 376 | struct dm_block_manager *bm; |
377 | sizeof(struct buffer_aux), | 377 | |
378 | dm_block_manager_alloc_callback, | 378 | bm = kmalloc(sizeof(*bm), GFP_KERNEL); |
379 | dm_block_manager_write_callback); | 379 | if (!bm) { |
380 | r = -ENOMEM; | ||
381 | goto bad; | ||
382 | } | ||
383 | |||
384 | bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread, | ||
385 | sizeof(struct buffer_aux), | ||
386 | dm_block_manager_alloc_callback, | ||
387 | dm_block_manager_write_callback); | ||
388 | if (IS_ERR(bm->bufio)) { | ||
389 | r = PTR_ERR(bm->bufio); | ||
390 | kfree(bm); | ||
391 | goto bad; | ||
392 | } | ||
393 | |||
394 | bm->read_only = false; | ||
395 | |||
396 | return bm; | ||
397 | |||
398 | bad: | ||
399 | return ERR_PTR(r); | ||
380 | } | 400 | } |
381 | EXPORT_SYMBOL_GPL(dm_block_manager_create); | 401 | EXPORT_SYMBOL_GPL(dm_block_manager_create); |
382 | 402 | ||
383 | void dm_block_manager_destroy(struct dm_block_manager *bm) | 403 | void dm_block_manager_destroy(struct dm_block_manager *bm) |
384 | { | 404 | { |
385 | return dm_bufio_client_destroy(to_bufio(bm)); | 405 | dm_bufio_client_destroy(bm->bufio); |
406 | kfree(bm); | ||
386 | } | 407 | } |
387 | EXPORT_SYMBOL_GPL(dm_block_manager_destroy); | 408 | EXPORT_SYMBOL_GPL(dm_block_manager_destroy); |
388 | 409 | ||
389 | unsigned dm_bm_block_size(struct dm_block_manager *bm) | 410 | unsigned dm_bm_block_size(struct dm_block_manager *bm) |
390 | { | 411 | { |
391 | return dm_bufio_get_block_size(to_bufio(bm)); | 412 | return dm_bufio_get_block_size(bm->bufio); |
392 | } | 413 | } |
393 | EXPORT_SYMBOL_GPL(dm_bm_block_size); | 414 | EXPORT_SYMBOL_GPL(dm_bm_block_size); |
394 | 415 | ||
395 | dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) | 416 | dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) |
396 | { | 417 | { |
397 | return dm_bufio_get_device_size(to_bufio(bm)); | 418 | return dm_bufio_get_device_size(bm->bufio); |
398 | } | 419 | } |
399 | 420 | ||
400 | static int dm_bm_validate_buffer(struct dm_block_manager *bm, | 421 | static int dm_bm_validate_buffer(struct dm_block_manager *bm, |
@@ -406,7 +427,7 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm, | |||
406 | int r; | 427 | int r; |
407 | if (!v) | 428 | if (!v) |
408 | return 0; | 429 | return 0; |
409 | r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm))); | 430 | r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio)); |
410 | if (unlikely(r)) | 431 | if (unlikely(r)) |
411 | return r; | 432 | return r; |
412 | aux->validator = v; | 433 | aux->validator = v; |
@@ -430,7 +451,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, | |||
430 | void *p; | 451 | void *p; |
431 | int r; | 452 | int r; |
432 | 453 | ||
433 | p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); | 454 | p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); |
434 | if (unlikely(IS_ERR(p))) | 455 | if (unlikely(IS_ERR(p))) |
435 | return PTR_ERR(p); | 456 | return PTR_ERR(p); |
436 | 457 | ||
@@ -463,7 +484,10 @@ int dm_bm_write_lock(struct dm_block_manager *bm, | |||
463 | void *p; | 484 | void *p; |
464 | int r; | 485 | int r; |
465 | 486 | ||
466 | p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); | 487 | if (bm->read_only) |
488 | return -EPERM; | ||
489 | |||
490 | p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); | ||
467 | if (unlikely(IS_ERR(p))) | 491 | if (unlikely(IS_ERR(p))) |
468 | return PTR_ERR(p); | 492 | return PTR_ERR(p); |
469 | 493 | ||
@@ -496,7 +520,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, | |||
496 | void *p; | 520 | void *p; |
497 | int r; | 521 | int r; |
498 | 522 | ||
499 | p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result); | 523 | p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result); |
500 | if (unlikely(IS_ERR(p))) | 524 | if (unlikely(IS_ERR(p))) |
501 | return PTR_ERR(p); | 525 | return PTR_ERR(p); |
502 | if (unlikely(!p)) | 526 | if (unlikely(!p)) |
@@ -529,7 +553,10 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, | |||
529 | struct buffer_aux *aux; | 553 | struct buffer_aux *aux; |
530 | void *p; | 554 | void *p; |
531 | 555 | ||
532 | p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result); | 556 | if (bm->read_only) |
557 | return -EPERM; | ||
558 | |||
559 | p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result); | ||
533 | if (unlikely(IS_ERR(p))) | 560 | if (unlikely(IS_ERR(p))) |
534 | return PTR_ERR(p); | 561 | return PTR_ERR(p); |
535 | 562 | ||
@@ -547,6 +574,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, | |||
547 | 574 | ||
548 | return 0; | 575 | return 0; |
549 | } | 576 | } |
577 | EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero); | ||
550 | 578 | ||
551 | int dm_bm_unlock(struct dm_block *b) | 579 | int dm_bm_unlock(struct dm_block *b) |
552 | { | 580 | { |
@@ -565,45 +593,30 @@ int dm_bm_unlock(struct dm_block *b) | |||
565 | } | 593 | } |
566 | EXPORT_SYMBOL_GPL(dm_bm_unlock); | 594 | EXPORT_SYMBOL_GPL(dm_bm_unlock); |
567 | 595 | ||
568 | int dm_bm_unlock_move(struct dm_block *b, dm_block_t n) | ||
569 | { | ||
570 | struct buffer_aux *aux; | ||
571 | |||
572 | aux = dm_bufio_get_aux_data(to_buffer(b)); | ||
573 | |||
574 | if (aux->write_locked) { | ||
575 | dm_bufio_mark_buffer_dirty(to_buffer(b)); | ||
576 | bl_up_write(&aux->lock); | ||
577 | } else | ||
578 | bl_up_read(&aux->lock); | ||
579 | |||
580 | dm_bufio_release_move(to_buffer(b), n); | ||
581 | return 0; | ||
582 | } | ||
583 | |||
584 | int dm_bm_flush_and_unlock(struct dm_block_manager *bm, | 596 | int dm_bm_flush_and_unlock(struct dm_block_manager *bm, |
585 | struct dm_block *superblock) | 597 | struct dm_block *superblock) |
586 | { | 598 | { |
587 | int r; | 599 | int r; |
588 | 600 | ||
589 | r = dm_bufio_write_dirty_buffers(to_bufio(bm)); | 601 | if (bm->read_only) |
590 | if (unlikely(r)) | 602 | return -EPERM; |
591 | return r; | 603 | |
592 | r = dm_bufio_issue_flush(to_bufio(bm)); | 604 | r = dm_bufio_write_dirty_buffers(bm->bufio); |
593 | if (unlikely(r)) | 605 | if (unlikely(r)) { |
606 | dm_bm_unlock(superblock); | ||
594 | return r; | 607 | return r; |
608 | } | ||
595 | 609 | ||
596 | dm_bm_unlock(superblock); | 610 | dm_bm_unlock(superblock); |
597 | 611 | ||
598 | r = dm_bufio_write_dirty_buffers(to_bufio(bm)); | 612 | return dm_bufio_write_dirty_buffers(bm->bufio); |
599 | if (unlikely(r)) | 613 | } |
600 | return r; | ||
601 | r = dm_bufio_issue_flush(to_bufio(bm)); | ||
602 | if (unlikely(r)) | ||
603 | return r; | ||
604 | 614 | ||
605 | return 0; | 615 | void dm_bm_set_read_only(struct dm_block_manager *bm) |
616 | { | ||
617 | bm->read_only = true; | ||
606 | } | 618 | } |
619 | EXPORT_SYMBOL_GPL(dm_bm_set_read_only); | ||
607 | 620 | ||
608 | u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) | 621 | u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) |
609 | { | 622 | { |
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 924833d2dfa6..be5bff61be28 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h | |||
@@ -97,14 +97,6 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b, | |||
97 | int dm_bm_unlock(struct dm_block *b); | 97 | int dm_bm_unlock(struct dm_block *b); |
98 | 98 | ||
99 | /* | 99 | /* |
100 | * An optimisation; we often want to copy a block's contents to a new | ||
101 | * block. eg, as part of the shadowing operation. It's far better for | ||
102 | * bufio to do this move behind the scenes than hold 2 locks and memcpy the | ||
103 | * data. | ||
104 | */ | ||
105 | int dm_bm_unlock_move(struct dm_block *b, dm_block_t n); | ||
106 | |||
107 | /* | ||
108 | * It's a common idiom to have a superblock that should be committed last. | 100 | * It's a common idiom to have a superblock that should be committed last. |
109 | * | 101 | * |
110 | * @superblock should be write-locked on entry. It will be unlocked during | 102 | * @superblock should be write-locked on entry. It will be unlocked during |
@@ -116,6 +108,19 @@ int dm_bm_unlock_move(struct dm_block *b, dm_block_t n); | |||
116 | int dm_bm_flush_and_unlock(struct dm_block_manager *bm, | 108 | int dm_bm_flush_and_unlock(struct dm_block_manager *bm, |
117 | struct dm_block *superblock); | 109 | struct dm_block *superblock); |
118 | 110 | ||
111 | /* | ||
112 | * Switches the bm to a read only mode. Once read-only mode | ||
113 | * has been entered the following functions will return -EPERM. | ||
114 | * | ||
115 | * dm_bm_write_lock | ||
116 | * dm_bm_write_lock_zero | ||
117 | * dm_bm_flush_and_unlock | ||
118 | * | ||
119 | * Additionally you should not use dm_bm_unlock_move, however no error will | ||
120 | * be returned if you do. | ||
121 | */ | ||
122 | void dm_bm_set_read_only(struct dm_block_manager *bm); | ||
123 | |||
119 | u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); | 124 | u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); |
120 | 125 | ||
121 | /*----------------------------------------------------------------*/ | 126 | /*----------------------------------------------------------------*/ |
diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c deleted file mode 100644 index 50ed53bf4aa2..000000000000 --- a/drivers/md/persistent-data/dm-space-map-checker.c +++ /dev/null | |||
@@ -1,438 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 Red Hat, Inc. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #include "dm-space-map-checker.h" | ||
8 | |||
9 | #include <linux/device-mapper.h> | ||
10 | #include <linux/export.h> | ||
11 | |||
12 | #ifdef CONFIG_DM_DEBUG_SPACE_MAPS | ||
13 | |||
14 | #define DM_MSG_PREFIX "space map checker" | ||
15 | |||
16 | /*----------------------------------------------------------------*/ | ||
17 | |||
18 | struct count_array { | ||
19 | dm_block_t nr; | ||
20 | dm_block_t nr_free; | ||
21 | |||
22 | uint32_t *counts; | ||
23 | }; | ||
24 | |||
25 | static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count) | ||
26 | { | ||
27 | if (b >= ca->nr) | ||
28 | return -EINVAL; | ||
29 | |||
30 | *count = ca->counts[b]; | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r) | ||
35 | { | ||
36 | if (b >= ca->nr) | ||
37 | return -EINVAL; | ||
38 | |||
39 | *r = ca->counts[b] > 1; | ||
40 | return 0; | ||
41 | } | ||
42 | |||
43 | static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count) | ||
44 | { | ||
45 | uint32_t old_count; | ||
46 | |||
47 | if (b >= ca->nr) | ||
48 | return -EINVAL; | ||
49 | |||
50 | old_count = ca->counts[b]; | ||
51 | |||
52 | if (!count && old_count) | ||
53 | ca->nr_free++; | ||
54 | |||
55 | else if (count && !old_count) | ||
56 | ca->nr_free--; | ||
57 | |||
58 | ca->counts[b] = count; | ||
59 | return 0; | ||
60 | } | ||
61 | |||
62 | static int ca_inc_block(struct count_array *ca, dm_block_t b) | ||
63 | { | ||
64 | if (b >= ca->nr) | ||
65 | return -EINVAL; | ||
66 | |||
67 | ca_set_count(ca, b, ca->counts[b] + 1); | ||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | static int ca_dec_block(struct count_array *ca, dm_block_t b) | ||
72 | { | ||
73 | if (b >= ca->nr) | ||
74 | return -EINVAL; | ||
75 | |||
76 | BUG_ON(ca->counts[b] == 0); | ||
77 | ca_set_count(ca, b, ca->counts[b] - 1); | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | static int ca_create(struct count_array *ca, struct dm_space_map *sm) | ||
82 | { | ||
83 | int r; | ||
84 | dm_block_t nr_blocks; | ||
85 | |||
86 | r = dm_sm_get_nr_blocks(sm, &nr_blocks); | ||
87 | if (r) | ||
88 | return r; | ||
89 | |||
90 | ca->nr = nr_blocks; | ||
91 | ca->nr_free = nr_blocks; | ||
92 | ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL); | ||
93 | if (!ca->counts) | ||
94 | return -ENOMEM; | ||
95 | |||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | static int ca_load(struct count_array *ca, struct dm_space_map *sm) | ||
100 | { | ||
101 | int r; | ||
102 | uint32_t count; | ||
103 | dm_block_t nr_blocks, i; | ||
104 | |||
105 | r = dm_sm_get_nr_blocks(sm, &nr_blocks); | ||
106 | if (r) | ||
107 | return r; | ||
108 | |||
109 | BUG_ON(ca->nr != nr_blocks); | ||
110 | |||
111 | DMWARN("Loading debug space map from disk. This may take some time"); | ||
112 | for (i = 0; i < nr_blocks; i++) { | ||
113 | r = dm_sm_get_count(sm, i, &count); | ||
114 | if (r) { | ||
115 | DMERR("load failed"); | ||
116 | return r; | ||
117 | } | ||
118 | |||
119 | ca_set_count(ca, i, count); | ||
120 | } | ||
121 | DMWARN("Load complete"); | ||
122 | |||
123 | return 0; | ||
124 | } | ||
125 | |||
126 | static int ca_extend(struct count_array *ca, dm_block_t extra_blocks) | ||
127 | { | ||
128 | dm_block_t nr_blocks = ca->nr + extra_blocks; | ||
129 | uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL); | ||
130 | if (!counts) | ||
131 | return -ENOMEM; | ||
132 | |||
133 | memcpy(counts, ca->counts, sizeof(*counts) * ca->nr); | ||
134 | kfree(ca->counts); | ||
135 | ca->nr = nr_blocks; | ||
136 | ca->nr_free += extra_blocks; | ||
137 | ca->counts = counts; | ||
138 | return 0; | ||
139 | } | ||
140 | |||
141 | static int ca_commit(struct count_array *old, struct count_array *new) | ||
142 | { | ||
143 | if (old->nr != new->nr) { | ||
144 | BUG_ON(old->nr > new->nr); | ||
145 | ca_extend(old, new->nr - old->nr); | ||
146 | } | ||
147 | |||
148 | BUG_ON(old->nr != new->nr); | ||
149 | old->nr_free = new->nr_free; | ||
150 | memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr); | ||
151 | return 0; | ||
152 | } | ||
153 | |||
154 | static void ca_destroy(struct count_array *ca) | ||
155 | { | ||
156 | kfree(ca->counts); | ||
157 | } | ||
158 | |||
159 | /*----------------------------------------------------------------*/ | ||
160 | |||
161 | struct sm_checker { | ||
162 | struct dm_space_map sm; | ||
163 | |||
164 | struct count_array old_counts; | ||
165 | struct count_array counts; | ||
166 | |||
167 | struct dm_space_map *real_sm; | ||
168 | }; | ||
169 | |||
170 | static void sm_checker_destroy(struct dm_space_map *sm) | ||
171 | { | ||
172 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
173 | |||
174 | dm_sm_destroy(smc->real_sm); | ||
175 | ca_destroy(&smc->old_counts); | ||
176 | ca_destroy(&smc->counts); | ||
177 | kfree(smc); | ||
178 | } | ||
179 | |||
180 | static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) | ||
181 | { | ||
182 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
183 | int r = dm_sm_get_nr_blocks(smc->real_sm, count); | ||
184 | if (!r) | ||
185 | BUG_ON(smc->old_counts.nr != *count); | ||
186 | return r; | ||
187 | } | ||
188 | |||
189 | static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count) | ||
190 | { | ||
191 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
192 | int r = dm_sm_get_nr_free(smc->real_sm, count); | ||
193 | if (!r) { | ||
194 | /* | ||
195 | * Slow, but we know it's correct. | ||
196 | */ | ||
197 | dm_block_t b, n = 0; | ||
198 | for (b = 0; b < smc->old_counts.nr; b++) | ||
199 | if (smc->old_counts.counts[b] == 0 && | ||
200 | smc->counts.counts[b] == 0) | ||
201 | n++; | ||
202 | |||
203 | if (n != *count) | ||
204 | DMERR("free block counts differ, checker %u, sm-disk:%u", | ||
205 | (unsigned) n, (unsigned) *count); | ||
206 | } | ||
207 | return r; | ||
208 | } | ||
209 | |||
210 | static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b) | ||
211 | { | ||
212 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
213 | int r = dm_sm_new_block(smc->real_sm, b); | ||
214 | |||
215 | if (!r) { | ||
216 | BUG_ON(*b >= smc->old_counts.nr); | ||
217 | BUG_ON(smc->old_counts.counts[*b] != 0); | ||
218 | BUG_ON(*b >= smc->counts.nr); | ||
219 | BUG_ON(smc->counts.counts[*b] != 0); | ||
220 | ca_set_count(&smc->counts, *b, 1); | ||
221 | } | ||
222 | |||
223 | return r; | ||
224 | } | ||
225 | |||
226 | static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b) | ||
227 | { | ||
228 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
229 | int r = dm_sm_inc_block(smc->real_sm, b); | ||
230 | int r2 = ca_inc_block(&smc->counts, b); | ||
231 | BUG_ON(r != r2); | ||
232 | return r; | ||
233 | } | ||
234 | |||
235 | static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b) | ||
236 | { | ||
237 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
238 | int r = dm_sm_dec_block(smc->real_sm, b); | ||
239 | int r2 = ca_dec_block(&smc->counts, b); | ||
240 | BUG_ON(r != r2); | ||
241 | return r; | ||
242 | } | ||
243 | |||
244 | static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result) | ||
245 | { | ||
246 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
247 | uint32_t result2 = 0; | ||
248 | int r = dm_sm_get_count(smc->real_sm, b, result); | ||
249 | int r2 = ca_get_count(&smc->counts, b, &result2); | ||
250 | |||
251 | BUG_ON(r != r2); | ||
252 | if (!r) | ||
253 | BUG_ON(*result != result2); | ||
254 | return r; | ||
255 | } | ||
256 | |||
257 | static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result) | ||
258 | { | ||
259 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
260 | int result2 = 0; | ||
261 | int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result); | ||
262 | int r2 = ca_count_more_than_one(&smc->counts, b, &result2); | ||
263 | |||
264 | BUG_ON(r != r2); | ||
265 | if (!r) | ||
266 | BUG_ON(!(*result) && result2); | ||
267 | return r; | ||
268 | } | ||
269 | |||
270 | static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) | ||
271 | { | ||
272 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
273 | uint32_t old_rc; | ||
274 | int r = dm_sm_set_count(smc->real_sm, b, count); | ||
275 | int r2; | ||
276 | |||
277 | BUG_ON(b >= smc->counts.nr); | ||
278 | old_rc = smc->counts.counts[b]; | ||
279 | r2 = ca_set_count(&smc->counts, b, count); | ||
280 | BUG_ON(r != r2); | ||
281 | |||
282 | return r; | ||
283 | } | ||
284 | |||
285 | static int sm_checker_commit(struct dm_space_map *sm) | ||
286 | { | ||
287 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
288 | int r; | ||
289 | |||
290 | r = dm_sm_commit(smc->real_sm); | ||
291 | if (r) | ||
292 | return r; | ||
293 | |||
294 | r = ca_commit(&smc->old_counts, &smc->counts); | ||
295 | if (r) | ||
296 | return r; | ||
297 | |||
298 | return 0; | ||
299 | } | ||
300 | |||
301 | static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks) | ||
302 | { | ||
303 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
304 | int r = dm_sm_extend(smc->real_sm, extra_blocks); | ||
305 | if (r) | ||
306 | return r; | ||
307 | |||
308 | return ca_extend(&smc->counts, extra_blocks); | ||
309 | } | ||
310 | |||
311 | static int sm_checker_root_size(struct dm_space_map *sm, size_t *result) | ||
312 | { | ||
313 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
314 | return dm_sm_root_size(smc->real_sm, result); | ||
315 | } | ||
316 | |||
317 | static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len) | ||
318 | { | ||
319 | struct sm_checker *smc = container_of(sm, struct sm_checker, sm); | ||
320 | return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len); | ||
321 | } | ||
322 | |||
323 | /*----------------------------------------------------------------*/ | ||
324 | |||
325 | static struct dm_space_map ops_ = { | ||
326 | .destroy = sm_checker_destroy, | ||
327 | .get_nr_blocks = sm_checker_get_nr_blocks, | ||
328 | .get_nr_free = sm_checker_get_nr_free, | ||
329 | .inc_block = sm_checker_inc_block, | ||
330 | .dec_block = sm_checker_dec_block, | ||
331 | .new_block = sm_checker_new_block, | ||
332 | .get_count = sm_checker_get_count, | ||
333 | .count_is_more_than_one = sm_checker_count_more_than_one, | ||
334 | .set_count = sm_checker_set_count, | ||
335 | .commit = sm_checker_commit, | ||
336 | .extend = sm_checker_extend, | ||
337 | .root_size = sm_checker_root_size, | ||
338 | .copy_root = sm_checker_copy_root | ||
339 | }; | ||
340 | |||
341 | struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) | ||
342 | { | ||
343 | int r; | ||
344 | struct sm_checker *smc; | ||
345 | |||
346 | if (!sm) | ||
347 | return NULL; | ||
348 | |||
349 | smc = kmalloc(sizeof(*smc), GFP_KERNEL); | ||
350 | if (!smc) | ||
351 | return NULL; | ||
352 | |||
353 | memcpy(&smc->sm, &ops_, sizeof(smc->sm)); | ||
354 | r = ca_create(&smc->old_counts, sm); | ||
355 | if (r) { | ||
356 | kfree(smc); | ||
357 | return NULL; | ||
358 | } | ||
359 | |||
360 | r = ca_create(&smc->counts, sm); | ||
361 | if (r) { | ||
362 | ca_destroy(&smc->old_counts); | ||
363 | kfree(smc); | ||
364 | return NULL; | ||
365 | } | ||
366 | |||
367 | smc->real_sm = sm; | ||
368 | |||
369 | r = ca_load(&smc->counts, sm); | ||
370 | if (r) { | ||
371 | ca_destroy(&smc->counts); | ||
372 | ca_destroy(&smc->old_counts); | ||
373 | kfree(smc); | ||
374 | return NULL; | ||
375 | } | ||
376 | |||
377 | r = ca_commit(&smc->old_counts, &smc->counts); | ||
378 | if (r) { | ||
379 | ca_destroy(&smc->counts); | ||
380 | ca_destroy(&smc->old_counts); | ||
381 | kfree(smc); | ||
382 | return NULL; | ||
383 | } | ||
384 | |||
385 | return &smc->sm; | ||
386 | } | ||
387 | EXPORT_SYMBOL_GPL(dm_sm_checker_create); | ||
388 | |||
389 | struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) | ||
390 | { | ||
391 | int r; | ||
392 | struct sm_checker *smc; | ||
393 | |||
394 | if (!sm) | ||
395 | return NULL; | ||
396 | |||
397 | smc = kmalloc(sizeof(*smc), GFP_KERNEL); | ||
398 | if (!smc) | ||
399 | return NULL; | ||
400 | |||
401 | memcpy(&smc->sm, &ops_, sizeof(smc->sm)); | ||
402 | r = ca_create(&smc->old_counts, sm); | ||
403 | if (r) { | ||
404 | kfree(smc); | ||
405 | return NULL; | ||
406 | } | ||
407 | |||
408 | r = ca_create(&smc->counts, sm); | ||
409 | if (r) { | ||
410 | ca_destroy(&smc->old_counts); | ||
411 | kfree(smc); | ||
412 | return NULL; | ||
413 | } | ||
414 | |||
415 | smc->real_sm = sm; | ||
416 | return &smc->sm; | ||
417 | } | ||
418 | EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh); | ||
419 | |||
420 | /*----------------------------------------------------------------*/ | ||
421 | |||
422 | #else | ||
423 | |||
424 | struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) | ||
425 | { | ||
426 | return sm; | ||
427 | } | ||
428 | EXPORT_SYMBOL_GPL(dm_sm_checker_create); | ||
429 | |||
430 | struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) | ||
431 | { | ||
432 | return sm; | ||
433 | } | ||
434 | EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh); | ||
435 | |||
436 | /*----------------------------------------------------------------*/ | ||
437 | |||
438 | #endif | ||
diff --git a/drivers/md/persistent-data/dm-space-map-checker.h b/drivers/md/persistent-data/dm-space-map-checker.h deleted file mode 100644 index 444dccf6688c..000000000000 --- a/drivers/md/persistent-data/dm-space-map-checker.h +++ /dev/null | |||
@@ -1,26 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 Red Hat, Inc. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H | ||
8 | #define SNAPSHOTS_SPACE_MAP_CHECKER_H | ||
9 | |||
10 | #include "dm-space-map.h" | ||
11 | |||
12 | /*----------------------------------------------------------------*/ | ||
13 | |||
14 | /* | ||
15 | * This space map wraps a real on-disk space map, and verifies all of its | ||
16 | * operations. It uses a lot of memory, so only use if you have a specific | ||
17 | * problem that you're debugging. | ||
18 | * | ||
19 | * Ownership of @sm passes. | ||
20 | */ | ||
21 | struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm); | ||
22 | struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm); | ||
23 | |||
24 | /*----------------------------------------------------------------*/ | ||
25 | |||
26 | #endif | ||
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index ff3beed6ad2d..d77602d63c83 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c | |||
@@ -224,6 +224,7 @@ static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm) | |||
224 | ll->nr_blocks = 0; | 224 | ll->nr_blocks = 0; |
225 | ll->bitmap_root = 0; | 225 | ll->bitmap_root = 0; |
226 | ll->ref_count_root = 0; | 226 | ll->ref_count_root = 0; |
227 | ll->bitmap_index_changed = false; | ||
227 | 228 | ||
228 | return 0; | 229 | return 0; |
229 | } | 230 | } |
@@ -476,7 +477,15 @@ int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) | |||
476 | 477 | ||
477 | int sm_ll_commit(struct ll_disk *ll) | 478 | int sm_ll_commit(struct ll_disk *ll) |
478 | { | 479 | { |
479 | return ll->commit(ll); | 480 | int r = 0; |
481 | |||
482 | if (ll->bitmap_index_changed) { | ||
483 | r = ll->commit(ll); | ||
484 | if (!r) | ||
485 | ll->bitmap_index_changed = false; | ||
486 | } | ||
487 | |||
488 | return r; | ||
480 | } | 489 | } |
481 | 490 | ||
482 | /*----------------------------------------------------------------*/ | 491 | /*----------------------------------------------------------------*/ |
@@ -491,6 +500,7 @@ static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index, | |||
491 | static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index, | 500 | static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index, |
492 | struct disk_index_entry *ie) | 501 | struct disk_index_entry *ie) |
493 | { | 502 | { |
503 | ll->bitmap_index_changed = true; | ||
494 | memcpy(ll->mi_le.index + index, ie, sizeof(*ie)); | 504 | memcpy(ll->mi_le.index + index, ie, sizeof(*ie)); |
495 | return 0; | 505 | return 0; |
496 | } | 506 | } |
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h index 8f220821a9a9..b3078d5eda0c 100644 --- a/drivers/md/persistent-data/dm-space-map-common.h +++ b/drivers/md/persistent-data/dm-space-map-common.h | |||
@@ -78,6 +78,7 @@ struct ll_disk { | |||
78 | open_index_fn open_index; | 78 | open_index_fn open_index; |
79 | max_index_entries_fn max_entries; | 79 | max_index_entries_fn max_entries; |
80 | commit_fn commit; | 80 | commit_fn commit; |
81 | bool bitmap_index_changed:1; | ||
81 | }; | 82 | }; |
82 | 83 | ||
83 | struct disk_sm_root { | 84 | struct disk_sm_root { |
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index fc469ba9f627..f6d29e614ab7 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c | |||
@@ -4,7 +4,6 @@ | |||
4 | * This file is released under the GPL. | 4 | * This file is released under the GPL. |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include "dm-space-map-checker.h" | ||
8 | #include "dm-space-map-common.h" | 7 | #include "dm-space-map-common.h" |
9 | #include "dm-space-map-disk.h" | 8 | #include "dm-space-map-disk.h" |
10 | #include "dm-space-map.h" | 9 | #include "dm-space-map.h" |
@@ -252,9 +251,8 @@ static struct dm_space_map ops = { | |||
252 | .copy_root = sm_disk_copy_root | 251 | .copy_root = sm_disk_copy_root |
253 | }; | 252 | }; |
254 | 253 | ||
255 | static struct dm_space_map *dm_sm_disk_create_real( | 254 | struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, |
256 | struct dm_transaction_manager *tm, | 255 | dm_block_t nr_blocks) |
257 | dm_block_t nr_blocks) | ||
258 | { | 256 | { |
259 | int r; | 257 | int r; |
260 | struct sm_disk *smd; | 258 | struct sm_disk *smd; |
@@ -285,18 +283,10 @@ bad: | |||
285 | kfree(smd); | 283 | kfree(smd); |
286 | return ERR_PTR(r); | 284 | return ERR_PTR(r); |
287 | } | 285 | } |
288 | |||
289 | struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, | ||
290 | dm_block_t nr_blocks) | ||
291 | { | ||
292 | struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks); | ||
293 | return dm_sm_checker_create_fresh(sm); | ||
294 | } | ||
295 | EXPORT_SYMBOL_GPL(dm_sm_disk_create); | 286 | EXPORT_SYMBOL_GPL(dm_sm_disk_create); |
296 | 287 | ||
297 | static struct dm_space_map *dm_sm_disk_open_real( | 288 | struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, |
298 | struct dm_transaction_manager *tm, | 289 | void *root_le, size_t len) |
299 | void *root_le, size_t len) | ||
300 | { | 290 | { |
301 | int r; | 291 | int r; |
302 | struct sm_disk *smd; | 292 | struct sm_disk *smd; |
@@ -323,13 +313,6 @@ bad: | |||
323 | kfree(smd); | 313 | kfree(smd); |
324 | return ERR_PTR(r); | 314 | return ERR_PTR(r); |
325 | } | 315 | } |
326 | |||
327 | struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, | ||
328 | void *root_le, size_t len) | ||
329 | { | ||
330 | return dm_sm_checker_create( | ||
331 | dm_sm_disk_open_real(tm, root_le, len)); | ||
332 | } | ||
333 | EXPORT_SYMBOL_GPL(dm_sm_disk_open); | 316 | EXPORT_SYMBOL_GPL(dm_sm_disk_open); |
334 | 317 | ||
335 | /*----------------------------------------------------------------*/ | 318 | /*----------------------------------------------------------------*/ |
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index 400fe144c0cd..d247a35da3c6 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c | |||
@@ -5,7 +5,6 @@ | |||
5 | */ | 5 | */ |
6 | #include "dm-transaction-manager.h" | 6 | #include "dm-transaction-manager.h" |
7 | #include "dm-space-map.h" | 7 | #include "dm-space-map.h" |
8 | #include "dm-space-map-checker.h" | ||
9 | #include "dm-space-map-disk.h" | 8 | #include "dm-space-map-disk.h" |
10 | #include "dm-space-map-metadata.h" | 9 | #include "dm-space-map-metadata.h" |
11 | #include "dm-persistent-data-internal.h" | 10 | #include "dm-persistent-data-internal.h" |
@@ -138,6 +137,9 @@ EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone); | |||
138 | 137 | ||
139 | void dm_tm_destroy(struct dm_transaction_manager *tm) | 138 | void dm_tm_destroy(struct dm_transaction_manager *tm) |
140 | { | 139 | { |
140 | if (!tm->is_clone) | ||
141 | wipe_shadow_table(tm); | ||
142 | |||
141 | kfree(tm); | 143 | kfree(tm); |
142 | } | 144 | } |
143 | EXPORT_SYMBOL_GPL(dm_tm_destroy); | 145 | EXPORT_SYMBOL_GPL(dm_tm_destroy); |
@@ -217,13 +219,24 @@ static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, | |||
217 | if (r < 0) | 219 | if (r < 0) |
218 | return r; | 220 | return r; |
219 | 221 | ||
220 | r = dm_bm_unlock_move(orig_block, new); | 222 | /* |
221 | if (r < 0) { | 223 | * It would be tempting to use dm_bm_unlock_move here, but some |
224 | * code, such as the space maps, keeps using the old data structures | ||
225 | * secure in the knowledge they won't be changed until the next | ||
226 | * transaction. Using unlock_move would force a synchronous read | ||
227 | * since the old block would no longer be in the cache. | ||
228 | */ | ||
229 | r = dm_bm_write_lock_zero(tm->bm, new, v, result); | ||
230 | if (r) { | ||
222 | dm_bm_unlock(orig_block); | 231 | dm_bm_unlock(orig_block); |
223 | return r; | 232 | return r; |
224 | } | 233 | } |
225 | 234 | ||
226 | return dm_bm_write_lock(tm->bm, new, v, result); | 235 | memcpy(dm_block_data(*result), dm_block_data(orig_block), |
236 | dm_bm_block_size(tm->bm)); | ||
237 | |||
238 | dm_bm_unlock(orig_block); | ||
239 | return r; | ||
227 | } | 240 | } |
228 | 241 | ||
229 | int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, | 242 | int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, |
@@ -308,94 +321,61 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm) | |||
308 | 321 | ||
309 | static int dm_tm_create_internal(struct dm_block_manager *bm, | 322 | static int dm_tm_create_internal(struct dm_block_manager *bm, |
310 | dm_block_t sb_location, | 323 | dm_block_t sb_location, |
311 | struct dm_block_validator *sb_validator, | ||
312 | size_t root_offset, size_t root_max_len, | ||
313 | struct dm_transaction_manager **tm, | 324 | struct dm_transaction_manager **tm, |
314 | struct dm_space_map **sm, | 325 | struct dm_space_map **sm, |
315 | struct dm_block **sblock, | 326 | int create, |
316 | int create) | 327 | void *sm_root, size_t sm_len) |
317 | { | 328 | { |
318 | int r; | 329 | int r; |
319 | struct dm_space_map *inner; | ||
320 | 330 | ||
321 | inner = dm_sm_metadata_init(); | 331 | *sm = dm_sm_metadata_init(); |
322 | if (IS_ERR(inner)) | 332 | if (IS_ERR(*sm)) |
323 | return PTR_ERR(inner); | 333 | return PTR_ERR(*sm); |
324 | 334 | ||
325 | *tm = dm_tm_create(bm, inner); | 335 | *tm = dm_tm_create(bm, *sm); |
326 | if (IS_ERR(*tm)) { | 336 | if (IS_ERR(*tm)) { |
327 | dm_sm_destroy(inner); | 337 | dm_sm_destroy(*sm); |
328 | return PTR_ERR(*tm); | 338 | return PTR_ERR(*tm); |
329 | } | 339 | } |
330 | 340 | ||
331 | if (create) { | 341 | if (create) { |
332 | r = dm_bm_write_lock_zero(dm_tm_get_bm(*tm), sb_location, | 342 | r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm), |
333 | sb_validator, sblock); | ||
334 | if (r < 0) { | ||
335 | DMERR("couldn't lock superblock"); | ||
336 | goto bad1; | ||
337 | } | ||
338 | |||
339 | r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm), | ||
340 | sb_location); | 343 | sb_location); |
341 | if (r) { | 344 | if (r) { |
342 | DMERR("couldn't create metadata space map"); | 345 | DMERR("couldn't create metadata space map"); |
343 | goto bad2; | 346 | goto bad; |
344 | } | 347 | } |
345 | 348 | ||
346 | *sm = dm_sm_checker_create(inner); | ||
347 | if (!*sm) | ||
348 | goto bad2; | ||
349 | |||
350 | } else { | 349 | } else { |
351 | r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location, | 350 | r = dm_sm_metadata_open(*sm, *tm, sm_root, sm_len); |
352 | sb_validator, sblock); | ||
353 | if (r < 0) { | ||
354 | DMERR("couldn't lock superblock"); | ||
355 | goto bad1; | ||
356 | } | ||
357 | |||
358 | r = dm_sm_metadata_open(inner, *tm, | ||
359 | dm_block_data(*sblock) + root_offset, | ||
360 | root_max_len); | ||
361 | if (r) { | 351 | if (r) { |
362 | DMERR("couldn't open metadata space map"); | 352 | DMERR("couldn't open metadata space map"); |
363 | goto bad2; | 353 | goto bad; |
364 | } | 354 | } |
365 | |||
366 | *sm = dm_sm_checker_create(inner); | ||
367 | if (!*sm) | ||
368 | goto bad2; | ||
369 | } | 355 | } |
370 | 356 | ||
371 | return 0; | 357 | return 0; |
372 | 358 | ||
373 | bad2: | 359 | bad: |
374 | dm_tm_unlock(*tm, *sblock); | ||
375 | bad1: | ||
376 | dm_tm_destroy(*tm); | 360 | dm_tm_destroy(*tm); |
377 | dm_sm_destroy(inner); | 361 | dm_sm_destroy(*sm); |
378 | return r; | 362 | return r; |
379 | } | 363 | } |
380 | 364 | ||
381 | int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, | 365 | int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, |
382 | struct dm_block_validator *sb_validator, | ||
383 | struct dm_transaction_manager **tm, | 366 | struct dm_transaction_manager **tm, |
384 | struct dm_space_map **sm, struct dm_block **sblock) | 367 | struct dm_space_map **sm) |
385 | { | 368 | { |
386 | return dm_tm_create_internal(bm, sb_location, sb_validator, | 369 | return dm_tm_create_internal(bm, sb_location, tm, sm, 1, NULL, 0); |
387 | 0, 0, tm, sm, sblock, 1); | ||
388 | } | 370 | } |
389 | EXPORT_SYMBOL_GPL(dm_tm_create_with_sm); | 371 | EXPORT_SYMBOL_GPL(dm_tm_create_with_sm); |
390 | 372 | ||
391 | int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, | 373 | int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, |
392 | struct dm_block_validator *sb_validator, | 374 | void *sm_root, size_t root_len, |
393 | size_t root_offset, size_t root_max_len, | ||
394 | struct dm_transaction_manager **tm, | 375 | struct dm_transaction_manager **tm, |
395 | struct dm_space_map **sm, struct dm_block **sblock) | 376 | struct dm_space_map **sm) |
396 | { | 377 | { |
397 | return dm_tm_create_internal(bm, sb_location, sb_validator, root_offset, | 378 | return dm_tm_create_internal(bm, sb_location, tm, sm, 0, sm_root, root_len); |
398 | root_max_len, tm, sm, sblock, 0); | ||
399 | } | 379 | } |
400 | EXPORT_SYMBOL_GPL(dm_tm_open_with_sm); | 380 | EXPORT_SYMBOL_GPL(dm_tm_open_with_sm); |
401 | 381 | ||
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index 6da784871db4..b5b139076ca5 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h | |||
@@ -115,16 +115,17 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); | |||
115 | * | 115 | * |
116 | * Returns a tm that has an open transaction to write the new disk sm. | 116 | * Returns a tm that has an open transaction to write the new disk sm. |
117 | * Caller should store the new sm root and commit. | 117 | * Caller should store the new sm root and commit. |
118 | * | ||
119 | * The superblock location is passed so the metadata space map knows it | ||
120 | * shouldn't be used. | ||
118 | */ | 121 | */ |
119 | int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, | 122 | int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, |
120 | struct dm_block_validator *sb_validator, | ||
121 | struct dm_transaction_manager **tm, | 123 | struct dm_transaction_manager **tm, |
122 | struct dm_space_map **sm, struct dm_block **sblock); | 124 | struct dm_space_map **sm); |
123 | 125 | ||
124 | int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, | 126 | int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, |
125 | struct dm_block_validator *sb_validator, | 127 | void *sm_root, size_t root_len, |
126 | size_t root_offset, size_t root_max_len, | ||
127 | struct dm_transaction_manager **tm, | 128 | struct dm_transaction_manager **tm, |
128 | struct dm_space_map **sm, struct dm_block **sblock); | 129 | struct dm_space_map **sm); |
129 | 130 | ||
130 | #endif /* _LINUX_DM_TRANSACTION_MANAGER_H */ | 131 | #endif /* _LINUX_DM_TRANSACTION_MANAGER_H */ |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a9c7981ddd24..611b5f797618 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -46,6 +46,20 @@ | |||
46 | */ | 46 | */ |
47 | #define NR_RAID1_BIOS 256 | 47 | #define NR_RAID1_BIOS 256 |
48 | 48 | ||
49 | /* when we get a read error on a read-only array, we redirect to another | ||
50 | * device without failing the first device, or trying to over-write to | ||
51 | * correct the read error. To keep track of bad blocks on a per-bio | ||
52 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
53 | */ | ||
54 | #define IO_BLOCKED ((struct bio *)1) | ||
55 | /* When we successfully write to a known bad-block, we need to remove the | ||
56 | * bad-block marking which must be done from process context. So we record | ||
57 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
58 | */ | ||
59 | #define IO_MADE_GOOD ((struct bio *)2) | ||
60 | |||
61 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
62 | |||
49 | /* When there are this many requests queue to be written by | 63 | /* When there are this many requests queue to be written by |
50 | * the raid1 thread, we become 'congested' to provide back-pressure | 64 | * the raid1 thread, we become 'congested' to provide back-pressure |
51 | * for writeback. | 65 | * for writeback. |
@@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
483 | const sector_t this_sector = r1_bio->sector; | 497 | const sector_t this_sector = r1_bio->sector; |
484 | int sectors; | 498 | int sectors; |
485 | int best_good_sectors; | 499 | int best_good_sectors; |
486 | int start_disk; | 500 | int best_disk, best_dist_disk, best_pending_disk; |
487 | int best_disk; | 501 | int has_nonrot_disk; |
488 | int i; | 502 | int disk; |
489 | sector_t best_dist; | 503 | sector_t best_dist; |
504 | unsigned int min_pending; | ||
490 | struct md_rdev *rdev; | 505 | struct md_rdev *rdev; |
491 | int choose_first; | 506 | int choose_first; |
507 | int choose_next_idle; | ||
492 | 508 | ||
493 | rcu_read_lock(); | 509 | rcu_read_lock(); |
494 | /* | 510 | /* |
@@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
499 | retry: | 515 | retry: |
500 | sectors = r1_bio->sectors; | 516 | sectors = r1_bio->sectors; |
501 | best_disk = -1; | 517 | best_disk = -1; |
518 | best_dist_disk = -1; | ||
502 | best_dist = MaxSector; | 519 | best_dist = MaxSector; |
520 | best_pending_disk = -1; | ||
521 | min_pending = UINT_MAX; | ||
503 | best_good_sectors = 0; | 522 | best_good_sectors = 0; |
523 | has_nonrot_disk = 0; | ||
524 | choose_next_idle = 0; | ||
504 | 525 | ||
505 | if (conf->mddev->recovery_cp < MaxSector && | 526 | if (conf->mddev->recovery_cp < MaxSector && |
506 | (this_sector + sectors >= conf->next_resync)) { | 527 | (this_sector + sectors >= conf->next_resync)) |
507 | choose_first = 1; | 528 | choose_first = 1; |
508 | start_disk = 0; | 529 | else |
509 | } else { | ||
510 | choose_first = 0; | 530 | choose_first = 0; |
511 | start_disk = conf->last_used; | ||
512 | } | ||
513 | 531 | ||
514 | for (i = 0 ; i < conf->raid_disks * 2 ; i++) { | 532 | for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { |
515 | sector_t dist; | 533 | sector_t dist; |
516 | sector_t first_bad; | 534 | sector_t first_bad; |
517 | int bad_sectors; | 535 | int bad_sectors; |
518 | 536 | unsigned int pending; | |
519 | int disk = start_disk + i; | 537 | bool nonrot; |
520 | if (disk >= conf->raid_disks) | ||
521 | disk -= conf->raid_disks; | ||
522 | 538 | ||
523 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 539 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
524 | if (r1_bio->bios[disk] == IO_BLOCKED | 540 | if (r1_bio->bios[disk] == IO_BLOCKED |
@@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
577 | } else | 593 | } else |
578 | best_good_sectors = sectors; | 594 | best_good_sectors = sectors; |
579 | 595 | ||
596 | nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); | ||
597 | has_nonrot_disk |= nonrot; | ||
598 | pending = atomic_read(&rdev->nr_pending); | ||
580 | dist = abs(this_sector - conf->mirrors[disk].head_position); | 599 | dist = abs(this_sector - conf->mirrors[disk].head_position); |
581 | if (choose_first | 600 | if (choose_first) { |
582 | /* Don't change to another disk for sequential reads */ | ||
583 | || conf->next_seq_sect == this_sector | ||
584 | || dist == 0 | ||
585 | /* If device is idle, use it */ | ||
586 | || atomic_read(&rdev->nr_pending) == 0) { | ||
587 | best_disk = disk; | 601 | best_disk = disk; |
588 | break; | 602 | break; |
589 | } | 603 | } |
604 | /* Don't change to another disk for sequential reads */ | ||
605 | if (conf->mirrors[disk].next_seq_sect == this_sector | ||
606 | || dist == 0) { | ||
607 | int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; | ||
608 | struct raid1_info *mirror = &conf->mirrors[disk]; | ||
609 | |||
610 | best_disk = disk; | ||
611 | /* | ||
612 | * If buffered sequential IO size exceeds optimal | ||
613 | * iosize, check if there is idle disk. If yes, choose | ||
614 | * the idle disk. read_balance could already choose an | ||
615 | * idle disk before noticing it's a sequential IO in | ||
616 | * this disk. This doesn't matter because this disk | ||
617 | * will idle, next time it will be utilized after the | ||
618 | * first disk has IO size exceeds optimal iosize. In | ||
619 | * this way, iosize of the first disk will be optimal | ||
620 | * iosize at least. iosize of the second disk might be | ||
621 | * small, but not a big deal since when the second disk | ||
622 | * starts IO, the first disk is likely still busy. | ||
623 | */ | ||
624 | if (nonrot && opt_iosize > 0 && | ||
625 | mirror->seq_start != MaxSector && | ||
626 | mirror->next_seq_sect > opt_iosize && | ||
627 | mirror->next_seq_sect - opt_iosize >= | ||
628 | mirror->seq_start) { | ||
629 | choose_next_idle = 1; | ||
630 | continue; | ||
631 | } | ||
632 | break; | ||
633 | } | ||
634 | /* If device is idle, use it */ | ||
635 | if (pending == 0) { | ||
636 | best_disk = disk; | ||
637 | break; | ||
638 | } | ||
639 | |||
640 | if (choose_next_idle) | ||
641 | continue; | ||
642 | |||
643 | if (min_pending > pending) { | ||
644 | min_pending = pending; | ||
645 | best_pending_disk = disk; | ||
646 | } | ||
647 | |||
590 | if (dist < best_dist) { | 648 | if (dist < best_dist) { |
591 | best_dist = dist; | 649 | best_dist = dist; |
592 | best_disk = disk; | 650 | best_dist_disk = disk; |
593 | } | 651 | } |
594 | } | 652 | } |
595 | 653 | ||
654 | /* | ||
655 | * If all disks are rotational, choose the closest disk. If any disk is | ||
656 | * non-rotational, choose the disk with less pending request even the | ||
657 | * disk is rotational, which might/might not be optimal for raids with | ||
658 | * mixed ratation/non-rotational disks depending on workload. | ||
659 | */ | ||
660 | if (best_disk == -1) { | ||
661 | if (has_nonrot_disk) | ||
662 | best_disk = best_pending_disk; | ||
663 | else | ||
664 | best_disk = best_dist_disk; | ||
665 | } | ||
666 | |||
596 | if (best_disk >= 0) { | 667 | if (best_disk >= 0) { |
597 | rdev = rcu_dereference(conf->mirrors[best_disk].rdev); | 668 | rdev = rcu_dereference(conf->mirrors[best_disk].rdev); |
598 | if (!rdev) | 669 | if (!rdev) |
@@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
606 | goto retry; | 677 | goto retry; |
607 | } | 678 | } |
608 | sectors = best_good_sectors; | 679 | sectors = best_good_sectors; |
609 | conf->next_seq_sect = this_sector + sectors; | 680 | |
610 | conf->last_used = best_disk; | 681 | if (conf->mirrors[best_disk].next_seq_sect != this_sector) |
682 | conf->mirrors[best_disk].seq_start = this_sector; | ||
683 | |||
684 | conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; | ||
611 | } | 685 | } |
612 | rcu_read_unlock(); | 686 | rcu_read_unlock(); |
613 | *max_sectors = sectors; | 687 | *max_sectors = sectors; |
@@ -870,10 +944,48 @@ do_sync_io: | |||
870 | pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | 944 | pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); |
871 | } | 945 | } |
872 | 946 | ||
947 | struct raid1_plug_cb { | ||
948 | struct blk_plug_cb cb; | ||
949 | struct bio_list pending; | ||
950 | int pending_cnt; | ||
951 | }; | ||
952 | |||
953 | static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) | ||
954 | { | ||
955 | struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, | ||
956 | cb); | ||
957 | struct mddev *mddev = plug->cb.data; | ||
958 | struct r1conf *conf = mddev->private; | ||
959 | struct bio *bio; | ||
960 | |||
961 | if (from_schedule) { | ||
962 | spin_lock_irq(&conf->device_lock); | ||
963 | bio_list_merge(&conf->pending_bio_list, &plug->pending); | ||
964 | conf->pending_count += plug->pending_cnt; | ||
965 | spin_unlock_irq(&conf->device_lock); | ||
966 | md_wakeup_thread(mddev->thread); | ||
967 | kfree(plug); | ||
968 | return; | ||
969 | } | ||
970 | |||
971 | /* we aren't scheduling, so we can do the write-out directly. */ | ||
972 | bio = bio_list_get(&plug->pending); | ||
973 | bitmap_unplug(mddev->bitmap); | ||
974 | wake_up(&conf->wait_barrier); | ||
975 | |||
976 | while (bio) { /* submit pending writes */ | ||
977 | struct bio *next = bio->bi_next; | ||
978 | bio->bi_next = NULL; | ||
979 | generic_make_request(bio); | ||
980 | bio = next; | ||
981 | } | ||
982 | kfree(plug); | ||
983 | } | ||
984 | |||
873 | static void make_request(struct mddev *mddev, struct bio * bio) | 985 | static void make_request(struct mddev *mddev, struct bio * bio) |
874 | { | 986 | { |
875 | struct r1conf *conf = mddev->private; | 987 | struct r1conf *conf = mddev->private; |
876 | struct mirror_info *mirror; | 988 | struct raid1_info *mirror; |
877 | struct r1bio *r1_bio; | 989 | struct r1bio *r1_bio; |
878 | struct bio *read_bio; | 990 | struct bio *read_bio; |
879 | int i, disks; | 991 | int i, disks; |
@@ -883,7 +995,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
883 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 995 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
884 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); | 996 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
885 | struct md_rdev *blocked_rdev; | 997 | struct md_rdev *blocked_rdev; |
886 | int plugged; | 998 | struct blk_plug_cb *cb; |
999 | struct raid1_plug_cb *plug = NULL; | ||
887 | int first_clone; | 1000 | int first_clone; |
888 | int sectors_handled; | 1001 | int sectors_handled; |
889 | int max_sectors; | 1002 | int max_sectors; |
@@ -1034,7 +1147,6 @@ read_again: | |||
1034 | * the bad blocks. Each set of writes gets it's own r1bio | 1147 | * the bad blocks. Each set of writes gets it's own r1bio |
1035 | * with a set of bios attached. | 1148 | * with a set of bios attached. |
1036 | */ | 1149 | */ |
1037 | plugged = mddev_check_plugged(mddev); | ||
1038 | 1150 | ||
1039 | disks = conf->raid_disks * 2; | 1151 | disks = conf->raid_disks * 2; |
1040 | retry_write: | 1152 | retry_write: |
@@ -1187,10 +1299,23 @@ read_again: | |||
1187 | mbio->bi_private = r1_bio; | 1299 | mbio->bi_private = r1_bio; |
1188 | 1300 | ||
1189 | atomic_inc(&r1_bio->remaining); | 1301 | atomic_inc(&r1_bio->remaining); |
1302 | |||
1303 | cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); | ||
1304 | if (cb) | ||
1305 | plug = container_of(cb, struct raid1_plug_cb, cb); | ||
1306 | else | ||
1307 | plug = NULL; | ||
1190 | spin_lock_irqsave(&conf->device_lock, flags); | 1308 | spin_lock_irqsave(&conf->device_lock, flags); |
1191 | bio_list_add(&conf->pending_bio_list, mbio); | 1309 | if (plug) { |
1192 | conf->pending_count++; | 1310 | bio_list_add(&plug->pending, mbio); |
1311 | plug->pending_cnt++; | ||
1312 | } else { | ||
1313 | bio_list_add(&conf->pending_bio_list, mbio); | ||
1314 | conf->pending_count++; | ||
1315 | } | ||
1193 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1316 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1317 | if (!plug) | ||
1318 | md_wakeup_thread(mddev->thread); | ||
1194 | } | 1319 | } |
1195 | /* Mustn't call r1_bio_write_done before this next test, | 1320 | /* Mustn't call r1_bio_write_done before this next test, |
1196 | * as it could result in the bio being freed. | 1321 | * as it could result in the bio being freed. |
@@ -1213,9 +1338,6 @@ read_again: | |||
1213 | 1338 | ||
1214 | /* In case raid1d snuck in to freeze_array */ | 1339 | /* In case raid1d snuck in to freeze_array */ |
1215 | wake_up(&conf->wait_barrier); | 1340 | wake_up(&conf->wait_barrier); |
1216 | |||
1217 | if (do_sync || !bitmap || !plugged) | ||
1218 | md_wakeup_thread(mddev->thread); | ||
1219 | } | 1341 | } |
1220 | 1342 | ||
1221 | static void status(struct seq_file *seq, struct mddev *mddev) | 1343 | static void status(struct seq_file *seq, struct mddev *mddev) |
@@ -1367,7 +1489,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1367 | struct r1conf *conf = mddev->private; | 1489 | struct r1conf *conf = mddev->private; |
1368 | int err = -EEXIST; | 1490 | int err = -EEXIST; |
1369 | int mirror = 0; | 1491 | int mirror = 0; |
1370 | struct mirror_info *p; | 1492 | struct raid1_info *p; |
1371 | int first = 0; | 1493 | int first = 0; |
1372 | int last = conf->raid_disks - 1; | 1494 | int last = conf->raid_disks - 1; |
1373 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 1495 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
@@ -1436,7 +1558,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1436 | struct r1conf *conf = mddev->private; | 1558 | struct r1conf *conf = mddev->private; |
1437 | int err = 0; | 1559 | int err = 0; |
1438 | int number = rdev->raid_disk; | 1560 | int number = rdev->raid_disk; |
1439 | struct mirror_info *p = conf->mirrors+ number; | 1561 | struct raid1_info *p = conf->mirrors + number; |
1440 | 1562 | ||
1441 | if (rdev != p->rdev) | 1563 | if (rdev != p->rdev) |
1442 | p = conf->mirrors + conf->raid_disks + number; | 1564 | p = conf->mirrors + conf->raid_disks + number; |
@@ -1821,8 +1943,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) | |||
1821 | 1943 | ||
1822 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 1944 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
1823 | /* if we're here, all write(s) have completed, so clean up */ | 1945 | /* if we're here, all write(s) have completed, so clean up */ |
1824 | md_done_sync(mddev, r1_bio->sectors, 1); | 1946 | int s = r1_bio->sectors; |
1825 | put_buf(r1_bio); | 1947 | if (test_bit(R1BIO_MadeGood, &r1_bio->state) || |
1948 | test_bit(R1BIO_WriteError, &r1_bio->state)) | ||
1949 | reschedule_retry(r1_bio); | ||
1950 | else { | ||
1951 | put_buf(r1_bio); | ||
1952 | md_done_sync(mddev, s, 1); | ||
1953 | } | ||
1826 | } | 1954 | } |
1827 | } | 1955 | } |
1828 | 1956 | ||
@@ -2170,8 +2298,7 @@ static void raid1d(struct mddev *mddev) | |||
2170 | blk_start_plug(&plug); | 2298 | blk_start_plug(&plug); |
2171 | for (;;) { | 2299 | for (;;) { |
2172 | 2300 | ||
2173 | if (atomic_read(&mddev->plug_cnt) == 0) | 2301 | flush_pending_writes(conf); |
2174 | flush_pending_writes(conf); | ||
2175 | 2302 | ||
2176 | spin_lock_irqsave(&conf->device_lock, flags); | 2303 | spin_lock_irqsave(&conf->device_lock, flags); |
2177 | if (list_empty(head)) { | 2304 | if (list_empty(head)) { |
@@ -2368,6 +2495,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
2368 | bio->bi_rw = READ; | 2495 | bio->bi_rw = READ; |
2369 | bio->bi_end_io = end_sync_read; | 2496 | bio->bi_end_io = end_sync_read; |
2370 | read_targets++; | 2497 | read_targets++; |
2498 | } else if (!test_bit(WriteErrorSeen, &rdev->flags) && | ||
2499 | test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && | ||
2500 | !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { | ||
2501 | /* | ||
2502 | * The device is suitable for reading (InSync), | ||
2503 | * but has bad block(s) here. Let's try to correct them, | ||
2504 | * if we are doing resync or repair. Otherwise, leave | ||
2505 | * this device alone for this sync request. | ||
2506 | */ | ||
2507 | bio->bi_rw = WRITE; | ||
2508 | bio->bi_end_io = end_sync_write; | ||
2509 | write_targets++; | ||
2371 | } | 2510 | } |
2372 | } | 2511 | } |
2373 | if (bio->bi_end_io) { | 2512 | if (bio->bi_end_io) { |
@@ -2425,7 +2564,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
2425 | /* There is nowhere to write, so all non-sync | 2564 | /* There is nowhere to write, so all non-sync |
2426 | * drives must be failed - so we are finished | 2565 | * drives must be failed - so we are finished |
2427 | */ | 2566 | */ |
2428 | sector_t rv = max_sector - sector_nr; | 2567 | sector_t rv; |
2568 | if (min_bad > 0) | ||
2569 | max_sector = sector_nr + min_bad; | ||
2570 | rv = max_sector - sector_nr; | ||
2429 | *skipped = 1; | 2571 | *skipped = 1; |
2430 | put_buf(r1_bio); | 2572 | put_buf(r1_bio); |
2431 | return rv; | 2573 | return rv; |
@@ -2488,9 +2630,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
2488 | */ | 2630 | */ |
2489 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 2631 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { |
2490 | atomic_set(&r1_bio->remaining, read_targets); | 2632 | atomic_set(&r1_bio->remaining, read_targets); |
2491 | for (i = 0; i < conf->raid_disks * 2; i++) { | 2633 | for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) { |
2492 | bio = r1_bio->bios[i]; | 2634 | bio = r1_bio->bios[i]; |
2493 | if (bio->bi_end_io == end_sync_read) { | 2635 | if (bio->bi_end_io == end_sync_read) { |
2636 | read_targets--; | ||
2494 | md_sync_acct(bio->bi_bdev, nr_sectors); | 2637 | md_sync_acct(bio->bi_bdev, nr_sectors); |
2495 | generic_make_request(bio); | 2638 | generic_make_request(bio); |
2496 | } | 2639 | } |
@@ -2517,7 +2660,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2517 | { | 2660 | { |
2518 | struct r1conf *conf; | 2661 | struct r1conf *conf; |
2519 | int i; | 2662 | int i; |
2520 | struct mirror_info *disk; | 2663 | struct raid1_info *disk; |
2521 | struct md_rdev *rdev; | 2664 | struct md_rdev *rdev; |
2522 | int err = -ENOMEM; | 2665 | int err = -ENOMEM; |
2523 | 2666 | ||
@@ -2525,7 +2668,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2525 | if (!conf) | 2668 | if (!conf) |
2526 | goto abort; | 2669 | goto abort; |
2527 | 2670 | ||
2528 | conf->mirrors = kzalloc(sizeof(struct mirror_info) | 2671 | conf->mirrors = kzalloc(sizeof(struct raid1_info) |
2529 | * mddev->raid_disks * 2, | 2672 | * mddev->raid_disks * 2, |
2530 | GFP_KERNEL); | 2673 | GFP_KERNEL); |
2531 | if (!conf->mirrors) | 2674 | if (!conf->mirrors) |
@@ -2568,6 +2711,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2568 | mddev->merge_check_needed = 1; | 2711 | mddev->merge_check_needed = 1; |
2569 | 2712 | ||
2570 | disk->head_position = 0; | 2713 | disk->head_position = 0; |
2714 | disk->seq_start = MaxSector; | ||
2571 | } | 2715 | } |
2572 | conf->raid_disks = mddev->raid_disks; | 2716 | conf->raid_disks = mddev->raid_disks; |
2573 | conf->mddev = mddev; | 2717 | conf->mddev = mddev; |
@@ -2581,7 +2725,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2581 | conf->recovery_disabled = mddev->recovery_disabled - 1; | 2725 | conf->recovery_disabled = mddev->recovery_disabled - 1; |
2582 | 2726 | ||
2583 | err = -EIO; | 2727 | err = -EIO; |
2584 | conf->last_used = -1; | ||
2585 | for (i = 0; i < conf->raid_disks * 2; i++) { | 2728 | for (i = 0; i < conf->raid_disks * 2; i++) { |
2586 | 2729 | ||
2587 | disk = conf->mirrors + i; | 2730 | disk = conf->mirrors + i; |
@@ -2607,21 +2750,11 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2607 | if (disk->rdev && | 2750 | if (disk->rdev && |
2608 | (disk->rdev->saved_raid_disk < 0)) | 2751 | (disk->rdev->saved_raid_disk < 0)) |
2609 | conf->fullsync = 1; | 2752 | conf->fullsync = 1; |
2610 | } else if (conf->last_used < 0) | 2753 | } |
2611 | /* | ||
2612 | * The first working device is used as a | ||
2613 | * starting point to read balancing. | ||
2614 | */ | ||
2615 | conf->last_used = i; | ||
2616 | } | 2754 | } |
2617 | 2755 | ||
2618 | if (conf->last_used < 0) { | ||
2619 | printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", | ||
2620 | mdname(mddev)); | ||
2621 | goto abort; | ||
2622 | } | ||
2623 | err = -ENOMEM; | 2756 | err = -ENOMEM; |
2624 | conf->thread = md_register_thread(raid1d, mddev, NULL); | 2757 | conf->thread = md_register_thread(raid1d, mddev, "raid1"); |
2625 | if (!conf->thread) { | 2758 | if (!conf->thread) { |
2626 | printk(KERN_ERR | 2759 | printk(KERN_ERR |
2627 | "md/raid1:%s: couldn't allocate thread\n", | 2760 | "md/raid1:%s: couldn't allocate thread\n", |
@@ -2794,7 +2927,7 @@ static int raid1_reshape(struct mddev *mddev) | |||
2794 | */ | 2927 | */ |
2795 | mempool_t *newpool, *oldpool; | 2928 | mempool_t *newpool, *oldpool; |
2796 | struct pool_info *newpoolinfo; | 2929 | struct pool_info *newpoolinfo; |
2797 | struct mirror_info *newmirrors; | 2930 | struct raid1_info *newmirrors; |
2798 | struct r1conf *conf = mddev->private; | 2931 | struct r1conf *conf = mddev->private; |
2799 | int cnt, raid_disks; | 2932 | int cnt, raid_disks; |
2800 | unsigned long flags; | 2933 | unsigned long flags; |
@@ -2837,7 +2970,7 @@ static int raid1_reshape(struct mddev *mddev) | |||
2837 | kfree(newpoolinfo); | 2970 | kfree(newpoolinfo); |
2838 | return -ENOMEM; | 2971 | return -ENOMEM; |
2839 | } | 2972 | } |
2840 | newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, | 2973 | newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2, |
2841 | GFP_KERNEL); | 2974 | GFP_KERNEL); |
2842 | if (!newmirrors) { | 2975 | if (!newmirrors) { |
2843 | kfree(newpoolinfo); | 2976 | kfree(newpoolinfo); |
@@ -2876,7 +3009,6 @@ static int raid1_reshape(struct mddev *mddev) | |||
2876 | conf->raid_disks = mddev->raid_disks = raid_disks; | 3009 | conf->raid_disks = mddev->raid_disks = raid_disks; |
2877 | mddev->delta_disks = 0; | 3010 | mddev->delta_disks = 0; |
2878 | 3011 | ||
2879 | conf->last_used = 0; /* just make sure it is in-range */ | ||
2880 | lower_barrier(conf); | 3012 | lower_barrier(conf); |
2881 | 3013 | ||
2882 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 3014 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 80ded139314c..0ff3715fb7eb 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
@@ -1,9 +1,15 @@ | |||
1 | #ifndef _RAID1_H | 1 | #ifndef _RAID1_H |
2 | #define _RAID1_H | 2 | #define _RAID1_H |
3 | 3 | ||
4 | struct mirror_info { | 4 | struct raid1_info { |
5 | struct md_rdev *rdev; | 5 | struct md_rdev *rdev; |
6 | sector_t head_position; | 6 | sector_t head_position; |
7 | |||
8 | /* When choose the best device for a read (read_balance()) | ||
9 | * we try to keep sequential reads one the same device | ||
10 | */ | ||
11 | sector_t next_seq_sect; | ||
12 | sector_t seq_start; | ||
7 | }; | 13 | }; |
8 | 14 | ||
9 | /* | 15 | /* |
@@ -24,17 +30,11 @@ struct pool_info { | |||
24 | 30 | ||
25 | struct r1conf { | 31 | struct r1conf { |
26 | struct mddev *mddev; | 32 | struct mddev *mddev; |
27 | struct mirror_info *mirrors; /* twice 'raid_disks' to | 33 | struct raid1_info *mirrors; /* twice 'raid_disks' to |
28 | * allow for replacements. | 34 | * allow for replacements. |
29 | */ | 35 | */ |
30 | int raid_disks; | 36 | int raid_disks; |
31 | 37 | ||
32 | /* When choose the best device for a read (read_balance()) | ||
33 | * we try to keep sequential reads one the same device | ||
34 | * using 'last_used' and 'next_seq_sect' | ||
35 | */ | ||
36 | int last_used; | ||
37 | sector_t next_seq_sect; | ||
38 | /* During resync, read_balancing is only allowed on the part | 38 | /* During resync, read_balancing is only allowed on the part |
39 | * of the array that has been resynced. 'next_resync' tells us | 39 | * of the array that has been resynced. 'next_resync' tells us |
40 | * where that is. | 40 | * where that is. |
@@ -135,20 +135,6 @@ struct r1bio { | |||
135 | /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ | 135 | /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ |
136 | }; | 136 | }; |
137 | 137 | ||
138 | /* when we get a read error on a read-only array, we redirect to another | ||
139 | * device without failing the first device, or trying to over-write to | ||
140 | * correct the read error. To keep track of bad blocks on a per-bio | ||
141 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
142 | */ | ||
143 | #define IO_BLOCKED ((struct bio *)1) | ||
144 | /* When we successfully write to a known bad-block, we need to remove the | ||
145 | * bad-block marking which must be done from process context. So we record | ||
146 | * the success by setting bios[n] to IO_MADE_GOOD | ||
147 | */ | ||
148 | #define IO_MADE_GOOD ((struct bio *)2) | ||
149 | |||
150 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
151 | |||
152 | /* bits for r1bio.state */ | 138 | /* bits for r1bio.state */ |
153 | #define R1BIO_Uptodate 0 | 139 | #define R1BIO_Uptodate 0 |
154 | #define R1BIO_IsSync 1 | 140 | #define R1BIO_IsSync 1 |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 99ae6068e456..de5ed6fd8806 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -60,7 +60,21 @@ | |||
60 | */ | 60 | */ |
61 | #define NR_RAID10_BIOS 256 | 61 | #define NR_RAID10_BIOS 256 |
62 | 62 | ||
63 | /* When there are this many requests queue to be written by | 63 | /* when we get a read error on a read-only array, we redirect to another |
64 | * device without failing the first device, or trying to over-write to | ||
65 | * correct the read error. To keep track of bad blocks on a per-bio | ||
66 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
67 | */ | ||
68 | #define IO_BLOCKED ((struct bio *)1) | ||
69 | /* When we successfully write to a known bad-block, we need to remove the | ||
70 | * bad-block marking which must be done from process context. So we record | ||
71 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
72 | */ | ||
73 | #define IO_MADE_GOOD ((struct bio *)2) | ||
74 | |||
75 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
76 | |||
77 | /* When there are this many requests queued to be written by | ||
64 | * the raid10 thread, we become 'congested' to provide back-pressure | 78 | * the raid10 thread, we become 'congested' to provide back-pressure |
65 | * for writeback. | 79 | * for writeback. |
66 | */ | 80 | */ |
@@ -717,7 +731,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, | |||
717 | int sectors = r10_bio->sectors; | 731 | int sectors = r10_bio->sectors; |
718 | int best_good_sectors; | 732 | int best_good_sectors; |
719 | sector_t new_distance, best_dist; | 733 | sector_t new_distance, best_dist; |
720 | struct md_rdev *rdev, *best_rdev; | 734 | struct md_rdev *best_rdev, *rdev = NULL; |
721 | int do_balance; | 735 | int do_balance; |
722 | int best_slot; | 736 | int best_slot; |
723 | struct geom *geo = &conf->geo; | 737 | struct geom *geo = &conf->geo; |
@@ -839,9 +853,8 @@ retry: | |||
839 | return rdev; | 853 | return rdev; |
840 | } | 854 | } |
841 | 855 | ||
842 | static int raid10_congested(void *data, int bits) | 856 | int md_raid10_congested(struct mddev *mddev, int bits) |
843 | { | 857 | { |
844 | struct mddev *mddev = data; | ||
845 | struct r10conf *conf = mddev->private; | 858 | struct r10conf *conf = mddev->private; |
846 | int i, ret = 0; | 859 | int i, ret = 0; |
847 | 860 | ||
@@ -849,8 +862,6 @@ static int raid10_congested(void *data, int bits) | |||
849 | conf->pending_count >= max_queued_requests) | 862 | conf->pending_count >= max_queued_requests) |
850 | return 1; | 863 | return 1; |
851 | 864 | ||
852 | if (mddev_congested(mddev, bits)) | ||
853 | return 1; | ||
854 | rcu_read_lock(); | 865 | rcu_read_lock(); |
855 | for (i = 0; | 866 | for (i = 0; |
856 | (i < conf->geo.raid_disks || i < conf->prev.raid_disks) | 867 | (i < conf->geo.raid_disks || i < conf->prev.raid_disks) |
@@ -866,6 +877,15 @@ static int raid10_congested(void *data, int bits) | |||
866 | rcu_read_unlock(); | 877 | rcu_read_unlock(); |
867 | return ret; | 878 | return ret; |
868 | } | 879 | } |
880 | EXPORT_SYMBOL_GPL(md_raid10_congested); | ||
881 | |||
882 | static int raid10_congested(void *data, int bits) | ||
883 | { | ||
884 | struct mddev *mddev = data; | ||
885 | |||
886 | return mddev_congested(mddev, bits) || | ||
887 | md_raid10_congested(mddev, bits); | ||
888 | } | ||
869 | 889 | ||
870 | static void flush_pending_writes(struct r10conf *conf) | 890 | static void flush_pending_writes(struct r10conf *conf) |
871 | { | 891 | { |
@@ -1039,7 +1059,6 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1039 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); | 1059 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); |
1040 | unsigned long flags; | 1060 | unsigned long flags; |
1041 | struct md_rdev *blocked_rdev; | 1061 | struct md_rdev *blocked_rdev; |
1042 | int plugged; | ||
1043 | int sectors_handled; | 1062 | int sectors_handled; |
1044 | int max_sectors; | 1063 | int max_sectors; |
1045 | int sectors; | 1064 | int sectors; |
@@ -1239,7 +1258,6 @@ read_again: | |||
1239 | * of r10_bios is recored in bio->bi_phys_segments just as with | 1258 | * of r10_bios is recored in bio->bi_phys_segments just as with |
1240 | * the read case. | 1259 | * the read case. |
1241 | */ | 1260 | */ |
1242 | plugged = mddev_check_plugged(mddev); | ||
1243 | 1261 | ||
1244 | r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ | 1262 | r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ |
1245 | raid10_find_phys(conf, r10_bio); | 1263 | raid10_find_phys(conf, r10_bio); |
@@ -1396,6 +1414,8 @@ retry_write: | |||
1396 | bio_list_add(&conf->pending_bio_list, mbio); | 1414 | bio_list_add(&conf->pending_bio_list, mbio); |
1397 | conf->pending_count++; | 1415 | conf->pending_count++; |
1398 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1416 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1417 | if (!mddev_check_plugged(mddev)) | ||
1418 | md_wakeup_thread(mddev->thread); | ||
1399 | 1419 | ||
1400 | if (!r10_bio->devs[i].repl_bio) | 1420 | if (!r10_bio->devs[i].repl_bio) |
1401 | continue; | 1421 | continue; |
@@ -1423,6 +1443,8 @@ retry_write: | |||
1423 | bio_list_add(&conf->pending_bio_list, mbio); | 1443 | bio_list_add(&conf->pending_bio_list, mbio); |
1424 | conf->pending_count++; | 1444 | conf->pending_count++; |
1425 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1445 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1446 | if (!mddev_check_plugged(mddev)) | ||
1447 | md_wakeup_thread(mddev->thread); | ||
1426 | } | 1448 | } |
1427 | 1449 | ||
1428 | /* Don't remove the bias on 'remaining' (one_write_done) until | 1450 | /* Don't remove the bias on 'remaining' (one_write_done) until |
@@ -1448,9 +1470,6 @@ retry_write: | |||
1448 | 1470 | ||
1449 | /* In case raid10d snuck in to freeze_array */ | 1471 | /* In case raid10d snuck in to freeze_array */ |
1450 | wake_up(&conf->wait_barrier); | 1472 | wake_up(&conf->wait_barrier); |
1451 | |||
1452 | if (do_sync || !mddev->bitmap || !plugged) | ||
1453 | md_wakeup_thread(mddev->thread); | ||
1454 | } | 1473 | } |
1455 | 1474 | ||
1456 | static void status(struct seq_file *seq, struct mddev *mddev) | 1475 | static void status(struct seq_file *seq, struct mddev *mddev) |
@@ -1547,7 +1566,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) | |||
1547 | static void print_conf(struct r10conf *conf) | 1566 | static void print_conf(struct r10conf *conf) |
1548 | { | 1567 | { |
1549 | int i; | 1568 | int i; |
1550 | struct mirror_info *tmp; | 1569 | struct raid10_info *tmp; |
1551 | 1570 | ||
1552 | printk(KERN_DEBUG "RAID10 conf printout:\n"); | 1571 | printk(KERN_DEBUG "RAID10 conf printout:\n"); |
1553 | if (!conf) { | 1572 | if (!conf) { |
@@ -1581,7 +1600,7 @@ static int raid10_spare_active(struct mddev *mddev) | |||
1581 | { | 1600 | { |
1582 | int i; | 1601 | int i; |
1583 | struct r10conf *conf = mddev->private; | 1602 | struct r10conf *conf = mddev->private; |
1584 | struct mirror_info *tmp; | 1603 | struct raid10_info *tmp; |
1585 | int count = 0; | 1604 | int count = 0; |
1586 | unsigned long flags; | 1605 | unsigned long flags; |
1587 | 1606 | ||
@@ -1656,7 +1675,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1656 | else | 1675 | else |
1657 | mirror = first; | 1676 | mirror = first; |
1658 | for ( ; mirror <= last ; mirror++) { | 1677 | for ( ; mirror <= last ; mirror++) { |
1659 | struct mirror_info *p = &conf->mirrors[mirror]; | 1678 | struct raid10_info *p = &conf->mirrors[mirror]; |
1660 | if (p->recovery_disabled == mddev->recovery_disabled) | 1679 | if (p->recovery_disabled == mddev->recovery_disabled) |
1661 | continue; | 1680 | continue; |
1662 | if (p->rdev) { | 1681 | if (p->rdev) { |
@@ -1710,7 +1729,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1710 | int err = 0; | 1729 | int err = 0; |
1711 | int number = rdev->raid_disk; | 1730 | int number = rdev->raid_disk; |
1712 | struct md_rdev **rdevp; | 1731 | struct md_rdev **rdevp; |
1713 | struct mirror_info *p = conf->mirrors + number; | 1732 | struct raid10_info *p = conf->mirrors + number; |
1714 | 1733 | ||
1715 | print_conf(conf); | 1734 | print_conf(conf); |
1716 | if (rdev == p->rdev) | 1735 | if (rdev == p->rdev) |
@@ -2310,7 +2329,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2310 | if (r10_sync_page_io(rdev, | 2329 | if (r10_sync_page_io(rdev, |
2311 | r10_bio->devs[sl].addr + | 2330 | r10_bio->devs[sl].addr + |
2312 | sect, | 2331 | sect, |
2313 | s<<9, conf->tmppage, WRITE) | 2332 | s, conf->tmppage, WRITE) |
2314 | == 0) { | 2333 | == 0) { |
2315 | /* Well, this device is dead */ | 2334 | /* Well, this device is dead */ |
2316 | printk(KERN_NOTICE | 2335 | printk(KERN_NOTICE |
@@ -2349,7 +2368,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2349 | switch (r10_sync_page_io(rdev, | 2368 | switch (r10_sync_page_io(rdev, |
2350 | r10_bio->devs[sl].addr + | 2369 | r10_bio->devs[sl].addr + |
2351 | sect, | 2370 | sect, |
2352 | s<<9, conf->tmppage, | 2371 | s, conf->tmppage, |
2353 | READ)) { | 2372 | READ)) { |
2354 | case 0: | 2373 | case 0: |
2355 | /* Well, this device is dead */ | 2374 | /* Well, this device is dead */ |
@@ -2512,7 +2531,7 @@ read_more: | |||
2512 | slot = r10_bio->read_slot; | 2531 | slot = r10_bio->read_slot; |
2513 | printk_ratelimited( | 2532 | printk_ratelimited( |
2514 | KERN_ERR | 2533 | KERN_ERR |
2515 | "md/raid10:%s: %s: redirecting" | 2534 | "md/raid10:%s: %s: redirecting " |
2516 | "sector %llu to another mirror\n", | 2535 | "sector %llu to another mirror\n", |
2517 | mdname(mddev), | 2536 | mdname(mddev), |
2518 | bdevname(rdev->bdev, b), | 2537 | bdevname(rdev->bdev, b), |
@@ -2876,7 +2895,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2876 | sector_t sect; | 2895 | sector_t sect; |
2877 | int must_sync; | 2896 | int must_sync; |
2878 | int any_working; | 2897 | int any_working; |
2879 | struct mirror_info *mirror = &conf->mirrors[i]; | 2898 | struct raid10_info *mirror = &conf->mirrors[i]; |
2880 | 2899 | ||
2881 | if ((mirror->rdev == NULL || | 2900 | if ((mirror->rdev == NULL || |
2882 | test_bit(In_sync, &mirror->rdev->flags)) | 2901 | test_bit(In_sync, &mirror->rdev->flags)) |
@@ -2890,6 +2909,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2890 | /* want to reconstruct this device */ | 2909 | /* want to reconstruct this device */ |
2891 | rb2 = r10_bio; | 2910 | rb2 = r10_bio; |
2892 | sect = raid10_find_virt(conf, sector_nr, i); | 2911 | sect = raid10_find_virt(conf, sector_nr, i); |
2912 | if (sect >= mddev->resync_max_sectors) { | ||
2913 | /* last stripe is not complete - don't | ||
2914 | * try to recover this sector. | ||
2915 | */ | ||
2916 | continue; | ||
2917 | } | ||
2893 | /* Unless we are doing a full sync, or a replacement | 2918 | /* Unless we are doing a full sync, or a replacement |
2894 | * we only need to recover the block if it is set in | 2919 | * we only need to recover the block if it is set in |
2895 | * the bitmap | 2920 | * the bitmap |
@@ -3382,7 +3407,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3382 | goto out; | 3407 | goto out; |
3383 | 3408 | ||
3384 | /* FIXME calc properly */ | 3409 | /* FIXME calc properly */ |
3385 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + | 3410 | conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks + |
3386 | max(0,mddev->delta_disks)), | 3411 | max(0,mddev->delta_disks)), |
3387 | GFP_KERNEL); | 3412 | GFP_KERNEL); |
3388 | if (!conf->mirrors) | 3413 | if (!conf->mirrors) |
@@ -3421,7 +3446,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3421 | spin_lock_init(&conf->resync_lock); | 3446 | spin_lock_init(&conf->resync_lock); |
3422 | init_waitqueue_head(&conf->wait_barrier); | 3447 | init_waitqueue_head(&conf->wait_barrier); |
3423 | 3448 | ||
3424 | conf->thread = md_register_thread(raid10d, mddev, NULL); | 3449 | conf->thread = md_register_thread(raid10d, mddev, "raid10"); |
3425 | if (!conf->thread) | 3450 | if (!conf->thread) |
3426 | goto out; | 3451 | goto out; |
3427 | 3452 | ||
@@ -3446,7 +3471,7 @@ static int run(struct mddev *mddev) | |||
3446 | { | 3471 | { |
3447 | struct r10conf *conf; | 3472 | struct r10conf *conf; |
3448 | int i, disk_idx, chunk_size; | 3473 | int i, disk_idx, chunk_size; |
3449 | struct mirror_info *disk; | 3474 | struct raid10_info *disk; |
3450 | struct md_rdev *rdev; | 3475 | struct md_rdev *rdev; |
3451 | sector_t size; | 3476 | sector_t size; |
3452 | sector_t min_offset_diff = 0; | 3477 | sector_t min_offset_diff = 0; |
@@ -3466,12 +3491,14 @@ static int run(struct mddev *mddev) | |||
3466 | conf->thread = NULL; | 3491 | conf->thread = NULL; |
3467 | 3492 | ||
3468 | chunk_size = mddev->chunk_sectors << 9; | 3493 | chunk_size = mddev->chunk_sectors << 9; |
3469 | blk_queue_io_min(mddev->queue, chunk_size); | 3494 | if (mddev->queue) { |
3470 | if (conf->geo.raid_disks % conf->geo.near_copies) | 3495 | blk_queue_io_min(mddev->queue, chunk_size); |
3471 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); | 3496 | if (conf->geo.raid_disks % conf->geo.near_copies) |
3472 | else | 3497 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); |
3473 | blk_queue_io_opt(mddev->queue, chunk_size * | 3498 | else |
3474 | (conf->geo.raid_disks / conf->geo.near_copies)); | 3499 | blk_queue_io_opt(mddev->queue, chunk_size * |
3500 | (conf->geo.raid_disks / conf->geo.near_copies)); | ||
3501 | } | ||
3475 | 3502 | ||
3476 | rdev_for_each(rdev, mddev) { | 3503 | rdev_for_each(rdev, mddev) { |
3477 | long long diff; | 3504 | long long diff; |
@@ -3505,8 +3532,9 @@ static int run(struct mddev *mddev) | |||
3505 | if (first || diff < min_offset_diff) | 3532 | if (first || diff < min_offset_diff) |
3506 | min_offset_diff = diff; | 3533 | min_offset_diff = diff; |
3507 | 3534 | ||
3508 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 3535 | if (mddev->gendisk) |
3509 | rdev->data_offset << 9); | 3536 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
3537 | rdev->data_offset << 9); | ||
3510 | 3538 | ||
3511 | disk->head_position = 0; | 3539 | disk->head_position = 0; |
3512 | } | 3540 | } |
@@ -3569,22 +3597,22 @@ static int run(struct mddev *mddev) | |||
3569 | md_set_array_sectors(mddev, size); | 3597 | md_set_array_sectors(mddev, size); |
3570 | mddev->resync_max_sectors = size; | 3598 | mddev->resync_max_sectors = size; |
3571 | 3599 | ||
3572 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; | 3600 | if (mddev->queue) { |
3573 | mddev->queue->backing_dev_info.congested_data = mddev; | ||
3574 | |||
3575 | /* Calculate max read-ahead size. | ||
3576 | * We need to readahead at least twice a whole stripe.... | ||
3577 | * maybe... | ||
3578 | */ | ||
3579 | { | ||
3580 | int stripe = conf->geo.raid_disks * | 3601 | int stripe = conf->geo.raid_disks * |
3581 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); | 3602 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); |
3603 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; | ||
3604 | mddev->queue->backing_dev_info.congested_data = mddev; | ||
3605 | |||
3606 | /* Calculate max read-ahead size. | ||
3607 | * We need to readahead at least twice a whole stripe.... | ||
3608 | * maybe... | ||
3609 | */ | ||
3582 | stripe /= conf->geo.near_copies; | 3610 | stripe /= conf->geo.near_copies; |
3583 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | 3611 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
3584 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | 3612 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
3613 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | ||
3585 | } | 3614 | } |
3586 | 3615 | ||
3587 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | ||
3588 | 3616 | ||
3589 | if (md_integrity_register(mddev)) | 3617 | if (md_integrity_register(mddev)) |
3590 | goto out_free_conf; | 3618 | goto out_free_conf; |
@@ -3635,7 +3663,10 @@ static int stop(struct mddev *mddev) | |||
3635 | lower_barrier(conf); | 3663 | lower_barrier(conf); |
3636 | 3664 | ||
3637 | md_unregister_thread(&mddev->thread); | 3665 | md_unregister_thread(&mddev->thread); |
3638 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 3666 | if (mddev->queue) |
3667 | /* the unplug fn references 'conf'*/ | ||
3668 | blk_sync_queue(mddev->queue); | ||
3669 | |||
3639 | if (conf->r10bio_pool) | 3670 | if (conf->r10bio_pool) |
3640 | mempool_destroy(conf->r10bio_pool); | 3671 | mempool_destroy(conf->r10bio_pool); |
3641 | kfree(conf->mirrors); | 3672 | kfree(conf->mirrors); |
@@ -3799,7 +3830,7 @@ static int raid10_check_reshape(struct mddev *mddev) | |||
3799 | if (mddev->delta_disks > 0) { | 3830 | if (mddev->delta_disks > 0) { |
3800 | /* allocate new 'mirrors' list */ | 3831 | /* allocate new 'mirrors' list */ |
3801 | conf->mirrors_new = kzalloc( | 3832 | conf->mirrors_new = kzalloc( |
3802 | sizeof(struct mirror_info) | 3833 | sizeof(struct raid10_info) |
3803 | *(mddev->raid_disks + | 3834 | *(mddev->raid_disks + |
3804 | mddev->delta_disks), | 3835 | mddev->delta_disks), |
3805 | GFP_KERNEL); | 3836 | GFP_KERNEL); |
@@ -3924,7 +3955,7 @@ static int raid10_start_reshape(struct mddev *mddev) | |||
3924 | spin_lock_irq(&conf->device_lock); | 3955 | spin_lock_irq(&conf->device_lock); |
3925 | if (conf->mirrors_new) { | 3956 | if (conf->mirrors_new) { |
3926 | memcpy(conf->mirrors_new, conf->mirrors, | 3957 | memcpy(conf->mirrors_new, conf->mirrors, |
3927 | sizeof(struct mirror_info)*conf->prev.raid_disks); | 3958 | sizeof(struct raid10_info)*conf->prev.raid_disks); |
3928 | smp_mb(); | 3959 | smp_mb(); |
3929 | kfree(conf->mirrors_old); /* FIXME and elsewhere */ | 3960 | kfree(conf->mirrors_old); /* FIXME and elsewhere */ |
3930 | conf->mirrors_old = conf->mirrors; | 3961 | conf->mirrors_old = conf->mirrors; |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 135b1b0a1554..007c2c68dd83 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -1,7 +1,7 @@ | |||
1 | #ifndef _RAID10_H | 1 | #ifndef _RAID10_H |
2 | #define _RAID10_H | 2 | #define _RAID10_H |
3 | 3 | ||
4 | struct mirror_info { | 4 | struct raid10_info { |
5 | struct md_rdev *rdev, *replacement; | 5 | struct md_rdev *rdev, *replacement; |
6 | sector_t head_position; | 6 | sector_t head_position; |
7 | int recovery_disabled; /* matches | 7 | int recovery_disabled; /* matches |
@@ -13,8 +13,8 @@ struct mirror_info { | |||
13 | 13 | ||
14 | struct r10conf { | 14 | struct r10conf { |
15 | struct mddev *mddev; | 15 | struct mddev *mddev; |
16 | struct mirror_info *mirrors; | 16 | struct raid10_info *mirrors; |
17 | struct mirror_info *mirrors_new, *mirrors_old; | 17 | struct raid10_info *mirrors_new, *mirrors_old; |
18 | spinlock_t device_lock; | 18 | spinlock_t device_lock; |
19 | 19 | ||
20 | /* geometry */ | 20 | /* geometry */ |
@@ -123,20 +123,6 @@ struct r10bio { | |||
123 | } devs[0]; | 123 | } devs[0]; |
124 | }; | 124 | }; |
125 | 125 | ||
126 | /* when we get a read error on a read-only array, we redirect to another | ||
127 | * device without failing the first device, or trying to over-write to | ||
128 | * correct the read error. To keep track of bad blocks on a per-bio | ||
129 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
130 | */ | ||
131 | #define IO_BLOCKED ((struct bio*)1) | ||
132 | /* When we successfully write to a known bad-block, we need to remove the | ||
133 | * bad-block marking which must be done from process context. So we record | ||
134 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
135 | */ | ||
136 | #define IO_MADE_GOOD ((struct bio *)2) | ||
137 | |||
138 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
139 | |||
140 | /* bits for r10bio.state */ | 126 | /* bits for r10bio.state */ |
141 | enum r10bio_state { | 127 | enum r10bio_state { |
142 | R10BIO_Uptodate, | 128 | R10BIO_Uptodate, |
@@ -159,4 +145,7 @@ enum r10bio_state { | |||
159 | */ | 145 | */ |
160 | R10BIO_Previous, | 146 | R10BIO_Previous, |
161 | }; | 147 | }; |
148 | |||
149 | extern int md_raid10_congested(struct mddev *mddev, int bits); | ||
150 | |||
162 | #endif | 151 | #endif |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d26767246d26..adda94df5eb2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -99,34 +99,40 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) | |||
99 | * We maintain a biased count of active stripes in the bottom 16 bits of | 99 | * We maintain a biased count of active stripes in the bottom 16 bits of |
100 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits | 100 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits |
101 | */ | 101 | */ |
102 | static inline int raid5_bi_phys_segments(struct bio *bio) | 102 | static inline int raid5_bi_processed_stripes(struct bio *bio) |
103 | { | 103 | { |
104 | return bio->bi_phys_segments & 0xffff; | 104 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
105 | return (atomic_read(segments) >> 16) & 0xffff; | ||
105 | } | 106 | } |
106 | 107 | ||
107 | static inline int raid5_bi_hw_segments(struct bio *bio) | 108 | static inline int raid5_dec_bi_active_stripes(struct bio *bio) |
108 | { | 109 | { |
109 | return (bio->bi_phys_segments >> 16) & 0xffff; | 110 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
111 | return atomic_sub_return(1, segments) & 0xffff; | ||
110 | } | 112 | } |
111 | 113 | ||
112 | static inline int raid5_dec_bi_phys_segments(struct bio *bio) | 114 | static inline void raid5_inc_bi_active_stripes(struct bio *bio) |
113 | { | 115 | { |
114 | --bio->bi_phys_segments; | 116 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
115 | return raid5_bi_phys_segments(bio); | 117 | atomic_inc(segments); |
116 | } | 118 | } |
117 | 119 | ||
118 | static inline int raid5_dec_bi_hw_segments(struct bio *bio) | 120 | static inline void raid5_set_bi_processed_stripes(struct bio *bio, |
121 | unsigned int cnt) | ||
119 | { | 122 | { |
120 | unsigned short val = raid5_bi_hw_segments(bio); | 123 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
124 | int old, new; | ||
121 | 125 | ||
122 | --val; | 126 | do { |
123 | bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); | 127 | old = atomic_read(segments); |
124 | return val; | 128 | new = (old & 0xffff) | (cnt << 16); |
129 | } while (atomic_cmpxchg(segments, old, new) != old); | ||
125 | } | 130 | } |
126 | 131 | ||
127 | static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) | 132 | static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) |
128 | { | 133 | { |
129 | bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); | 134 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
135 | atomic_set(segments, cnt); | ||
130 | } | 136 | } |
131 | 137 | ||
132 | /* Find first data disk in a raid6 stripe */ | 138 | /* Find first data disk in a raid6 stripe */ |
@@ -190,47 +196,56 @@ static int stripe_operations_active(struct stripe_head *sh) | |||
190 | test_bit(STRIPE_COMPUTE_RUN, &sh->state); | 196 | test_bit(STRIPE_COMPUTE_RUN, &sh->state); |
191 | } | 197 | } |
192 | 198 | ||
193 | static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) | 199 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) |
194 | { | 200 | { |
195 | if (atomic_dec_and_test(&sh->count)) { | 201 | BUG_ON(!list_empty(&sh->lru)); |
196 | BUG_ON(!list_empty(&sh->lru)); | 202 | BUG_ON(atomic_read(&conf->active_stripes)==0); |
197 | BUG_ON(atomic_read(&conf->active_stripes)==0); | 203 | if (test_bit(STRIPE_HANDLE, &sh->state)) { |
198 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | 204 | if (test_bit(STRIPE_DELAYED, &sh->state) && |
199 | if (test_bit(STRIPE_DELAYED, &sh->state)) | 205 | !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
200 | list_add_tail(&sh->lru, &conf->delayed_list); | 206 | list_add_tail(&sh->lru, &conf->delayed_list); |
201 | else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && | 207 | else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && |
202 | sh->bm_seq - conf->seq_write > 0) | 208 | sh->bm_seq - conf->seq_write > 0) |
203 | list_add_tail(&sh->lru, &conf->bitmap_list); | 209 | list_add_tail(&sh->lru, &conf->bitmap_list); |
204 | else { | 210 | else { |
205 | clear_bit(STRIPE_BIT_DELAY, &sh->state); | 211 | clear_bit(STRIPE_DELAYED, &sh->state); |
206 | list_add_tail(&sh->lru, &conf->handle_list); | 212 | clear_bit(STRIPE_BIT_DELAY, &sh->state); |
207 | } | 213 | list_add_tail(&sh->lru, &conf->handle_list); |
208 | md_wakeup_thread(conf->mddev->thread); | 214 | } |
209 | } else { | 215 | md_wakeup_thread(conf->mddev->thread); |
210 | BUG_ON(stripe_operations_active(sh)); | 216 | } else { |
211 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 217 | BUG_ON(stripe_operations_active(sh)); |
212 | if (atomic_dec_return(&conf->preread_active_stripes) | 218 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
213 | < IO_THRESHOLD) | 219 | if (atomic_dec_return(&conf->preread_active_stripes) |
214 | md_wakeup_thread(conf->mddev->thread); | 220 | < IO_THRESHOLD) |
215 | atomic_dec(&conf->active_stripes); | 221 | md_wakeup_thread(conf->mddev->thread); |
216 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { | 222 | atomic_dec(&conf->active_stripes); |
217 | list_add_tail(&sh->lru, &conf->inactive_list); | 223 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { |
218 | wake_up(&conf->wait_for_stripe); | 224 | list_add_tail(&sh->lru, &conf->inactive_list); |
219 | if (conf->retry_read_aligned) | 225 | wake_up(&conf->wait_for_stripe); |
220 | md_wakeup_thread(conf->mddev->thread); | 226 | if (conf->retry_read_aligned) |
221 | } | 227 | md_wakeup_thread(conf->mddev->thread); |
222 | } | 228 | } |
223 | } | 229 | } |
224 | } | 230 | } |
225 | 231 | ||
232 | static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) | ||
233 | { | ||
234 | if (atomic_dec_and_test(&sh->count)) | ||
235 | do_release_stripe(conf, sh); | ||
236 | } | ||
237 | |||
226 | static void release_stripe(struct stripe_head *sh) | 238 | static void release_stripe(struct stripe_head *sh) |
227 | { | 239 | { |
228 | struct r5conf *conf = sh->raid_conf; | 240 | struct r5conf *conf = sh->raid_conf; |
229 | unsigned long flags; | 241 | unsigned long flags; |
230 | 242 | ||
231 | spin_lock_irqsave(&conf->device_lock, flags); | 243 | local_irq_save(flags); |
232 | __release_stripe(conf, sh); | 244 | if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { |
233 | spin_unlock_irqrestore(&conf->device_lock, flags); | 245 | do_release_stripe(conf, sh); |
246 | spin_unlock(&conf->device_lock); | ||
247 | } | ||
248 | local_irq_restore(flags); | ||
234 | } | 249 | } |
235 | 250 | ||
236 | static inline void remove_hash(struct stripe_head *sh) | 251 | static inline void remove_hash(struct stripe_head *sh) |
@@ -469,7 +484,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
469 | } else { | 484 | } else { |
470 | if (atomic_read(&sh->count)) { | 485 | if (atomic_read(&sh->count)) { |
471 | BUG_ON(!list_empty(&sh->lru) | 486 | BUG_ON(!list_empty(&sh->lru) |
472 | && !test_bit(STRIPE_EXPANDING, &sh->state)); | 487 | && !test_bit(STRIPE_EXPANDING, &sh->state) |
488 | && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)); | ||
473 | } else { | 489 | } else { |
474 | if (!test_bit(STRIPE_HANDLE, &sh->state)) | 490 | if (!test_bit(STRIPE_HANDLE, &sh->state)) |
475 | atomic_inc(&conf->active_stripes); | 491 | atomic_inc(&conf->active_stripes); |
@@ -606,6 +622,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
606 | * a chance*/ | 622 | * a chance*/ |
607 | md_check_recovery(conf->mddev); | 623 | md_check_recovery(conf->mddev); |
608 | } | 624 | } |
625 | /* | ||
626 | * Because md_wait_for_blocked_rdev | ||
627 | * will dec nr_pending, we must | ||
628 | * increment it first. | ||
629 | */ | ||
630 | atomic_inc(&rdev->nr_pending); | ||
609 | md_wait_for_blocked_rdev(rdev, conf->mddev); | 631 | md_wait_for_blocked_rdev(rdev, conf->mddev); |
610 | } else { | 632 | } else { |
611 | /* Acknowledged bad block - skip the write */ | 633 | /* Acknowledged bad block - skip the write */ |
@@ -632,6 +654,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
632 | else | 654 | else |
633 | bi->bi_sector = (sh->sector | 655 | bi->bi_sector = (sh->sector |
634 | + rdev->data_offset); | 656 | + rdev->data_offset); |
657 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | ||
658 | bi->bi_rw |= REQ_FLUSH; | ||
659 | |||
635 | bi->bi_flags = 1 << BIO_UPTODATE; | 660 | bi->bi_flags = 1 << BIO_UPTODATE; |
636 | bi->bi_idx = 0; | 661 | bi->bi_idx = 0; |
637 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 662 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
@@ -741,14 +766,12 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
741 | { | 766 | { |
742 | struct stripe_head *sh = stripe_head_ref; | 767 | struct stripe_head *sh = stripe_head_ref; |
743 | struct bio *return_bi = NULL; | 768 | struct bio *return_bi = NULL; |
744 | struct r5conf *conf = sh->raid_conf; | ||
745 | int i; | 769 | int i; |
746 | 770 | ||
747 | pr_debug("%s: stripe %llu\n", __func__, | 771 | pr_debug("%s: stripe %llu\n", __func__, |
748 | (unsigned long long)sh->sector); | 772 | (unsigned long long)sh->sector); |
749 | 773 | ||
750 | /* clear completed biofills */ | 774 | /* clear completed biofills */ |
751 | spin_lock_irq(&conf->device_lock); | ||
752 | for (i = sh->disks; i--; ) { | 775 | for (i = sh->disks; i--; ) { |
753 | struct r5dev *dev = &sh->dev[i]; | 776 | struct r5dev *dev = &sh->dev[i]; |
754 | 777 | ||
@@ -766,7 +789,7 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
766 | while (rbi && rbi->bi_sector < | 789 | while (rbi && rbi->bi_sector < |
767 | dev->sector + STRIPE_SECTORS) { | 790 | dev->sector + STRIPE_SECTORS) { |
768 | rbi2 = r5_next_bio(rbi, dev->sector); | 791 | rbi2 = r5_next_bio(rbi, dev->sector); |
769 | if (!raid5_dec_bi_phys_segments(rbi)) { | 792 | if (!raid5_dec_bi_active_stripes(rbi)) { |
770 | rbi->bi_next = return_bi; | 793 | rbi->bi_next = return_bi; |
771 | return_bi = rbi; | 794 | return_bi = rbi; |
772 | } | 795 | } |
@@ -774,7 +797,6 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
774 | } | 797 | } |
775 | } | 798 | } |
776 | } | 799 | } |
777 | spin_unlock_irq(&conf->device_lock); | ||
778 | clear_bit(STRIPE_BIOFILL_RUN, &sh->state); | 800 | clear_bit(STRIPE_BIOFILL_RUN, &sh->state); |
779 | 801 | ||
780 | return_io(return_bi); | 802 | return_io(return_bi); |
@@ -786,7 +808,6 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
786 | static void ops_run_biofill(struct stripe_head *sh) | 808 | static void ops_run_biofill(struct stripe_head *sh) |
787 | { | 809 | { |
788 | struct dma_async_tx_descriptor *tx = NULL; | 810 | struct dma_async_tx_descriptor *tx = NULL; |
789 | struct r5conf *conf = sh->raid_conf; | ||
790 | struct async_submit_ctl submit; | 811 | struct async_submit_ctl submit; |
791 | int i; | 812 | int i; |
792 | 813 | ||
@@ -797,10 +818,10 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
797 | struct r5dev *dev = &sh->dev[i]; | 818 | struct r5dev *dev = &sh->dev[i]; |
798 | if (test_bit(R5_Wantfill, &dev->flags)) { | 819 | if (test_bit(R5_Wantfill, &dev->flags)) { |
799 | struct bio *rbi; | 820 | struct bio *rbi; |
800 | spin_lock_irq(&conf->device_lock); | 821 | spin_lock_irq(&sh->stripe_lock); |
801 | dev->read = rbi = dev->toread; | 822 | dev->read = rbi = dev->toread; |
802 | dev->toread = NULL; | 823 | dev->toread = NULL; |
803 | spin_unlock_irq(&conf->device_lock); | 824 | spin_unlock_irq(&sh->stripe_lock); |
804 | while (rbi && rbi->bi_sector < | 825 | while (rbi && rbi->bi_sector < |
805 | dev->sector + STRIPE_SECTORS) { | 826 | dev->sector + STRIPE_SECTORS) { |
806 | tx = async_copy_data(0, rbi, dev->page, | 827 | tx = async_copy_data(0, rbi, dev->page, |
@@ -1136,12 +1157,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1136 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { | 1157 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { |
1137 | struct bio *wbi; | 1158 | struct bio *wbi; |
1138 | 1159 | ||
1139 | spin_lock_irq(&sh->raid_conf->device_lock); | 1160 | spin_lock_irq(&sh->stripe_lock); |
1140 | chosen = dev->towrite; | 1161 | chosen = dev->towrite; |
1141 | dev->towrite = NULL; | 1162 | dev->towrite = NULL; |
1142 | BUG_ON(dev->written); | 1163 | BUG_ON(dev->written); |
1143 | wbi = dev->written = chosen; | 1164 | wbi = dev->written = chosen; |
1144 | spin_unlock_irq(&sh->raid_conf->device_lock); | 1165 | spin_unlock_irq(&sh->stripe_lock); |
1145 | 1166 | ||
1146 | while (wbi && wbi->bi_sector < | 1167 | while (wbi && wbi->bi_sector < |
1147 | dev->sector + STRIPE_SECTORS) { | 1168 | dev->sector + STRIPE_SECTORS) { |
@@ -1446,6 +1467,8 @@ static int grow_one_stripe(struct r5conf *conf) | |||
1446 | init_waitqueue_head(&sh->ops.wait_for_ops); | 1467 | init_waitqueue_head(&sh->ops.wait_for_ops); |
1447 | #endif | 1468 | #endif |
1448 | 1469 | ||
1470 | spin_lock_init(&sh->stripe_lock); | ||
1471 | |||
1449 | if (grow_buffers(sh)) { | 1472 | if (grow_buffers(sh)) { |
1450 | shrink_buffers(sh); | 1473 | shrink_buffers(sh); |
1451 | kmem_cache_free(conf->slab_cache, sh); | 1474 | kmem_cache_free(conf->slab_cache, sh); |
@@ -1731,12 +1754,15 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1731 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | 1754 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); |
1732 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1755 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
1733 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | 1756 | clear_bit(R5_ReWrite, &sh->dev[i].flags); |
1734 | } | 1757 | } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) |
1758 | clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); | ||
1759 | |||
1735 | if (atomic_read(&rdev->read_errors)) | 1760 | if (atomic_read(&rdev->read_errors)) |
1736 | atomic_set(&rdev->read_errors, 0); | 1761 | atomic_set(&rdev->read_errors, 0); |
1737 | } else { | 1762 | } else { |
1738 | const char *bdn = bdevname(rdev->bdev, b); | 1763 | const char *bdn = bdevname(rdev->bdev, b); |
1739 | int retry = 0; | 1764 | int retry = 0; |
1765 | int set_bad = 0; | ||
1740 | 1766 | ||
1741 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | 1767 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); |
1742 | atomic_inc(&rdev->read_errors); | 1768 | atomic_inc(&rdev->read_errors); |
@@ -1748,7 +1774,8 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1748 | mdname(conf->mddev), | 1774 | mdname(conf->mddev), |
1749 | (unsigned long long)s, | 1775 | (unsigned long long)s, |
1750 | bdn); | 1776 | bdn); |
1751 | else if (conf->mddev->degraded >= conf->max_degraded) | 1777 | else if (conf->mddev->degraded >= conf->max_degraded) { |
1778 | set_bad = 1; | ||
1752 | printk_ratelimited( | 1779 | printk_ratelimited( |
1753 | KERN_WARNING | 1780 | KERN_WARNING |
1754 | "md/raid:%s: read error not correctable " | 1781 | "md/raid:%s: read error not correctable " |
@@ -1756,8 +1783,9 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1756 | mdname(conf->mddev), | 1783 | mdname(conf->mddev), |
1757 | (unsigned long long)s, | 1784 | (unsigned long long)s, |
1758 | bdn); | 1785 | bdn); |
1759 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) | 1786 | } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { |
1760 | /* Oh, no!!! */ | 1787 | /* Oh, no!!! */ |
1788 | set_bad = 1; | ||
1761 | printk_ratelimited( | 1789 | printk_ratelimited( |
1762 | KERN_WARNING | 1790 | KERN_WARNING |
1763 | "md/raid:%s: read error NOT corrected!! " | 1791 | "md/raid:%s: read error NOT corrected!! " |
@@ -1765,7 +1793,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1765 | mdname(conf->mddev), | 1793 | mdname(conf->mddev), |
1766 | (unsigned long long)s, | 1794 | (unsigned long long)s, |
1767 | bdn); | 1795 | bdn); |
1768 | else if (atomic_read(&rdev->read_errors) | 1796 | } else if (atomic_read(&rdev->read_errors) |
1769 | > conf->max_nr_stripes) | 1797 | > conf->max_nr_stripes) |
1770 | printk(KERN_WARNING | 1798 | printk(KERN_WARNING |
1771 | "md/raid:%s: Too many read errors, failing device %s.\n", | 1799 | "md/raid:%s: Too many read errors, failing device %s.\n", |
@@ -1773,11 +1801,19 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1773 | else | 1801 | else |
1774 | retry = 1; | 1802 | retry = 1; |
1775 | if (retry) | 1803 | if (retry) |
1776 | set_bit(R5_ReadError, &sh->dev[i].flags); | 1804 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { |
1805 | set_bit(R5_ReadError, &sh->dev[i].flags); | ||
1806 | clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); | ||
1807 | } else | ||
1808 | set_bit(R5_ReadNoMerge, &sh->dev[i].flags); | ||
1777 | else { | 1809 | else { |
1778 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1810 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
1779 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | 1811 | clear_bit(R5_ReWrite, &sh->dev[i].flags); |
1780 | md_error(conf->mddev, rdev); | 1812 | if (!(set_bad |
1813 | && test_bit(In_sync, &rdev->flags) | ||
1814 | && rdev_set_badblocks( | ||
1815 | rdev, sh->sector, STRIPE_SECTORS, 0))) | ||
1816 | md_error(conf->mddev, rdev); | ||
1781 | } | 1817 | } |
1782 | } | 1818 | } |
1783 | rdev_dec_pending(rdev, conf->mddev); | 1819 | rdev_dec_pending(rdev, conf->mddev); |
@@ -2325,11 +2361,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2325 | (unsigned long long)bi->bi_sector, | 2361 | (unsigned long long)bi->bi_sector, |
2326 | (unsigned long long)sh->sector); | 2362 | (unsigned long long)sh->sector); |
2327 | 2363 | ||
2328 | 2364 | /* | |
2329 | spin_lock_irq(&conf->device_lock); | 2365 | * If several bio share a stripe. The bio bi_phys_segments acts as a |
2366 | * reference count to avoid race. The reference count should already be | ||
2367 | * increased before this function is called (for example, in | ||
2368 | * make_request()), so other bio sharing this stripe will not free the | ||
2369 | * stripe. If a stripe is owned by one stripe, the stripe lock will | ||
2370 | * protect it. | ||
2371 | */ | ||
2372 | spin_lock_irq(&sh->stripe_lock); | ||
2330 | if (forwrite) { | 2373 | if (forwrite) { |
2331 | bip = &sh->dev[dd_idx].towrite; | 2374 | bip = &sh->dev[dd_idx].towrite; |
2332 | if (*bip == NULL && sh->dev[dd_idx].written == NULL) | 2375 | if (*bip == NULL) |
2333 | firstwrite = 1; | 2376 | firstwrite = 1; |
2334 | } else | 2377 | } else |
2335 | bip = &sh->dev[dd_idx].toread; | 2378 | bip = &sh->dev[dd_idx].toread; |
@@ -2345,7 +2388,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2345 | if (*bip) | 2388 | if (*bip) |
2346 | bi->bi_next = *bip; | 2389 | bi->bi_next = *bip; |
2347 | *bip = bi; | 2390 | *bip = bi; |
2348 | bi->bi_phys_segments++; | 2391 | raid5_inc_bi_active_stripes(bi); |
2349 | 2392 | ||
2350 | if (forwrite) { | 2393 | if (forwrite) { |
2351 | /* check if page is covered */ | 2394 | /* check if page is covered */ |
@@ -2360,7 +2403,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2360 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | 2403 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) |
2361 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | 2404 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); |
2362 | } | 2405 | } |
2363 | spin_unlock_irq(&conf->device_lock); | 2406 | spin_unlock_irq(&sh->stripe_lock); |
2364 | 2407 | ||
2365 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | 2408 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", |
2366 | (unsigned long long)(*bip)->bi_sector, | 2409 | (unsigned long long)(*bip)->bi_sector, |
@@ -2376,7 +2419,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2376 | 2419 | ||
2377 | overlap: | 2420 | overlap: |
2378 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); | 2421 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); |
2379 | spin_unlock_irq(&conf->device_lock); | 2422 | spin_unlock_irq(&sh->stripe_lock); |
2380 | return 0; | 2423 | return 0; |
2381 | } | 2424 | } |
2382 | 2425 | ||
@@ -2426,10 +2469,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
2426 | rdev_dec_pending(rdev, conf->mddev); | 2469 | rdev_dec_pending(rdev, conf->mddev); |
2427 | } | 2470 | } |
2428 | } | 2471 | } |
2429 | spin_lock_irq(&conf->device_lock); | 2472 | spin_lock_irq(&sh->stripe_lock); |
2430 | /* fail all writes first */ | 2473 | /* fail all writes first */ |
2431 | bi = sh->dev[i].towrite; | 2474 | bi = sh->dev[i].towrite; |
2432 | sh->dev[i].towrite = NULL; | 2475 | sh->dev[i].towrite = NULL; |
2476 | spin_unlock_irq(&sh->stripe_lock); | ||
2433 | if (bi) { | 2477 | if (bi) { |
2434 | s->to_write--; | 2478 | s->to_write--; |
2435 | bitmap_end = 1; | 2479 | bitmap_end = 1; |
@@ -2442,13 +2486,17 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
2442 | sh->dev[i].sector + STRIPE_SECTORS) { | 2486 | sh->dev[i].sector + STRIPE_SECTORS) { |
2443 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | 2487 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); |
2444 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 2488 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
2445 | if (!raid5_dec_bi_phys_segments(bi)) { | 2489 | if (!raid5_dec_bi_active_stripes(bi)) { |
2446 | md_write_end(conf->mddev); | 2490 | md_write_end(conf->mddev); |
2447 | bi->bi_next = *return_bi; | 2491 | bi->bi_next = *return_bi; |
2448 | *return_bi = bi; | 2492 | *return_bi = bi; |
2449 | } | 2493 | } |
2450 | bi = nextbi; | 2494 | bi = nextbi; |
2451 | } | 2495 | } |
2496 | if (bitmap_end) | ||
2497 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
2498 | STRIPE_SECTORS, 0, 0); | ||
2499 | bitmap_end = 0; | ||
2452 | /* and fail all 'written' */ | 2500 | /* and fail all 'written' */ |
2453 | bi = sh->dev[i].written; | 2501 | bi = sh->dev[i].written; |
2454 | sh->dev[i].written = NULL; | 2502 | sh->dev[i].written = NULL; |
@@ -2457,7 +2505,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
2457 | sh->dev[i].sector + STRIPE_SECTORS) { | 2505 | sh->dev[i].sector + STRIPE_SECTORS) { |
2458 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | 2506 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); |
2459 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 2507 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
2460 | if (!raid5_dec_bi_phys_segments(bi)) { | 2508 | if (!raid5_dec_bi_active_stripes(bi)) { |
2461 | md_write_end(conf->mddev); | 2509 | md_write_end(conf->mddev); |
2462 | bi->bi_next = *return_bi; | 2510 | bi->bi_next = *return_bi; |
2463 | *return_bi = bi; | 2511 | *return_bi = bi; |
@@ -2481,14 +2529,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
2481 | struct bio *nextbi = | 2529 | struct bio *nextbi = |
2482 | r5_next_bio(bi, sh->dev[i].sector); | 2530 | r5_next_bio(bi, sh->dev[i].sector); |
2483 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 2531 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
2484 | if (!raid5_dec_bi_phys_segments(bi)) { | 2532 | if (!raid5_dec_bi_active_stripes(bi)) { |
2485 | bi->bi_next = *return_bi; | 2533 | bi->bi_next = *return_bi; |
2486 | *return_bi = bi; | 2534 | *return_bi = bi; |
2487 | } | 2535 | } |
2488 | bi = nextbi; | 2536 | bi = nextbi; |
2489 | } | 2537 | } |
2490 | } | 2538 | } |
2491 | spin_unlock_irq(&conf->device_lock); | ||
2492 | if (bitmap_end) | 2539 | if (bitmap_end) |
2493 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | 2540 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, |
2494 | STRIPE_SECTORS, 0, 0); | 2541 | STRIPE_SECTORS, 0, 0); |
@@ -2692,30 +2739,23 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
2692 | test_bit(R5_UPTODATE, &dev->flags)) { | 2739 | test_bit(R5_UPTODATE, &dev->flags)) { |
2693 | /* We can return any write requests */ | 2740 | /* We can return any write requests */ |
2694 | struct bio *wbi, *wbi2; | 2741 | struct bio *wbi, *wbi2; |
2695 | int bitmap_end = 0; | ||
2696 | pr_debug("Return write for disc %d\n", i); | 2742 | pr_debug("Return write for disc %d\n", i); |
2697 | spin_lock_irq(&conf->device_lock); | ||
2698 | wbi = dev->written; | 2743 | wbi = dev->written; |
2699 | dev->written = NULL; | 2744 | dev->written = NULL; |
2700 | while (wbi && wbi->bi_sector < | 2745 | while (wbi && wbi->bi_sector < |
2701 | dev->sector + STRIPE_SECTORS) { | 2746 | dev->sector + STRIPE_SECTORS) { |
2702 | wbi2 = r5_next_bio(wbi, dev->sector); | 2747 | wbi2 = r5_next_bio(wbi, dev->sector); |
2703 | if (!raid5_dec_bi_phys_segments(wbi)) { | 2748 | if (!raid5_dec_bi_active_stripes(wbi)) { |
2704 | md_write_end(conf->mddev); | 2749 | md_write_end(conf->mddev); |
2705 | wbi->bi_next = *return_bi; | 2750 | wbi->bi_next = *return_bi; |
2706 | *return_bi = wbi; | 2751 | *return_bi = wbi; |
2707 | } | 2752 | } |
2708 | wbi = wbi2; | 2753 | wbi = wbi2; |
2709 | } | 2754 | } |
2710 | if (dev->towrite == NULL) | 2755 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, |
2711 | bitmap_end = 1; | 2756 | STRIPE_SECTORS, |
2712 | spin_unlock_irq(&conf->device_lock); | ||
2713 | if (bitmap_end) | ||
2714 | bitmap_endwrite(conf->mddev->bitmap, | ||
2715 | sh->sector, | ||
2716 | STRIPE_SECTORS, | ||
2717 | !test_bit(STRIPE_DEGRADED, &sh->state), | 2757 | !test_bit(STRIPE_DEGRADED, &sh->state), |
2718 | 0); | 2758 | 0); |
2719 | } | 2759 | } |
2720 | } | 2760 | } |
2721 | 2761 | ||
@@ -3167,7 +3207,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3167 | 3207 | ||
3168 | /* Now to look around and see what can be done */ | 3208 | /* Now to look around and see what can be done */ |
3169 | rcu_read_lock(); | 3209 | rcu_read_lock(); |
3170 | spin_lock_irq(&conf->device_lock); | ||
3171 | for (i=disks; i--; ) { | 3210 | for (i=disks; i--; ) { |
3172 | struct md_rdev *rdev; | 3211 | struct md_rdev *rdev; |
3173 | sector_t first_bad; | 3212 | sector_t first_bad; |
@@ -3313,7 +3352,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3313 | do_recovery = 1; | 3352 | do_recovery = 1; |
3314 | } | 3353 | } |
3315 | } | 3354 | } |
3316 | spin_unlock_irq(&conf->device_lock); | ||
3317 | if (test_bit(STRIPE_SYNCING, &sh->state)) { | 3355 | if (test_bit(STRIPE_SYNCING, &sh->state)) { |
3318 | /* If there is a failed device being replaced, | 3356 | /* If there is a failed device being replaced, |
3319 | * we must be recovering. | 3357 | * we must be recovering. |
@@ -3582,8 +3620,18 @@ static void handle_stripe(struct stripe_head *sh) | |||
3582 | 3620 | ||
3583 | finish: | 3621 | finish: |
3584 | /* wait for this device to become unblocked */ | 3622 | /* wait for this device to become unblocked */ |
3585 | if (conf->mddev->external && unlikely(s.blocked_rdev)) | 3623 | if (unlikely(s.blocked_rdev)) { |
3586 | md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); | 3624 | if (conf->mddev->external) |
3625 | md_wait_for_blocked_rdev(s.blocked_rdev, | ||
3626 | conf->mddev); | ||
3627 | else | ||
3628 | /* Internal metadata will immediately | ||
3629 | * be written by raid5d, so we don't | ||
3630 | * need to wait here. | ||
3631 | */ | ||
3632 | rdev_dec_pending(s.blocked_rdev, | ||
3633 | conf->mddev); | ||
3634 | } | ||
3587 | 3635 | ||
3588 | if (s.handle_bad_blocks) | 3636 | if (s.handle_bad_blocks) |
3589 | for (i = disks; i--; ) { | 3637 | for (i = disks; i--; ) { |
@@ -3766,7 +3814,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) | |||
3766 | * this sets the active strip count to 1 and the processed | 3814 | * this sets the active strip count to 1 and the processed |
3767 | * strip count to zero (upper 8 bits) | 3815 | * strip count to zero (upper 8 bits) |
3768 | */ | 3816 | */ |
3769 | bi->bi_phys_segments = 1; /* biased count of active stripes */ | 3817 | raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ |
3770 | } | 3818 | } |
3771 | 3819 | ||
3772 | return bi; | 3820 | return bi; |
@@ -3881,8 +3929,6 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
3881 | raid_bio->bi_next = (void*)rdev; | 3929 | raid_bio->bi_next = (void*)rdev; |
3882 | align_bi->bi_bdev = rdev->bdev; | 3930 | align_bi->bi_bdev = rdev->bdev; |
3883 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); | 3931 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); |
3884 | /* No reshape active, so we can trust rdev->data_offset */ | ||
3885 | align_bi->bi_sector += rdev->data_offset; | ||
3886 | 3932 | ||
3887 | if (!bio_fits_rdev(align_bi) || | 3933 | if (!bio_fits_rdev(align_bi) || |
3888 | is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, | 3934 | is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, |
@@ -3893,6 +3939,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
3893 | return 0; | 3939 | return 0; |
3894 | } | 3940 | } |
3895 | 3941 | ||
3942 | /* No reshape active, so we can trust rdev->data_offset */ | ||
3943 | align_bi->bi_sector += rdev->data_offset; | ||
3944 | |||
3896 | spin_lock_irq(&conf->device_lock); | 3945 | spin_lock_irq(&conf->device_lock); |
3897 | wait_event_lock_irq(conf->wait_for_stripe, | 3946 | wait_event_lock_irq(conf->wait_for_stripe, |
3898 | conf->quiesce == 0, | 3947 | conf->quiesce == 0, |
@@ -3962,6 +4011,62 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf) | |||
3962 | return sh; | 4011 | return sh; |
3963 | } | 4012 | } |
3964 | 4013 | ||
4014 | struct raid5_plug_cb { | ||
4015 | struct blk_plug_cb cb; | ||
4016 | struct list_head list; | ||
4017 | }; | ||
4018 | |||
4019 | static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | ||
4020 | { | ||
4021 | struct raid5_plug_cb *cb = container_of( | ||
4022 | blk_cb, struct raid5_plug_cb, cb); | ||
4023 | struct stripe_head *sh; | ||
4024 | struct mddev *mddev = cb->cb.data; | ||
4025 | struct r5conf *conf = mddev->private; | ||
4026 | |||
4027 | if (cb->list.next && !list_empty(&cb->list)) { | ||
4028 | spin_lock_irq(&conf->device_lock); | ||
4029 | while (!list_empty(&cb->list)) { | ||
4030 | sh = list_first_entry(&cb->list, struct stripe_head, lru); | ||
4031 | list_del_init(&sh->lru); | ||
4032 | /* | ||
4033 | * avoid race release_stripe_plug() sees | ||
4034 | * STRIPE_ON_UNPLUG_LIST clear but the stripe | ||
4035 | * is still in our list | ||
4036 | */ | ||
4037 | smp_mb__before_clear_bit(); | ||
4038 | clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); | ||
4039 | __release_stripe(conf, sh); | ||
4040 | } | ||
4041 | spin_unlock_irq(&conf->device_lock); | ||
4042 | } | ||
4043 | kfree(cb); | ||
4044 | } | ||
4045 | |||
4046 | static void release_stripe_plug(struct mddev *mddev, | ||
4047 | struct stripe_head *sh) | ||
4048 | { | ||
4049 | struct blk_plug_cb *blk_cb = blk_check_plugged( | ||
4050 | raid5_unplug, mddev, | ||
4051 | sizeof(struct raid5_plug_cb)); | ||
4052 | struct raid5_plug_cb *cb; | ||
4053 | |||
4054 | if (!blk_cb) { | ||
4055 | release_stripe(sh); | ||
4056 | return; | ||
4057 | } | ||
4058 | |||
4059 | cb = container_of(blk_cb, struct raid5_plug_cb, cb); | ||
4060 | |||
4061 | if (cb->list.next == NULL) | ||
4062 | INIT_LIST_HEAD(&cb->list); | ||
4063 | |||
4064 | if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) | ||
4065 | list_add_tail(&sh->lru, &cb->list); | ||
4066 | else | ||
4067 | release_stripe(sh); | ||
4068 | } | ||
4069 | |||
3965 | static void make_request(struct mddev *mddev, struct bio * bi) | 4070 | static void make_request(struct mddev *mddev, struct bio * bi) |
3966 | { | 4071 | { |
3967 | struct r5conf *conf = mddev->private; | 4072 | struct r5conf *conf = mddev->private; |
@@ -3971,7 +4076,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
3971 | struct stripe_head *sh; | 4076 | struct stripe_head *sh; |
3972 | const int rw = bio_data_dir(bi); | 4077 | const int rw = bio_data_dir(bi); |
3973 | int remaining; | 4078 | int remaining; |
3974 | int plugged; | ||
3975 | 4079 | ||
3976 | if (unlikely(bi->bi_rw & REQ_FLUSH)) { | 4080 | if (unlikely(bi->bi_rw & REQ_FLUSH)) { |
3977 | md_flush_request(mddev, bi); | 4081 | md_flush_request(mddev, bi); |
@@ -3990,7 +4094,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
3990 | bi->bi_next = NULL; | 4094 | bi->bi_next = NULL; |
3991 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ | 4095 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ |
3992 | 4096 | ||
3993 | plugged = mddev_check_plugged(mddev); | ||
3994 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { | 4097 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { |
3995 | DEFINE_WAIT(w); | 4098 | DEFINE_WAIT(w); |
3996 | int previous; | 4099 | int previous; |
@@ -4089,24 +4192,19 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
4089 | finish_wait(&conf->wait_for_overlap, &w); | 4192 | finish_wait(&conf->wait_for_overlap, &w); |
4090 | set_bit(STRIPE_HANDLE, &sh->state); | 4193 | set_bit(STRIPE_HANDLE, &sh->state); |
4091 | clear_bit(STRIPE_DELAYED, &sh->state); | 4194 | clear_bit(STRIPE_DELAYED, &sh->state); |
4092 | if ((bi->bi_rw & REQ_SYNC) && | 4195 | if ((bi->bi_rw & REQ_NOIDLE) && |
4093 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 4196 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
4094 | atomic_inc(&conf->preread_active_stripes); | 4197 | atomic_inc(&conf->preread_active_stripes); |
4095 | release_stripe(sh); | 4198 | release_stripe_plug(mddev, sh); |
4096 | } else { | 4199 | } else { |
4097 | /* cannot get stripe for read-ahead, just give-up */ | 4200 | /* cannot get stripe for read-ahead, just give-up */ |
4098 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 4201 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
4099 | finish_wait(&conf->wait_for_overlap, &w); | 4202 | finish_wait(&conf->wait_for_overlap, &w); |
4100 | break; | 4203 | break; |
4101 | } | 4204 | } |
4102 | |||
4103 | } | 4205 | } |
4104 | if (!plugged) | ||
4105 | md_wakeup_thread(mddev->thread); | ||
4106 | 4206 | ||
4107 | spin_lock_irq(&conf->device_lock); | 4207 | remaining = raid5_dec_bi_active_stripes(bi); |
4108 | remaining = raid5_dec_bi_phys_segments(bi); | ||
4109 | spin_unlock_irq(&conf->device_lock); | ||
4110 | if (remaining == 0) { | 4208 | if (remaining == 0) { |
4111 | 4209 | ||
4112 | if ( rw == WRITE ) | 4210 | if ( rw == WRITE ) |
@@ -4462,7 +4560,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
4462 | sector += STRIPE_SECTORS, | 4560 | sector += STRIPE_SECTORS, |
4463 | scnt++) { | 4561 | scnt++) { |
4464 | 4562 | ||
4465 | if (scnt < raid5_bi_hw_segments(raid_bio)) | 4563 | if (scnt < raid5_bi_processed_stripes(raid_bio)) |
4466 | /* already done this stripe */ | 4564 | /* already done this stripe */ |
4467 | continue; | 4565 | continue; |
4468 | 4566 | ||
@@ -4470,25 +4568,24 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
4470 | 4568 | ||
4471 | if (!sh) { | 4569 | if (!sh) { |
4472 | /* failed to get a stripe - must wait */ | 4570 | /* failed to get a stripe - must wait */ |
4473 | raid5_set_bi_hw_segments(raid_bio, scnt); | 4571 | raid5_set_bi_processed_stripes(raid_bio, scnt); |
4474 | conf->retry_read_aligned = raid_bio; | 4572 | conf->retry_read_aligned = raid_bio; |
4475 | return handled; | 4573 | return handled; |
4476 | } | 4574 | } |
4477 | 4575 | ||
4478 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { | 4576 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { |
4479 | release_stripe(sh); | 4577 | release_stripe(sh); |
4480 | raid5_set_bi_hw_segments(raid_bio, scnt); | 4578 | raid5_set_bi_processed_stripes(raid_bio, scnt); |
4481 | conf->retry_read_aligned = raid_bio; | 4579 | conf->retry_read_aligned = raid_bio; |
4482 | return handled; | 4580 | return handled; |
4483 | } | 4581 | } |
4484 | 4582 | ||
4583 | set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); | ||
4485 | handle_stripe(sh); | 4584 | handle_stripe(sh); |
4486 | release_stripe(sh); | 4585 | release_stripe(sh); |
4487 | handled++; | 4586 | handled++; |
4488 | } | 4587 | } |
4489 | spin_lock_irq(&conf->device_lock); | 4588 | remaining = raid5_dec_bi_active_stripes(raid_bio); |
4490 | remaining = raid5_dec_bi_phys_segments(raid_bio); | ||
4491 | spin_unlock_irq(&conf->device_lock); | ||
4492 | if (remaining == 0) | 4589 | if (remaining == 0) |
4493 | bio_endio(raid_bio, 0); | 4590 | bio_endio(raid_bio, 0); |
4494 | if (atomic_dec_and_test(&conf->active_aligned_reads)) | 4591 | if (atomic_dec_and_test(&conf->active_aligned_reads)) |
@@ -4496,6 +4593,30 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
4496 | return handled; | 4593 | return handled; |
4497 | } | 4594 | } |
4498 | 4595 | ||
4596 | #define MAX_STRIPE_BATCH 8 | ||
4597 | static int handle_active_stripes(struct r5conf *conf) | ||
4598 | { | ||
4599 | struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; | ||
4600 | int i, batch_size = 0; | ||
4601 | |||
4602 | while (batch_size < MAX_STRIPE_BATCH && | ||
4603 | (sh = __get_priority_stripe(conf)) != NULL) | ||
4604 | batch[batch_size++] = sh; | ||
4605 | |||
4606 | if (batch_size == 0) | ||
4607 | return batch_size; | ||
4608 | spin_unlock_irq(&conf->device_lock); | ||
4609 | |||
4610 | for (i = 0; i < batch_size; i++) | ||
4611 | handle_stripe(batch[i]); | ||
4612 | |||
4613 | cond_resched(); | ||
4614 | |||
4615 | spin_lock_irq(&conf->device_lock); | ||
4616 | for (i = 0; i < batch_size; i++) | ||
4617 | __release_stripe(conf, batch[i]); | ||
4618 | return batch_size; | ||
4619 | } | ||
4499 | 4620 | ||
4500 | /* | 4621 | /* |
4501 | * This is our raid5 kernel thread. | 4622 | * This is our raid5 kernel thread. |
@@ -4506,7 +4627,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
4506 | */ | 4627 | */ |
4507 | static void raid5d(struct mddev *mddev) | 4628 | static void raid5d(struct mddev *mddev) |
4508 | { | 4629 | { |
4509 | struct stripe_head *sh; | ||
4510 | struct r5conf *conf = mddev->private; | 4630 | struct r5conf *conf = mddev->private; |
4511 | int handled; | 4631 | int handled; |
4512 | struct blk_plug plug; | 4632 | struct blk_plug plug; |
@@ -4520,8 +4640,9 @@ static void raid5d(struct mddev *mddev) | |||
4520 | spin_lock_irq(&conf->device_lock); | 4640 | spin_lock_irq(&conf->device_lock); |
4521 | while (1) { | 4641 | while (1) { |
4522 | struct bio *bio; | 4642 | struct bio *bio; |
4643 | int batch_size; | ||
4523 | 4644 | ||
4524 | if (atomic_read(&mddev->plug_cnt) == 0 && | 4645 | if ( |
4525 | !list_empty(&conf->bitmap_list)) { | 4646 | !list_empty(&conf->bitmap_list)) { |
4526 | /* Now is a good time to flush some bitmap updates */ | 4647 | /* Now is a good time to flush some bitmap updates */ |
4527 | conf->seq_flush++; | 4648 | conf->seq_flush++; |
@@ -4531,8 +4652,7 @@ static void raid5d(struct mddev *mddev) | |||
4531 | conf->seq_write = conf->seq_flush; | 4652 | conf->seq_write = conf->seq_flush; |
4532 | activate_bit_delay(conf); | 4653 | activate_bit_delay(conf); |
4533 | } | 4654 | } |
4534 | if (atomic_read(&mddev->plug_cnt) == 0) | 4655 | raid5_activate_delayed(conf); |
4535 | raid5_activate_delayed(conf); | ||
4536 | 4656 | ||
4537 | while ((bio = remove_bio_from_retry(conf))) { | 4657 | while ((bio = remove_bio_from_retry(conf))) { |
4538 | int ok; | 4658 | int ok; |
@@ -4544,21 +4664,16 @@ static void raid5d(struct mddev *mddev) | |||
4544 | handled++; | 4664 | handled++; |
4545 | } | 4665 | } |
4546 | 4666 | ||
4547 | sh = __get_priority_stripe(conf); | 4667 | batch_size = handle_active_stripes(conf); |
4548 | 4668 | if (!batch_size) | |
4549 | if (!sh) | ||
4550 | break; | 4669 | break; |
4551 | spin_unlock_irq(&conf->device_lock); | 4670 | handled += batch_size; |
4552 | |||
4553 | handled++; | ||
4554 | handle_stripe(sh); | ||
4555 | release_stripe(sh); | ||
4556 | cond_resched(); | ||
4557 | 4671 | ||
4558 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | 4672 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { |
4673 | spin_unlock_irq(&conf->device_lock); | ||
4559 | md_check_recovery(mddev); | 4674 | md_check_recovery(mddev); |
4560 | 4675 | spin_lock_irq(&conf->device_lock); | |
4561 | spin_lock_irq(&conf->device_lock); | 4676 | } |
4562 | } | 4677 | } |
4563 | pr_debug("%d stripes handled\n", handled); | 4678 | pr_debug("%d stripes handled\n", handled); |
4564 | 4679 | ||
@@ -4823,6 +4938,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
4823 | int raid_disk, memory, max_disks; | 4938 | int raid_disk, memory, max_disks; |
4824 | struct md_rdev *rdev; | 4939 | struct md_rdev *rdev; |
4825 | struct disk_info *disk; | 4940 | struct disk_info *disk; |
4941 | char pers_name[6]; | ||
4826 | 4942 | ||
4827 | if (mddev->new_level != 5 | 4943 | if (mddev->new_level != 5 |
4828 | && mddev->new_level != 4 | 4944 | && mddev->new_level != 4 |
@@ -4946,7 +5062,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
4946 | printk(KERN_INFO "md/raid:%s: allocated %dkB\n", | 5062 | printk(KERN_INFO "md/raid:%s: allocated %dkB\n", |
4947 | mdname(mddev), memory); | 5063 | mdname(mddev), memory); |
4948 | 5064 | ||
4949 | conf->thread = md_register_thread(raid5d, mddev, NULL); | 5065 | sprintf(pers_name, "raid%d", mddev->new_level); |
5066 | conf->thread = md_register_thread(raid5d, mddev, pers_name); | ||
4950 | if (!conf->thread) { | 5067 | if (!conf->thread) { |
4951 | printk(KERN_ERR | 5068 | printk(KERN_ERR |
4952 | "md/raid:%s: couldn't allocate thread.\n", | 5069 | "md/raid:%s: couldn't allocate thread.\n", |
@@ -5465,10 +5582,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
5465 | if (rdev->saved_raid_disk >= 0 && | 5582 | if (rdev->saved_raid_disk >= 0 && |
5466 | rdev->saved_raid_disk >= first && | 5583 | rdev->saved_raid_disk >= first && |
5467 | conf->disks[rdev->saved_raid_disk].rdev == NULL) | 5584 | conf->disks[rdev->saved_raid_disk].rdev == NULL) |
5468 | disk = rdev->saved_raid_disk; | 5585 | first = rdev->saved_raid_disk; |
5469 | else | 5586 | |
5470 | disk = first; | 5587 | for (disk = first; disk <= last; disk++) { |
5471 | for ( ; disk <= last ; disk++) { | ||
5472 | p = conf->disks + disk; | 5588 | p = conf->disks + disk; |
5473 | if (p->rdev == NULL) { | 5589 | if (p->rdev == NULL) { |
5474 | clear_bit(In_sync, &rdev->flags); | 5590 | clear_bit(In_sync, &rdev->flags); |
@@ -5477,8 +5593,11 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
5477 | if (rdev->saved_raid_disk != disk) | 5593 | if (rdev->saved_raid_disk != disk) |
5478 | conf->fullsync = 1; | 5594 | conf->fullsync = 1; |
5479 | rcu_assign_pointer(p->rdev, rdev); | 5595 | rcu_assign_pointer(p->rdev, rdev); |
5480 | break; | 5596 | goto out; |
5481 | } | 5597 | } |
5598 | } | ||
5599 | for (disk = first; disk <= last; disk++) { | ||
5600 | p = conf->disks + disk; | ||
5482 | if (test_bit(WantReplacement, &p->rdev->flags) && | 5601 | if (test_bit(WantReplacement, &p->rdev->flags) && |
5483 | p->replacement == NULL) { | 5602 | p->replacement == NULL) { |
5484 | clear_bit(In_sync, &rdev->flags); | 5603 | clear_bit(In_sync, &rdev->flags); |
@@ -5490,6 +5609,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
5490 | break; | 5609 | break; |
5491 | } | 5610 | } |
5492 | } | 5611 | } |
5612 | out: | ||
5493 | print_raid5_conf(conf); | 5613 | print_raid5_conf(conf); |
5494 | return err; | 5614 | return err; |
5495 | } | 5615 | } |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 2164021f3b5f..a9fc24901eda 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -210,6 +210,7 @@ struct stripe_head { | |||
210 | int disks; /* disks in stripe */ | 210 | int disks; /* disks in stripe */ |
211 | enum check_states check_state; | 211 | enum check_states check_state; |
212 | enum reconstruct_states reconstruct_state; | 212 | enum reconstruct_states reconstruct_state; |
213 | spinlock_t stripe_lock; | ||
213 | /** | 214 | /** |
214 | * struct stripe_operations | 215 | * struct stripe_operations |
215 | * @target - STRIPE_OP_COMPUTE_BLK target | 216 | * @target - STRIPE_OP_COMPUTE_BLK target |
@@ -273,6 +274,7 @@ enum r5dev_flags { | |||
273 | R5_Wantwrite, | 274 | R5_Wantwrite, |
274 | R5_Overlap, /* There is a pending overlapping request | 275 | R5_Overlap, /* There is a pending overlapping request |
275 | * on this block */ | 276 | * on this block */ |
277 | R5_ReadNoMerge, /* prevent bio from merging in block-layer */ | ||
276 | R5_ReadError, /* seen a read error here recently */ | 278 | R5_ReadError, /* seen a read error here recently */ |
277 | R5_ReWrite, /* have tried to over-write the readerror */ | 279 | R5_ReWrite, /* have tried to over-write the readerror */ |
278 | 280 | ||
@@ -319,6 +321,7 @@ enum { | |||
319 | STRIPE_BIOFILL_RUN, | 321 | STRIPE_BIOFILL_RUN, |
320 | STRIPE_COMPUTE_RUN, | 322 | STRIPE_COMPUTE_RUN, |
321 | STRIPE_OPS_REQ_PENDING, | 323 | STRIPE_OPS_REQ_PENDING, |
324 | STRIPE_ON_UNPLUG_LIST, | ||
322 | }; | 325 | }; |
323 | 326 | ||
324 | /* | 327 | /* |