aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig28
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-bufio.c108
-rw-r--r--drivers/md/dm-bufio.h8
-rw-r--r--drivers/md/dm-crypt.c46
-rw-r--r--drivers/md/dm-delay.c9
-rw-r--r--drivers/md/dm-exception-store.c2
-rw-r--r--drivers/md/dm-flakey.c3
-rw-r--r--drivers/md/dm-ioctl.c5
-rw-r--r--drivers/md/dm-linear.c3
-rw-r--r--drivers/md/dm-log.c3
-rw-r--r--drivers/md/dm-mpath.c52
-rw-r--r--drivers/md/dm-queue-length.c3
-rw-r--r--drivers/md/dm-raid.c53
-rw-r--r--drivers/md/dm-raid1.c12
-rw-r--r--drivers/md/dm-round-robin.c3
-rw-r--r--drivers/md/dm-service-time.c5
-rw-r--r--drivers/md/dm-stripe.c3
-rw-r--r--drivers/md/dm-table.c9
-rw-r--r--drivers/md/dm-thin-metadata.c5
-rw-r--r--drivers/md/dm-thin-metadata.h13
-rw-r--r--drivers/md/dm-thin.c680
-rw-r--r--drivers/md/dm-verity.c913
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/md/linear.c9
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h7
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c202
-rw-r--r--drivers/md/persistent-data/dm-btree.c27
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c3
-rw-r--r--drivers/md/raid0.c27
-rw-r--r--drivers/md/raid1.c13
-rw-r--r--drivers/md/raid10.c2
-rw-r--r--drivers/md/raid5.c59
33 files changed, 1906 insertions, 411 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index faa4741df6d3..10f122a3a856 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -277,8 +277,8 @@ config DM_MIRROR
277 needed for live data migration tools such as 'pvmove'. 277 needed for live data migration tools such as 'pvmove'.
278 278
279config DM_RAID 279config DM_RAID
280 tristate "RAID 1/4/5/6 target (EXPERIMENTAL)" 280 tristate "RAID 1/4/5/6 target"
281 depends on BLK_DEV_DM && EXPERIMENTAL 281 depends on BLK_DEV_DM
282 select MD_RAID1 282 select MD_RAID1
283 select MD_RAID456 283 select MD_RAID456
284 select BLK_DEV_MD 284 select BLK_DEV_MD
@@ -359,8 +359,8 @@ config DM_DELAY
359 If unsure, say N. 359 If unsure, say N.
360 360
361config DM_UEVENT 361config DM_UEVENT
362 bool "DM uevents (EXPERIMENTAL)" 362 bool "DM uevents"
363 depends on BLK_DEV_DM && EXPERIMENTAL 363 depends on BLK_DEV_DM
364 ---help--- 364 ---help---
365 Generate udev events for DM events. 365 Generate udev events for DM events.
366 366
@@ -370,4 +370,24 @@ config DM_FLAKEY
370 ---help--- 370 ---help---
371 A target that intermittently fails I/O for debugging purposes. 371 A target that intermittently fails I/O for debugging purposes.
372 372
373config DM_VERITY
374 tristate "Verity target support (EXPERIMENTAL)"
375 depends on BLK_DEV_DM && EXPERIMENTAL
376 select CRYPTO
377 select CRYPTO_HASH
378 select DM_BUFIO
379 ---help---
380 This device-mapper target creates a read-only device that
381 transparently validates the data on one underlying device against
382 a pre-generated tree of cryptographic checksums stored on a second
383 device.
384
385 You'll need to activate the digests you're going to use in the
386 cryptoapi configuration.
387
388 To compile this code as a module, choose M here: the module will
389 be called dm-verity.
390
391 If unsure, say N.
392
373endif # MD 393endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 046860c7a166..8b2e0dffe82e 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
42obj-$(CONFIG_DM_ZERO) += dm-zero.o 42obj-$(CONFIG_DM_ZERO) += dm-zero.o
43obj-$(CONFIG_DM_RAID) += dm-raid.o 43obj-$(CONFIG_DM_RAID) += dm-raid.o
44obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o 44obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
45obj-$(CONFIG_DM_VERITY) += dm-verity.o
45 46
46ifeq ($(CONFIG_DM_UEVENT),y) 47ifeq ($(CONFIG_DM_UEVENT),y)
47dm-mod-objs += dm-uevent.o 48dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index b6e58c7b6df5..cc06a1e52423 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -578,7 +578,7 @@ static void write_endio(struct bio *bio, int error)
578 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 578 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
579 579
580 b->write_error = error; 580 b->write_error = error;
581 if (error) { 581 if (unlikely(error)) {
582 struct dm_bufio_client *c = b->c; 582 struct dm_bufio_client *c = b->c;
583 (void)cmpxchg(&c->async_write_error, 0, error); 583 (void)cmpxchg(&c->async_write_error, 0, error);
584 } 584 }
@@ -697,13 +697,20 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
697 dm_bufio_lock(c); 697 dm_bufio_lock(c);
698} 698}
699 699
700enum new_flag {
701 NF_FRESH = 0,
702 NF_READ = 1,
703 NF_GET = 2,
704 NF_PREFETCH = 3
705};
706
700/* 707/*
701 * Allocate a new buffer. If the allocation is not possible, wait until 708 * Allocate a new buffer. If the allocation is not possible, wait until
702 * some other thread frees a buffer. 709 * some other thread frees a buffer.
703 * 710 *
704 * May drop the lock and regain it. 711 * May drop the lock and regain it.
705 */ 712 */
706static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c) 713static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
707{ 714{
708 struct dm_buffer *b; 715 struct dm_buffer *b;
709 716
@@ -726,6 +733,9 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
726 return b; 733 return b;
727 } 734 }
728 735
736 if (nf == NF_PREFETCH)
737 return NULL;
738
729 if (!list_empty(&c->reserved_buffers)) { 739 if (!list_empty(&c->reserved_buffers)) {
730 b = list_entry(c->reserved_buffers.next, 740 b = list_entry(c->reserved_buffers.next,
731 struct dm_buffer, lru_list); 741 struct dm_buffer, lru_list);
@@ -743,9 +753,12 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
743 } 753 }
744} 754}
745 755
746static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c) 756static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
747{ 757{
748 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c); 758 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
759
760 if (!b)
761 return NULL;
749 762
750 if (c->alloc_callback) 763 if (c->alloc_callback)
751 c->alloc_callback(b); 764 c->alloc_callback(b);
@@ -865,32 +878,23 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
865 * Getting a buffer 878 * Getting a buffer
866 *--------------------------------------------------------------*/ 879 *--------------------------------------------------------------*/
867 880
868enum new_flag {
869 NF_FRESH = 0,
870 NF_READ = 1,
871 NF_GET = 2
872};
873
874static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 881static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
875 enum new_flag nf, struct dm_buffer **bp, 882 enum new_flag nf, int *need_submit)
876 int *need_submit)
877{ 883{
878 struct dm_buffer *b, *new_b = NULL; 884 struct dm_buffer *b, *new_b = NULL;
879 885
880 *need_submit = 0; 886 *need_submit = 0;
881 887
882 b = __find(c, block); 888 b = __find(c, block);
883 if (b) { 889 if (b)
884 b->hold_count++; 890 goto found_buffer;
885 __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
886 test_bit(B_WRITING, &b->state));
887 return b;
888 }
889 891
890 if (nf == NF_GET) 892 if (nf == NF_GET)
891 return NULL; 893 return NULL;
892 894
893 new_b = __alloc_buffer_wait(c); 895 new_b = __alloc_buffer_wait(c, nf);
896 if (!new_b)
897 return NULL;
894 898
895 /* 899 /*
896 * We've had a period where the mutex was unlocked, so need to 900 * We've had a period where the mutex was unlocked, so need to
@@ -899,10 +903,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
899 b = __find(c, block); 903 b = __find(c, block);
900 if (b) { 904 if (b) {
901 __free_buffer_wake(new_b); 905 __free_buffer_wake(new_b);
902 b->hold_count++; 906 goto found_buffer;
903 __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
904 test_bit(B_WRITING, &b->state));
905 return b;
906 } 907 }
907 908
908 __check_watermark(c); 909 __check_watermark(c);
@@ -922,6 +923,24 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
922 *need_submit = 1; 923 *need_submit = 1;
923 924
924 return b; 925 return b;
926
927found_buffer:
928 if (nf == NF_PREFETCH)
929 return NULL;
930 /*
931 * Note: it is essential that we don't wait for the buffer to be
932 * read if dm_bufio_get function is used. Both dm_bufio_get and
933 * dm_bufio_prefetch can be used in the driver request routine.
934 * If the user called both dm_bufio_prefetch and dm_bufio_get on
935 * the same buffer, it would deadlock if we waited.
936 */
937 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
938 return NULL;
939
940 b->hold_count++;
941 __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
942 test_bit(B_WRITING, &b->state));
943 return b;
925} 944}
926 945
927/* 946/*
@@ -956,10 +975,10 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
956 struct dm_buffer *b; 975 struct dm_buffer *b;
957 976
958 dm_bufio_lock(c); 977 dm_bufio_lock(c);
959 b = __bufio_new(c, block, nf, bp, &need_submit); 978 b = __bufio_new(c, block, nf, &need_submit);
960 dm_bufio_unlock(c); 979 dm_bufio_unlock(c);
961 980
962 if (!b || IS_ERR(b)) 981 if (!b)
963 return b; 982 return b;
964 983
965 if (need_submit) 984 if (need_submit)
@@ -1005,13 +1024,47 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1005} 1024}
1006EXPORT_SYMBOL_GPL(dm_bufio_new); 1025EXPORT_SYMBOL_GPL(dm_bufio_new);
1007 1026
1027void dm_bufio_prefetch(struct dm_bufio_client *c,
1028 sector_t block, unsigned n_blocks)
1029{
1030 struct blk_plug plug;
1031
1032 blk_start_plug(&plug);
1033 dm_bufio_lock(c);
1034
1035 for (; n_blocks--; block++) {
1036 int need_submit;
1037 struct dm_buffer *b;
1038 b = __bufio_new(c, block, NF_PREFETCH, &need_submit);
1039 if (unlikely(b != NULL)) {
1040 dm_bufio_unlock(c);
1041
1042 if (need_submit)
1043 submit_io(b, READ, b->block, read_endio);
1044 dm_bufio_release(b);
1045
1046 dm_bufio_cond_resched();
1047
1048 if (!n_blocks)
1049 goto flush_plug;
1050 dm_bufio_lock(c);
1051 }
1052
1053 }
1054
1055 dm_bufio_unlock(c);
1056
1057flush_plug:
1058 blk_finish_plug(&plug);
1059}
1060EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1061
1008void dm_bufio_release(struct dm_buffer *b) 1062void dm_bufio_release(struct dm_buffer *b)
1009{ 1063{
1010 struct dm_bufio_client *c = b->c; 1064 struct dm_bufio_client *c = b->c;
1011 1065
1012 dm_bufio_lock(c); 1066 dm_bufio_lock(c);
1013 1067
1014 BUG_ON(test_bit(B_READING, &b->state));
1015 BUG_ON(!b->hold_count); 1068 BUG_ON(!b->hold_count);
1016 1069
1017 b->hold_count--; 1070 b->hold_count--;
@@ -1024,6 +1077,7 @@ void dm_bufio_release(struct dm_buffer *b)
1024 * invalid buffer. 1077 * invalid buffer.
1025 */ 1078 */
1026 if ((b->read_error || b->write_error) && 1079 if ((b->read_error || b->write_error) &&
1080 !test_bit(B_READING, &b->state) &&
1027 !test_bit(B_WRITING, &b->state) && 1081 !test_bit(B_WRITING, &b->state) &&
1028 !test_bit(B_DIRTY, &b->state)) { 1082 !test_bit(B_DIRTY, &b->state)) {
1029 __unlink_buffer(b); 1083 __unlink_buffer(b);
@@ -1041,6 +1095,8 @@ void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1041 1095
1042 dm_bufio_lock(c); 1096 dm_bufio_lock(c);
1043 1097
1098 BUG_ON(test_bit(B_READING, &b->state));
1099
1044 if (!test_and_set_bit(B_DIRTY, &b->state)) 1100 if (!test_and_set_bit(B_DIRTY, &b->state))
1045 __relink_lru(b, LIST_DIRTY); 1101 __relink_lru(b, LIST_DIRTY);
1046 1102
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index 5c4c3a04e381..b142946a9e32 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -63,6 +63,14 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
63 struct dm_buffer **bp); 63 struct dm_buffer **bp);
64 64
65/* 65/*
66 * Prefetch the specified blocks to the cache.
67 * The function starts to read the blocks and returns without waiting for
68 * I/O to finish.
69 */
70void dm_bufio_prefetch(struct dm_bufio_client *c,
71 sector_t block, unsigned n_blocks);
72
73/*
66 * Release a reference obtained with dm_bufio_{read,get,new}. The data 74 * Release a reference obtained with dm_bufio_{read,get,new}. The data
67 * pointer and dm_buffer pointer is no longer valid after this call. 75 * pointer and dm_buffer pointer is no longer valid after this call.
68 */ 76 */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index db6b51639cee..3f06df59fd82 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -176,7 +176,6 @@ struct crypt_config {
176 176
177#define MIN_IOS 16 177#define MIN_IOS 16
178#define MIN_POOL_PAGES 32 178#define MIN_POOL_PAGES 32
179#define MIN_BIO_PAGES 8
180 179
181static struct kmem_cache *_crypt_io_pool; 180static struct kmem_cache *_crypt_io_pool;
182 181
@@ -848,12 +847,11 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
848 } 847 }
849 848
850 /* 849 /*
851 * if additional pages cannot be allocated without waiting, 850 * If additional pages cannot be allocated without waiting,
852 * return a partially allocated bio, the caller will then try 851 * return a partially-allocated bio. The caller will then try
853 * to allocate additional bios while submitting this partial bio 852 * to allocate more bios while submitting this partial bio.
854 */ 853 */
855 if (i == (MIN_BIO_PAGES - 1)) 854 gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
856 gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
857 855
858 len = (size > PAGE_SIZE) ? PAGE_SIZE : size; 856 len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
859 857
@@ -1046,16 +1044,14 @@ static void kcryptd_queue_io(struct dm_crypt_io *io)
1046 queue_work(cc->io_queue, &io->work); 1044 queue_work(cc->io_queue, &io->work);
1047} 1045}
1048 1046
1049static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, 1047static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
1050 int error, int async)
1051{ 1048{
1052 struct bio *clone = io->ctx.bio_out; 1049 struct bio *clone = io->ctx.bio_out;
1053 struct crypt_config *cc = io->target->private; 1050 struct crypt_config *cc = io->target->private;
1054 1051
1055 if (unlikely(error < 0)) { 1052 if (unlikely(io->error < 0)) {
1056 crypt_free_buffer_pages(cc, clone); 1053 crypt_free_buffer_pages(cc, clone);
1057 bio_put(clone); 1054 bio_put(clone);
1058 io->error = -EIO;
1059 crypt_dec_pending(io); 1055 crypt_dec_pending(io);
1060 return; 1056 return;
1061 } 1057 }
@@ -1106,12 +1102,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1106 sector += bio_sectors(clone); 1102 sector += bio_sectors(clone);
1107 1103
1108 crypt_inc_pending(io); 1104 crypt_inc_pending(io);
1105
1109 r = crypt_convert(cc, &io->ctx); 1106 r = crypt_convert(cc, &io->ctx);
1107 if (r < 0)
1108 io->error = -EIO;
1109
1110 crypt_finished = atomic_dec_and_test(&io->ctx.pending); 1110 crypt_finished = atomic_dec_and_test(&io->ctx.pending);
1111 1111
1112 /* Encryption was already finished, submit io now */ 1112 /* Encryption was already finished, submit io now */
1113 if (crypt_finished) { 1113 if (crypt_finished) {
1114 kcryptd_crypt_write_io_submit(io, r, 0); 1114 kcryptd_crypt_write_io_submit(io, 0);
1115 1115
1116 /* 1116 /*
1117 * If there was an error, do not try next fragments. 1117 * If there was an error, do not try next fragments.
@@ -1162,11 +1162,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1162 crypt_dec_pending(io); 1162 crypt_dec_pending(io);
1163} 1163}
1164 1164
1165static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error) 1165static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
1166{ 1166{
1167 if (unlikely(error < 0))
1168 io->error = -EIO;
1169
1170 crypt_dec_pending(io); 1167 crypt_dec_pending(io);
1171} 1168}
1172 1169
@@ -1181,9 +1178,11 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
1181 io->sector); 1178 io->sector);
1182 1179
1183 r = crypt_convert(cc, &io->ctx); 1180 r = crypt_convert(cc, &io->ctx);
1181 if (r < 0)
1182 io->error = -EIO;
1184 1183
1185 if (atomic_dec_and_test(&io->ctx.pending)) 1184 if (atomic_dec_and_test(&io->ctx.pending))
1186 kcryptd_crypt_read_done(io, r); 1185 kcryptd_crypt_read_done(io);
1187 1186
1188 crypt_dec_pending(io); 1187 crypt_dec_pending(io);
1189} 1188}
@@ -1204,15 +1203,18 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1204 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) 1203 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
1205 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); 1204 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
1206 1205
1206 if (error < 0)
1207 io->error = -EIO;
1208
1207 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); 1209 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
1208 1210
1209 if (!atomic_dec_and_test(&ctx->pending)) 1211 if (!atomic_dec_and_test(&ctx->pending))
1210 return; 1212 return;
1211 1213
1212 if (bio_data_dir(io->base_bio) == READ) 1214 if (bio_data_dir(io->base_bio) == READ)
1213 kcryptd_crypt_read_done(io, error); 1215 kcryptd_crypt_read_done(io);
1214 else 1216 else
1215 kcryptd_crypt_write_io_submit(io, error, 1); 1217 kcryptd_crypt_write_io_submit(io, 1);
1216} 1218}
1217 1219
1218static void kcryptd_crypt(struct work_struct *work) 1220static void kcryptd_crypt(struct work_struct *work)
@@ -1413,6 +1415,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1413 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; 1415 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
1414 char *cipher_api = NULL; 1416 char *cipher_api = NULL;
1415 int cpu, ret = -EINVAL; 1417 int cpu, ret = -EINVAL;
1418 char dummy;
1416 1419
1417 /* Convert to crypto api definition? */ 1420 /* Convert to crypto api definition? */
1418 if (strchr(cipher_in, '(')) { 1421 if (strchr(cipher_in, '(')) {
@@ -1434,7 +1437,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1434 1437
1435 if (!keycount) 1438 if (!keycount)
1436 cc->tfms_count = 1; 1439 cc->tfms_count = 1;
1437 else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || 1440 else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 ||
1438 !is_power_of_2(cc->tfms_count)) { 1441 !is_power_of_2(cc->tfms_count)) {
1439 ti->error = "Bad cipher key count specification"; 1442 ti->error = "Bad cipher key count specification";
1440 return -EINVAL; 1443 return -EINVAL;
@@ -1579,6 +1582,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1579 int ret; 1582 int ret;
1580 struct dm_arg_set as; 1583 struct dm_arg_set as;
1581 const char *opt_string; 1584 const char *opt_string;
1585 char dummy;
1582 1586
1583 static struct dm_arg _args[] = { 1587 static struct dm_arg _args[] = {
1584 {0, 1, "Invalid number of feature args"}, 1588 {0, 1, "Invalid number of feature args"},
@@ -1636,7 +1640,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1636 } 1640 }
1637 1641
1638 ret = -EINVAL; 1642 ret = -EINVAL;
1639 if (sscanf(argv[2], "%llu", &tmpll) != 1) { 1643 if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
1640 ti->error = "Invalid iv_offset sector"; 1644 ti->error = "Invalid iv_offset sector";
1641 goto bad; 1645 goto bad;
1642 } 1646 }
@@ -1647,7 +1651,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1647 goto bad; 1651 goto bad;
1648 } 1652 }
1649 1653
1650 if (sscanf(argv[4], "%llu", &tmpll) != 1) { 1654 if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
1651 ti->error = "Invalid device sector"; 1655 ti->error = "Invalid device sector";
1652 goto bad; 1656 goto bad;
1653 } 1657 }
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index f18375dcedd9..2dc22dddb2ae 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -131,6 +131,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
131{ 131{
132 struct delay_c *dc; 132 struct delay_c *dc;
133 unsigned long long tmpll; 133 unsigned long long tmpll;
134 char dummy;
134 135
135 if (argc != 3 && argc != 6) { 136 if (argc != 3 && argc != 6) {
136 ti->error = "requires exactly 3 or 6 arguments"; 137 ti->error = "requires exactly 3 or 6 arguments";
@@ -145,13 +146,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
145 146
146 dc->reads = dc->writes = 0; 147 dc->reads = dc->writes = 0;
147 148
148 if (sscanf(argv[1], "%llu", &tmpll) != 1) { 149 if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
149 ti->error = "Invalid device sector"; 150 ti->error = "Invalid device sector";
150 goto bad; 151 goto bad;
151 } 152 }
152 dc->start_read = tmpll; 153 dc->start_read = tmpll;
153 154
154 if (sscanf(argv[2], "%u", &dc->read_delay) != 1) { 155 if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) {
155 ti->error = "Invalid delay"; 156 ti->error = "Invalid delay";
156 goto bad; 157 goto bad;
157 } 158 }
@@ -166,13 +167,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
166 if (argc == 3) 167 if (argc == 3)
167 goto out; 168 goto out;
168 169
169 if (sscanf(argv[4], "%llu", &tmpll) != 1) { 170 if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
170 ti->error = "Invalid write device sector"; 171 ti->error = "Invalid write device sector";
171 goto bad_dev_read; 172 goto bad_dev_read;
172 } 173 }
173 dc->start_write = tmpll; 174 dc->start_write = tmpll;
174 175
175 if (sscanf(argv[5], "%u", &dc->write_delay) != 1) { 176 if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) {
176 ti->error = "Invalid write delay"; 177 ti->error = "Invalid write delay";
177 goto bad_dev_read; 178 goto bad_dev_read;
178 } 179 }
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 042e71996569..aa70f7d43a1a 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -283,7 +283,7 @@ int dm_exception_store_init(void)
283 return 0; 283 return 0;
284 284
285persistent_fail: 285persistent_fail:
286 dm_persistent_snapshot_exit(); 286 dm_transient_snapshot_exit();
287transient_fail: 287transient_fail:
288 return r; 288 return r;
289} 289}
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index b280c433e4a0..ac49c01f1a44 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -160,6 +160,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
160 unsigned long long tmpll; 160 unsigned long long tmpll;
161 struct dm_arg_set as; 161 struct dm_arg_set as;
162 const char *devname; 162 const char *devname;
163 char dummy;
163 164
164 as.argc = argc; 165 as.argc = argc;
165 as.argv = argv; 166 as.argv = argv;
@@ -178,7 +179,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
178 179
179 devname = dm_shift_arg(&as); 180 devname = dm_shift_arg(&as);
180 181
181 if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) { 182 if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) {
182 ti->error = "Invalid device sector"; 183 ti->error = "Invalid device sector";
183 goto bad; 184 goto bad;
184 } 185 }
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1ce84ed0b765..a1a3e6df17b8 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -880,6 +880,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
880 struct hd_geometry geometry; 880 struct hd_geometry geometry;
881 unsigned long indata[4]; 881 unsigned long indata[4];
882 char *geostr = (char *) param + param->data_start; 882 char *geostr = (char *) param + param->data_start;
883 char dummy;
883 884
884 md = find_device(param); 885 md = find_device(param);
885 if (!md) 886 if (!md)
@@ -891,8 +892,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
891 goto out; 892 goto out;
892 } 893 }
893 894
894 x = sscanf(geostr, "%lu %lu %lu %lu", indata, 895 x = sscanf(geostr, "%lu %lu %lu %lu%c", indata,
895 indata + 1, indata + 2, indata + 3); 896 indata + 1, indata + 2, indata + 3, &dummy);
896 897
897 if (x != 4) { 898 if (x != 4) {
898 DMWARN("Unable to interpret geometry settings."); 899 DMWARN("Unable to interpret geometry settings.");
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 9728839f844a..3639eeab6042 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -29,6 +29,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
29{ 29{
30 struct linear_c *lc; 30 struct linear_c *lc;
31 unsigned long long tmp; 31 unsigned long long tmp;
32 char dummy;
32 33
33 if (argc != 2) { 34 if (argc != 2) {
34 ti->error = "Invalid argument count"; 35 ti->error = "Invalid argument count";
@@ -41,7 +42,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
41 return -ENOMEM; 42 return -ENOMEM;
42 } 43 }
43 44
44 if (sscanf(argv[1], "%llu", &tmp) != 1) { 45 if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) {
45 ti->error = "dm-linear: Invalid device sector"; 46 ti->error = "dm-linear: Invalid device sector";
46 goto bad; 47 goto bad;
47 } 48 }
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 3b52bb72bd1f..65ebaebf502b 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -369,6 +369,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
369 unsigned int region_count; 369 unsigned int region_count;
370 size_t bitset_size, buf_size; 370 size_t bitset_size, buf_size;
371 int r; 371 int r;
372 char dummy;
372 373
373 if (argc < 1 || argc > 2) { 374 if (argc < 1 || argc > 2) {
374 DMWARN("wrong number of arguments to dirty region log"); 375 DMWARN("wrong number of arguments to dirty region log");
@@ -387,7 +388,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
387 } 388 }
388 } 389 }
389 390
390 if (sscanf(argv[0], "%u", &region_size) != 1 || 391 if (sscanf(argv[0], "%u%c", &region_size, &dummy) != 1 ||
391 !_check_region_size(ti, region_size)) { 392 !_check_region_size(ti, region_size)) {
392 DMWARN("invalid region size %s", argv[0]); 393 DMWARN("invalid region size %s", argv[0]);
393 return -EINVAL; 394 return -EINVAL;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 801d92d237cf..922a3385eead 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -226,6 +226,27 @@ static void free_multipath(struct multipath *m)
226 kfree(m); 226 kfree(m);
227} 227}
228 228
229static int set_mapinfo(struct multipath *m, union map_info *info)
230{
231 struct dm_mpath_io *mpio;
232
233 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
234 if (!mpio)
235 return -ENOMEM;
236
237 memset(mpio, 0, sizeof(*mpio));
238 info->ptr = mpio;
239
240 return 0;
241}
242
243static void clear_mapinfo(struct multipath *m, union map_info *info)
244{
245 struct dm_mpath_io *mpio = info->ptr;
246
247 info->ptr = NULL;
248 mempool_free(mpio, m->mpio_pool);
249}
229 250
230/*----------------------------------------------- 251/*-----------------------------------------------
231 * Path selection 252 * Path selection
@@ -341,13 +362,14 @@ static int __must_push_back(struct multipath *m)
341} 362}
342 363
343static int map_io(struct multipath *m, struct request *clone, 364static int map_io(struct multipath *m, struct request *clone,
344 struct dm_mpath_io *mpio, unsigned was_queued) 365 union map_info *map_context, unsigned was_queued)
345{ 366{
346 int r = DM_MAPIO_REMAPPED; 367 int r = DM_MAPIO_REMAPPED;
347 size_t nr_bytes = blk_rq_bytes(clone); 368 size_t nr_bytes = blk_rq_bytes(clone);
348 unsigned long flags; 369 unsigned long flags;
349 struct pgpath *pgpath; 370 struct pgpath *pgpath;
350 struct block_device *bdev; 371 struct block_device *bdev;
372 struct dm_mpath_io *mpio = map_context->ptr;
351 373
352 spin_lock_irqsave(&m->lock, flags); 374 spin_lock_irqsave(&m->lock, flags);
353 375
@@ -423,7 +445,6 @@ static void dispatch_queued_ios(struct multipath *m)
423{ 445{
424 int r; 446 int r;
425 unsigned long flags; 447 unsigned long flags;
426 struct dm_mpath_io *mpio;
427 union map_info *info; 448 union map_info *info;
428 struct request *clone, *n; 449 struct request *clone, *n;
429 LIST_HEAD(cl); 450 LIST_HEAD(cl);
@@ -436,16 +457,15 @@ static void dispatch_queued_ios(struct multipath *m)
436 list_del_init(&clone->queuelist); 457 list_del_init(&clone->queuelist);
437 458
438 info = dm_get_rq_mapinfo(clone); 459 info = dm_get_rq_mapinfo(clone);
439 mpio = info->ptr;
440 460
441 r = map_io(m, clone, mpio, 1); 461 r = map_io(m, clone, info, 1);
442 if (r < 0) { 462 if (r < 0) {
443 mempool_free(mpio, m->mpio_pool); 463 clear_mapinfo(m, info);
444 dm_kill_unmapped_request(clone, r); 464 dm_kill_unmapped_request(clone, r);
445 } else if (r == DM_MAPIO_REMAPPED) 465 } else if (r == DM_MAPIO_REMAPPED)
446 dm_dispatch_request(clone); 466 dm_dispatch_request(clone);
447 else if (r == DM_MAPIO_REQUEUE) { 467 else if (r == DM_MAPIO_REQUEUE) {
448 mempool_free(mpio, m->mpio_pool); 468 clear_mapinfo(m, info);
449 dm_requeue_unmapped_request(clone); 469 dm_requeue_unmapped_request(clone);
450 } 470 }
451 } 471 }
@@ -908,20 +928,16 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
908 union map_info *map_context) 928 union map_info *map_context)
909{ 929{
910 int r; 930 int r;
911 struct dm_mpath_io *mpio;
912 struct multipath *m = (struct multipath *) ti->private; 931 struct multipath *m = (struct multipath *) ti->private;
913 932
914 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); 933 if (set_mapinfo(m, map_context) < 0)
915 if (!mpio)
916 /* ENOMEM, requeue */ 934 /* ENOMEM, requeue */
917 return DM_MAPIO_REQUEUE; 935 return DM_MAPIO_REQUEUE;
918 memset(mpio, 0, sizeof(*mpio));
919 936
920 map_context->ptr = mpio;
921 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 937 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
922 r = map_io(m, clone, mpio, 0); 938 r = map_io(m, clone, map_context, 0);
923 if (r < 0 || r == DM_MAPIO_REQUEUE) 939 if (r < 0 || r == DM_MAPIO_REQUEUE)
924 mempool_free(mpio, m->mpio_pool); 940 clear_mapinfo(m, map_context);
925 941
926 return r; 942 return r;
927} 943}
@@ -1054,8 +1070,9 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
1054 struct priority_group *pg; 1070 struct priority_group *pg;
1055 unsigned pgnum; 1071 unsigned pgnum;
1056 unsigned long flags; 1072 unsigned long flags;
1073 char dummy;
1057 1074
1058 if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || 1075 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1059 (pgnum > m->nr_priority_groups)) { 1076 (pgnum > m->nr_priority_groups)) {
1060 DMWARN("invalid PG number supplied to switch_pg_num"); 1077 DMWARN("invalid PG number supplied to switch_pg_num");
1061 return -EINVAL; 1078 return -EINVAL;
@@ -1085,8 +1102,9 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
1085{ 1102{
1086 struct priority_group *pg; 1103 struct priority_group *pg;
1087 unsigned pgnum; 1104 unsigned pgnum;
1105 char dummy;
1088 1106
1089 if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || 1107 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1090 (pgnum > m->nr_priority_groups)) { 1108 (pgnum > m->nr_priority_groups)) {
1091 DMWARN("invalid PG number supplied to bypass_pg"); 1109 DMWARN("invalid PG number supplied to bypass_pg");
1092 return -EINVAL; 1110 return -EINVAL;
@@ -1261,13 +1279,15 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
1261 struct path_selector *ps; 1279 struct path_selector *ps;
1262 int r; 1280 int r;
1263 1281
1282 BUG_ON(!mpio);
1283
1264 r = do_end_io(m, clone, error, mpio); 1284 r = do_end_io(m, clone, error, mpio);
1265 if (pgpath) { 1285 if (pgpath) {
1266 ps = &pgpath->pg->ps; 1286 ps = &pgpath->pg->ps;
1267 if (ps->type->end_io) 1287 if (ps->type->end_io)
1268 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1288 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1269 } 1289 }
1270 mempool_free(mpio, m->mpio_pool); 1290 clear_mapinfo(m, map_context);
1271 1291
1272 return r; 1292 return r;
1273} 1293}
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 03a837aa5ce6..3941fae0de9f 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -112,6 +112,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
112 struct selector *s = ps->context; 112 struct selector *s = ps->context;
113 struct path_info *pi; 113 struct path_info *pi;
114 unsigned repeat_count = QL_MIN_IO; 114 unsigned repeat_count = QL_MIN_IO;
115 char dummy;
115 116
116 /* 117 /*
117 * Arguments: [<repeat_count>] 118 * Arguments: [<repeat_count>]
@@ -123,7 +124,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
123 return -EINVAL; 124 return -EINVAL;
124 } 125 }
125 126
126 if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { 127 if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
127 *error = "queue-length ps: invalid repeat count"; 128 *error = "queue-length ps: invalid repeat count";
128 return -EINVAL; 129 return -EINVAL;
129 } 130 }
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c5a875d7b882..b0ba52459ed7 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -604,7 +604,9 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
604 return 0; 604 return 0;
605 605
606 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { 606 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
607 DMERR("Failed to read device superblock"); 607 DMERR("Failed to read superblock of device at position %d",
608 rdev->raid_disk);
609 set_bit(Faulty, &rdev->flags);
608 return -EINVAL; 610 return -EINVAL;
609 } 611 }
610 612
@@ -855,9 +857,25 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
855static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) 857static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
856{ 858{
857 int ret; 859 int ret;
860 unsigned redundancy = 0;
861 struct raid_dev *dev;
858 struct md_rdev *rdev, *freshest; 862 struct md_rdev *rdev, *freshest;
859 struct mddev *mddev = &rs->md; 863 struct mddev *mddev = &rs->md;
860 864
865 switch (rs->raid_type->level) {
866 case 1:
867 redundancy = rs->md.raid_disks - 1;
868 break;
869 case 4:
870 case 5:
871 case 6:
872 redundancy = rs->raid_type->parity_devs;
873 break;
874 default:
875 ti->error = "Unknown RAID type";
876 return -EINVAL;
877 }
878
861 freshest = NULL; 879 freshest = NULL;
862 rdev_for_each(rdev, mddev) { 880 rdev_for_each(rdev, mddev) {
863 if (!rdev->meta_bdev) 881 if (!rdev->meta_bdev)
@@ -872,6 +890,37 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
872 case 0: 890 case 0:
873 break; 891 break;
874 default: 892 default:
893 dev = container_of(rdev, struct raid_dev, rdev);
894 if (redundancy--) {
895 if (dev->meta_dev)
896 dm_put_device(ti, dev->meta_dev);
897
898 dev->meta_dev = NULL;
899 rdev->meta_bdev = NULL;
900
901 if (rdev->sb_page)
902 put_page(rdev->sb_page);
903
904 rdev->sb_page = NULL;
905
906 rdev->sb_loaded = 0;
907
908 /*
909 * We might be able to salvage the data device
910 * even though the meta device has failed. For
911 * now, we behave as though '- -' had been
912 * set for this device in the table.
913 */
914 if (dev->data_dev)
915 dm_put_device(ti, dev->data_dev);
916
917 dev->data_dev = NULL;
918 rdev->bdev = NULL;
919
920 list_del(&rdev->same_set);
921
922 continue;
923 }
875 ti->error = "Failed to load superblock"; 924 ti->error = "Failed to load superblock";
876 return ret; 925 return ret;
877 } 926 }
@@ -1214,7 +1263,7 @@ static void raid_resume(struct dm_target *ti)
1214 1263
1215static struct target_type raid_target = { 1264static struct target_type raid_target = {
1216 .name = "raid", 1265 .name = "raid",
1217 .version = {1, 1, 0}, 1266 .version = {1, 2, 0},
1218 .module = THIS_MODULE, 1267 .module = THIS_MODULE,
1219 .ctr = raid_ctr, 1268 .ctr = raid_ctr,
1220 .dtr = raid_dtr, 1269 .dtr = raid_dtr,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9bfd057be686..d039de8322f0 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -924,8 +924,9 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
924 unsigned int mirror, char **argv) 924 unsigned int mirror, char **argv)
925{ 925{
926 unsigned long long offset; 926 unsigned long long offset;
927 char dummy;
927 928
928 if (sscanf(argv[1], "%llu", &offset) != 1) { 929 if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) {
929 ti->error = "Invalid offset"; 930 ti->error = "Invalid offset";
930 return -EINVAL; 931 return -EINVAL;
931 } 932 }
@@ -953,13 +954,14 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
953{ 954{
954 unsigned param_count; 955 unsigned param_count;
955 struct dm_dirty_log *dl; 956 struct dm_dirty_log *dl;
957 char dummy;
956 958
957 if (argc < 2) { 959 if (argc < 2) {
958 ti->error = "Insufficient mirror log arguments"; 960 ti->error = "Insufficient mirror log arguments";
959 return NULL; 961 return NULL;
960 } 962 }
961 963
962 if (sscanf(argv[1], "%u", &param_count) != 1) { 964 if (sscanf(argv[1], "%u%c", &param_count, &dummy) != 1) {
963 ti->error = "Invalid mirror log argument count"; 965 ti->error = "Invalid mirror log argument count";
964 return NULL; 966 return NULL;
965 } 967 }
@@ -986,13 +988,14 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
986{ 988{
987 unsigned num_features; 989 unsigned num_features;
988 struct dm_target *ti = ms->ti; 990 struct dm_target *ti = ms->ti;
991 char dummy;
989 992
990 *args_used = 0; 993 *args_used = 0;
991 994
992 if (!argc) 995 if (!argc)
993 return 0; 996 return 0;
994 997
995 if (sscanf(argv[0], "%u", &num_features) != 1) { 998 if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) {
996 ti->error = "Invalid number of features"; 999 ti->error = "Invalid number of features";
997 return -EINVAL; 1000 return -EINVAL;
998 } 1001 }
@@ -1036,6 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1036 unsigned int nr_mirrors, m, args_used; 1039 unsigned int nr_mirrors, m, args_used;
1037 struct mirror_set *ms; 1040 struct mirror_set *ms;
1038 struct dm_dirty_log *dl; 1041 struct dm_dirty_log *dl;
1042 char dummy;
1039 1043
1040 dl = create_dirty_log(ti, argc, argv, &args_used); 1044 dl = create_dirty_log(ti, argc, argv, &args_used);
1041 if (!dl) 1045 if (!dl)
@@ -1044,7 +1048,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1044 argv += args_used; 1048 argv += args_used;
1045 argc -= args_used; 1049 argc -= args_used;
1046 1050
1047 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || 1051 if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 ||
1048 nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { 1052 nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) {
1049 ti->error = "Invalid number of mirrors"; 1053 ti->error = "Invalid number of mirrors";
1050 dm_dirty_log_destroy(dl); 1054 dm_dirty_log_destroy(dl);
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 27f1d423b76c..6ab1192cdd5f 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -114,6 +114,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
114 struct selector *s = (struct selector *) ps->context; 114 struct selector *s = (struct selector *) ps->context;
115 struct path_info *pi; 115 struct path_info *pi;
116 unsigned repeat_count = RR_MIN_IO; 116 unsigned repeat_count = RR_MIN_IO;
117 char dummy;
117 118
118 if (argc > 1) { 119 if (argc > 1) {
119 *error = "round-robin ps: incorrect number of arguments"; 120 *error = "round-robin ps: incorrect number of arguments";
@@ -121,7 +122,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
121 } 122 }
122 123
123 /* First path argument is number of I/Os before switching path */ 124 /* First path argument is number of I/Os before switching path */
124 if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { 125 if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
125 *error = "round-robin ps: invalid repeat count"; 126 *error = "round-robin ps: invalid repeat count";
126 return -EINVAL; 127 return -EINVAL;
127 } 128 }
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 59883bd78214..9df8f6bd6418 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -110,6 +110,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
110 struct path_info *pi; 110 struct path_info *pi;
111 unsigned repeat_count = ST_MIN_IO; 111 unsigned repeat_count = ST_MIN_IO;
112 unsigned relative_throughput = 1; 112 unsigned relative_throughput = 1;
113 char dummy;
113 114
114 /* 115 /*
115 * Arguments: [<repeat_count> [<relative_throughput>]] 116 * Arguments: [<repeat_count> [<relative_throughput>]]
@@ -128,13 +129,13 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
128 return -EINVAL; 129 return -EINVAL;
129 } 130 }
130 131
131 if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { 132 if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
132 *error = "service-time ps: invalid repeat count"; 133 *error = "service-time ps: invalid repeat count";
133 return -EINVAL; 134 return -EINVAL;
134 } 135 }
135 136
136 if ((argc == 2) && 137 if ((argc == 2) &&
137 (sscanf(argv[1], "%u", &relative_throughput) != 1 || 138 (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
138 relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { 139 relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
139 *error = "service-time ps: invalid relative_throughput value"; 140 *error = "service-time ps: invalid relative_throughput value";
140 return -EINVAL; 141 return -EINVAL;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 3d80cf0c152d..35c94ff24ad5 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -75,8 +75,9 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
75 unsigned int stripe, char **argv) 75 unsigned int stripe, char **argv)
76{ 76{
77 unsigned long long start; 77 unsigned long long start;
78 char dummy;
78 79
79 if (sscanf(argv[1], "%llu", &start) != 1) 80 if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1)
80 return -EINVAL; 81 return -EINVAL;
81 82
82 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), 83 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 63cc54289aff..2e227fbf1622 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -268,8 +268,7 @@ void dm_table_destroy(struct dm_table *t)
268 vfree(t->highs); 268 vfree(t->highs);
269 269
270 /* free the device list */ 270 /* free the device list */
271 if (t->devices.next != &t->devices) 271 free_devices(&t->devices);
272 free_devices(&t->devices);
273 272
274 dm_free_md_mempools(t->mempools); 273 dm_free_md_mempools(t->mempools);
275 274
@@ -464,10 +463,11 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
464 struct dm_dev_internal *dd; 463 struct dm_dev_internal *dd;
465 unsigned int major, minor; 464 unsigned int major, minor;
466 struct dm_table *t = ti->table; 465 struct dm_table *t = ti->table;
466 char dummy;
467 467
468 BUG_ON(!t); 468 BUG_ON(!t);
469 469
470 if (sscanf(path, "%u:%u", &major, &minor) == 2) { 470 if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
471 /* Extract the major/minor numbers */ 471 /* Extract the major/minor numbers */
472 dev = MKDEV(major, minor); 472 dev = MKDEV(major, minor);
473 if (MAJOR(dev) != major || MINOR(dev) != minor) 473 if (MAJOR(dev) != major || MINOR(dev) != minor)
@@ -842,9 +842,10 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
842 unsigned *value, char **error, unsigned grouped) 842 unsigned *value, char **error, unsigned grouped)
843{ 843{
844 const char *arg_str = dm_shift_arg(arg_set); 844 const char *arg_str = dm_shift_arg(arg_set);
845 char dummy;
845 846
846 if (!arg_str || 847 if (!arg_str ||
847 (sscanf(arg_str, "%u", value) != 1) || 848 (sscanf(arg_str, "%u%c", value, &dummy) != 1) ||
848 (*value < arg->min) || 849 (*value < arg->min) ||
849 (*value > arg->max) || 850 (*value > arg->max) ||
850 (grouped && arg_set->argc < *value)) { 851 (grouped && arg_set->argc < *value)) {
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 237571af77fd..737d38865b69 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -614,7 +614,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
614 if (r < 0) 614 if (r < 0)
615 goto out; 615 goto out;
616 616
617 r = dm_sm_root_size(pmd->metadata_sm, &data_len); 617 r = dm_sm_root_size(pmd->data_sm, &data_len);
618 if (r < 0) 618 if (r < 0)
619 goto out; 619 goto out;
620 620
@@ -713,6 +713,9 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
713 if (r) 713 if (r)
714 goto bad; 714 goto bad;
715 715
716 if (bdev_size > THIN_METADATA_MAX_SECTORS)
717 bdev_size = THIN_METADATA_MAX_SECTORS;
718
716 disk_super = dm_block_data(sblock); 719 disk_super = dm_block_data(sblock);
717 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); 720 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
718 disk_super->version = cpu_to_le32(THIN_VERSION); 721 disk_super->version = cpu_to_le32(THIN_VERSION);
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 859c16896877..ed4725e67c96 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -11,6 +11,19 @@
11 11
12#define THIN_METADATA_BLOCK_SIZE 4096 12#define THIN_METADATA_BLOCK_SIZE 4096
13 13
14/*
15 * The metadata device is currently limited in size.
16 *
17 * We have one block of index, which can hold 255 index entries. Each
18 * index entry contains allocation info about 16k metadata blocks.
19 */
20#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
21
22/*
23 * A metadata device larger than 16GB triggers a warning.
24 */
25#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
26
14/*----------------------------------------------------------------*/ 27/*----------------------------------------------------------------*/
15 28
16struct dm_pool_metadata; 29struct dm_pool_metadata;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c3087575fef0..213ae32a0fc4 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -23,6 +23,7 @@
23#define DEFERRED_SET_SIZE 64 23#define DEFERRED_SET_SIZE 64
24#define MAPPING_POOL_SIZE 1024 24#define MAPPING_POOL_SIZE 1024
25#define PRISON_CELLS 1024 25#define PRISON_CELLS 1024
26#define COMMIT_PERIOD HZ
26 27
27/* 28/*
28 * The block size of the device holding pool data must be 29 * The block size of the device holding pool data must be
@@ -32,16 +33,6 @@
32#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 33#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
33 34
34/* 35/*
35 * The metadata device is currently limited in size. The limitation is
36 * checked lower down in dm-space-map-metadata, but we also check it here
37 * so we can fail early.
38 *
39 * We have one block of index, which can hold 255 index entries. Each
40 * index entry contains allocation info about 16k metadata blocks.
41 */
42#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
43
44/*
45 * Device id is restricted to 24 bits. 36 * Device id is restricted to 24 bits.
46 */ 37 */
47#define MAX_DEV_ID ((1 << 24) - 1) 38#define MAX_DEV_ID ((1 << 24) - 1)
@@ -72,7 +63,7 @@
72 * missed out if the io covers the block. (schedule_copy). 63 * missed out if the io covers the block. (schedule_copy).
73 * 64 *
74 * iv) insert the new mapping into the origin's btree 65 * iv) insert the new mapping into the origin's btree
75 * (process_prepared_mappings). This act of inserting breaks some 66 * (process_prepared_mapping). This act of inserting breaks some
76 * sharing of btree nodes between the two devices. Breaking sharing only 67 * sharing of btree nodes between the two devices. Breaking sharing only
77 * effects the btree of that specific device. Btrees for the other 68 * effects the btree of that specific device. Btrees for the other
78 * devices that share the block never change. The btree for the origin 69 * devices that share the block never change. The btree for the origin
@@ -124,7 +115,7 @@ struct cell {
124 struct hlist_node list; 115 struct hlist_node list;
125 struct bio_prison *prison; 116 struct bio_prison *prison;
126 struct cell_key key; 117 struct cell_key key;
127 unsigned count; 118 struct bio *holder;
128 struct bio_list bios; 119 struct bio_list bios;
129}; 120};
130 121
@@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
220 * This may block if a new cell needs allocating. You must ensure that 211 * This may block if a new cell needs allocating. You must ensure that
221 * cells will be unlocked even if the calling thread is blocked. 212 * cells will be unlocked even if the calling thread is blocked.
222 * 213 *
223 * Returns the number of entries in the cell prior to the new addition 214 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
224 * or < 0 on failure.
225 */ 215 */
226static int bio_detain(struct bio_prison *prison, struct cell_key *key, 216static int bio_detain(struct bio_prison *prison, struct cell_key *key,
227 struct bio *inmate, struct cell **ref) 217 struct bio *inmate, struct cell **ref)
228{ 218{
229 int r; 219 int r = 1;
230 unsigned long flags; 220 unsigned long flags;
231 uint32_t hash = hash_key(prison, key); 221 uint32_t hash = hash_key(prison, key);
232 struct cell *uninitialized_var(cell), *cell2 = NULL; 222 struct cell *cell, *cell2;
233 223
234 BUG_ON(hash > prison->nr_buckets); 224 BUG_ON(hash > prison->nr_buckets);
235 225
236 spin_lock_irqsave(&prison->lock, flags); 226 spin_lock_irqsave(&prison->lock, flags);
227
237 cell = __search_bucket(prison->cells + hash, key); 228 cell = __search_bucket(prison->cells + hash, key);
229 if (cell) {
230 bio_list_add(&cell->bios, inmate);
231 goto out;
232 }
238 233
239 if (!cell) { 234 /*
240 /* 235 * Allocate a new cell
241 * Allocate a new cell 236 */
242 */ 237 spin_unlock_irqrestore(&prison->lock, flags);
243 spin_unlock_irqrestore(&prison->lock, flags); 238 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
244 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 239 spin_lock_irqsave(&prison->lock, flags);
245 spin_lock_irqsave(&prison->lock, flags);
246 240
247 /* 241 /*
248 * We've been unlocked, so we have to double check that 242 * We've been unlocked, so we have to double check that
249 * nobody else has inserted this cell in the meantime. 243 * nobody else has inserted this cell in the meantime.
250 */ 244 */
251 cell = __search_bucket(prison->cells + hash, key); 245 cell = __search_bucket(prison->cells + hash, key);
246 if (cell) {
247 mempool_free(cell2, prison->cell_pool);
248 bio_list_add(&cell->bios, inmate);
249 goto out;
250 }
252 251
253 if (!cell) { 252 /*
254 cell = cell2; 253 * Use new cell.
255 cell2 = NULL; 254 */
255 cell = cell2;
256 256
257 cell->prison = prison; 257 cell->prison = prison;
258 memcpy(&cell->key, key, sizeof(cell->key)); 258 memcpy(&cell->key, key, sizeof(cell->key));
259 cell->count = 0; 259 cell->holder = inmate;
260 bio_list_init(&cell->bios); 260 bio_list_init(&cell->bios);
261 hlist_add_head(&cell->list, prison->cells + hash); 261 hlist_add_head(&cell->list, prison->cells + hash);
262 }
263 }
264 262
265 r = cell->count++; 263 r = 0;
266 bio_list_add(&cell->bios, inmate);
267 spin_unlock_irqrestore(&prison->lock, flags);
268 264
269 if (cell2) 265out:
270 mempool_free(cell2, prison->cell_pool); 266 spin_unlock_irqrestore(&prison->lock, flags);
271 267
272 *ref = cell; 268 *ref = cell;
273 269
@@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
283 279
284 hlist_del(&cell->list); 280 hlist_del(&cell->list);
285 281
286 if (inmates) 282 bio_list_add(inmates, cell->holder);
287 bio_list_merge(inmates, &cell->bios); 283 bio_list_merge(inmates, &cell->bios);
288 284
289 mempool_free(cell, prison->cell_pool); 285 mempool_free(cell, prison->cell_pool);
290} 286}
@@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios)
305 * bio may be in the cell. This function releases the cell, and also does 301 * bio may be in the cell. This function releases the cell, and also does
306 * a sanity check. 302 * a sanity check.
307 */ 303 */
304static void __cell_release_singleton(struct cell *cell, struct bio *bio)
305{
306 hlist_del(&cell->list);
307 BUG_ON(cell->holder != bio);
308 BUG_ON(!bio_list_empty(&cell->bios));
309}
310
308static void cell_release_singleton(struct cell *cell, struct bio *bio) 311static void cell_release_singleton(struct cell *cell, struct bio *bio)
309{ 312{
310 struct bio_prison *prison = cell->prison;
311 struct bio_list bios;
312 struct bio *b;
313 unsigned long flags; 313 unsigned long flags;
314 314 struct bio_prison *prison = cell->prison;
315 bio_list_init(&bios);
316 315
317 spin_lock_irqsave(&prison->lock, flags); 316 spin_lock_irqsave(&prison->lock, flags);
318 __cell_release(cell, &bios); 317 __cell_release_singleton(cell, bio);
319 spin_unlock_irqrestore(&prison->lock, flags); 318 spin_unlock_irqrestore(&prison->lock, flags);
319}
320
321/*
322 * Sometimes we don't want the holder, just the additional bios.
323 */
324static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
325{
326 struct bio_prison *prison = cell->prison;
327
328 hlist_del(&cell->list);
329 bio_list_merge(inmates, &cell->bios);
320 330
321 b = bio_list_pop(&bios); 331 mempool_free(cell, prison->cell_pool);
322 BUG_ON(b != bio); 332}
323 BUG_ON(!bio_list_empty(&bios)); 333
334static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
335{
336 unsigned long flags;
337 struct bio_prison *prison = cell->prison;
338
339 spin_lock_irqsave(&prison->lock, flags);
340 __cell_release_no_holder(cell, inmates);
341 spin_unlock_irqrestore(&prison->lock, flags);
324} 342}
325 343
326static void cell_error(struct cell *cell) 344static void cell_error(struct cell *cell)
@@ -471,6 +489,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
471 * devices. 489 * devices.
472 */ 490 */
473struct new_mapping; 491struct new_mapping;
492
493struct pool_features {
494 unsigned zero_new_blocks:1;
495 unsigned discard_enabled:1;
496 unsigned discard_passdown:1;
497};
498
474struct pool { 499struct pool {
475 struct list_head list; 500 struct list_head list;
476 struct dm_target *ti; /* Only set if a pool target is bound */ 501 struct dm_target *ti; /* Only set if a pool target is bound */
@@ -484,7 +509,7 @@ struct pool {
484 dm_block_t offset_mask; 509 dm_block_t offset_mask;
485 dm_block_t low_water_blocks; 510 dm_block_t low_water_blocks;
486 511
487 unsigned zero_new_blocks:1; 512 struct pool_features pf;
488 unsigned low_water_triggered:1; /* A dm event has been sent */ 513 unsigned low_water_triggered:1; /* A dm event has been sent */
489 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 514 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
490 515
@@ -493,17 +518,21 @@ struct pool {
493 518
494 struct workqueue_struct *wq; 519 struct workqueue_struct *wq;
495 struct work_struct worker; 520 struct work_struct worker;
521 struct delayed_work waker;
496 522
497 unsigned ref_count; 523 unsigned ref_count;
524 unsigned long last_commit_jiffies;
498 525
499 spinlock_t lock; 526 spinlock_t lock;
500 struct bio_list deferred_bios; 527 struct bio_list deferred_bios;
501 struct bio_list deferred_flush_bios; 528 struct bio_list deferred_flush_bios;
502 struct list_head prepared_mappings; 529 struct list_head prepared_mappings;
530 struct list_head prepared_discards;
503 531
504 struct bio_list retry_on_resume_list; 532 struct bio_list retry_on_resume_list;
505 533
506 struct deferred_set ds; /* FIXME: move to thin_c */ 534 struct deferred_set shared_read_ds;
535 struct deferred_set all_io_ds;
507 536
508 struct new_mapping *next_mapping; 537 struct new_mapping *next_mapping;
509 mempool_t *mapping_pool; 538 mempool_t *mapping_pool;
@@ -521,7 +550,7 @@ struct pool_c {
521 struct dm_target_callbacks callbacks; 550 struct dm_target_callbacks callbacks;
522 551
523 dm_block_t low_water_blocks; 552 dm_block_t low_water_blocks;
524 unsigned zero_new_blocks:1; 553 struct pool_features pf;
525}; 554};
526 555
527/* 556/*
@@ -529,6 +558,7 @@ struct pool_c {
529 */ 558 */
530struct thin_c { 559struct thin_c {
531 struct dm_dev *pool_dev; 560 struct dm_dev *pool_dev;
561 struct dm_dev *origin_dev;
532 dm_thin_id dev_id; 562 dm_thin_id dev_id;
533 563
534 struct pool *pool; 564 struct pool *pool;
@@ -597,6 +627,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
597 627
598/*----------------------------------------------------------------*/ 628/*----------------------------------------------------------------*/
599 629
630struct endio_hook {
631 struct thin_c *tc;
632 struct deferred_entry *shared_read_entry;
633 struct deferred_entry *all_io_entry;
634 struct new_mapping *overwrite_mapping;
635};
636
600static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 637static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
601{ 638{
602 struct bio *bio; 639 struct bio *bio;
@@ -607,7 +644,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
607 bio_list_init(master); 644 bio_list_init(master);
608 645
609 while ((bio = bio_list_pop(&bios))) { 646 while ((bio = bio_list_pop(&bios))) {
610 if (dm_get_mapinfo(bio)->ptr == tc) 647 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
648 if (h->tc == tc)
611 bio_endio(bio, DM_ENDIO_REQUEUE); 649 bio_endio(bio, DM_ENDIO_REQUEUE);
612 else 650 else
613 bio_list_add(master, bio); 651 bio_list_add(master, bio);
@@ -646,14 +684,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
646 (bio->bi_sector & pool->offset_mask); 684 (bio->bi_sector & pool->offset_mask);
647} 685}
648 686
649static void remap_and_issue(struct thin_c *tc, struct bio *bio, 687static void remap_to_origin(struct thin_c *tc, struct bio *bio)
650 dm_block_t block) 688{
689 bio->bi_bdev = tc->origin_dev->bdev;
690}
691
692static void issue(struct thin_c *tc, struct bio *bio)
651{ 693{
652 struct pool *pool = tc->pool; 694 struct pool *pool = tc->pool;
653 unsigned long flags; 695 unsigned long flags;
654 696
655 remap(tc, bio, block);
656
657 /* 697 /*
658 * Batch together any FUA/FLUSH bios we find and then issue 698 * Batch together any FUA/FLUSH bios we find and then issue
659 * a single commit for them in process_deferred_bios(). 699 * a single commit for them in process_deferred_bios().
@@ -666,6 +706,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
666 generic_make_request(bio); 706 generic_make_request(bio);
667} 707}
668 708
709static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
710{
711 remap_to_origin(tc, bio);
712 issue(tc, bio);
713}
714
715static void remap_and_issue(struct thin_c *tc, struct bio *bio,
716 dm_block_t block)
717{
718 remap(tc, bio, block);
719 issue(tc, bio);
720}
721
669/* 722/*
670 * wake_worker() is used when new work is queued and when pool_resume is 723 * wake_worker() is used when new work is queued and when pool_resume is
671 * ready to continue deferred IO processing. 724 * ready to continue deferred IO processing.
@@ -680,21 +733,17 @@ static void wake_worker(struct pool *pool)
680/* 733/*
681 * Bio endio functions. 734 * Bio endio functions.
682 */ 735 */
683struct endio_hook {
684 struct thin_c *tc;
685 bio_end_io_t *saved_bi_end_io;
686 struct deferred_entry *entry;
687};
688
689struct new_mapping { 736struct new_mapping {
690 struct list_head list; 737 struct list_head list;
691 738
692 int prepared; 739 unsigned quiesced:1;
740 unsigned prepared:1;
741 unsigned pass_discard:1;
693 742
694 struct thin_c *tc; 743 struct thin_c *tc;
695 dm_block_t virt_block; 744 dm_block_t virt_block;
696 dm_block_t data_block; 745 dm_block_t data_block;
697 struct cell *cell; 746 struct cell *cell, *cell2;
698 int err; 747 int err;
699 748
700 /* 749 /*
@@ -711,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
711{ 760{
712 struct pool *pool = m->tc->pool; 761 struct pool *pool = m->tc->pool;
713 762
714 if (list_empty(&m->list) && m->prepared) { 763 if (m->quiesced && m->prepared) {
715 list_add(&m->list, &pool->prepared_mappings); 764 list_add(&m->list, &pool->prepared_mappings);
716 wake_worker(pool); 765 wake_worker(pool);
717 } 766 }
@@ -734,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
734static void overwrite_endio(struct bio *bio, int err) 783static void overwrite_endio(struct bio *bio, int err)
735{ 784{
736 unsigned long flags; 785 unsigned long flags;
737 struct new_mapping *m = dm_get_mapinfo(bio)->ptr; 786 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
787 struct new_mapping *m = h->overwrite_mapping;
738 struct pool *pool = m->tc->pool; 788 struct pool *pool = m->tc->pool;
739 789
740 m->err = err; 790 m->err = err;
@@ -745,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err)
745 spin_unlock_irqrestore(&pool->lock, flags); 795 spin_unlock_irqrestore(&pool->lock, flags);
746} 796}
747 797
748static void shared_read_endio(struct bio *bio, int err)
749{
750 struct list_head mappings;
751 struct new_mapping *m, *tmp;
752 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
753 unsigned long flags;
754 struct pool *pool = h->tc->pool;
755
756 bio->bi_end_io = h->saved_bi_end_io;
757 bio_endio(bio, err);
758
759 INIT_LIST_HEAD(&mappings);
760 ds_dec(h->entry, &mappings);
761
762 spin_lock_irqsave(&pool->lock, flags);
763 list_for_each_entry_safe(m, tmp, &mappings, list) {
764 list_del(&m->list);
765 INIT_LIST_HEAD(&m->list);
766 __maybe_add_mapping(m);
767 }
768 spin_unlock_irqrestore(&pool->lock, flags);
769
770 mempool_free(h, pool->endio_hook_pool);
771}
772
773/*----------------------------------------------------------------*/ 798/*----------------------------------------------------------------*/
774 799
775/* 800/*
@@ -800,21 +825,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell,
800 * Same as cell_defer above, except it omits one particular detainee, 825 * Same as cell_defer above, except it omits one particular detainee,
801 * a write bio that covers the block and has already been processed. 826 * a write bio that covers the block and has already been processed.
802 */ 827 */
803static void cell_defer_except(struct thin_c *tc, struct cell *cell, 828static void cell_defer_except(struct thin_c *tc, struct cell *cell)
804 struct bio *exception)
805{ 829{
806 struct bio_list bios; 830 struct bio_list bios;
807 struct bio *bio;
808 struct pool *pool = tc->pool; 831 struct pool *pool = tc->pool;
809 unsigned long flags; 832 unsigned long flags;
810 833
811 bio_list_init(&bios); 834 bio_list_init(&bios);
812 cell_release(cell, &bios);
813 835
814 spin_lock_irqsave(&pool->lock, flags); 836 spin_lock_irqsave(&pool->lock, flags);
815 while ((bio = bio_list_pop(&bios))) 837 cell_release_no_holder(cell, &pool->deferred_bios);
816 if (bio != exception)
817 bio_list_add(&pool->deferred_bios, bio);
818 spin_unlock_irqrestore(&pool->lock, flags); 838 spin_unlock_irqrestore(&pool->lock, flags);
819 839
820 wake_worker(pool); 840 wake_worker(pool);
@@ -854,7 +874,7 @@ static void process_prepared_mapping(struct new_mapping *m)
854 * the bios in the cell. 874 * the bios in the cell.
855 */ 875 */
856 if (bio) { 876 if (bio) {
857 cell_defer_except(tc, m->cell, bio); 877 cell_defer_except(tc, m->cell);
858 bio_endio(bio, 0); 878 bio_endio(bio, 0);
859 } else 879 } else
860 cell_defer(tc, m->cell, m->data_block); 880 cell_defer(tc, m->cell, m->data_block);
@@ -863,7 +883,30 @@ static void process_prepared_mapping(struct new_mapping *m)
863 mempool_free(m, tc->pool->mapping_pool); 883 mempool_free(m, tc->pool->mapping_pool);
864} 884}
865 885
866static void process_prepared_mappings(struct pool *pool) 886static void process_prepared_discard(struct new_mapping *m)
887{
888 int r;
889 struct thin_c *tc = m->tc;
890
891 r = dm_thin_remove_block(tc->td, m->virt_block);
892 if (r)
893 DMERR("dm_thin_remove_block() failed");
894
895 /*
896 * Pass the discard down to the underlying device?
897 */
898 if (m->pass_discard)
899 remap_and_issue(tc, m->bio, m->data_block);
900 else
901 bio_endio(m->bio, 0);
902
903 cell_defer_except(tc, m->cell);
904 cell_defer_except(tc, m->cell2);
905 mempool_free(m, tc->pool->mapping_pool);
906}
907
908static void process_prepared(struct pool *pool, struct list_head *head,
909 void (*fn)(struct new_mapping *))
867{ 910{
868 unsigned long flags; 911 unsigned long flags;
869 struct list_head maps; 912 struct list_head maps;
@@ -871,21 +914,27 @@ static void process_prepared_mappings(struct pool *pool)
871 914
872 INIT_LIST_HEAD(&maps); 915 INIT_LIST_HEAD(&maps);
873 spin_lock_irqsave(&pool->lock, flags); 916 spin_lock_irqsave(&pool->lock, flags);
874 list_splice_init(&pool->prepared_mappings, &maps); 917 list_splice_init(head, &maps);
875 spin_unlock_irqrestore(&pool->lock, flags); 918 spin_unlock_irqrestore(&pool->lock, flags);
876 919
877 list_for_each_entry_safe(m, tmp, &maps, list) 920 list_for_each_entry_safe(m, tmp, &maps, list)
878 process_prepared_mapping(m); 921 fn(m);
879} 922}
880 923
881/* 924/*
882 * Deferred bio jobs. 925 * Deferred bio jobs.
883 */ 926 */
884static int io_overwrites_block(struct pool *pool, struct bio *bio) 927static int io_overlaps_block(struct pool *pool, struct bio *bio)
885{ 928{
886 return ((bio_data_dir(bio) == WRITE) && 929 return !(bio->bi_sector & pool->offset_mask) &&
887 !(bio->bi_sector & pool->offset_mask)) &&
888 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); 930 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
931
932}
933
934static int io_overwrites_block(struct pool *pool, struct bio *bio)
935{
936 return (bio_data_dir(bio) == WRITE) &&
937 io_overlaps_block(pool, bio);
889} 938}
890 939
891static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 940static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -917,7 +966,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool)
917} 966}
918 967
919static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 968static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
920 dm_block_t data_origin, dm_block_t data_dest, 969 struct dm_dev *origin, dm_block_t data_origin,
970 dm_block_t data_dest,
921 struct cell *cell, struct bio *bio) 971 struct cell *cell, struct bio *bio)
922{ 972{
923 int r; 973 int r;
@@ -925,6 +975,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
925 struct new_mapping *m = get_next_mapping(pool); 975 struct new_mapping *m = get_next_mapping(pool);
926 976
927 INIT_LIST_HEAD(&m->list); 977 INIT_LIST_HEAD(&m->list);
978 m->quiesced = 0;
928 m->prepared = 0; 979 m->prepared = 0;
929 m->tc = tc; 980 m->tc = tc;
930 m->virt_block = virt_block; 981 m->virt_block = virt_block;
@@ -933,7 +984,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
933 m->err = 0; 984 m->err = 0;
934 m->bio = NULL; 985 m->bio = NULL;
935 986
936 ds_add_work(&pool->ds, &m->list); 987 if (!ds_add_work(&pool->shared_read_ds, &m->list))
988 m->quiesced = 1;
937 989
938 /* 990 /*
939 * IO to pool_dev remaps to the pool target's data_dev. 991 * IO to pool_dev remaps to the pool target's data_dev.
@@ -942,14 +994,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
942 * bio immediately. Otherwise we use kcopyd to clone the data first. 994 * bio immediately. Otherwise we use kcopyd to clone the data first.
943 */ 995 */
944 if (io_overwrites_block(pool, bio)) { 996 if (io_overwrites_block(pool, bio)) {
997 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
998 h->overwrite_mapping = m;
945 m->bio = bio; 999 m->bio = bio;
946 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1000 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
947 dm_get_mapinfo(bio)->ptr = m;
948 remap_and_issue(tc, bio, data_dest); 1001 remap_and_issue(tc, bio, data_dest);
949 } else { 1002 } else {
950 struct dm_io_region from, to; 1003 struct dm_io_region from, to;
951 1004
952 from.bdev = tc->pool_dev->bdev; 1005 from.bdev = origin->bdev;
953 from.sector = data_origin * pool->sectors_per_block; 1006 from.sector = data_origin * pool->sectors_per_block;
954 from.count = pool->sectors_per_block; 1007 from.count = pool->sectors_per_block;
955 1008
@@ -967,6 +1020,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
967 } 1020 }
968} 1021}
969 1022
1023static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1024 dm_block_t data_origin, dm_block_t data_dest,
1025 struct cell *cell, struct bio *bio)
1026{
1027 schedule_copy(tc, virt_block, tc->pool_dev,
1028 data_origin, data_dest, cell, bio);
1029}
1030
1031static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1032 dm_block_t data_dest,
1033 struct cell *cell, struct bio *bio)
1034{
1035 schedule_copy(tc, virt_block, tc->origin_dev,
1036 virt_block, data_dest, cell, bio);
1037}
1038
970static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 1039static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
971 dm_block_t data_block, struct cell *cell, 1040 dm_block_t data_block, struct cell *cell,
972 struct bio *bio) 1041 struct bio *bio)
@@ -975,6 +1044,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
975 struct new_mapping *m = get_next_mapping(pool); 1044 struct new_mapping *m = get_next_mapping(pool);
976 1045
977 INIT_LIST_HEAD(&m->list); 1046 INIT_LIST_HEAD(&m->list);
1047 m->quiesced = 1;
978 m->prepared = 0; 1048 m->prepared = 0;
979 m->tc = tc; 1049 m->tc = tc;
980 m->virt_block = virt_block; 1050 m->virt_block = virt_block;
@@ -988,13 +1058,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
988 * zeroing pre-existing data, we can issue the bio immediately. 1058 * zeroing pre-existing data, we can issue the bio immediately.
989 * Otherwise we use kcopyd to zero the data first. 1059 * Otherwise we use kcopyd to zero the data first.
990 */ 1060 */
991 if (!pool->zero_new_blocks) 1061 if (!pool->pf.zero_new_blocks)
992 process_prepared_mapping(m); 1062 process_prepared_mapping(m);
993 1063
994 else if (io_overwrites_block(pool, bio)) { 1064 else if (io_overwrites_block(pool, bio)) {
1065 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1066 h->overwrite_mapping = m;
995 m->bio = bio; 1067 m->bio = bio;
996 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1068 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
997 dm_get_mapinfo(bio)->ptr = m;
998 remap_and_issue(tc, bio, data_block); 1069 remap_and_issue(tc, bio, data_block);
999 1070
1000 } else { 1071 } else {
@@ -1081,7 +1152,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1081 */ 1152 */
1082static void retry_on_resume(struct bio *bio) 1153static void retry_on_resume(struct bio *bio)
1083{ 1154{
1084 struct thin_c *tc = dm_get_mapinfo(bio)->ptr; 1155 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1156 struct thin_c *tc = h->tc;
1085 struct pool *pool = tc->pool; 1157 struct pool *pool = tc->pool;
1086 unsigned long flags; 1158 unsigned long flags;
1087 1159
@@ -1102,6 +1174,86 @@ static void no_space(struct cell *cell)
1102 retry_on_resume(bio); 1174 retry_on_resume(bio);
1103} 1175}
1104 1176
1177static void process_discard(struct thin_c *tc, struct bio *bio)
1178{
1179 int r;
1180 struct pool *pool = tc->pool;
1181 struct cell *cell, *cell2;
1182 struct cell_key key, key2;
1183 dm_block_t block = get_bio_block(tc, bio);
1184 struct dm_thin_lookup_result lookup_result;
1185 struct new_mapping *m;
1186
1187 build_virtual_key(tc->td, block, &key);
1188 if (bio_detain(tc->pool->prison, &key, bio, &cell))
1189 return;
1190
1191 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1192 switch (r) {
1193 case 0:
1194 /*
1195 * Check nobody is fiddling with this pool block. This can
1196 * happen if someone's in the process of breaking sharing
1197 * on this block.
1198 */
1199 build_data_key(tc->td, lookup_result.block, &key2);
1200 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
1201 cell_release_singleton(cell, bio);
1202 break;
1203 }
1204
1205 if (io_overlaps_block(pool, bio)) {
1206 /*
1207 * IO may still be going to the destination block. We must
1208 * quiesce before we can do the removal.
1209 */
1210 m = get_next_mapping(pool);
1211 m->tc = tc;
1212 m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown;
1213 m->virt_block = block;
1214 m->data_block = lookup_result.block;
1215 m->cell = cell;
1216 m->cell2 = cell2;
1217 m->err = 0;
1218 m->bio = bio;
1219
1220 if (!ds_add_work(&pool->all_io_ds, &m->list)) {
1221 list_add(&m->list, &pool->prepared_discards);
1222 wake_worker(pool);
1223 }
1224 } else {
1225 /*
1226 * This path is hit if people are ignoring
1227 * limits->discard_granularity. It ignores any
1228 * part of the discard that is in a subsequent
1229 * block.
1230 */
1231 sector_t offset = bio->bi_sector - (block << pool->block_shift);
1232 unsigned remaining = (pool->sectors_per_block - offset) << 9;
1233 bio->bi_size = min(bio->bi_size, remaining);
1234
1235 cell_release_singleton(cell, bio);
1236 cell_release_singleton(cell2, bio);
1237 remap_and_issue(tc, bio, lookup_result.block);
1238 }
1239 break;
1240
1241 case -ENODATA:
1242 /*
1243 * It isn't provisioned, just forget it.
1244 */
1245 cell_release_singleton(cell, bio);
1246 bio_endio(bio, 0);
1247 break;
1248
1249 default:
1250 DMERR("discard: find block unexpectedly returned %d", r);
1251 cell_release_singleton(cell, bio);
1252 bio_io_error(bio);
1253 break;
1254 }
1255}
1256
1105static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1257static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1106 struct cell_key *key, 1258 struct cell_key *key,
1107 struct dm_thin_lookup_result *lookup_result, 1259 struct dm_thin_lookup_result *lookup_result,
@@ -1113,8 +1265,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1113 r = alloc_data_block(tc, &data_block); 1265 r = alloc_data_block(tc, &data_block);
1114 switch (r) { 1266 switch (r) {
1115 case 0: 1267 case 0:
1116 schedule_copy(tc, block, lookup_result->block, 1268 schedule_internal_copy(tc, block, lookup_result->block,
1117 data_block, cell, bio); 1269 data_block, cell, bio);
1118 break; 1270 break;
1119 1271
1120 case -ENOSPC: 1272 case -ENOSPC:
@@ -1147,13 +1299,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1147 if (bio_data_dir(bio) == WRITE) 1299 if (bio_data_dir(bio) == WRITE)
1148 break_sharing(tc, bio, block, &key, lookup_result, cell); 1300 break_sharing(tc, bio, block, &key, lookup_result, cell);
1149 else { 1301 else {
1150 struct endio_hook *h; 1302 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1151 h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1152 1303
1153 h->tc = tc; 1304 h->shared_read_entry = ds_inc(&pool->shared_read_ds);
1154 h->entry = ds_inc(&pool->ds);
1155 save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
1156 dm_get_mapinfo(bio)->ptr = h;
1157 1305
1158 cell_release_singleton(cell, bio); 1306 cell_release_singleton(cell, bio);
1159 remap_and_issue(tc, bio, lookup_result->block); 1307 remap_and_issue(tc, bio, lookup_result->block);
@@ -1188,7 +1336,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1188 r = alloc_data_block(tc, &data_block); 1336 r = alloc_data_block(tc, &data_block);
1189 switch (r) { 1337 switch (r) {
1190 case 0: 1338 case 0:
1191 schedule_zero(tc, block, data_block, cell, bio); 1339 if (tc->origin_dev)
1340 schedule_external_copy(tc, block, data_block, cell, bio);
1341 else
1342 schedule_zero(tc, block, data_block, cell, bio);
1192 break; 1343 break;
1193 1344
1194 case -ENOSPC: 1345 case -ENOSPC:
@@ -1239,16 +1390,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1239 break; 1390 break;
1240 1391
1241 case -ENODATA: 1392 case -ENODATA:
1242 provision_block(tc, bio, block, cell); 1393 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1394 cell_release_singleton(cell, bio);
1395 remap_to_origin_and_issue(tc, bio);
1396 } else
1397 provision_block(tc, bio, block, cell);
1243 break; 1398 break;
1244 1399
1245 default: 1400 default:
1246 DMERR("dm_thin_find_block() failed, error = %d", r); 1401 DMERR("dm_thin_find_block() failed, error = %d", r);
1402 cell_release_singleton(cell, bio);
1247 bio_io_error(bio); 1403 bio_io_error(bio);
1248 break; 1404 break;
1249 } 1405 }
1250} 1406}
1251 1407
1408static int need_commit_due_to_time(struct pool *pool)
1409{
1410 return jiffies < pool->last_commit_jiffies ||
1411 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1412}
1413
1252static void process_deferred_bios(struct pool *pool) 1414static void process_deferred_bios(struct pool *pool)
1253{ 1415{
1254 unsigned long flags; 1416 unsigned long flags;
@@ -1264,7 +1426,9 @@ static void process_deferred_bios(struct pool *pool)
1264 spin_unlock_irqrestore(&pool->lock, flags); 1426 spin_unlock_irqrestore(&pool->lock, flags);
1265 1427
1266 while ((bio = bio_list_pop(&bios))) { 1428 while ((bio = bio_list_pop(&bios))) {
1267 struct thin_c *tc = dm_get_mapinfo(bio)->ptr; 1429 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1430 struct thin_c *tc = h->tc;
1431
1268 /* 1432 /*
1269 * If we've got no free new_mapping structs, and processing 1433 * If we've got no free new_mapping structs, and processing
1270 * this bio might require one, we pause until there are some 1434 * this bio might require one, we pause until there are some
@@ -1277,7 +1441,11 @@ static void process_deferred_bios(struct pool *pool)
1277 1441
1278 break; 1442 break;
1279 } 1443 }
1280 process_bio(tc, bio); 1444
1445 if (bio->bi_rw & REQ_DISCARD)
1446 process_discard(tc, bio);
1447 else
1448 process_bio(tc, bio);
1281 } 1449 }
1282 1450
1283 /* 1451 /*
@@ -1290,7 +1458,7 @@ static void process_deferred_bios(struct pool *pool)
1290 bio_list_init(&pool->deferred_flush_bios); 1458 bio_list_init(&pool->deferred_flush_bios);
1291 spin_unlock_irqrestore(&pool->lock, flags); 1459 spin_unlock_irqrestore(&pool->lock, flags);
1292 1460
1293 if (bio_list_empty(&bios)) 1461 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1294 return; 1462 return;
1295 1463
1296 r = dm_pool_commit_metadata(pool->pmd); 1464 r = dm_pool_commit_metadata(pool->pmd);
@@ -1301,6 +1469,7 @@ static void process_deferred_bios(struct pool *pool)
1301 bio_io_error(bio); 1469 bio_io_error(bio);
1302 return; 1470 return;
1303 } 1471 }
1472 pool->last_commit_jiffies = jiffies;
1304 1473
1305 while ((bio = bio_list_pop(&bios))) 1474 while ((bio = bio_list_pop(&bios)))
1306 generic_make_request(bio); 1475 generic_make_request(bio);
@@ -1310,10 +1479,22 @@ static void do_worker(struct work_struct *ws)
1310{ 1479{
1311 struct pool *pool = container_of(ws, struct pool, worker); 1480 struct pool *pool = container_of(ws, struct pool, worker);
1312 1481
1313 process_prepared_mappings(pool); 1482 process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
1483 process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
1314 process_deferred_bios(pool); 1484 process_deferred_bios(pool);
1315} 1485}
1316 1486
1487/*
1488 * We want to commit periodically so that not too much
1489 * unwritten data builds up.
1490 */
1491static void do_waker(struct work_struct *ws)
1492{
1493 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1494 wake_worker(pool);
1495 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1496}
1497
1317/*----------------------------------------------------------------*/ 1498/*----------------------------------------------------------------*/
1318 1499
1319/* 1500/*
@@ -1335,6 +1516,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1335 wake_worker(pool); 1516 wake_worker(pool);
1336} 1517}
1337 1518
1519static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
1520{
1521 struct pool *pool = tc->pool;
1522 struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1523
1524 h->tc = tc;
1525 h->shared_read_entry = NULL;
1526 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
1527 h->overwrite_mapping = NULL;
1528
1529 return h;
1530}
1531
1338/* 1532/*
1339 * Non-blocking function called from the thin target's map function. 1533 * Non-blocking function called from the thin target's map function.
1340 */ 1534 */
@@ -1347,12 +1541,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1347 struct dm_thin_device *td = tc->td; 1541 struct dm_thin_device *td = tc->td;
1348 struct dm_thin_lookup_result result; 1542 struct dm_thin_lookup_result result;
1349 1543
1350 /* 1544 map_context->ptr = thin_hook_bio(tc, bio);
1351 * Save the thin context for easy access from the deferred bio later. 1545 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1352 */
1353 map_context->ptr = tc;
1354
1355 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1356 thin_defer_bio(tc, bio); 1546 thin_defer_bio(tc, bio);
1357 return DM_MAPIO_SUBMITTED; 1547 return DM_MAPIO_SUBMITTED;
1358 } 1548 }
@@ -1434,7 +1624,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1434 1624
1435 pool->ti = ti; 1625 pool->ti = ti;
1436 pool->low_water_blocks = pt->low_water_blocks; 1626 pool->low_water_blocks = pt->low_water_blocks;
1437 pool->zero_new_blocks = pt->zero_new_blocks; 1627 pool->pf = pt->pf;
1438 1628
1439 return 0; 1629 return 0;
1440} 1630}
@@ -1448,6 +1638,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1448/*---------------------------------------------------------------- 1638/*----------------------------------------------------------------
1449 * Pool creation 1639 * Pool creation
1450 *--------------------------------------------------------------*/ 1640 *--------------------------------------------------------------*/
1641/* Initialize pool features. */
1642static void pool_features_init(struct pool_features *pf)
1643{
1644 pf->zero_new_blocks = 1;
1645 pf->discard_enabled = 1;
1646 pf->discard_passdown = 1;
1647}
1648
1451static void __pool_destroy(struct pool *pool) 1649static void __pool_destroy(struct pool *pool)
1452{ 1650{
1453 __pool_table_remove(pool); 1651 __pool_table_remove(pool);
@@ -1495,7 +1693,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1495 pool->block_shift = ffs(block_size) - 1; 1693 pool->block_shift = ffs(block_size) - 1;
1496 pool->offset_mask = block_size - 1; 1694 pool->offset_mask = block_size - 1;
1497 pool->low_water_blocks = 0; 1695 pool->low_water_blocks = 0;
1498 pool->zero_new_blocks = 1; 1696 pool_features_init(&pool->pf);
1499 pool->prison = prison_create(PRISON_CELLS); 1697 pool->prison = prison_create(PRISON_CELLS);
1500 if (!pool->prison) { 1698 if (!pool->prison) {
1501 *error = "Error creating pool's bio prison"; 1699 *error = "Error creating pool's bio prison";
@@ -1523,14 +1721,17 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1523 } 1721 }
1524 1722
1525 INIT_WORK(&pool->worker, do_worker); 1723 INIT_WORK(&pool->worker, do_worker);
1724 INIT_DELAYED_WORK(&pool->waker, do_waker);
1526 spin_lock_init(&pool->lock); 1725 spin_lock_init(&pool->lock);
1527 bio_list_init(&pool->deferred_bios); 1726 bio_list_init(&pool->deferred_bios);
1528 bio_list_init(&pool->deferred_flush_bios); 1727 bio_list_init(&pool->deferred_flush_bios);
1529 INIT_LIST_HEAD(&pool->prepared_mappings); 1728 INIT_LIST_HEAD(&pool->prepared_mappings);
1729 INIT_LIST_HEAD(&pool->prepared_discards);
1530 pool->low_water_triggered = 0; 1730 pool->low_water_triggered = 0;
1531 pool->no_free_space = 0; 1731 pool->no_free_space = 0;
1532 bio_list_init(&pool->retry_on_resume_list); 1732 bio_list_init(&pool->retry_on_resume_list);
1533 ds_init(&pool->ds); 1733 ds_init(&pool->shared_read_ds);
1734 ds_init(&pool->all_io_ds);
1534 1735
1535 pool->next_mapping = NULL; 1736 pool->next_mapping = NULL;
1536 pool->mapping_pool = 1737 pool->mapping_pool =
@@ -1549,6 +1750,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1549 goto bad_endio_hook_pool; 1750 goto bad_endio_hook_pool;
1550 } 1751 }
1551 pool->ref_count = 1; 1752 pool->ref_count = 1;
1753 pool->last_commit_jiffies = jiffies;
1552 pool->pool_md = pool_md; 1754 pool->pool_md = pool_md;
1553 pool->md_dev = metadata_dev; 1755 pool->md_dev = metadata_dev;
1554 __pool_table_insert(pool); 1756 __pool_table_insert(pool);
@@ -1588,7 +1790,8 @@ static void __pool_dec(struct pool *pool)
1588 1790
1589static struct pool *__pool_find(struct mapped_device *pool_md, 1791static struct pool *__pool_find(struct mapped_device *pool_md,
1590 struct block_device *metadata_dev, 1792 struct block_device *metadata_dev,
1591 unsigned long block_size, char **error) 1793 unsigned long block_size, char **error,
1794 int *created)
1592{ 1795{
1593 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 1796 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1594 1797
@@ -1604,8 +1807,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
1604 return ERR_PTR(-EINVAL); 1807 return ERR_PTR(-EINVAL);
1605 __pool_inc(pool); 1808 __pool_inc(pool);
1606 1809
1607 } else 1810 } else {
1608 pool = pool_create(pool_md, metadata_dev, block_size, error); 1811 pool = pool_create(pool_md, metadata_dev, block_size, error);
1812 *created = 1;
1813 }
1609 } 1814 }
1610 1815
1611 return pool; 1816 return pool;
@@ -1629,10 +1834,6 @@ static void pool_dtr(struct dm_target *ti)
1629 mutex_unlock(&dm_thin_pool_table.mutex); 1834 mutex_unlock(&dm_thin_pool_table.mutex);
1630} 1835}
1631 1836
1632struct pool_features {
1633 unsigned zero_new_blocks:1;
1634};
1635
1636static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 1837static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1637 struct dm_target *ti) 1838 struct dm_target *ti)
1638{ 1839{
@@ -1641,7 +1842,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1641 const char *arg_name; 1842 const char *arg_name;
1642 1843
1643 static struct dm_arg _args[] = { 1844 static struct dm_arg _args[] = {
1644 {0, 1, "Invalid number of pool feature arguments"}, 1845 {0, 3, "Invalid number of pool feature arguments"},
1645 }; 1846 };
1646 1847
1647 /* 1848 /*
@@ -1661,6 +1862,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1661 if (!strcasecmp(arg_name, "skip_block_zeroing")) { 1862 if (!strcasecmp(arg_name, "skip_block_zeroing")) {
1662 pf->zero_new_blocks = 0; 1863 pf->zero_new_blocks = 0;
1663 continue; 1864 continue;
1865 } else if (!strcasecmp(arg_name, "ignore_discard")) {
1866 pf->discard_enabled = 0;
1867 continue;
1868 } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
1869 pf->discard_passdown = 0;
1870 continue;
1664 } 1871 }
1665 1872
1666 ti->error = "Unrecognised pool feature requested"; 1873 ti->error = "Unrecognised pool feature requested";
@@ -1678,10 +1885,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1678 * 1885 *
1679 * Optional feature arguments are: 1886 * Optional feature arguments are:
1680 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 1887 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1888 * ignore_discard: disable discard
1889 * no_discard_passdown: don't pass discards down to the data device
1681 */ 1890 */
1682static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 1891static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1683{ 1892{
1684 int r; 1893 int r, pool_created = 0;
1685 struct pool_c *pt; 1894 struct pool_c *pt;
1686 struct pool *pool; 1895 struct pool *pool;
1687 struct pool_features pf; 1896 struct pool_features pf;
@@ -1691,6 +1900,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1691 dm_block_t low_water_blocks; 1900 dm_block_t low_water_blocks;
1692 struct dm_dev *metadata_dev; 1901 struct dm_dev *metadata_dev;
1693 sector_t metadata_dev_size; 1902 sector_t metadata_dev_size;
1903 char b[BDEVNAME_SIZE];
1694 1904
1695 /* 1905 /*
1696 * FIXME Remove validation from scope of lock. 1906 * FIXME Remove validation from scope of lock.
@@ -1712,11 +1922,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1712 } 1922 }
1713 1923
1714 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 1924 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1715 if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) { 1925 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1716 ti->error = "Metadata device is too large"; 1926 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1717 r = -EINVAL; 1927 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1718 goto out_metadata;
1719 }
1720 1928
1721 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 1929 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1722 if (r) { 1930 if (r) {
@@ -1742,8 +1950,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1742 /* 1950 /*
1743 * Set default pool features. 1951 * Set default pool features.
1744 */ 1952 */
1745 memset(&pf, 0, sizeof(pf)); 1953 pool_features_init(&pf);
1746 pf.zero_new_blocks = 1;
1747 1954
1748 dm_consume_args(&as, 4); 1955 dm_consume_args(&as, 4);
1749 r = parse_pool_features(&as, &pf, ti); 1956 r = parse_pool_features(&as, &pf, ti);
@@ -1757,20 +1964,58 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1757 } 1964 }
1758 1965
1759 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 1966 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1760 block_size, &ti->error); 1967 block_size, &ti->error, &pool_created);
1761 if (IS_ERR(pool)) { 1968 if (IS_ERR(pool)) {
1762 r = PTR_ERR(pool); 1969 r = PTR_ERR(pool);
1763 goto out_free_pt; 1970 goto out_free_pt;
1764 } 1971 }
1765 1972
1973 /*
1974 * 'pool_created' reflects whether this is the first table load.
1975 * Top level discard support is not allowed to be changed after
1976 * initial load. This would require a pool reload to trigger thin
1977 * device changes.
1978 */
1979 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
1980 ti->error = "Discard support cannot be disabled once enabled";
1981 r = -EINVAL;
1982 goto out_flags_changed;
1983 }
1984
1985 /*
1986 * If discard_passdown was enabled verify that the data device
1987 * supports discards. Disable discard_passdown if not; otherwise
1988 * -EOPNOTSUPP will be returned.
1989 */
1990 if (pf.discard_passdown) {
1991 struct request_queue *q = bdev_get_queue(data_dev->bdev);
1992 if (!q || !blk_queue_discard(q)) {
1993 DMWARN("Discard unsupported by data device: Disabling discard passdown.");
1994 pf.discard_passdown = 0;
1995 }
1996 }
1997
1766 pt->pool = pool; 1998 pt->pool = pool;
1767 pt->ti = ti; 1999 pt->ti = ti;
1768 pt->metadata_dev = metadata_dev; 2000 pt->metadata_dev = metadata_dev;
1769 pt->data_dev = data_dev; 2001 pt->data_dev = data_dev;
1770 pt->low_water_blocks = low_water_blocks; 2002 pt->low_water_blocks = low_water_blocks;
1771 pt->zero_new_blocks = pf.zero_new_blocks; 2003 pt->pf = pf;
1772 ti->num_flush_requests = 1; 2004 ti->num_flush_requests = 1;
1773 ti->num_discard_requests = 0; 2005 /*
2006 * Only need to enable discards if the pool should pass
2007 * them down to the data device. The thin device's discard
2008 * processing will cause mappings to be removed from the btree.
2009 */
2010 if (pf.discard_enabled && pf.discard_passdown) {
2011 ti->num_discard_requests = 1;
2012 /*
2013 * Setting 'discards_supported' circumvents the normal
2014 * stacking of discard limits (this keeps the pool and
2015 * thin devices' discard limits consistent).
2016 */
2017 ti->discards_supported = 1;
2018 }
1774 ti->private = pt; 2019 ti->private = pt;
1775 2020
1776 pt->callbacks.congested_fn = pool_is_congested; 2021 pt->callbacks.congested_fn = pool_is_congested;
@@ -1780,6 +2025,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1780 2025
1781 return 0; 2026 return 0;
1782 2027
2028out_flags_changed:
2029 __pool_dec(pool);
1783out_free_pt: 2030out_free_pt:
1784 kfree(pt); 2031 kfree(pt);
1785out: 2032out:
@@ -1878,7 +2125,7 @@ static void pool_resume(struct dm_target *ti)
1878 __requeue_bios(pool); 2125 __requeue_bios(pool);
1879 spin_unlock_irqrestore(&pool->lock, flags); 2126 spin_unlock_irqrestore(&pool->lock, flags);
1880 2127
1881 wake_worker(pool); 2128 do_waker(&pool->waker.work);
1882} 2129}
1883 2130
1884static void pool_postsuspend(struct dm_target *ti) 2131static void pool_postsuspend(struct dm_target *ti)
@@ -1887,6 +2134,7 @@ static void pool_postsuspend(struct dm_target *ti)
1887 struct pool_c *pt = ti->private; 2134 struct pool_c *pt = ti->private;
1888 struct pool *pool = pt->pool; 2135 struct pool *pool = pt->pool;
1889 2136
2137 cancel_delayed_work(&pool->waker);
1890 flush_workqueue(pool->wq); 2138 flush_workqueue(pool->wq);
1891 2139
1892 r = dm_pool_commit_metadata(pool->pmd); 2140 r = dm_pool_commit_metadata(pool->pmd);
@@ -2067,7 +2315,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2067static int pool_status(struct dm_target *ti, status_type_t type, 2315static int pool_status(struct dm_target *ti, status_type_t type,
2068 char *result, unsigned maxlen) 2316 char *result, unsigned maxlen)
2069{ 2317{
2070 int r; 2318 int r, count;
2071 unsigned sz = 0; 2319 unsigned sz = 0;
2072 uint64_t transaction_id; 2320 uint64_t transaction_id;
2073 dm_block_t nr_free_blocks_data; 2321 dm_block_t nr_free_blocks_data;
@@ -2130,10 +2378,19 @@ static int pool_status(struct dm_target *ti, status_type_t type,
2130 (unsigned long)pool->sectors_per_block, 2378 (unsigned long)pool->sectors_per_block,
2131 (unsigned long long)pt->low_water_blocks); 2379 (unsigned long long)pt->low_water_blocks);
2132 2380
2133 DMEMIT("%u ", !pool->zero_new_blocks); 2381 count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
2382 !pool->pf.discard_passdown;
2383 DMEMIT("%u ", count);
2134 2384
2135 if (!pool->zero_new_blocks) 2385 if (!pool->pf.zero_new_blocks)
2136 DMEMIT("skip_block_zeroing "); 2386 DMEMIT("skip_block_zeroing ");
2387
2388 if (!pool->pf.discard_enabled)
2389 DMEMIT("ignore_discard ");
2390
2391 if (!pool->pf.discard_passdown)
2392 DMEMIT("no_discard_passdown ");
2393
2137 break; 2394 break;
2138 } 2395 }
2139 2396
@@ -2162,6 +2419,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2162 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2419 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2163} 2420}
2164 2421
2422static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
2423{
2424 /*
2425 * FIXME: these limits may be incompatible with the pool's data device
2426 */
2427 limits->max_discard_sectors = pool->sectors_per_block;
2428
2429 /*
2430 * This is just a hint, and not enforced. We have to cope with
2431 * bios that overlap 2 blocks.
2432 */
2433 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2434 limits->discard_zeroes_data = pool->pf.zero_new_blocks;
2435}
2436
2165static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2437static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2166{ 2438{
2167 struct pool_c *pt = ti->private; 2439 struct pool_c *pt = ti->private;
@@ -2169,13 +2441,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2169 2441
2170 blk_limits_io_min(limits, 0); 2442 blk_limits_io_min(limits, 0);
2171 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2443 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2444 if (pool->pf.discard_enabled)
2445 set_discard_limits(pool, limits);
2172} 2446}
2173 2447
2174static struct target_type pool_target = { 2448static struct target_type pool_target = {
2175 .name = "thin-pool", 2449 .name = "thin-pool",
2176 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2450 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2177 DM_TARGET_IMMUTABLE, 2451 DM_TARGET_IMMUTABLE,
2178 .version = {1, 0, 0}, 2452 .version = {1, 1, 0},
2179 .module = THIS_MODULE, 2453 .module = THIS_MODULE,
2180 .ctr = pool_ctr, 2454 .ctr = pool_ctr,
2181 .dtr = pool_dtr, 2455 .dtr = pool_dtr,
@@ -2202,6 +2476,8 @@ static void thin_dtr(struct dm_target *ti)
2202 __pool_dec(tc->pool); 2476 __pool_dec(tc->pool);
2203 dm_pool_close_thin_device(tc->td); 2477 dm_pool_close_thin_device(tc->td);
2204 dm_put_device(ti, tc->pool_dev); 2478 dm_put_device(ti, tc->pool_dev);
2479 if (tc->origin_dev)
2480 dm_put_device(ti, tc->origin_dev);
2205 kfree(tc); 2481 kfree(tc);
2206 2482
2207 mutex_unlock(&dm_thin_pool_table.mutex); 2483 mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2210,21 +2486,25 @@ static void thin_dtr(struct dm_target *ti)
2210/* 2486/*
2211 * Thin target parameters: 2487 * Thin target parameters:
2212 * 2488 *
2213 * <pool_dev> <dev_id> 2489 * <pool_dev> <dev_id> [origin_dev]
2214 * 2490 *
2215 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2491 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2216 * dev_id: the internal device identifier 2492 * dev_id: the internal device identifier
2493 * origin_dev: a device external to the pool that should act as the origin
2494 *
2495 * If the pool device has discards disabled, they get disabled for the thin
2496 * device as well.
2217 */ 2497 */
2218static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2498static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2219{ 2499{
2220 int r; 2500 int r;
2221 struct thin_c *tc; 2501 struct thin_c *tc;
2222 struct dm_dev *pool_dev; 2502 struct dm_dev *pool_dev, *origin_dev;
2223 struct mapped_device *pool_md; 2503 struct mapped_device *pool_md;
2224 2504
2225 mutex_lock(&dm_thin_pool_table.mutex); 2505 mutex_lock(&dm_thin_pool_table.mutex);
2226 2506
2227 if (argc != 2) { 2507 if (argc != 2 && argc != 3) {
2228 ti->error = "Invalid argument count"; 2508 ti->error = "Invalid argument count";
2229 r = -EINVAL; 2509 r = -EINVAL;
2230 goto out_unlock; 2510 goto out_unlock;
@@ -2237,6 +2517,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2237 goto out_unlock; 2517 goto out_unlock;
2238 } 2518 }
2239 2519
2520 if (argc == 3) {
2521 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2522 if (r) {
2523 ti->error = "Error opening origin device";
2524 goto bad_origin_dev;
2525 }
2526 tc->origin_dev = origin_dev;
2527 }
2528
2240 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2529 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2241 if (r) { 2530 if (r) {
2242 ti->error = "Error opening pool device"; 2531 ti->error = "Error opening pool device";
@@ -2273,8 +2562,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2273 2562
2274 ti->split_io = tc->pool->sectors_per_block; 2563 ti->split_io = tc->pool->sectors_per_block;
2275 ti->num_flush_requests = 1; 2564 ti->num_flush_requests = 1;
2276 ti->num_discard_requests = 0; 2565
2277 ti->discards_supported = 0; 2566 /* In case the pool supports discards, pass them on. */
2567 if (tc->pool->pf.discard_enabled) {
2568 ti->discards_supported = 1;
2569 ti->num_discard_requests = 1;
2570 }
2278 2571
2279 dm_put(pool_md); 2572 dm_put(pool_md);
2280 2573
@@ -2289,6 +2582,9 @@ bad_pool_lookup:
2289bad_common: 2582bad_common:
2290 dm_put_device(ti, tc->pool_dev); 2583 dm_put_device(ti, tc->pool_dev);
2291bad_pool_dev: 2584bad_pool_dev:
2585 if (tc->origin_dev)
2586 dm_put_device(ti, tc->origin_dev);
2587bad_origin_dev:
2292 kfree(tc); 2588 kfree(tc);
2293out_unlock: 2589out_unlock:
2294 mutex_unlock(&dm_thin_pool_table.mutex); 2590 mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2299,11 +2595,46 @@ out_unlock:
2299static int thin_map(struct dm_target *ti, struct bio *bio, 2595static int thin_map(struct dm_target *ti, struct bio *bio,
2300 union map_info *map_context) 2596 union map_info *map_context)
2301{ 2597{
2302 bio->bi_sector -= ti->begin; 2598 bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
2303 2599
2304 return thin_bio_map(ti, bio, map_context); 2600 return thin_bio_map(ti, bio, map_context);
2305} 2601}
2306 2602
2603static int thin_endio(struct dm_target *ti,
2604 struct bio *bio, int err,
2605 union map_info *map_context)
2606{
2607 unsigned long flags;
2608 struct endio_hook *h = map_context->ptr;
2609 struct list_head work;
2610 struct new_mapping *m, *tmp;
2611 struct pool *pool = h->tc->pool;
2612
2613 if (h->shared_read_entry) {
2614 INIT_LIST_HEAD(&work);
2615 ds_dec(h->shared_read_entry, &work);
2616
2617 spin_lock_irqsave(&pool->lock, flags);
2618 list_for_each_entry_safe(m, tmp, &work, list) {
2619 list_del(&m->list);
2620 m->quiesced = 1;
2621 __maybe_add_mapping(m);
2622 }
2623 spin_unlock_irqrestore(&pool->lock, flags);
2624 }
2625
2626 if (h->all_io_entry) {
2627 INIT_LIST_HEAD(&work);
2628 ds_dec(h->all_io_entry, &work);
2629 list_for_each_entry_safe(m, tmp, &work, list)
2630 list_add(&m->list, &pool->prepared_discards);
2631 }
2632
2633 mempool_free(h, pool->endio_hook_pool);
2634
2635 return 0;
2636}
2637
2307static void thin_postsuspend(struct dm_target *ti) 2638static void thin_postsuspend(struct dm_target *ti)
2308{ 2639{
2309 if (dm_noflush_suspending(ti)) 2640 if (dm_noflush_suspending(ti))
@@ -2347,6 +2678,8 @@ static int thin_status(struct dm_target *ti, status_type_t type,
2347 DMEMIT("%s %lu", 2678 DMEMIT("%s %lu",
2348 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 2679 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2349 (unsigned long) tc->dev_id); 2680 (unsigned long) tc->dev_id);
2681 if (tc->origin_dev)
2682 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
2350 break; 2683 break;
2351 } 2684 }
2352 } 2685 }
@@ -2377,18 +2710,21 @@ static int thin_iterate_devices(struct dm_target *ti,
2377static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 2710static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2378{ 2711{
2379 struct thin_c *tc = ti->private; 2712 struct thin_c *tc = ti->private;
2713 struct pool *pool = tc->pool;
2380 2714
2381 blk_limits_io_min(limits, 0); 2715 blk_limits_io_min(limits, 0);
2382 blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT); 2716 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2717 set_discard_limits(pool, limits);
2383} 2718}
2384 2719
2385static struct target_type thin_target = { 2720static struct target_type thin_target = {
2386 .name = "thin", 2721 .name = "thin",
2387 .version = {1, 0, 0}, 2722 .version = {1, 1, 0},
2388 .module = THIS_MODULE, 2723 .module = THIS_MODULE,
2389 .ctr = thin_ctr, 2724 .ctr = thin_ctr,
2390 .dtr = thin_dtr, 2725 .dtr = thin_dtr,
2391 .map = thin_map, 2726 .map = thin_map,
2727 .end_io = thin_endio,
2392 .postsuspend = thin_postsuspend, 2728 .postsuspend = thin_postsuspend,
2393 .status = thin_status, 2729 .status = thin_status,
2394 .iterate_devices = thin_iterate_devices, 2730 .iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
new file mode 100644
index 000000000000..fa365d39b612
--- /dev/null
+++ b/drivers/md/dm-verity.c
@@ -0,0 +1,913 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * Author: Mikulas Patocka <mpatocka@redhat.com>
5 *
6 * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
7 *
8 * This file is released under the GPLv2.
9 *
10 * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set
11 * default prefetch value. Data are read in "prefetch_cluster" chunks from the
12 * hash device. Setting this greatly improves performance when data and hash
13 * are on the same disk on different partitions on devices with poor random
14 * access behavior.
15 */
16
17#include "dm-bufio.h"
18
19#include <linux/module.h>
20#include <linux/device-mapper.h>
21#include <crypto/hash.h>
22
23#define DM_MSG_PREFIX "verity"
24
25#define DM_VERITY_IO_VEC_INLINE 16
26#define DM_VERITY_MEMPOOL_SIZE 4
27#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
28
29#define DM_VERITY_MAX_LEVELS 63
30
31static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
32
33module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
34
35struct dm_verity {
36 struct dm_dev *data_dev;
37 struct dm_dev *hash_dev;
38 struct dm_target *ti;
39 struct dm_bufio_client *bufio;
40 char *alg_name;
41 struct crypto_shash *tfm;
42 u8 *root_digest; /* digest of the root block */
43 u8 *salt; /* salt: its size is salt_size */
44 unsigned salt_size;
45 sector_t data_start; /* data offset in 512-byte sectors */
46 sector_t hash_start; /* hash start in blocks */
47 sector_t data_blocks; /* the number of data blocks */
48 sector_t hash_blocks; /* the number of hash blocks */
49 unsigned char data_dev_block_bits; /* log2(data blocksize) */
50 unsigned char hash_dev_block_bits; /* log2(hash blocksize) */
51 unsigned char hash_per_block_bits; /* log2(hashes in hash block) */
52 unsigned char levels; /* the number of tree levels */
53 unsigned char version;
54 unsigned digest_size; /* digest size for the current hash algorithm */
55 unsigned shash_descsize;/* the size of temporary space for crypto */
56 int hash_failed; /* set to 1 if hash of any block failed */
57
58 mempool_t *io_mempool; /* mempool of struct dm_verity_io */
59 mempool_t *vec_mempool; /* mempool of bio vector */
60
61 struct workqueue_struct *verify_wq;
62
63 /* starting blocks for each tree level. 0 is the lowest level. */
64 sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
65};
66
67struct dm_verity_io {
68 struct dm_verity *v;
69 struct bio *bio;
70
71 /* original values of bio->bi_end_io and bio->bi_private */
72 bio_end_io_t *orig_bi_end_io;
73 void *orig_bi_private;
74
75 sector_t block;
76 unsigned n_blocks;
77
78 /* saved bio vector */
79 struct bio_vec *io_vec;
80 unsigned io_vec_size;
81
82 struct work_struct work;
83
84 /* A space for short vectors; longer vectors are allocated separately. */
85 struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
86
87 /*
88 * Three variably-size fields follow this struct:
89 *
90 * u8 hash_desc[v->shash_descsize];
91 * u8 real_digest[v->digest_size];
92 * u8 want_digest[v->digest_size];
93 *
94 * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
95 */
96};
97
98static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
99{
100 return (struct shash_desc *)(io + 1);
101}
102
103static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
104{
105 return (u8 *)(io + 1) + v->shash_descsize;
106}
107
108static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
109{
110 return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
111}
112
113/*
114 * Auxiliary structure appended to each dm-bufio buffer. If the value
115 * hash_verified is nonzero, hash of the block has been verified.
116 *
117 * The variable hash_verified is set to 0 when allocating the buffer, then
118 * it can be changed to 1 and it is never reset to 0 again.
119 *
120 * There is no lock around this value, a race condition can at worst cause
121 * that multiple processes verify the hash of the same buffer simultaneously
122 * and write 1 to hash_verified simultaneously.
123 * This condition is harmless, so we don't need locking.
124 */
125struct buffer_aux {
126 int hash_verified;
127};
128
129/*
130 * Initialize struct buffer_aux for a freshly created buffer.
131 */
132static void dm_bufio_alloc_callback(struct dm_buffer *buf)
133{
134 struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
135
136 aux->hash_verified = 0;
137}
138
139/*
140 * Translate input sector number to the sector number on the target device.
141 */
142static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector)
143{
144 return v->data_start + dm_target_offset(v->ti, bi_sector);
145}
146
147/*
148 * Return hash position of a specified block at a specified tree level
149 * (0 is the lowest level).
150 * The lowest "hash_per_block_bits"-bits of the result denote hash position
151 * inside a hash block. The remaining bits denote location of the hash block.
152 */
153static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
154 int level)
155{
156 return block >> (level * v->hash_per_block_bits);
157}
158
159static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
160 sector_t *hash_block, unsigned *offset)
161{
162 sector_t position = verity_position_at_level(v, block, level);
163 unsigned idx;
164
165 *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits);
166
167 if (!offset)
168 return;
169
170 idx = position & ((1 << v->hash_per_block_bits) - 1);
171 if (!v->version)
172 *offset = idx * v->digest_size;
173 else
174 *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
175}
176
177/*
178 * Verify hash of a metadata block pertaining to the specified data block
179 * ("block" argument) at a specified level ("level" argument).
180 *
181 * On successful return, io_want_digest(v, io) contains the hash value for
182 * a lower tree level or for the data block (if we're at the lowest leve).
183 *
184 * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
185 * If "skip_unverified" is false, unverified buffer is hashed and verified
186 * against current value of io_want_digest(v, io).
187 */
188static int verity_verify_level(struct dm_verity_io *io, sector_t block,
189 int level, bool skip_unverified)
190{
191 struct dm_verity *v = io->v;
192 struct dm_buffer *buf;
193 struct buffer_aux *aux;
194 u8 *data;
195 int r;
196 sector_t hash_block;
197 unsigned offset;
198
199 verity_hash_at_level(v, block, level, &hash_block, &offset);
200
201 data = dm_bufio_read(v->bufio, hash_block, &buf);
202 if (unlikely(IS_ERR(data)))
203 return PTR_ERR(data);
204
205 aux = dm_bufio_get_aux_data(buf);
206
207 if (!aux->hash_verified) {
208 struct shash_desc *desc;
209 u8 *result;
210
211 if (skip_unverified) {
212 r = 1;
213 goto release_ret_r;
214 }
215
216 desc = io_hash_desc(v, io);
217 desc->tfm = v->tfm;
218 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
219 r = crypto_shash_init(desc);
220 if (r < 0) {
221 DMERR("crypto_shash_init failed: %d", r);
222 goto release_ret_r;
223 }
224
225 if (likely(v->version >= 1)) {
226 r = crypto_shash_update(desc, v->salt, v->salt_size);
227 if (r < 0) {
228 DMERR("crypto_shash_update failed: %d", r);
229 goto release_ret_r;
230 }
231 }
232
233 r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
234 if (r < 0) {
235 DMERR("crypto_shash_update failed: %d", r);
236 goto release_ret_r;
237 }
238
239 if (!v->version) {
240 r = crypto_shash_update(desc, v->salt, v->salt_size);
241 if (r < 0) {
242 DMERR("crypto_shash_update failed: %d", r);
243 goto release_ret_r;
244 }
245 }
246
247 result = io_real_digest(v, io);
248 r = crypto_shash_final(desc, result);
249 if (r < 0) {
250 DMERR("crypto_shash_final failed: %d", r);
251 goto release_ret_r;
252 }
253 if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
254 DMERR_LIMIT("metadata block %llu is corrupted",
255 (unsigned long long)hash_block);
256 v->hash_failed = 1;
257 r = -EIO;
258 goto release_ret_r;
259 } else
260 aux->hash_verified = 1;
261 }
262
263 data += offset;
264
265 memcpy(io_want_digest(v, io), data, v->digest_size);
266
267 dm_bufio_release(buf);
268 return 0;
269
270release_ret_r:
271 dm_bufio_release(buf);
272
273 return r;
274}
275
276/*
277 * Verify one "dm_verity_io" structure.
278 */
279static int verity_verify_io(struct dm_verity_io *io)
280{
281 struct dm_verity *v = io->v;
282 unsigned b;
283 int i;
284 unsigned vector = 0, offset = 0;
285
286 for (b = 0; b < io->n_blocks; b++) {
287 struct shash_desc *desc;
288 u8 *result;
289 int r;
290 unsigned todo;
291
292 if (likely(v->levels)) {
293 /*
294 * First, we try to get the requested hash for
295 * the current block. If the hash block itself is
296 * verified, zero is returned. If it isn't, this
297 * function returns 0 and we fall back to whole
298 * chain verification.
299 */
300 int r = verity_verify_level(io, io->block + b, 0, true);
301 if (likely(!r))
302 goto test_block_hash;
303 if (r < 0)
304 return r;
305 }
306
307 memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
308
309 for (i = v->levels - 1; i >= 0; i--) {
310 int r = verity_verify_level(io, io->block + b, i, false);
311 if (unlikely(r))
312 return r;
313 }
314
315test_block_hash:
316 desc = io_hash_desc(v, io);
317 desc->tfm = v->tfm;
318 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
319 r = crypto_shash_init(desc);
320 if (r < 0) {
321 DMERR("crypto_shash_init failed: %d", r);
322 return r;
323 }
324
325 if (likely(v->version >= 1)) {
326 r = crypto_shash_update(desc, v->salt, v->salt_size);
327 if (r < 0) {
328 DMERR("crypto_shash_update failed: %d", r);
329 return r;
330 }
331 }
332
333 todo = 1 << v->data_dev_block_bits;
334 do {
335 struct bio_vec *bv;
336 u8 *page;
337 unsigned len;
338
339 BUG_ON(vector >= io->io_vec_size);
340 bv = &io->io_vec[vector];
341 page = kmap_atomic(bv->bv_page);
342 len = bv->bv_len - offset;
343 if (likely(len >= todo))
344 len = todo;
345 r = crypto_shash_update(desc,
346 page + bv->bv_offset + offset, len);
347 kunmap_atomic(page);
348 if (r < 0) {
349 DMERR("crypto_shash_update failed: %d", r);
350 return r;
351 }
352 offset += len;
353 if (likely(offset == bv->bv_len)) {
354 offset = 0;
355 vector++;
356 }
357 todo -= len;
358 } while (todo);
359
360 if (!v->version) {
361 r = crypto_shash_update(desc, v->salt, v->salt_size);
362 if (r < 0) {
363 DMERR("crypto_shash_update failed: %d", r);
364 return r;
365 }
366 }
367
368 result = io_real_digest(v, io);
369 r = crypto_shash_final(desc, result);
370 if (r < 0) {
371 DMERR("crypto_shash_final failed: %d", r);
372 return r;
373 }
374 if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
375 DMERR_LIMIT("data block %llu is corrupted",
376 (unsigned long long)(io->block + b));
377 v->hash_failed = 1;
378 return -EIO;
379 }
380 }
381 BUG_ON(vector != io->io_vec_size);
382 BUG_ON(offset);
383
384 return 0;
385}
386
387/*
388 * End one "io" structure with a given error.
389 */
390static void verity_finish_io(struct dm_verity_io *io, int error)
391{
392 struct bio *bio = io->bio;
393 struct dm_verity *v = io->v;
394
395 bio->bi_end_io = io->orig_bi_end_io;
396 bio->bi_private = io->orig_bi_private;
397
398 if (io->io_vec != io->io_vec_inline)
399 mempool_free(io->io_vec, v->vec_mempool);
400
401 mempool_free(io, v->io_mempool);
402
403 bio_endio(bio, error);
404}
405
406static void verity_work(struct work_struct *w)
407{
408 struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
409
410 verity_finish_io(io, verity_verify_io(io));
411}
412
413static void verity_end_io(struct bio *bio, int error)
414{
415 struct dm_verity_io *io = bio->bi_private;
416
417 if (error) {
418 verity_finish_io(io, error);
419 return;
420 }
421
422 INIT_WORK(&io->work, verity_work);
423 queue_work(io->v->verify_wq, &io->work);
424}
425
426/*
427 * Prefetch buffers for the specified io.
428 * The root buffer is not prefetched, it is assumed that it will be cached
429 * all the time.
430 */
431static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
432{
433 int i;
434
435 for (i = v->levels - 2; i >= 0; i--) {
436 sector_t hash_block_start;
437 sector_t hash_block_end;
438 verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
439 verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
440 if (!i) {
441 unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster;
442
443 cluster >>= v->data_dev_block_bits;
444 if (unlikely(!cluster))
445 goto no_prefetch_cluster;
446
447 if (unlikely(cluster & (cluster - 1)))
448 cluster = 1 << (fls(cluster) - 1);
449
450 hash_block_start &= ~(sector_t)(cluster - 1);
451 hash_block_end |= cluster - 1;
452 if (unlikely(hash_block_end >= v->hash_blocks))
453 hash_block_end = v->hash_blocks - 1;
454 }
455no_prefetch_cluster:
456 dm_bufio_prefetch(v->bufio, hash_block_start,
457 hash_block_end - hash_block_start + 1);
458 }
459}
460
461/*
462 * Bio map function. It allocates dm_verity_io structure and bio vector and
463 * fills them. Then it issues prefetches and the I/O.
464 */
465static int verity_map(struct dm_target *ti, struct bio *bio,
466 union map_info *map_context)
467{
468 struct dm_verity *v = ti->private;
469 struct dm_verity_io *io;
470
471 bio->bi_bdev = v->data_dev->bdev;
472 bio->bi_sector = verity_map_sector(v, bio->bi_sector);
473
474 if (((unsigned)bio->bi_sector | bio_sectors(bio)) &
475 ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
476 DMERR_LIMIT("unaligned io");
477 return -EIO;
478 }
479
480 if ((bio->bi_sector + bio_sectors(bio)) >>
481 (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
482 DMERR_LIMIT("io out of range");
483 return -EIO;
484 }
485
486 if (bio_data_dir(bio) == WRITE)
487 return -EIO;
488
489 io = mempool_alloc(v->io_mempool, GFP_NOIO);
490 io->v = v;
491 io->bio = bio;
492 io->orig_bi_end_io = bio->bi_end_io;
493 io->orig_bi_private = bio->bi_private;
494 io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
495 io->n_blocks = bio->bi_size >> v->data_dev_block_bits;
496
497 bio->bi_end_io = verity_end_io;
498 bio->bi_private = io;
499 io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
500 if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
501 io->io_vec = io->io_vec_inline;
502 else
503 io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
504 memcpy(io->io_vec, bio_iovec(bio),
505 io->io_vec_size * sizeof(struct bio_vec));
506
507 verity_prefetch_io(v, io);
508
509 generic_make_request(bio);
510
511 return DM_MAPIO_SUBMITTED;
512}
513
514/*
515 * Status: V (valid) or C (corruption found)
516 */
517static int verity_status(struct dm_target *ti, status_type_t type,
518 char *result, unsigned maxlen)
519{
520 struct dm_verity *v = ti->private;
521 unsigned sz = 0;
522 unsigned x;
523
524 switch (type) {
525 case STATUSTYPE_INFO:
526 DMEMIT("%c", v->hash_failed ? 'C' : 'V');
527 break;
528 case STATUSTYPE_TABLE:
529 DMEMIT("%u %s %s %u %u %llu %llu %s ",
530 v->version,
531 v->data_dev->name,
532 v->hash_dev->name,
533 1 << v->data_dev_block_bits,
534 1 << v->hash_dev_block_bits,
535 (unsigned long long)v->data_blocks,
536 (unsigned long long)v->hash_start,
537 v->alg_name
538 );
539 for (x = 0; x < v->digest_size; x++)
540 DMEMIT("%02x", v->root_digest[x]);
541 DMEMIT(" ");
542 if (!v->salt_size)
543 DMEMIT("-");
544 else
545 for (x = 0; x < v->salt_size; x++)
546 DMEMIT("%02x", v->salt[x]);
547 break;
548 }
549
550 return 0;
551}
552
553static int verity_ioctl(struct dm_target *ti, unsigned cmd,
554 unsigned long arg)
555{
556 struct dm_verity *v = ti->private;
557 int r = 0;
558
559 if (v->data_start ||
560 ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
561 r = scsi_verify_blk_ioctl(NULL, cmd);
562
563 return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
564 cmd, arg);
565}
566
567static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
568 struct bio_vec *biovec, int max_size)
569{
570 struct dm_verity *v = ti->private;
571 struct request_queue *q = bdev_get_queue(v->data_dev->bdev);
572
573 if (!q->merge_bvec_fn)
574 return max_size;
575
576 bvm->bi_bdev = v->data_dev->bdev;
577 bvm->bi_sector = verity_map_sector(v, bvm->bi_sector);
578
579 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
580}
581
582static int verity_iterate_devices(struct dm_target *ti,
583 iterate_devices_callout_fn fn, void *data)
584{
585 struct dm_verity *v = ti->private;
586
587 return fn(ti, v->data_dev, v->data_start, ti->len, data);
588}
589
590static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
591{
592 struct dm_verity *v = ti->private;
593
594 if (limits->logical_block_size < 1 << v->data_dev_block_bits)
595 limits->logical_block_size = 1 << v->data_dev_block_bits;
596
597 if (limits->physical_block_size < 1 << v->data_dev_block_bits)
598 limits->physical_block_size = 1 << v->data_dev_block_bits;
599
600 blk_limits_io_min(limits, limits->logical_block_size);
601}
602
603static void verity_dtr(struct dm_target *ti)
604{
605 struct dm_verity *v = ti->private;
606
607 if (v->verify_wq)
608 destroy_workqueue(v->verify_wq);
609
610 if (v->vec_mempool)
611 mempool_destroy(v->vec_mempool);
612
613 if (v->io_mempool)
614 mempool_destroy(v->io_mempool);
615
616 if (v->bufio)
617 dm_bufio_client_destroy(v->bufio);
618
619 kfree(v->salt);
620 kfree(v->root_digest);
621
622 if (v->tfm)
623 crypto_free_shash(v->tfm);
624
625 kfree(v->alg_name);
626
627 if (v->hash_dev)
628 dm_put_device(ti, v->hash_dev);
629
630 if (v->data_dev)
631 dm_put_device(ti, v->data_dev);
632
633 kfree(v);
634}
635
636/*
637 * Target parameters:
638 * <version> The current format is version 1.
639 * Vsn 0 is compatible with original Chromium OS releases.
640 * <data device>
641 * <hash device>
642 * <data block size>
643 * <hash block size>
644 * <the number of data blocks>
645 * <hash start block>
646 * <algorithm>
647 * <digest>
648 * <salt> Hex string or "-" if no salt.
649 */
650static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
651{
652 struct dm_verity *v;
653 unsigned num;
654 unsigned long long num_ll;
655 int r;
656 int i;
657 sector_t hash_position;
658 char dummy;
659
660 v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
661 if (!v) {
662 ti->error = "Cannot allocate verity structure";
663 return -ENOMEM;
664 }
665 ti->private = v;
666 v->ti = ti;
667
668 if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
669 ti->error = "Device must be readonly";
670 r = -EINVAL;
671 goto bad;
672 }
673
674 if (argc != 10) {
675 ti->error = "Invalid argument count: exactly 10 arguments required";
676 r = -EINVAL;
677 goto bad;
678 }
679
680 if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
681 num < 0 || num > 1) {
682 ti->error = "Invalid version";
683 r = -EINVAL;
684 goto bad;
685 }
686 v->version = num;
687
688 r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
689 if (r) {
690 ti->error = "Data device lookup failed";
691 goto bad;
692 }
693
694 r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
695 if (r) {
696 ti->error = "Data device lookup failed";
697 goto bad;
698 }
699
700 if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 ||
701 !num || (num & (num - 1)) ||
702 num < bdev_logical_block_size(v->data_dev->bdev) ||
703 num > PAGE_SIZE) {
704 ti->error = "Invalid data device block size";
705 r = -EINVAL;
706 goto bad;
707 }
708 v->data_dev_block_bits = ffs(num) - 1;
709
710 if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
711 !num || (num & (num - 1)) ||
712 num < bdev_logical_block_size(v->hash_dev->bdev) ||
713 num > INT_MAX) {
714 ti->error = "Invalid hash device block size";
715 r = -EINVAL;
716 goto bad;
717 }
718 v->hash_dev_block_bits = ffs(num) - 1;
719
720 if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
721 num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) !=
722 (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) {
723 ti->error = "Invalid data blocks";
724 r = -EINVAL;
725 goto bad;
726 }
727 v->data_blocks = num_ll;
728
729 if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) {
730 ti->error = "Data device is too small";
731 r = -EINVAL;
732 goto bad;
733 }
734
735 if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
736 num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) !=
737 (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) {
738 ti->error = "Invalid hash start";
739 r = -EINVAL;
740 goto bad;
741 }
742 v->hash_start = num_ll;
743
744 v->alg_name = kstrdup(argv[7], GFP_KERNEL);
745 if (!v->alg_name) {
746 ti->error = "Cannot allocate algorithm name";
747 r = -ENOMEM;
748 goto bad;
749 }
750
751 v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
752 if (IS_ERR(v->tfm)) {
753 ti->error = "Cannot initialize hash function";
754 r = PTR_ERR(v->tfm);
755 v->tfm = NULL;
756 goto bad;
757 }
758 v->digest_size = crypto_shash_digestsize(v->tfm);
759 if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
760 ti->error = "Digest size too big";
761 r = -EINVAL;
762 goto bad;
763 }
764 v->shash_descsize =
765 sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
766
767 v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
768 if (!v->root_digest) {
769 ti->error = "Cannot allocate root digest";
770 r = -ENOMEM;
771 goto bad;
772 }
773 if (strlen(argv[8]) != v->digest_size * 2 ||
774 hex2bin(v->root_digest, argv[8], v->digest_size)) {
775 ti->error = "Invalid root digest";
776 r = -EINVAL;
777 goto bad;
778 }
779
780 if (strcmp(argv[9], "-")) {
781 v->salt_size = strlen(argv[9]) / 2;
782 v->salt = kmalloc(v->salt_size, GFP_KERNEL);
783 if (!v->salt) {
784 ti->error = "Cannot allocate salt";
785 r = -ENOMEM;
786 goto bad;
787 }
788 if (strlen(argv[9]) != v->salt_size * 2 ||
789 hex2bin(v->salt, argv[9], v->salt_size)) {
790 ti->error = "Invalid salt";
791 r = -EINVAL;
792 goto bad;
793 }
794 }
795
796 v->hash_per_block_bits =
797 fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1;
798
799 v->levels = 0;
800 if (v->data_blocks)
801 while (v->hash_per_block_bits * v->levels < 64 &&
802 (unsigned long long)(v->data_blocks - 1) >>
803 (v->hash_per_block_bits * v->levels))
804 v->levels++;
805
806 if (v->levels > DM_VERITY_MAX_LEVELS) {
807 ti->error = "Too many tree levels";
808 r = -E2BIG;
809 goto bad;
810 }
811
812 hash_position = v->hash_start;
813 for (i = v->levels - 1; i >= 0; i--) {
814 sector_t s;
815 v->hash_level_block[i] = hash_position;
816 s = verity_position_at_level(v, v->data_blocks, i);
817 s = (s >> v->hash_per_block_bits) +
818 !!(s & ((1 << v->hash_per_block_bits) - 1));
819 if (hash_position + s < hash_position) {
820 ti->error = "Hash device offset overflow";
821 r = -E2BIG;
822 goto bad;
823 }
824 hash_position += s;
825 }
826 v->hash_blocks = hash_position;
827
828 v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
829 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
830 dm_bufio_alloc_callback, NULL);
831 if (IS_ERR(v->bufio)) {
832 ti->error = "Cannot initialize dm-bufio";
833 r = PTR_ERR(v->bufio);
834 v->bufio = NULL;
835 goto bad;
836 }
837
838 if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) {
839 ti->error = "Hash device is too small";
840 r = -E2BIG;
841 goto bad;
842 }
843
844 v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
845 sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2);
846 if (!v->io_mempool) {
847 ti->error = "Cannot allocate io mempool";
848 r = -ENOMEM;
849 goto bad;
850 }
851
852 v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
853 BIO_MAX_PAGES * sizeof(struct bio_vec));
854 if (!v->vec_mempool) {
855 ti->error = "Cannot allocate vector mempool";
856 r = -ENOMEM;
857 goto bad;
858 }
859
860 /* WQ_UNBOUND greatly improves performance when running on ramdisk */
861 v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
862 if (!v->verify_wq) {
863 ti->error = "Cannot allocate workqueue";
864 r = -ENOMEM;
865 goto bad;
866 }
867
868 return 0;
869
870bad:
871 verity_dtr(ti);
872
873 return r;
874}
875
876static struct target_type verity_target = {
877 .name = "verity",
878 .version = {1, 0, 0},
879 .module = THIS_MODULE,
880 .ctr = verity_ctr,
881 .dtr = verity_dtr,
882 .map = verity_map,
883 .status = verity_status,
884 .ioctl = verity_ioctl,
885 .merge = verity_merge,
886 .iterate_devices = verity_iterate_devices,
887 .io_hints = verity_io_hints,
888};
889
890static int __init dm_verity_init(void)
891{
892 int r;
893
894 r = dm_register_target(&verity_target);
895 if (r < 0)
896 DMERR("register failed %d", r);
897
898 return r;
899}
900
901static void __exit dm_verity_exit(void)
902{
903 dm_unregister_target(&verity_target);
904}
905
906module_init(dm_verity_init);
907module_exit(dm_verity_exit);
908
909MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
910MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
911MODULE_AUTHOR("Will Drewry <wad@chromium.org>");
912MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking");
913MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b89c548ec3f8..e24143cc2040 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1016,6 +1016,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
1016 /* 1016 /*
1017 * Store bio_set for cleanup. 1017 * Store bio_set for cleanup.
1018 */ 1018 */
1019 clone->bi_end_io = NULL;
1019 clone->bi_private = md->bs; 1020 clone->bi_private = md->bs;
1020 bio_put(clone); 1021 bio_put(clone);
1021 free_tio(md, tio); 1022 free_tio(md, tio);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index b0fcc7d02adb..fa211d80fc0a 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -198,6 +198,7 @@ out:
198static int linear_run (struct mddev *mddev) 198static int linear_run (struct mddev *mddev)
199{ 199{
200 struct linear_conf *conf; 200 struct linear_conf *conf;
201 int ret;
201 202
202 if (md_check_no_bitmap(mddev)) 203 if (md_check_no_bitmap(mddev))
203 return -EINVAL; 204 return -EINVAL;
@@ -211,7 +212,13 @@ static int linear_run (struct mddev *mddev)
211 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); 212 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
212 mddev->queue->backing_dev_info.congested_fn = linear_congested; 213 mddev->queue->backing_dev_info.congested_fn = linear_congested;
213 mddev->queue->backing_dev_info.congested_data = mddev; 214 mddev->queue->backing_dev_info.congested_data = mddev;
214 return md_integrity_register(mddev); 215
216 ret = md_integrity_register(mddev);
217 if (ret) {
218 kfree(conf);
219 mddev->private = NULL;
220 }
221 return ret;
215} 222}
216 223
217static int linear_add(struct mddev *mddev, struct md_rdev *rdev) 224static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index d279c768f8f1..5709bfeab1e8 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -108,12 +108,9 @@ static inline void *value_base(struct node *n)
108 return &n->keys[le32_to_cpu(n->header.max_entries)]; 108 return &n->keys[le32_to_cpu(n->header.max_entries)];
109} 109}
110 110
111/* 111static inline void *value_ptr(struct node *n, uint32_t index)
112 * FIXME: Now that value size is stored in node we don't need the third parm.
113 */
114static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size)
115{ 112{
116 BUG_ON(value_size != le32_to_cpu(n->header.value_size)); 113 uint32_t value_size = le32_to_cpu(n->header.value_size);
117 return value_base(n) + (value_size * index); 114 return value_base(n) + (value_size * index);
118} 115}
119 116
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 023fbc2d389e..aa71e2359a07 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -61,20 +61,20 @@ static void node_shift(struct node *n, int shift)
61 if (shift < 0) { 61 if (shift < 0) {
62 shift = -shift; 62 shift = -shift;
63 BUG_ON(shift > nr_entries); 63 BUG_ON(shift > nr_entries);
64 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size)); 64 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
65 memmove(key_ptr(n, 0), 65 memmove(key_ptr(n, 0),
66 key_ptr(n, shift), 66 key_ptr(n, shift),
67 (nr_entries - shift) * sizeof(__le64)); 67 (nr_entries - shift) * sizeof(__le64));
68 memmove(value_ptr(n, 0, value_size), 68 memmove(value_ptr(n, 0),
69 value_ptr(n, shift, value_size), 69 value_ptr(n, shift),
70 (nr_entries - shift) * value_size); 70 (nr_entries - shift) * value_size);
71 } else { 71 } else {
72 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); 72 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
73 memmove(key_ptr(n, shift), 73 memmove(key_ptr(n, shift),
74 key_ptr(n, 0), 74 key_ptr(n, 0),
75 nr_entries * sizeof(__le64)); 75 nr_entries * sizeof(__le64));
76 memmove(value_ptr(n, shift, value_size), 76 memmove(value_ptr(n, shift),
77 value_ptr(n, 0, value_size), 77 value_ptr(n, 0),
78 nr_entries * value_size); 78 nr_entries * value_size);
79 } 79 }
80} 80}
@@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift)
91 memcpy(key_ptr(left, nr_left), 91 memcpy(key_ptr(left, nr_left),
92 key_ptr(right, 0), 92 key_ptr(right, 0),
93 shift * sizeof(__le64)); 93 shift * sizeof(__le64));
94 memcpy(value_ptr(left, nr_left, value_size), 94 memcpy(value_ptr(left, nr_left),
95 value_ptr(right, 0, value_size), 95 value_ptr(right, 0),
96 shift * value_size); 96 shift * value_size);
97 } else { 97 } else {
98 BUG_ON(shift > le32_to_cpu(right->header.max_entries)); 98 BUG_ON(shift > le32_to_cpu(right->header.max_entries));
99 memcpy(key_ptr(right, 0), 99 memcpy(key_ptr(right, 0),
100 key_ptr(left, nr_left - shift), 100 key_ptr(left, nr_left - shift),
101 shift * sizeof(__le64)); 101 shift * sizeof(__le64));
102 memcpy(value_ptr(right, 0, value_size), 102 memcpy(value_ptr(right, 0),
103 value_ptr(left, nr_left - shift, value_size), 103 value_ptr(left, nr_left - shift),
104 shift * value_size); 104 shift * value_size);
105 } 105 }
106} 106}
@@ -120,26 +120,17 @@ static void delete_at(struct node *n, unsigned index)
120 key_ptr(n, index + 1), 120 key_ptr(n, index + 1),
121 nr_to_copy * sizeof(__le64)); 121 nr_to_copy * sizeof(__le64));
122 122
123 memmove(value_ptr(n, index, value_size), 123 memmove(value_ptr(n, index),
124 value_ptr(n, index + 1, value_size), 124 value_ptr(n, index + 1),
125 nr_to_copy * value_size); 125 nr_to_copy * value_size);
126 } 126 }
127 127
128 n->header.nr_entries = cpu_to_le32(nr_entries - 1); 128 n->header.nr_entries = cpu_to_le32(nr_entries - 1);
129} 129}
130 130
131static unsigned del_threshold(struct node *n)
132{
133 return le32_to_cpu(n->header.max_entries) / 3;
134}
135
136static unsigned merge_threshold(struct node *n) 131static unsigned merge_threshold(struct node *n)
137{ 132{
138 /* 133 return le32_to_cpu(n->header.max_entries) / 3;
139 * The extra one is because we know we're potentially going to
140 * delete an entry.
141 */
142 return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1;
143} 134}
144 135
145struct child { 136struct child {
@@ -175,7 +166,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent,
175 if (inc) 166 if (inc)
176 inc_children(info->tm, result->n, &le64_type); 167 inc_children(info->tm, result->n, &le64_type);
177 168
178 *((__le64 *) value_ptr(parent, index, sizeof(__le64))) = 169 *((__le64 *) value_ptr(parent, index)) =
179 cpu_to_le64(dm_block_location(result->block)); 170 cpu_to_le64(dm_block_location(result->block));
180 171
181 return 0; 172 return 0;
@@ -188,6 +179,15 @@ static int exit_child(struct dm_btree_info *info, struct child *c)
188 179
189static void shift(struct node *left, struct node *right, int count) 180static void shift(struct node *left, struct node *right, int count)
190{ 181{
182 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
183 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
184 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
185 uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
186
187 BUG_ON(max_entries != r_max_entries);
188 BUG_ON(nr_left - count > max_entries);
189 BUG_ON(nr_right + count > max_entries);
190
191 if (!count) 191 if (!count)
192 return; 192 return;
193 193
@@ -199,13 +199,8 @@ static void shift(struct node *left, struct node *right, int count)
199 node_shift(right, count); 199 node_shift(right, count);
200 } 200 }
201 201
202 left->header.nr_entries = 202 left->header.nr_entries = cpu_to_le32(nr_left - count);
203 cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count); 203 right->header.nr_entries = cpu_to_le32(nr_right + count);
204 BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
205
206 right->header.nr_entries =
207 cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
208 BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
209} 204}
210 205
211static void __rebalance2(struct dm_btree_info *info, struct node *parent, 206static void __rebalance2(struct dm_btree_info *info, struct node *parent,
@@ -215,8 +210,9 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
215 struct node *right = r->n; 210 struct node *right = r->n;
216 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 211 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
217 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 212 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
213 unsigned threshold = 2 * merge_threshold(left) + 1;
218 214
219 if (nr_left + nr_right <= merge_threshold(left)) { 215 if (nr_left + nr_right < threshold) {
220 /* 216 /*
221 * Merge 217 * Merge
222 */ 218 */
@@ -234,9 +230,6 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
234 * Rebalance. 230 * Rebalance.
235 */ 231 */
236 unsigned target_left = (nr_left + nr_right) / 2; 232 unsigned target_left = (nr_left + nr_right) / 2;
237 unsigned shift_ = nr_left - target_left;
238 BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
239 BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
240 shift(left, right, nr_left - target_left); 233 shift(left, right, nr_left - target_left);
241 *key_ptr(parent, r->index) = right->keys[0]; 234 *key_ptr(parent, r->index) = right->keys[0];
242 } 235 }
@@ -272,6 +265,84 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
272 return exit_child(info, &right); 265 return exit_child(info, &right);
273} 266}
274 267
268/*
269 * We dump as many entries from center as possible into left, then the rest
270 * in right, then rebalance2. This wastes some cpu, but I want something
271 * simple atm.
272 */
273static void delete_center_node(struct dm_btree_info *info, struct node *parent,
274 struct child *l, struct child *c, struct child *r,
275 struct node *left, struct node *center, struct node *right,
276 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
277{
278 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
279 unsigned shift = min(max_entries - nr_left, nr_center);
280
281 BUG_ON(nr_left + shift > max_entries);
282 node_copy(left, center, -shift);
283 left->header.nr_entries = cpu_to_le32(nr_left + shift);
284
285 if (shift != nr_center) {
286 shift = nr_center - shift;
287 BUG_ON((nr_right + shift) > max_entries);
288 node_shift(right, shift);
289 node_copy(center, right, shift);
290 right->header.nr_entries = cpu_to_le32(nr_right + shift);
291 }
292 *key_ptr(parent, r->index) = right->keys[0];
293
294 delete_at(parent, c->index);
295 r->index--;
296
297 dm_tm_dec(info->tm, dm_block_location(c->block));
298 __rebalance2(info, parent, l, r);
299}
300
301/*
302 * Redistributes entries among 3 sibling nodes.
303 */
304static void redistribute3(struct dm_btree_info *info, struct node *parent,
305 struct child *l, struct child *c, struct child *r,
306 struct node *left, struct node *center, struct node *right,
307 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
308{
309 int s;
310 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
311 unsigned target = (nr_left + nr_center + nr_right) / 3;
312 BUG_ON(target > max_entries);
313
314 if (nr_left < nr_right) {
315 s = nr_left - target;
316
317 if (s < 0 && nr_center < -s) {
318 /* not enough in central node */
319 shift(left, center, nr_center);
320 s = nr_center - target;
321 shift(left, right, s);
322 nr_right += s;
323 } else
324 shift(left, center, s);
325
326 shift(center, right, target - nr_right);
327
328 } else {
329 s = target - nr_right;
330 if (s > 0 && nr_center < s) {
331 /* not enough in central node */
332 shift(center, right, nr_center);
333 s = target - nr_center;
334 shift(left, right, s);
335 nr_left -= s;
336 } else
337 shift(center, right, s);
338
339 shift(left, center, nr_left - target);
340 }
341
342 *key_ptr(parent, c->index) = center->keys[0];
343 *key_ptr(parent, r->index) = right->keys[0];
344}
345
275static void __rebalance3(struct dm_btree_info *info, struct node *parent, 346static void __rebalance3(struct dm_btree_info *info, struct node *parent,
276 struct child *l, struct child *c, struct child *r) 347 struct child *l, struct child *c, struct child *r)
277{ 348{
@@ -282,62 +353,18 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent,
282 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 353 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
283 uint32_t nr_center = le32_to_cpu(center->header.nr_entries); 354 uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
284 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 355 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
285 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
286 356
287 unsigned target; 357 unsigned threshold = merge_threshold(left) * 4 + 1;
288 358
289 BUG_ON(left->header.max_entries != center->header.max_entries); 359 BUG_ON(left->header.max_entries != center->header.max_entries);
290 BUG_ON(center->header.max_entries != right->header.max_entries); 360 BUG_ON(center->header.max_entries != right->header.max_entries);
291 361
292 if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) { 362 if ((nr_left + nr_center + nr_right) < threshold)
293 /* 363 delete_center_node(info, parent, l, c, r, left, center, right,
294 * Delete center node: 364 nr_left, nr_center, nr_right);
295 * 365 else
296 * We dump as many entries from center as possible into 366 redistribute3(info, parent, l, c, r, left, center, right,
297 * left, then the rest in right, then rebalance2. This 367 nr_left, nr_center, nr_right);
298 * wastes some cpu, but I want something simple atm.
299 */
300 unsigned shift = min(max_entries - nr_left, nr_center);
301
302 BUG_ON(nr_left + shift > max_entries);
303 node_copy(left, center, -shift);
304 left->header.nr_entries = cpu_to_le32(nr_left + shift);
305
306 if (shift != nr_center) {
307 shift = nr_center - shift;
308 BUG_ON((nr_right + shift) >= max_entries);
309 node_shift(right, shift);
310 node_copy(center, right, shift);
311 right->header.nr_entries = cpu_to_le32(nr_right + shift);
312 }
313 *key_ptr(parent, r->index) = right->keys[0];
314
315 delete_at(parent, c->index);
316 r->index--;
317
318 dm_tm_dec(info->tm, dm_block_location(c->block));
319 __rebalance2(info, parent, l, r);
320
321 return;
322 }
323
324 /*
325 * Rebalance
326 */
327 target = (nr_left + nr_center + nr_right) / 3;
328 BUG_ON(target > max_entries);
329
330 /*
331 * Adjust the left node
332 */
333 shift(left, center, nr_left - target);
334
335 /*
336 * Adjust the right node
337 */
338 shift(center, right, target - nr_right);
339 *key_ptr(parent, c->index) = center->keys[0];
340 *key_ptr(parent, r->index) = right->keys[0];
341} 368}
342 369
343static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, 370static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
@@ -441,9 +468,6 @@ static int rebalance_children(struct shadow_spine *s,
441 if (r) 468 if (r)
442 return r; 469 return r;
443 470
444 if (child_entries > del_threshold(n))
445 return 0;
446
447 has_left_sibling = i > 0; 471 has_left_sibling = i > 0;
448 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); 472 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
449 473
@@ -496,7 +520,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
496 */ 520 */
497 if (shadow_has_parent(s)) { 521 if (shadow_has_parent(s)) {
498 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 522 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
499 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)), 523 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
500 &location, sizeof(__le64)); 524 &location, sizeof(__le64));
501 } 525 }
502 526
@@ -553,7 +577,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
553 577
554 if (info->value_type.dec) 578 if (info->value_type.dec)
555 info->value_type.dec(info->value_type.context, 579 info->value_type.dec(info->value_type.context,
556 value_ptr(n, index, info->value_type.size)); 580 value_ptr(n, index));
557 581
558 delete_at(n, index); 582 delete_at(n, index);
559 } 583 }
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index bd1e7ffbe26c..d12b2cc51f1a 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -74,8 +74,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n,
74 dm_tm_inc(tm, value64(n, i)); 74 dm_tm_inc(tm, value64(n, i));
75 else if (vt->inc) 75 else if (vt->inc)
76 for (i = 0; i < nr_entries; i++) 76 for (i = 0; i < nr_entries; i++)
77 vt->inc(vt->context, 77 vt->inc(vt->context, value_ptr(n, i));
78 value_ptr(n, i, vt->size));
79} 78}
80 79
81static int insert_at(size_t value_size, struct node *node, unsigned index, 80static int insert_at(size_t value_size, struct node *node, unsigned index,
@@ -281,7 +280,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
281 280
282 for (i = 0; i < f->nr_children; i++) 281 for (i = 0; i < f->nr_children; i++)
283 info->value_type.dec(info->value_type.context, 282 info->value_type.dec(info->value_type.context,
284 value_ptr(f->n, i, info->value_type.size)); 283 value_ptr(f->n, i));
285 } 284 }
286 f->current_child = f->nr_children; 285 f->current_child = f->nr_children;
287 } 286 }
@@ -320,7 +319,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
320 } while (!(flags & LEAF_NODE)); 319 } while (!(flags & LEAF_NODE));
321 320
322 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 321 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
323 memcpy(v, value_ptr(ro_node(s), i, value_size), value_size); 322 memcpy(v, value_ptr(ro_node(s), i), value_size);
324 323
325 return 0; 324 return 0;
326} 325}
@@ -432,7 +431,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
432 431
433 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? 432 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
434 sizeof(uint64_t) : s->info->value_type.size; 433 sizeof(uint64_t) : s->info->value_type.size;
435 memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size), 434 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
436 size * nr_right); 435 size * nr_right);
437 436
438 /* 437 /*
@@ -443,7 +442,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
443 pn = dm_block_data(parent); 442 pn = dm_block_data(parent);
444 location = cpu_to_le64(dm_block_location(left)); 443 location = cpu_to_le64(dm_block_location(left));
445 __dm_bless_for_disk(&location); 444 __dm_bless_for_disk(&location);
446 memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)), 445 memcpy_disk(value_ptr(pn, parent_index),
447 &location, sizeof(__le64)); 446 &location, sizeof(__le64));
448 447
449 location = cpu_to_le64(dm_block_location(right)); 448 location = cpu_to_le64(dm_block_location(right));
@@ -529,8 +528,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
529 528
530 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? 529 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
531 sizeof(__le64) : s->info->value_type.size; 530 sizeof(__le64) : s->info->value_type.size;
532 memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size); 531 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
533 memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size), 532 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
534 nr_right * size); 533 nr_right * size);
535 534
536 /* new_parent should just point to l and r now */ 535 /* new_parent should just point to l and r now */
@@ -545,12 +544,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
545 val = cpu_to_le64(dm_block_location(left)); 544 val = cpu_to_le64(dm_block_location(left));
546 __dm_bless_for_disk(&val); 545 __dm_bless_for_disk(&val);
547 pn->keys[0] = ln->keys[0]; 546 pn->keys[0] = ln->keys[0];
548 memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64)); 547 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
549 548
550 val = cpu_to_le64(dm_block_location(right)); 549 val = cpu_to_le64(dm_block_location(right));
551 __dm_bless_for_disk(&val); 550 __dm_bless_for_disk(&val);
552 pn->keys[1] = rn->keys[0]; 551 pn->keys[1] = rn->keys[0];
553 memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64)); 552 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
554 553
555 /* 554 /*
556 * rejig the spine. This is ugly, since it knows too 555 * rejig the spine. This is ugly, since it knows too
@@ -595,7 +594,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
595 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 594 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
596 595
597 __dm_bless_for_disk(&location); 596 __dm_bless_for_disk(&location);
598 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)), 597 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
599 &location, sizeof(__le64)); 598 &location, sizeof(__le64));
600 } 599 }
601 600
@@ -710,12 +709,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
710 (!info->value_type.equal || 709 (!info->value_type.equal ||
711 !info->value_type.equal( 710 !info->value_type.equal(
712 info->value_type.context, 711 info->value_type.context,
713 value_ptr(n, index, info->value_type.size), 712 value_ptr(n, index),
714 value))) { 713 value))) {
715 info->value_type.dec(info->value_type.context, 714 info->value_type.dec(info->value_type.context,
716 value_ptr(n, index, info->value_type.size)); 715 value_ptr(n, index));
717 } 716 }
718 memcpy_disk(value_ptr(n, index, info->value_type.size), 717 memcpy_disk(value_ptr(n, index),
719 value, info->value_type.size); 718 value, info->value_type.size);
720 } 719 }
721 720
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index df2494c06cdc..ff3beed6ad2d 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -405,8 +405,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
405 if (r < 0) 405 if (r < 0)
406 return r; 406 return r;
407 407
408#if 0
409 /* FIXME: dm_btree_remove doesn't handle this yet */
410 if (old > 2) { 408 if (old > 2) {
411 r = dm_btree_remove(&ll->ref_count_info, 409 r = dm_btree_remove(&ll->ref_count_info,
412 ll->ref_count_root, 410 ll->ref_count_root,
@@ -414,7 +412,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
414 if (r) 412 if (r)
415 return r; 413 return r;
416 } 414 }
417#endif
418 415
419 } else { 416 } else {
420 __le32 le_rc = cpu_to_le32(ref_count); 417 __le32 le_rc = cpu_to_le32(ref_count);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6f31f5596e01..de63a1fc3737 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -407,6 +407,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
407 return array_sectors; 407 return array_sectors;
408} 408}
409 409
410static int raid0_stop(struct mddev *mddev);
411
410static int raid0_run(struct mddev *mddev) 412static int raid0_run(struct mddev *mddev)
411{ 413{
412 struct r0conf *conf; 414 struct r0conf *conf;
@@ -454,7 +456,12 @@ static int raid0_run(struct mddev *mddev)
454 456
455 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); 457 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
456 dump_zones(mddev); 458 dump_zones(mddev);
457 return md_integrity_register(mddev); 459
460 ret = md_integrity_register(mddev);
461 if (ret)
462 raid0_stop(mddev);
463
464 return ret;
458} 465}
459 466
460static int raid0_stop(struct mddev *mddev) 467static int raid0_stop(struct mddev *mddev)
@@ -625,6 +632,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
625static void *raid0_takeover_raid1(struct mddev *mddev) 632static void *raid0_takeover_raid1(struct mddev *mddev)
626{ 633{
627 struct r0conf *priv_conf; 634 struct r0conf *priv_conf;
635 int chunksect;
628 636
629 /* Check layout: 637 /* Check layout:
630 * - (N - 1) mirror drives must be already faulty 638 * - (N - 1) mirror drives must be already faulty
@@ -635,10 +643,25 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
635 return ERR_PTR(-EINVAL); 643 return ERR_PTR(-EINVAL);
636 } 644 }
637 645
646 /*
647 * a raid1 doesn't have the notion of chunk size, so
648 * figure out the largest suitable size we can use.
649 */
650 chunksect = 64 * 2; /* 64K by default */
651
652 /* The array must be an exact multiple of chunksize */
653 while (chunksect && (mddev->array_sectors & (chunksect - 1)))
654 chunksect >>= 1;
655
656 if ((chunksect << 9) < PAGE_SIZE)
657 /* array size does not allow a suitable chunk size */
658 return ERR_PTR(-EINVAL);
659
638 /* Set new parameters */ 660 /* Set new parameters */
639 mddev->new_level = 0; 661 mddev->new_level = 0;
640 mddev->new_layout = 0; 662 mddev->new_layout = 0;
641 mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */ 663 mddev->new_chunk_sectors = chunksect;
664 mddev->chunk_sectors = chunksect;
642 mddev->delta_disks = 1 - mddev->raid_disks; 665 mddev->delta_disks = 1 - mddev->raid_disks;
643 mddev->raid_disks = 1; 666 mddev->raid_disks = 1;
644 /* make sure it will be not marked as dirty */ 667 /* make sure it will be not marked as dirty */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4a40a200d769..d35e4c991e38 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1738,7 +1738,7 @@ static int process_checks(struct r1bio *r1_bio)
1738 s = sbio->bi_io_vec[j].bv_page; 1738 s = sbio->bi_io_vec[j].bv_page;
1739 if (memcmp(page_address(p), 1739 if (memcmp(page_address(p),
1740 page_address(s), 1740 page_address(s),
1741 PAGE_SIZE)) 1741 sbio->bi_io_vec[j].bv_len))
1742 break; 1742 break;
1743 } 1743 }
1744 } else 1744 } else
@@ -2386,8 +2386,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2386 int ok = 1; 2386 int ok = 1;
2387 for (i = 0 ; i < conf->raid_disks * 2 ; i++) 2387 for (i = 0 ; i < conf->raid_disks * 2 ; i++)
2388 if (r1_bio->bios[i]->bi_end_io == end_sync_write) { 2388 if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
2389 struct md_rdev *rdev = 2389 struct md_rdev *rdev = conf->mirrors[i].rdev;
2390 rcu_dereference(conf->mirrors[i].rdev);
2391 ok = rdev_set_badblocks(rdev, sector_nr, 2390 ok = rdev_set_badblocks(rdev, sector_nr,
2392 min_bad, 0 2391 min_bad, 0
2393 ) && ok; 2392 ) && ok;
@@ -2636,11 +2635,13 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2636 return ERR_PTR(err); 2635 return ERR_PTR(err);
2637} 2636}
2638 2637
2638static int stop(struct mddev *mddev);
2639static int run(struct mddev *mddev) 2639static int run(struct mddev *mddev)
2640{ 2640{
2641 struct r1conf *conf; 2641 struct r1conf *conf;
2642 int i; 2642 int i;
2643 struct md_rdev *rdev; 2643 struct md_rdev *rdev;
2644 int ret;
2644 2645
2645 if (mddev->level != 1) { 2646 if (mddev->level != 1) {
2646 printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", 2647 printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
@@ -2705,7 +2706,11 @@ static int run(struct mddev *mddev)
2705 mddev->queue->backing_dev_info.congested_data = mddev; 2706 mddev->queue->backing_dev_info.congested_data = mddev;
2706 blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); 2707 blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
2707 } 2708 }
2708 return md_integrity_register(mddev); 2709
2710 ret = md_integrity_register(mddev);
2711 if (ret)
2712 stop(mddev);
2713 return ret;
2709} 2714}
2710 2715
2711static int stop(struct mddev *mddev) 2716static int stop(struct mddev *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3540316886f2..fff782189e48 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1821,7 +1821,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1821 for (j = 0; j < vcnt; j++) 1821 for (j = 0; j < vcnt; j++)
1822 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), 1822 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1823 page_address(tbio->bi_io_vec[j].bv_page), 1823 page_address(tbio->bi_io_vec[j].bv_page),
1824 PAGE_SIZE)) 1824 fbio->bi_io_vec[j].bv_len))
1825 break; 1825 break;
1826 if (j == vcnt) 1826 if (j == vcnt)
1827 continue; 1827 continue;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 23ac880bba9a..f351422938e0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2471,39 +2471,41 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2471 int abort = 0; 2471 int abort = 0;
2472 int i; 2472 int i;
2473 2473
2474 md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2475 clear_bit(STRIPE_SYNCING, &sh->state); 2474 clear_bit(STRIPE_SYNCING, &sh->state);
2476 s->syncing = 0; 2475 s->syncing = 0;
2477 s->replacing = 0; 2476 s->replacing = 0;
2478 /* There is nothing more to do for sync/check/repair. 2477 /* There is nothing more to do for sync/check/repair.
2478 * Don't even need to abort as that is handled elsewhere
2479 * if needed, and not always wanted e.g. if there is a known
2480 * bad block here.
2479 * For recover/replace we need to record a bad block on all 2481 * For recover/replace we need to record a bad block on all
2480 * non-sync devices, or abort the recovery 2482 * non-sync devices, or abort the recovery
2481 */ 2483 */
2482 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) 2484 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
2483 return; 2485 /* During recovery devices cannot be removed, so
2484 /* During recovery devices cannot be removed, so locking and 2486 * locking and refcounting of rdevs is not needed
2485 * refcounting of rdevs is not needed 2487 */
2486 */ 2488 for (i = 0; i < conf->raid_disks; i++) {
2487 for (i = 0; i < conf->raid_disks; i++) { 2489 struct md_rdev *rdev = conf->disks[i].rdev;
2488 struct md_rdev *rdev = conf->disks[i].rdev; 2490 if (rdev
2489 if (rdev 2491 && !test_bit(Faulty, &rdev->flags)
2490 && !test_bit(Faulty, &rdev->flags) 2492 && !test_bit(In_sync, &rdev->flags)
2491 && !test_bit(In_sync, &rdev->flags) 2493 && !rdev_set_badblocks(rdev, sh->sector,
2492 && !rdev_set_badblocks(rdev, sh->sector, 2494 STRIPE_SECTORS, 0))
2493 STRIPE_SECTORS, 0)) 2495 abort = 1;
2494 abort = 1; 2496 rdev = conf->disks[i].replacement;
2495 rdev = conf->disks[i].replacement; 2497 if (rdev
2496 if (rdev 2498 && !test_bit(Faulty, &rdev->flags)
2497 && !test_bit(Faulty, &rdev->flags) 2499 && !test_bit(In_sync, &rdev->flags)
2498 && !test_bit(In_sync, &rdev->flags) 2500 && !rdev_set_badblocks(rdev, sh->sector,
2499 && !rdev_set_badblocks(rdev, sh->sector, 2501 STRIPE_SECTORS, 0))
2500 STRIPE_SECTORS, 0)) 2502 abort = 1;
2501 abort = 1; 2503 }
2502 } 2504 if (abort)
2503 if (abort) { 2505 conf->recovery_disabled =
2504 conf->recovery_disabled = conf->mddev->recovery_disabled; 2506 conf->mddev->recovery_disabled;
2505 set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
2506 } 2507 }
2508 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
2507} 2509}
2508 2510
2509static int want_replace(struct stripe_head *sh, int disk_idx) 2511static int want_replace(struct stripe_head *sh, int disk_idx)
@@ -3203,7 +3205,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3203 /* Not in-sync */; 3205 /* Not in-sync */;
3204 else if (is_bad) { 3206 else if (is_bad) {
3205 /* also not in-sync */ 3207 /* also not in-sync */
3206 if (!test_bit(WriteErrorSeen, &rdev->flags)) { 3208 if (!test_bit(WriteErrorSeen, &rdev->flags) &&
3209 test_bit(R5_UPTODATE, &dev->flags)) {
3207 /* treat as in-sync, but with a read error 3210 /* treat as in-sync, but with a read error
3208 * which we can now try to correct 3211 * which we can now try to correct
3209 */ 3212 */
@@ -3276,12 +3279,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3276 /* If there is a failed device being replaced, 3279 /* If there is a failed device being replaced,
3277 * we must be recovering. 3280 * we must be recovering.
3278 * else if we are after recovery_cp, we must be syncing 3281 * else if we are after recovery_cp, we must be syncing
3282 * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
3279 * else we can only be replacing 3283 * else we can only be replacing
3280 * sync and recovery both need to read all devices, and so 3284 * sync and recovery both need to read all devices, and so
3281 * use the same flag. 3285 * use the same flag.
3282 */ 3286 */
3283 if (do_recovery || 3287 if (do_recovery ||
3284 sh->sector >= conf->mddev->recovery_cp) 3288 sh->sector >= conf->mddev->recovery_cp ||
3289 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
3285 s->syncing = 1; 3290 s->syncing = 1;
3286 else 3291 else
3287 s->replacing = 1; 3292 s->replacing = 1;