aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-12-15 12:12:01 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-15 12:12:01 -0500
commit53365383c4667aba55385cd1858582c19a7a8a36 (patch)
treeb290d003534b3947834762c2fb492d9d0beb985f
parent51b736b85155a56543fda8aeca5f8592795d7983 (diff)
parentd2fdb776e08d4231d7e86a879cc663a93913c202 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm
* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (80 commits) dm snapshot: use merge origin if snapshot invalid dm snapshot: report merge failure in status dm snapshot: merge consecutive chunks together dm snapshot: trigger exceptions in remaining snapshots during merge dm snapshot: delay merging a chunk until writes to it complete dm snapshot: queue writes to chunks being merged dm snapshot: add merging dm snapshot: permit only one merge at once dm snapshot: support barriers in snapshot merge target dm snapshot: avoid allocating exceptions in merge dm snapshot: rework writing to origin dm snapshot: add merge target dm exception store: add merge specific methods dm snapshot: create function for chunk_is_tracked wait dm snapshot: make bio optional in __origin_write dm mpath: reject messages when device is suspended dm: export suspended state to targets dm: rename dm_suspended to dm_suspended_md dm: swap target postsuspend call and setting suspended flag dm crypt: add plain64 iv ...
-rw-r--r--Documentation/device-mapper/snapshot.txt60
-rw-r--r--drivers/md/dm-crypt.c207
-rw-r--r--drivers/md/dm-exception-store.c33
-rw-r--r--drivers/md/dm-exception-store.h62
-rw-r--r--drivers/md/dm-io.c120
-rw-r--r--drivers/md/dm-ioctl.c123
-rw-r--r--drivers/md/dm-kcopyd.c5
-rw-r--r--drivers/md/dm-log.c77
-rw-r--r--drivers/md/dm-mpath.c95
-rw-r--r--drivers/md/dm-raid1.c219
-rw-r--r--drivers/md/dm-region-hash.c31
-rw-r--r--drivers/md/dm-snap-persistent.c195
-rw-r--r--drivers/md/dm-snap-transient.c24
-rw-r--r--drivers/md/dm-snap.c1279
-rw-r--r--drivers/md/dm-sysfs.c10
-rw-r--r--drivers/md/dm-table.c3
-rw-r--r--drivers/md/dm-uevent.c9
-rw-r--r--drivers/md/dm.c643
-rw-r--r--drivers/md/dm.h13
-rw-r--r--include/linux/device-mapper.h8
-rw-r--r--include/linux/dm-dirty-log.h6
-rw-r--r--include/linux/dm-ioctl.h13
-rw-r--r--include/linux/dm-region-hash.h3
23 files changed, 2349 insertions, 889 deletions
diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt
index a5009c8300f3..e3a77b215135 100644
--- a/Documentation/device-mapper/snapshot.txt
+++ b/Documentation/device-mapper/snapshot.txt
@@ -8,13 +8,19 @@ the block device which are also writable without interfering with the
8original content; 8original content;
9*) To create device "forks", i.e. multiple different versions of the 9*) To create device "forks", i.e. multiple different versions of the
10same data stream. 10same data stream.
11*) To merge a snapshot of a block device back into the snapshot's origin
12device.
11 13
14In the first two cases, dm copies only the chunks of data that get
15changed and uses a separate copy-on-write (COW) block device for
16storage.
12 17
13In both cases, dm copies only the chunks of data that get changed and 18For snapshot merge the contents of the COW storage are merged back into
14uses a separate copy-on-write (COW) block device for storage. 19the origin device.
15 20
16 21
17There are two dm targets available: snapshot and snapshot-origin. 22There are three dm targets available:
23snapshot, snapshot-origin, and snapshot-merge.
18 24
19*) snapshot-origin <origin> 25*) snapshot-origin <origin>
20 26
@@ -40,8 +46,25 @@ The difference is that for transient snapshots less metadata must be
40saved on disk - they can be kept in memory by the kernel. 46saved on disk - they can be kept in memory by the kernel.
41 47
42 48
43How this is used by LVM2 49* snapshot-merge <origin> <COW device> <persistent> <chunksize>
44======================== 50
51takes the same table arguments as the snapshot target except it only
52works with persistent snapshots. This target assumes the role of the
53"snapshot-origin" target and must not be loaded if the "snapshot-origin"
54is still present for <origin>.
55
56Creates a merging snapshot that takes control of the changed chunks
57stored in the <COW device> of an existing snapshot, through a handover
58procedure, and merges these chunks back into the <origin>. Once merging
59has started (in the background) the <origin> may be opened and the merge
60will continue while I/O is flowing to it. Changes to the <origin> are
61deferred until the merging snapshot's corresponding chunk(s) have been
62merged. Once merging has started the snapshot device, associated with
63the "snapshot" target, will return -EIO when accessed.
64
65
66How snapshot is used by LVM2
67============================
45When you create the first LVM2 snapshot of a volume, four dm devices are used: 68When you create the first LVM2 snapshot of a volume, four dm devices are used:
46 69
471) a device containing the original mapping table of the source volume; 701) a device containing the original mapping table of the source volume;
@@ -72,3 +95,30 @@ brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
72brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap 95brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
73brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base 96brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base
74 97
98
99How snapshot-merge is used by LVM2
100==================================
101A merging snapshot assumes the role of the "snapshot-origin" while
102merging. As such the "snapshot-origin" is replaced with
103"snapshot-merge". The "-real" device is not changed and the "-cow"
104device is renamed to <origin name>-cow to aid LVM2's cleanup of the
105merging snapshot after it completes. The "snapshot" that hands over its
106COW device to the "snapshot-merge" is deactivated (unless using lvchange
107--refresh); but if it is left active it will simply return I/O errors.
108
109A snapshot will merge into its origin with the following command:
110
111lvconvert --merge volumeGroup/snap
112
113we'll now have this situation:
114
115# dmsetup table|grep volumeGroup
116
117volumeGroup-base-real: 0 2097152 linear 8:19 384
118volumeGroup-base-cow: 0 204800 linear 8:19 2097536
119volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
120
121# ls -lL /dev/mapper/volumeGroup-*
122brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
123brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
124brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index e412980763bd..a93637223c8d 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de> 2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> 3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
4 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. 4 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
5 * 5 *
6 * This file is released under the GPL. 6 * This file is released under the GPL.
7 */ 7 */
@@ -71,10 +71,21 @@ struct crypt_iv_operations {
71 int (*ctr)(struct crypt_config *cc, struct dm_target *ti, 71 int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
72 const char *opts); 72 const char *opts);
73 void (*dtr)(struct crypt_config *cc); 73 void (*dtr)(struct crypt_config *cc);
74 const char *(*status)(struct crypt_config *cc); 74 int (*init)(struct crypt_config *cc);
75 int (*wipe)(struct crypt_config *cc);
75 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); 76 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
76}; 77};
77 78
79struct iv_essiv_private {
80 struct crypto_cipher *tfm;
81 struct crypto_hash *hash_tfm;
82 u8 *salt;
83};
84
85struct iv_benbi_private {
86 int shift;
87};
88
78/* 89/*
79 * Crypt: maps a linear range of a block device 90 * Crypt: maps a linear range of a block device
80 * and encrypts / decrypts at the same time. 91 * and encrypts / decrypts at the same time.
@@ -102,8 +113,8 @@ struct crypt_config {
102 struct crypt_iv_operations *iv_gen_ops; 113 struct crypt_iv_operations *iv_gen_ops;
103 char *iv_mode; 114 char *iv_mode;
104 union { 115 union {
105 struct crypto_cipher *essiv_tfm; 116 struct iv_essiv_private essiv;
106 int benbi_shift; 117 struct iv_benbi_private benbi;
107 } iv_gen_private; 118 } iv_gen_private;
108 sector_t iv_offset; 119 sector_t iv_offset;
109 unsigned int iv_size; 120 unsigned int iv_size;
@@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
147 * plain: the initial vector is the 32-bit little-endian version of the sector 158 * plain: the initial vector is the 32-bit little-endian version of the sector
148 * number, padded with zeros if necessary. 159 * number, padded with zeros if necessary.
149 * 160 *
161 * plain64: the initial vector is the 64-bit little-endian version of the sector
162 * number, padded with zeros if necessary.
163 *
150 * essiv: "encrypted sector|salt initial vector", the sector number is 164 * essiv: "encrypted sector|salt initial vector", the sector number is
151 * encrypted with the bulk cipher using a salt as key. The salt 165 * encrypted with the bulk cipher using a salt as key. The salt
152 * should be derived from the bulk cipher's key via hashing. 166 * should be derived from the bulk cipher's key via hashing.
@@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
169 return 0; 183 return 0;
170} 184}
171 185
172static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, 186static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
173 const char *opts) 187 sector_t sector)
174{ 188{
175 struct crypto_cipher *essiv_tfm; 189 memset(iv, 0, cc->iv_size);
176 struct crypto_hash *hash_tfm; 190 *(u64 *)iv = cpu_to_le64(sector);
191
192 return 0;
193}
194
195/* Initialise ESSIV - compute salt but no local memory allocations */
196static int crypt_iv_essiv_init(struct crypt_config *cc)
197{
198 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
177 struct hash_desc desc; 199 struct hash_desc desc;
178 struct scatterlist sg; 200 struct scatterlist sg;
179 unsigned int saltsize;
180 u8 *salt;
181 int err; 201 int err;
182 202
183 if (opts == NULL) { 203 sg_init_one(&sg, cc->key, cc->key_size);
204 desc.tfm = essiv->hash_tfm;
205 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
206
207 err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
208 if (err)
209 return err;
210
211 return crypto_cipher_setkey(essiv->tfm, essiv->salt,
212 crypto_hash_digestsize(essiv->hash_tfm));
213}
214
215/* Wipe salt and reset key derived from volume key */
216static int crypt_iv_essiv_wipe(struct crypt_config *cc)
217{
218 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
219 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
220
221 memset(essiv->salt, 0, salt_size);
222
223 return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
224}
225
226static void crypt_iv_essiv_dtr(struct crypt_config *cc)
227{
228 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
229
230 crypto_free_cipher(essiv->tfm);
231 essiv->tfm = NULL;
232
233 crypto_free_hash(essiv->hash_tfm);
234 essiv->hash_tfm = NULL;
235
236 kzfree(essiv->salt);
237 essiv->salt = NULL;
238}
239
240static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
241 const char *opts)
242{
243 struct crypto_cipher *essiv_tfm = NULL;
244 struct crypto_hash *hash_tfm = NULL;
245 u8 *salt = NULL;
246 int err;
247
248 if (!opts) {
184 ti->error = "Digest algorithm missing for ESSIV mode"; 249 ti->error = "Digest algorithm missing for ESSIV mode";
185 return -EINVAL; 250 return -EINVAL;
186 } 251 }
187 252
188 /* Hash the cipher key with the given hash algorithm */ 253 /* Allocate hash algorithm */
189 hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); 254 hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
190 if (IS_ERR(hash_tfm)) { 255 if (IS_ERR(hash_tfm)) {
191 ti->error = "Error initializing ESSIV hash"; 256 ti->error = "Error initializing ESSIV hash";
192 return PTR_ERR(hash_tfm); 257 err = PTR_ERR(hash_tfm);
258 goto bad;
193 } 259 }
194 260
195 saltsize = crypto_hash_digestsize(hash_tfm); 261 salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
196 salt = kmalloc(saltsize, GFP_KERNEL); 262 if (!salt) {
197 if (salt == NULL) {
198 ti->error = "Error kmallocing salt storage in ESSIV"; 263 ti->error = "Error kmallocing salt storage in ESSIV";
199 crypto_free_hash(hash_tfm); 264 err = -ENOMEM;
200 return -ENOMEM; 265 goto bad;
201 } 266 }
202 267
203 sg_init_one(&sg, cc->key, cc->key_size); 268 /* Allocate essiv_tfm */
204 desc.tfm = hash_tfm;
205 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
206 err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
207 crypto_free_hash(hash_tfm);
208
209 if (err) {
210 ti->error = "Error calculating hash in ESSIV";
211 kfree(salt);
212 return err;
213 }
214
215 /* Setup the essiv_tfm with the given salt */
216 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); 269 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
217 if (IS_ERR(essiv_tfm)) { 270 if (IS_ERR(essiv_tfm)) {
218 ti->error = "Error allocating crypto tfm for ESSIV"; 271 ti->error = "Error allocating crypto tfm for ESSIV";
219 kfree(salt); 272 err = PTR_ERR(essiv_tfm);
220 return PTR_ERR(essiv_tfm); 273 goto bad;
221 } 274 }
222 if (crypto_cipher_blocksize(essiv_tfm) != 275 if (crypto_cipher_blocksize(essiv_tfm) !=
223 crypto_ablkcipher_ivsize(cc->tfm)) { 276 crypto_ablkcipher_ivsize(cc->tfm)) {
224 ti->error = "Block size of ESSIV cipher does " 277 ti->error = "Block size of ESSIV cipher does "
225 "not match IV size of block cipher"; 278 "not match IV size of block cipher";
226 crypto_free_cipher(essiv_tfm); 279 err = -EINVAL;
227 kfree(salt); 280 goto bad;
228 return -EINVAL;
229 } 281 }
230 err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
231 if (err) {
232 ti->error = "Failed to set key for ESSIV cipher";
233 crypto_free_cipher(essiv_tfm);
234 kfree(salt);
235 return err;
236 }
237 kfree(salt);
238 282
239 cc->iv_gen_private.essiv_tfm = essiv_tfm; 283 cc->iv_gen_private.essiv.salt = salt;
284 cc->iv_gen_private.essiv.tfm = essiv_tfm;
285 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
286
240 return 0; 287 return 0;
241}
242 288
243static void crypt_iv_essiv_dtr(struct crypt_config *cc) 289bad:
244{ 290 if (essiv_tfm && !IS_ERR(essiv_tfm))
245 crypto_free_cipher(cc->iv_gen_private.essiv_tfm); 291 crypto_free_cipher(essiv_tfm);
246 cc->iv_gen_private.essiv_tfm = NULL; 292 if (hash_tfm && !IS_ERR(hash_tfm))
293 crypto_free_hash(hash_tfm);
294 kfree(salt);
295 return err;
247} 296}
248 297
249static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 298static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
250{ 299{
251 memset(iv, 0, cc->iv_size); 300 memset(iv, 0, cc->iv_size);
252 *(u64 *)iv = cpu_to_le64(sector); 301 *(u64 *)iv = cpu_to_le64(sector);
253 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv); 302 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
254 return 0; 303 return 0;
255} 304}
256 305
@@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
273 return -EINVAL; 322 return -EINVAL;
274 } 323 }
275 324
276 cc->iv_gen_private.benbi_shift = 9 - log; 325 cc->iv_gen_private.benbi.shift = 9 - log;
277 326
278 return 0; 327 return 0;
279} 328}
@@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
288 337
289 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ 338 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
290 339
291 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1); 340 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
292 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); 341 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
293 342
294 return 0; 343 return 0;
@@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = {
305 .generator = crypt_iv_plain_gen 354 .generator = crypt_iv_plain_gen
306}; 355};
307 356
357static struct crypt_iv_operations crypt_iv_plain64_ops = {
358 .generator = crypt_iv_plain64_gen
359};
360
308static struct crypt_iv_operations crypt_iv_essiv_ops = { 361static struct crypt_iv_operations crypt_iv_essiv_ops = {
309 .ctr = crypt_iv_essiv_ctr, 362 .ctr = crypt_iv_essiv_ctr,
310 .dtr = crypt_iv_essiv_dtr, 363 .dtr = crypt_iv_essiv_dtr,
364 .init = crypt_iv_essiv_init,
365 .wipe = crypt_iv_essiv_wipe,
311 .generator = crypt_iv_essiv_gen 366 .generator = crypt_iv_essiv_gen
312}; 367};
313 368
@@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
934 989
935 set_bit(DM_CRYPT_KEY_VALID, &cc->flags); 990 set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
936 991
937 return 0; 992 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
938} 993}
939 994
940static int crypt_wipe_key(struct crypt_config *cc) 995static int crypt_wipe_key(struct crypt_config *cc)
941{ 996{
942 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); 997 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
943 memset(&cc->key, 0, cc->key_size * sizeof(u8)); 998 memset(&cc->key, 0, cc->key_size * sizeof(u8));
944 return 0; 999 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
945} 1000}
946 1001
947/* 1002/*
@@ -983,11 +1038,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
983 return -ENOMEM; 1038 return -ENOMEM;
984 } 1039 }
985 1040
986 if (crypt_set_key(cc, argv[1])) {
987 ti->error = "Error decoding key";
988 goto bad_cipher;
989 }
990
991 /* Compatibility mode for old dm-crypt cipher strings */ 1041 /* Compatibility mode for old dm-crypt cipher strings */
992 if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { 1042 if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
993 chainmode = "cbc"; 1043 chainmode = "cbc";
@@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1015 strcpy(cc->chainmode, chainmode); 1065 strcpy(cc->chainmode, chainmode);
1016 cc->tfm = tfm; 1066 cc->tfm = tfm;
1017 1067
1068 if (crypt_set_key(cc, argv[1]) < 0) {
1069 ti->error = "Error decoding and setting key";
1070 goto bad_ivmode;
1071 }
1072
1018 /* 1073 /*
1019 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". 1074 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
1020 * See comments at iv code 1075 * See comments at iv code
@@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1024 cc->iv_gen_ops = NULL; 1079 cc->iv_gen_ops = NULL;
1025 else if (strcmp(ivmode, "plain") == 0) 1080 else if (strcmp(ivmode, "plain") == 0)
1026 cc->iv_gen_ops = &crypt_iv_plain_ops; 1081 cc->iv_gen_ops = &crypt_iv_plain_ops;
1082 else if (strcmp(ivmode, "plain64") == 0)
1083 cc->iv_gen_ops = &crypt_iv_plain64_ops;
1027 else if (strcmp(ivmode, "essiv") == 0) 1084 else if (strcmp(ivmode, "essiv") == 0)
1028 cc->iv_gen_ops = &crypt_iv_essiv_ops; 1085 cc->iv_gen_ops = &crypt_iv_essiv_ops;
1029 else if (strcmp(ivmode, "benbi") == 0) 1086 else if (strcmp(ivmode, "benbi") == 0)
@@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1039 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) 1096 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
1040 goto bad_ivmode; 1097 goto bad_ivmode;
1041 1098
1099 if (cc->iv_gen_ops && cc->iv_gen_ops->init &&
1100 cc->iv_gen_ops->init(cc) < 0) {
1101 ti->error = "Error initialising IV";
1102 goto bad_slab_pool;
1103 }
1104
1042 cc->iv_size = crypto_ablkcipher_ivsize(tfm); 1105 cc->iv_size = crypto_ablkcipher_ivsize(tfm);
1043 if (cc->iv_size) 1106 if (cc->iv_size)
1044 /* at least a 64 bit sector number should fit in our buffer */ 1107 /* at least a 64 bit sector number should fit in our buffer */
@@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1085 goto bad_bs; 1148 goto bad_bs;
1086 } 1149 }
1087 1150
1088 if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
1089 ti->error = "Error setting key";
1090 goto bad_device;
1091 }
1092
1093 if (sscanf(argv[2], "%llu", &tmpll) != 1) { 1151 if (sscanf(argv[2], "%llu", &tmpll) != 1) {
1094 ti->error = "Invalid iv_offset sector"; 1152 ti->error = "Invalid iv_offset sector";
1095 goto bad_device; 1153 goto bad_device;
@@ -1278,6 +1336,7 @@ static void crypt_resume(struct dm_target *ti)
1278static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) 1336static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1279{ 1337{
1280 struct crypt_config *cc = ti->private; 1338 struct crypt_config *cc = ti->private;
1339 int ret = -EINVAL;
1281 1340
1282 if (argc < 2) 1341 if (argc < 2)
1283 goto error; 1342 goto error;
@@ -1287,10 +1346,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1287 DMWARN("not suspended during key manipulation."); 1346 DMWARN("not suspended during key manipulation.");
1288 return -EINVAL; 1347 return -EINVAL;
1289 } 1348 }
1290 if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) 1349 if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
1291 return crypt_set_key(cc, argv[2]); 1350 ret = crypt_set_key(cc, argv[2]);
1292 if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) 1351 if (ret)
1352 return ret;
1353 if (cc->iv_gen_ops && cc->iv_gen_ops->init)
1354 ret = cc->iv_gen_ops->init(cc);
1355 return ret;
1356 }
1357 if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
1358 if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
1359 ret = cc->iv_gen_ops->wipe(cc);
1360 if (ret)
1361 return ret;
1362 }
1293 return crypt_wipe_key(cc); 1363 return crypt_wipe_key(cc);
1364 }
1294 } 1365 }
1295 1366
1296error: 1367error:
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 7dbe652efb5a..2b7907b6dd09 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
172 } 172 }
173 173
174 /* Validate the chunk size against the device block size */ 174 /* Validate the chunk size against the device block size */
175 if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) { 175 if (chunk_size %
176 (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) {
176 *error = "Chunk size is not a multiple of device blocksize"; 177 *error = "Chunk size is not a multiple of device blocksize";
177 return -EINVAL; 178 return -EINVAL;
178 } 179 }
@@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
190} 191}
191 192
192int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, 193int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
194 struct dm_snapshot *snap,
193 unsigned *args_used, 195 unsigned *args_used,
194 struct dm_exception_store **store) 196 struct dm_exception_store **store)
195{ 197{
@@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
198 struct dm_exception_store *tmp_store; 200 struct dm_exception_store *tmp_store;
199 char persistent; 201 char persistent;
200 202
201 if (argc < 3) { 203 if (argc < 2) {
202 ti->error = "Insufficient exception store arguments"; 204 ti->error = "Insufficient exception store arguments";
203 return -EINVAL; 205 return -EINVAL;
204 } 206 }
@@ -209,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
209 return -ENOMEM; 211 return -ENOMEM;
210 } 212 }
211 213
212 persistent = toupper(*argv[1]); 214 persistent = toupper(*argv[0]);
213 if (persistent == 'P') 215 if (persistent == 'P')
214 type = get_type("P"); 216 type = get_type("P");
215 else if (persistent == 'N') 217 else if (persistent == 'N')
216 type = get_type("N"); 218 type = get_type("N");
217 else { 219 else {
218 ti->error = "Persistent flag is not P or N"; 220 ti->error = "Persistent flag is not P or N";
219 return -EINVAL; 221 r = -EINVAL;
222 goto bad_type;
220 } 223 }
221 224
222 if (!type) { 225 if (!type) {
@@ -226,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
226 } 229 }
227 230
228 tmp_store->type = type; 231 tmp_store->type = type;
229 tmp_store->ti = ti; 232 tmp_store->snap = snap;
230
231 r = dm_get_device(ti, argv[0], 0, 0,
232 FMODE_READ | FMODE_WRITE, &tmp_store->cow);
233 if (r) {
234 ti->error = "Cannot get COW device";
235 goto bad_cow;
236 }
237 233
238 r = set_chunk_size(tmp_store, argv[2], &ti->error); 234 r = set_chunk_size(tmp_store, argv[1], &ti->error);
239 if (r) 235 if (r)
240 goto bad_ctr; 236 goto bad;
241 237
242 r = type->ctr(tmp_store, 0, NULL); 238 r = type->ctr(tmp_store, 0, NULL);
243 if (r) { 239 if (r) {
244 ti->error = "Exception store type constructor failed"; 240 ti->error = "Exception store type constructor failed";
245 goto bad_ctr; 241 goto bad;
246 } 242 }
247 243
248 *args_used = 3; 244 *args_used = 2;
249 *store = tmp_store; 245 *store = tmp_store;
250 return 0; 246 return 0;
251 247
252bad_ctr: 248bad:
253 dm_put_device(ti, tmp_store->cow);
254bad_cow:
255 put_type(type); 249 put_type(type);
256bad_type: 250bad_type:
257 kfree(tmp_store); 251 kfree(tmp_store);
@@ -262,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create);
262void dm_exception_store_destroy(struct dm_exception_store *store) 256void dm_exception_store_destroy(struct dm_exception_store *store)
263{ 257{
264 store->type->dtr(store); 258 store->type->dtr(store);
265 dm_put_device(store->ti, store->cow);
266 put_type(store->type); 259 put_type(store->type);
267 kfree(store); 260 kfree(store);
268} 261}
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 8a223a48802c..e8dfa06af3ba 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -26,7 +26,7 @@ typedef sector_t chunk_t;
26 * of chunks that follow contiguously. Remaining bits hold the number of the 26 * of chunks that follow contiguously. Remaining bits hold the number of the
27 * chunk within the device. 27 * chunk within the device.
28 */ 28 */
29struct dm_snap_exception { 29struct dm_exception {
30 struct list_head hash_list; 30 struct list_head hash_list;
31 31
32 chunk_t old_chunk; 32 chunk_t old_chunk;
@@ -64,17 +64,34 @@ struct dm_exception_store_type {
64 * Find somewhere to store the next exception. 64 * Find somewhere to store the next exception.
65 */ 65 */
66 int (*prepare_exception) (struct dm_exception_store *store, 66 int (*prepare_exception) (struct dm_exception_store *store,
67 struct dm_snap_exception *e); 67 struct dm_exception *e);
68 68
69 /* 69 /*
70 * Update the metadata with this exception. 70 * Update the metadata with this exception.
71 */ 71 */
72 void (*commit_exception) (struct dm_exception_store *store, 72 void (*commit_exception) (struct dm_exception_store *store,
73 struct dm_snap_exception *e, 73 struct dm_exception *e,
74 void (*callback) (void *, int success), 74 void (*callback) (void *, int success),
75 void *callback_context); 75 void *callback_context);
76 76
77 /* 77 /*
78 * Returns 0 if the exception store is empty.
79 *
80 * If there are exceptions still to be merged, sets
81 * *last_old_chunk and *last_new_chunk to the most recent
82 * still-to-be-merged chunk and returns the number of
83 * consecutive previous ones.
84 */
85 int (*prepare_merge) (struct dm_exception_store *store,
86 chunk_t *last_old_chunk, chunk_t *last_new_chunk);
87
88 /*
89 * Clear the last n exceptions.
90 * nr_merged must be <= the value returned by prepare_merge.
91 */
92 int (*commit_merge) (struct dm_exception_store *store, int nr_merged);
93
94 /*
78 * The snapshot is invalid, note this in the metadata. 95 * The snapshot is invalid, note this in the metadata.
79 */ 96 */
80 void (*drop_snapshot) (struct dm_exception_store *store); 97 void (*drop_snapshot) (struct dm_exception_store *store);
@@ -86,19 +103,19 @@ struct dm_exception_store_type {
86 /* 103 /*
87 * Return how full the snapshot is. 104 * Return how full the snapshot is.
88 */ 105 */
89 void (*fraction_full) (struct dm_exception_store *store, 106 void (*usage) (struct dm_exception_store *store,
90 sector_t *numerator, 107 sector_t *total_sectors, sector_t *sectors_allocated,
91 sector_t *denominator); 108 sector_t *metadata_sectors);
92 109
93 /* For internal device-mapper use only. */ 110 /* For internal device-mapper use only. */
94 struct list_head list; 111 struct list_head list;
95}; 112};
96 113
114struct dm_snapshot;
115
97struct dm_exception_store { 116struct dm_exception_store {
98 struct dm_exception_store_type *type; 117 struct dm_exception_store_type *type;
99 struct dm_target *ti; 118 struct dm_snapshot *snap;
100
101 struct dm_dev *cow;
102 119
103 /* Size of data blocks saved - must be a power of 2 */ 120 /* Size of data blocks saved - must be a power of 2 */
104 unsigned chunk_size; 121 unsigned chunk_size;
@@ -109,6 +126,11 @@ struct dm_exception_store {
109}; 126};
110 127
111/* 128/*
129 * Obtain the cow device used by a given snapshot.
130 */
131struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
132
133/*
112 * Funtions to manipulate consecutive chunks 134 * Funtions to manipulate consecutive chunks
113 */ 135 */
114# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) 136# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
@@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
120 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); 142 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
121} 143}
122 144
123static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) 145static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
124{ 146{
125 return e->new_chunk >> DM_CHUNK_NUMBER_BITS; 147 return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
126} 148}
127 149
128static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) 150static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
129{ 151{
130 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); 152 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
131 153
132 BUG_ON(!dm_consecutive_chunk_count(e)); 154 BUG_ON(!dm_consecutive_chunk_count(e));
133} 155}
134 156
157static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
158{
159 BUG_ON(!dm_consecutive_chunk_count(e));
160
161 e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
162}
163
135# else 164# else
136# define DM_CHUNK_CONSECUTIVE_BITS 0 165# define DM_CHUNK_CONSECUTIVE_BITS 0
137 166
@@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
140 return chunk; 169 return chunk;
141} 170}
142 171
143static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) 172static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
144{ 173{
145 return 0; 174 return 0;
146} 175}
147 176
148static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) 177static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
178{
179}
180
181static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
149{ 182{
150} 183}
151 184
@@ -162,7 +195,7 @@ static inline sector_t get_dev_size(struct block_device *bdev)
162static inline chunk_t sector_to_chunk(struct dm_exception_store *store, 195static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
163 sector_t sector) 196 sector_t sector)
164{ 197{
165 return (sector & ~store->chunk_mask) >> store->chunk_shift; 198 return sector >> store->chunk_shift;
166} 199}
167 200
168int dm_exception_store_type_register(struct dm_exception_store_type *type); 201int dm_exception_store_type_register(struct dm_exception_store_type *type);
@@ -173,6 +206,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
173 char **error); 206 char **error);
174 207
175int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, 208int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
209 struct dm_snapshot *snap,
176 unsigned *args_used, 210 unsigned *args_used,
177 struct dm_exception_store **store); 211 struct dm_exception_store **store);
178void dm_exception_store_destroy(struct dm_exception_store *store); 212void dm_exception_store_destroy(struct dm_exception_store *store);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3a2e6a2f8bdd..10f457ca6af2 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -5,6 +5,8 @@
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8#include "dm.h"
9
8#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
9 11
10#include <linux/bio.h> 12#include <linux/bio.h>
@@ -14,12 +16,19 @@
14#include <linux/slab.h> 16#include <linux/slab.h>
15#include <linux/dm-io.h> 17#include <linux/dm-io.h>
16 18
19#define DM_MSG_PREFIX "io"
20
21#define DM_IO_MAX_REGIONS BITS_PER_LONG
22
17struct dm_io_client { 23struct dm_io_client {
18 mempool_t *pool; 24 mempool_t *pool;
19 struct bio_set *bios; 25 struct bio_set *bios;
20}; 26};
21 27
22/* FIXME: can we shrink this ? */ 28/*
29 * Aligning 'struct io' reduces the number of bits required to store
30 * its address. Refer to store_io_and_region_in_bio() below.
31 */
23struct io { 32struct io {
24 unsigned long error_bits; 33 unsigned long error_bits;
25 unsigned long eopnotsupp_bits; 34 unsigned long eopnotsupp_bits;
@@ -28,7 +37,9 @@ struct io {
28 struct dm_io_client *client; 37 struct dm_io_client *client;
29 io_notify_fn callback; 38 io_notify_fn callback;
30 void *context; 39 void *context;
31}; 40} __attribute__((aligned(DM_IO_MAX_REGIONS)));
41
42static struct kmem_cache *_dm_io_cache;
32 43
33/* 44/*
34 * io contexts are only dynamically allocated for asynchronous 45 * io contexts are only dynamically allocated for asynchronous
@@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
53 if (!client) 64 if (!client)
54 return ERR_PTR(-ENOMEM); 65 return ERR_PTR(-ENOMEM);
55 66
56 client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io)); 67 client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
57 if (!client->pool) 68 if (!client->pool)
58 goto bad; 69 goto bad;
59 70
@@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy);
88 99
89/*----------------------------------------------------------------- 100/*-----------------------------------------------------------------
90 * We need to keep track of which region a bio is doing io for. 101 * We need to keep track of which region a bio is doing io for.
91 * In order to save a memory allocation we store this the last 102 * To avoid a memory allocation to store just 5 or 6 bits, we
92 * bvec which we know is unused (blech). 103 * ensure the 'struct io' pointer is aligned so enough low bits are
93 * XXX This is ugly and can OOPS with some configs... find another way. 104 * always zero and then combine it with the region number directly in
105 * bi_private.
94 *---------------------------------------------------------------*/ 106 *---------------------------------------------------------------*/
95static inline void bio_set_region(struct bio *bio, unsigned region) 107static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
108 unsigned region)
96{ 109{
97 bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; 110 if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
111 DMCRIT("Unaligned struct io pointer %p", io);
112 BUG();
113 }
114
115 bio->bi_private = (void *)((unsigned long)io | region);
98} 116}
99 117
100static inline unsigned bio_get_region(struct bio *bio) 118static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
119 unsigned *region)
101{ 120{
102 return bio->bi_io_vec[bio->bi_max_vecs].bv_len; 121 unsigned long val = (unsigned long)bio->bi_private;
122
123 *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
124 *region = val & (DM_IO_MAX_REGIONS - 1);
103} 125}
104 126
105/*----------------------------------------------------------------- 127/*-----------------------------------------------------------------
@@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error)
140 /* 162 /*
141 * The bio destructor in bio_put() may use the io object. 163 * The bio destructor in bio_put() may use the io object.
142 */ 164 */
143 io = bio->bi_private; 165 retrieve_io_and_region_from_bio(bio, &io, &region);
144 region = bio_get_region(bio);
145 166
146 bio->bi_max_vecs++;
147 bio_put(bio); 167 bio_put(bio);
148 168
149 dec_count(io, region, error); 169 dec_count(io, region, error);
@@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data)
243 263
244static void dm_bio_destructor(struct bio *bio) 264static void dm_bio_destructor(struct bio *bio)
245{ 265{
246 struct io *io = bio->bi_private; 266 unsigned region;
267 struct io *io;
268
269 retrieve_io_and_region_from_bio(bio, &io, &region);
247 270
248 bio_free(bio, io->client->bios); 271 bio_free(bio, io->client->bios);
249} 272}
@@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
286 unsigned num_bvecs; 309 unsigned num_bvecs;
287 sector_t remaining = where->count; 310 sector_t remaining = where->count;
288 311
289 while (remaining) { 312 /*
313 * where->count may be zero if rw holds a write barrier and we
314 * need to send a zero-sized barrier.
315 */
316 do {
290 /* 317 /*
291 * Allocate a suitably sized-bio: we add an extra 318 * Allocate a suitably sized-bio.
292 * bvec for bio_get/set_region() and decrement bi_max_vecs
293 * to hide it from bio_add_page().
294 */ 319 */
295 num_bvecs = dm_sector_div_up(remaining, 320 num_bvecs = dm_sector_div_up(remaining,
296 (PAGE_SIZE >> SECTOR_SHIFT)); 321 (PAGE_SIZE >> SECTOR_SHIFT));
297 num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev), 322 num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
298 num_bvecs);
299 if (unlikely(num_bvecs > BIO_MAX_PAGES))
300 num_bvecs = BIO_MAX_PAGES;
301 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); 323 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
302 bio->bi_sector = where->sector + (where->count - remaining); 324 bio->bi_sector = where->sector + (where->count - remaining);
303 bio->bi_bdev = where->bdev; 325 bio->bi_bdev = where->bdev;
304 bio->bi_end_io = endio; 326 bio->bi_end_io = endio;
305 bio->bi_private = io;
306 bio->bi_destructor = dm_bio_destructor; 327 bio->bi_destructor = dm_bio_destructor;
307 bio->bi_max_vecs--; 328 store_io_and_region_in_bio(bio, io, region);
308 bio_set_region(bio, region);
309 329
310 /* 330 /*
311 * Try and add as many pages as possible. 331 * Try and add as many pages as possible.
@@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
323 343
324 atomic_inc(&io->count); 344 atomic_inc(&io->count);
325 submit_bio(rw, bio); 345 submit_bio(rw, bio);
326 } 346 } while (remaining);
327} 347}
328 348
329static void dispatch_io(int rw, unsigned int num_regions, 349static void dispatch_io(int rw, unsigned int num_regions,
@@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
333 int i; 353 int i;
334 struct dpages old_pages = *dp; 354 struct dpages old_pages = *dp;
335 355
356 BUG_ON(num_regions > DM_IO_MAX_REGIONS);
357
336 if (sync) 358 if (sync)
337 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 359 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
338 360
@@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
342 */ 364 */
343 for (i = 0; i < num_regions; i++) { 365 for (i = 0; i < num_regions; i++) {
344 *dp = old_pages; 366 *dp = old_pages;
345 if (where[i].count) 367 if (where[i].count || (rw & (1 << BIO_RW_BARRIER)))
346 do_region(rw, i, where + i, dp, io); 368 do_region(rw, i, where + i, dp, io);
347 } 369 }
348 370
@@ -357,7 +379,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
357 struct dm_io_region *where, int rw, struct dpages *dp, 379 struct dm_io_region *where, int rw, struct dpages *dp,
358 unsigned long *error_bits) 380 unsigned long *error_bits)
359{ 381{
360 struct io io; 382 /*
383 * gcc <= 4.3 can't do the alignment for stack variables, so we must
384 * align it on our own.
385 * volatile prevents the optimizer from removing or reusing
386 * "io_" field from the stack frame (allowed in ANSI C).
387 */
388 volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
389 struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
361 390
362 if (num_regions > 1 && (rw & RW_MASK) != WRITE) { 391 if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
363 WARN_ON(1); 392 WARN_ON(1);
@@ -365,33 +394,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
365 } 394 }
366 395
367retry: 396retry:
368 io.error_bits = 0; 397 io->error_bits = 0;
369 io.eopnotsupp_bits = 0; 398 io->eopnotsupp_bits = 0;
370 atomic_set(&io.count, 1); /* see dispatch_io() */ 399 atomic_set(&io->count, 1); /* see dispatch_io() */
371 io.sleeper = current; 400 io->sleeper = current;
372 io.client = client; 401 io->client = client;
373 402
374 dispatch_io(rw, num_regions, where, dp, &io, 1); 403 dispatch_io(rw, num_regions, where, dp, io, 1);
375 404
376 while (1) { 405 while (1) {
377 set_current_state(TASK_UNINTERRUPTIBLE); 406 set_current_state(TASK_UNINTERRUPTIBLE);
378 407
379 if (!atomic_read(&io.count)) 408 if (!atomic_read(&io->count))
380 break; 409 break;
381 410
382 io_schedule(); 411 io_schedule();
383 } 412 }
384 set_current_state(TASK_RUNNING); 413 set_current_state(TASK_RUNNING);
385 414
386 if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { 415 if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
387 rw &= ~(1 << BIO_RW_BARRIER); 416 rw &= ~(1 << BIO_RW_BARRIER);
388 goto retry; 417 goto retry;
389 } 418 }
390 419
391 if (error_bits) 420 if (error_bits)
392 *error_bits = io.error_bits; 421 *error_bits = io->error_bits;
393 422
394 return io.error_bits ? -EIO : 0; 423 return io->error_bits ? -EIO : 0;
395} 424}
396 425
397static int async_io(struct dm_io_client *client, unsigned int num_regions, 426static int async_io(struct dm_io_client *client, unsigned int num_regions,
@@ -472,3 +501,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
472 &dp, io_req->notify.fn, io_req->notify.context); 501 &dp, io_req->notify.fn, io_req->notify.context);
473} 502}
474EXPORT_SYMBOL(dm_io); 503EXPORT_SYMBOL(dm_io);
504
505int __init dm_io_init(void)
506{
507 _dm_io_cache = KMEM_CACHE(io, 0);
508 if (!_dm_io_cache)
509 return -ENOMEM;
510
511 return 0;
512}
513
514void dm_io_exit(void)
515{
516 kmem_cache_destroy(_dm_io_cache);
517 _dm_io_cache = NULL;
518}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index a67942931582..1d669322b27c 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -56,6 +56,11 @@ static void dm_hash_remove_all(int keep_open_devices);
56 */ 56 */
57static DECLARE_RWSEM(_hash_lock); 57static DECLARE_RWSEM(_hash_lock);
58 58
59/*
60 * Protects use of mdptr to obtain hash cell name and uuid from mapped device.
61 */
62static DEFINE_MUTEX(dm_hash_cells_mutex);
63
59static void init_buckets(struct list_head *buckets) 64static void init_buckets(struct list_head *buckets)
60{ 65{
61 unsigned int i; 66 unsigned int i;
@@ -206,7 +211,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
206 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); 211 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
207 } 212 }
208 dm_get(md); 213 dm_get(md);
214 mutex_lock(&dm_hash_cells_mutex);
209 dm_set_mdptr(md, cell); 215 dm_set_mdptr(md, cell);
216 mutex_unlock(&dm_hash_cells_mutex);
210 up_write(&_hash_lock); 217 up_write(&_hash_lock);
211 218
212 return 0; 219 return 0;
@@ -224,9 +231,11 @@ static void __hash_remove(struct hash_cell *hc)
224 /* remove from the dev hash */ 231 /* remove from the dev hash */
225 list_del(&hc->uuid_list); 232 list_del(&hc->uuid_list);
226 list_del(&hc->name_list); 233 list_del(&hc->name_list);
234 mutex_lock(&dm_hash_cells_mutex);
227 dm_set_mdptr(hc->md, NULL); 235 dm_set_mdptr(hc->md, NULL);
236 mutex_unlock(&dm_hash_cells_mutex);
228 237
229 table = dm_get_table(hc->md); 238 table = dm_get_live_table(hc->md);
230 if (table) { 239 if (table) {
231 dm_table_event(table); 240 dm_table_event(table);
232 dm_table_put(table); 241 dm_table_put(table);
@@ -321,13 +330,15 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
321 */ 330 */
322 list_del(&hc->name_list); 331 list_del(&hc->name_list);
323 old_name = hc->name; 332 old_name = hc->name;
333 mutex_lock(&dm_hash_cells_mutex);
324 hc->name = new_name; 334 hc->name = new_name;
335 mutex_unlock(&dm_hash_cells_mutex);
325 list_add(&hc->name_list, _name_buckets + hash_str(new_name)); 336 list_add(&hc->name_list, _name_buckets + hash_str(new_name));
326 337
327 /* 338 /*
328 * Wake up any dm event waiters. 339 * Wake up any dm event waiters.
329 */ 340 */
330 table = dm_get_table(hc->md); 341 table = dm_get_live_table(hc->md);
331 if (table) { 342 if (table) {
332 dm_table_event(table); 343 dm_table_event(table);
333 dm_table_put(table); 344 dm_table_put(table);
@@ -512,8 +523,6 @@ static int list_versions(struct dm_ioctl *param, size_t param_size)
512 return 0; 523 return 0;
513} 524}
514 525
515
516
517static int check_name(const char *name) 526static int check_name(const char *name)
518{ 527{
519 if (strchr(name, '/')) { 528 if (strchr(name, '/')) {
@@ -525,6 +534,40 @@ static int check_name(const char *name)
525} 534}
526 535
527/* 536/*
537 * On successful return, the caller must not attempt to acquire
538 * _hash_lock without first calling dm_table_put, because dm_table_destroy
539 * waits for this dm_table_put and could be called under this lock.
540 */
541static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
542{
543 struct hash_cell *hc;
544 struct dm_table *table = NULL;
545
546 down_read(&_hash_lock);
547 hc = dm_get_mdptr(md);
548 if (!hc || hc->md != md) {
549 DMWARN("device has been removed from the dev hash table.");
550 goto out;
551 }
552
553 table = hc->new_map;
554 if (table)
555 dm_table_get(table);
556
557out:
558 up_read(&_hash_lock);
559
560 return table;
561}
562
563static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
564 struct dm_ioctl *param)
565{
566 return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
567 dm_get_inactive_table(md) : dm_get_live_table(md);
568}
569
570/*
528 * Fills in a dm_ioctl structure, ready for sending back to 571 * Fills in a dm_ioctl structure, ready for sending back to
529 * userland. 572 * userland.
530 */ 573 */
@@ -536,7 +579,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
536 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | 579 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
537 DM_ACTIVE_PRESENT_FLAG); 580 DM_ACTIVE_PRESENT_FLAG);
538 581
539 if (dm_suspended(md)) 582 if (dm_suspended_md(md))
540 param->flags |= DM_SUSPEND_FLAG; 583 param->flags |= DM_SUSPEND_FLAG;
541 584
542 param->dev = huge_encode_dev(disk_devt(disk)); 585 param->dev = huge_encode_dev(disk_devt(disk));
@@ -548,18 +591,30 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
548 */ 591 */
549 param->open_count = dm_open_count(md); 592 param->open_count = dm_open_count(md);
550 593
551 if (get_disk_ro(disk))
552 param->flags |= DM_READONLY_FLAG;
553
554 param->event_nr = dm_get_event_nr(md); 594 param->event_nr = dm_get_event_nr(md);
595 param->target_count = 0;
555 596
556 table = dm_get_table(md); 597 table = dm_get_live_table(md);
557 if (table) { 598 if (table) {
558 param->flags |= DM_ACTIVE_PRESENT_FLAG; 599 if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
559 param->target_count = dm_table_get_num_targets(table); 600 if (get_disk_ro(disk))
601 param->flags |= DM_READONLY_FLAG;
602 param->target_count = dm_table_get_num_targets(table);
603 }
560 dm_table_put(table); 604 dm_table_put(table);
561 } else 605
562 param->target_count = 0; 606 param->flags |= DM_ACTIVE_PRESENT_FLAG;
607 }
608
609 if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
610 table = dm_get_inactive_table(md);
611 if (table) {
612 if (!(dm_table_get_mode(table) & FMODE_WRITE))
613 param->flags |= DM_READONLY_FLAG;
614 param->target_count = dm_table_get_num_targets(table);
615 dm_table_put(table);
616 }
617 }
563 618
564 return 0; 619 return 0;
565} 620}
@@ -634,9 +689,9 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
634 * Sneakily write in both the name and the uuid 689 * Sneakily write in both the name and the uuid
635 * while we have the cell. 690 * while we have the cell.
636 */ 691 */
637 strncpy(param->name, hc->name, sizeof(param->name)); 692 strlcpy(param->name, hc->name, sizeof(param->name));
638 if (hc->uuid) 693 if (hc->uuid)
639 strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1); 694 strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
640 else 695 else
641 param->uuid[0] = '\0'; 696 param->uuid[0] = '\0';
642 697
@@ -784,7 +839,7 @@ static int do_suspend(struct dm_ioctl *param)
784 if (param->flags & DM_NOFLUSH_FLAG) 839 if (param->flags & DM_NOFLUSH_FLAG)
785 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; 840 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
786 841
787 if (!dm_suspended(md)) 842 if (!dm_suspended_md(md))
788 r = dm_suspend(md, suspend_flags); 843 r = dm_suspend(md, suspend_flags);
789 844
790 if (!r) 845 if (!r)
@@ -800,7 +855,7 @@ static int do_resume(struct dm_ioctl *param)
800 unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; 855 unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
801 struct hash_cell *hc; 856 struct hash_cell *hc;
802 struct mapped_device *md; 857 struct mapped_device *md;
803 struct dm_table *new_map; 858 struct dm_table *new_map, *old_map = NULL;
804 859
805 down_write(&_hash_lock); 860 down_write(&_hash_lock);
806 861
@@ -826,14 +881,14 @@ static int do_resume(struct dm_ioctl *param)
826 suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; 881 suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
827 if (param->flags & DM_NOFLUSH_FLAG) 882 if (param->flags & DM_NOFLUSH_FLAG)
828 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; 883 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
829 if (!dm_suspended(md)) 884 if (!dm_suspended_md(md))
830 dm_suspend(md, suspend_flags); 885 dm_suspend(md, suspend_flags);
831 886
832 r = dm_swap_table(md, new_map); 887 old_map = dm_swap_table(md, new_map);
833 if (r) { 888 if (IS_ERR(old_map)) {
834 dm_table_destroy(new_map); 889 dm_table_destroy(new_map);
835 dm_put(md); 890 dm_put(md);
836 return r; 891 return PTR_ERR(old_map);
837 } 892 }
838 893
839 if (dm_table_get_mode(new_map) & FMODE_WRITE) 894 if (dm_table_get_mode(new_map) & FMODE_WRITE)
@@ -842,9 +897,11 @@ static int do_resume(struct dm_ioctl *param)
842 set_disk_ro(dm_disk(md), 1); 897 set_disk_ro(dm_disk(md), 1);
843 } 898 }
844 899
845 if (dm_suspended(md)) 900 if (dm_suspended_md(md))
846 r = dm_resume(md); 901 r = dm_resume(md);
847 902
903 if (old_map)
904 dm_table_destroy(old_map);
848 905
849 if (!r) { 906 if (!r) {
850 dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); 907 dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
@@ -982,7 +1039,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
982 if (r) 1039 if (r)
983 goto out; 1040 goto out;
984 1041
985 table = dm_get_table(md); 1042 table = dm_get_live_or_inactive_table(md, param);
986 if (table) { 1043 if (table) {
987 retrieve_status(table, param, param_size); 1044 retrieve_status(table, param, param_size);
988 dm_table_put(table); 1045 dm_table_put(table);
@@ -1215,7 +1272,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
1215 if (r) 1272 if (r)
1216 goto out; 1273 goto out;
1217 1274
1218 table = dm_get_table(md); 1275 table = dm_get_live_or_inactive_table(md, param);
1219 if (table) { 1276 if (table) {
1220 retrieve_deps(table, param, param_size); 1277 retrieve_deps(table, param, param_size);
1221 dm_table_put(table); 1278 dm_table_put(table);
@@ -1244,13 +1301,13 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
1244 if (r) 1301 if (r)
1245 goto out; 1302 goto out;
1246 1303
1247 table = dm_get_table(md); 1304 table = dm_get_live_or_inactive_table(md, param);
1248 if (table) { 1305 if (table) {
1249 retrieve_status(table, param, param_size); 1306 retrieve_status(table, param, param_size);
1250 dm_table_put(table); 1307 dm_table_put(table);
1251 } 1308 }
1252 1309
1253 out: 1310out:
1254 dm_put(md); 1311 dm_put(md);
1255 return r; 1312 return r;
1256} 1313}
@@ -1288,10 +1345,15 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1288 goto out; 1345 goto out;
1289 } 1346 }
1290 1347
1291 table = dm_get_table(md); 1348 table = dm_get_live_table(md);
1292 if (!table) 1349 if (!table)
1293 goto out_argv; 1350 goto out_argv;
1294 1351
1352 if (dm_deleting_md(md)) {
1353 r = -ENXIO;
1354 goto out_table;
1355 }
1356
1295 ti = dm_table_find_target(table, tmsg->sector); 1357 ti = dm_table_find_target(table, tmsg->sector);
1296 if (!dm_target_is_valid(ti)) { 1358 if (!dm_target_is_valid(ti)) {
1297 DMWARN("Target message sector outside device."); 1359 DMWARN("Target message sector outside device.");
@@ -1303,6 +1365,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1303 r = -EINVAL; 1365 r = -EINVAL;
1304 } 1366 }
1305 1367
1368 out_table:
1306 dm_table_put(table); 1369 dm_table_put(table);
1307 out_argv: 1370 out_argv:
1308 kfree(argv); 1371 kfree(argv);
@@ -1582,8 +1645,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
1582 if (!md) 1645 if (!md)
1583 return -ENXIO; 1646 return -ENXIO;
1584 1647
1585 dm_get(md); 1648 mutex_lock(&dm_hash_cells_mutex);
1586 down_read(&_hash_lock);
1587 hc = dm_get_mdptr(md); 1649 hc = dm_get_mdptr(md);
1588 if (!hc || hc->md != md) { 1650 if (!hc || hc->md != md) {
1589 r = -ENXIO; 1651 r = -ENXIO;
@@ -1596,8 +1658,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
1596 strcpy(uuid, hc->uuid ? : ""); 1658 strcpy(uuid, hc->uuid ? : "");
1597 1659
1598out: 1660out:
1599 up_read(&_hash_lock); 1661 mutex_unlock(&dm_hash_cells_mutex);
1600 dm_put(md);
1601 1662
1602 return r; 1663 return r;
1603} 1664}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 3e3fc06cb861..addf83475040 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -450,7 +450,10 @@ static void dispatch_job(struct kcopyd_job *job)
450{ 450{
451 struct dm_kcopyd_client *kc = job->kc; 451 struct dm_kcopyd_client *kc = job->kc;
452 atomic_inc(&kc->nr_jobs); 452 atomic_inc(&kc->nr_jobs);
453 push(&kc->pages_jobs, job); 453 if (unlikely(!job->source.count))
454 push(&kc->complete_jobs, job);
455 else
456 push(&kc->pages_jobs, job);
454 wake(kc); 457 wake(kc);
455} 458}
456 459
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 9443896ede07..7035582786fb 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -145,8 +145,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
145EXPORT_SYMBOL(dm_dirty_log_type_unregister); 145EXPORT_SYMBOL(dm_dirty_log_type_unregister);
146 146
147struct dm_dirty_log *dm_dirty_log_create(const char *type_name, 147struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
148 struct dm_target *ti, 148 struct dm_target *ti,
149 unsigned int argc, char **argv) 149 int (*flush_callback_fn)(struct dm_target *ti),
150 unsigned int argc, char **argv)
150{ 151{
151 struct dm_dirty_log_type *type; 152 struct dm_dirty_log_type *type;
152 struct dm_dirty_log *log; 153 struct dm_dirty_log *log;
@@ -161,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
161 return NULL; 162 return NULL;
162 } 163 }
163 164
165 log->flush_callback_fn = flush_callback_fn;
164 log->type = type; 166 log->type = type;
165 if (type->ctr(log, ti, argc, argv)) { 167 if (type->ctr(log, ti, argc, argv)) {
166 kfree(log); 168 kfree(log);
@@ -208,7 +210,9 @@ struct log_header {
208 210
209struct log_c { 211struct log_c {
210 struct dm_target *ti; 212 struct dm_target *ti;
211 int touched; 213 int touched_dirtied;
214 int touched_cleaned;
215 int flush_failed;
212 uint32_t region_size; 216 uint32_t region_size;
213 unsigned int region_count; 217 unsigned int region_count;
214 region_t sync_count; 218 region_t sync_count;
@@ -233,6 +237,7 @@ struct log_c {
233 * Disk log fields 237 * Disk log fields
234 */ 238 */
235 int log_dev_failed; 239 int log_dev_failed;
240 int log_dev_flush_failed;
236 struct dm_dev *log_dev; 241 struct dm_dev *log_dev;
237 struct log_header header; 242 struct log_header header;
238 243
@@ -253,14 +258,14 @@ static inline void log_set_bit(struct log_c *l,
253 uint32_t *bs, unsigned bit) 258 uint32_t *bs, unsigned bit)
254{ 259{
255 ext2_set_bit(bit, (unsigned long *) bs); 260 ext2_set_bit(bit, (unsigned long *) bs);
256 l->touched = 1; 261 l->touched_cleaned = 1;
257} 262}
258 263
259static inline void log_clear_bit(struct log_c *l, 264static inline void log_clear_bit(struct log_c *l,
260 uint32_t *bs, unsigned bit) 265 uint32_t *bs, unsigned bit)
261{ 266{
262 ext2_clear_bit(bit, (unsigned long *) bs); 267 ext2_clear_bit(bit, (unsigned long *) bs);
263 l->touched = 1; 268 l->touched_dirtied = 1;
264} 269}
265 270
266/*---------------------------------------------------------------- 271/*----------------------------------------------------------------
@@ -287,6 +292,19 @@ static int rw_header(struct log_c *lc, int rw)
287 return dm_io(&lc->io_req, 1, &lc->header_location, NULL); 292 return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
288} 293}
289 294
295static int flush_header(struct log_c *lc)
296{
297 struct dm_io_region null_location = {
298 .bdev = lc->header_location.bdev,
299 .sector = 0,
300 .count = 0,
301 };
302
303 lc->io_req.bi_rw = WRITE_BARRIER;
304
305 return dm_io(&lc->io_req, 1, &null_location, NULL);
306}
307
290static int read_header(struct log_c *log) 308static int read_header(struct log_c *log)
291{ 309{
292 int r; 310 int r;
@@ -378,7 +396,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
378 } 396 }
379 397
380 lc->ti = ti; 398 lc->ti = ti;
381 lc->touched = 0; 399 lc->touched_dirtied = 0;
400 lc->touched_cleaned = 0;
401 lc->flush_failed = 0;
382 lc->region_size = region_size; 402 lc->region_size = region_size;
383 lc->region_count = region_count; 403 lc->region_count = region_count;
384 lc->sync = sync; 404 lc->sync = sync;
@@ -406,6 +426,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
406 } else { 426 } else {
407 lc->log_dev = dev; 427 lc->log_dev = dev;
408 lc->log_dev_failed = 0; 428 lc->log_dev_failed = 0;
429 lc->log_dev_flush_failed = 0;
409 lc->header_location.bdev = lc->log_dev->bdev; 430 lc->header_location.bdev = lc->log_dev->bdev;
410 lc->header_location.sector = 0; 431 lc->header_location.sector = 0;
411 432
@@ -614,6 +635,11 @@ static int disk_resume(struct dm_dirty_log *log)
614 635
615 /* write the new header */ 636 /* write the new header */
616 r = rw_header(lc, WRITE); 637 r = rw_header(lc, WRITE);
638 if (!r) {
639 r = flush_header(lc);
640 if (r)
641 lc->log_dev_flush_failed = 1;
642 }
617 if (r) { 643 if (r) {
618 DMWARN("%s: Failed to write header on dirty region log device", 644 DMWARN("%s: Failed to write header on dirty region log device",
619 lc->log_dev->name); 645 lc->log_dev->name);
@@ -656,18 +682,40 @@ static int core_flush(struct dm_dirty_log *log)
656 682
657static int disk_flush(struct dm_dirty_log *log) 683static int disk_flush(struct dm_dirty_log *log)
658{ 684{
659 int r; 685 int r, i;
660 struct log_c *lc = (struct log_c *) log->context; 686 struct log_c *lc = log->context;
661 687
662 /* only write if the log has changed */ 688 /* only write if the log has changed */
663 if (!lc->touched) 689 if (!lc->touched_cleaned && !lc->touched_dirtied)
664 return 0; 690 return 0;
665 691
692 if (lc->touched_cleaned && log->flush_callback_fn &&
693 log->flush_callback_fn(lc->ti)) {
694 /*
695 * At this point it is impossible to determine which
696 * regions are clean and which are dirty (without
697 * re-reading the log off disk). So mark all of them
698 * dirty.
699 */
700 lc->flush_failed = 1;
701 for (i = 0; i < lc->region_count; i++)
702 log_clear_bit(lc, lc->clean_bits, i);
703 }
704
666 r = rw_header(lc, WRITE); 705 r = rw_header(lc, WRITE);
667 if (r) 706 if (r)
668 fail_log_device(lc); 707 fail_log_device(lc);
669 else 708 else {
670 lc->touched = 0; 709 if (lc->touched_dirtied) {
710 r = flush_header(lc);
711 if (r) {
712 lc->log_dev_flush_failed = 1;
713 fail_log_device(lc);
714 } else
715 lc->touched_dirtied = 0;
716 }
717 lc->touched_cleaned = 0;
718 }
671 719
672 return r; 720 return r;
673} 721}
@@ -681,7 +729,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region)
681static void core_clear_region(struct dm_dirty_log *log, region_t region) 729static void core_clear_region(struct dm_dirty_log *log, region_t region)
682{ 730{
683 struct log_c *lc = (struct log_c *) log->context; 731 struct log_c *lc = (struct log_c *) log->context;
684 log_set_bit(lc, lc->clean_bits, region); 732 if (likely(!lc->flush_failed))
733 log_set_bit(lc, lc->clean_bits, region);
685} 734}
686 735
687static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) 736static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
@@ -762,7 +811,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status,
762 switch(status) { 811 switch(status) {
763 case STATUSTYPE_INFO: 812 case STATUSTYPE_INFO:
764 DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, 813 DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
765 lc->log_dev_failed ? 'D' : 'A'); 814 lc->log_dev_flush_failed ? 'F' :
815 lc->log_dev_failed ? 'D' :
816 'A');
766 break; 817 break;
767 818
768 case STATUSTYPE_TABLE: 819 case STATUSTYPE_TABLE:
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index dce971dbdfa3..e81345a1d08f 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -93,6 +93,10 @@ struct multipath {
93 * can resubmit bios on error. 93 * can resubmit bios on error.
94 */ 94 */
95 mempool_t *mpio_pool; 95 mempool_t *mpio_pool;
96
97 struct mutex work_mutex;
98
99 unsigned suspended; /* Don't create new I/O internally when set. */
96}; 100};
97 101
98/* 102/*
@@ -198,6 +202,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
198 m->queue_io = 1; 202 m->queue_io = 1;
199 INIT_WORK(&m->process_queued_ios, process_queued_ios); 203 INIT_WORK(&m->process_queued_ios, process_queued_ios);
200 INIT_WORK(&m->trigger_event, trigger_event); 204 INIT_WORK(&m->trigger_event, trigger_event);
205 mutex_init(&m->work_mutex);
201 m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); 206 m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
202 if (!m->mpio_pool) { 207 if (!m->mpio_pool) {
203 kfree(m); 208 kfree(m);
@@ -885,13 +890,18 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
885 return r; 890 return r;
886} 891}
887 892
888static void multipath_dtr(struct dm_target *ti) 893static void flush_multipath_work(void)
889{ 894{
890 struct multipath *m = (struct multipath *) ti->private;
891
892 flush_workqueue(kmpath_handlerd); 895 flush_workqueue(kmpath_handlerd);
893 flush_workqueue(kmultipathd); 896 flush_workqueue(kmultipathd);
894 flush_scheduled_work(); 897 flush_scheduled_work();
898}
899
900static void multipath_dtr(struct dm_target *ti)
901{
902 struct multipath *m = ti->private;
903
904 flush_multipath_work();
895 free_multipath(m); 905 free_multipath(m);
896} 906}
897 907
@@ -1261,6 +1271,16 @@ static void multipath_presuspend(struct dm_target *ti)
1261 queue_if_no_path(m, 0, 1); 1271 queue_if_no_path(m, 0, 1);
1262} 1272}
1263 1273
1274static void multipath_postsuspend(struct dm_target *ti)
1275{
1276 struct multipath *m = ti->private;
1277
1278 mutex_lock(&m->work_mutex);
1279 m->suspended = 1;
1280 flush_multipath_work();
1281 mutex_unlock(&m->work_mutex);
1282}
1283
1264/* 1284/*
1265 * Restore the queue_if_no_path setting. 1285 * Restore the queue_if_no_path setting.
1266 */ 1286 */
@@ -1269,6 +1289,10 @@ static void multipath_resume(struct dm_target *ti)
1269 struct multipath *m = (struct multipath *) ti->private; 1289 struct multipath *m = (struct multipath *) ti->private;
1270 unsigned long flags; 1290 unsigned long flags;
1271 1291
1292 mutex_lock(&m->work_mutex);
1293 m->suspended = 0;
1294 mutex_unlock(&m->work_mutex);
1295
1272 spin_lock_irqsave(&m->lock, flags); 1296 spin_lock_irqsave(&m->lock, flags);
1273 m->queue_if_no_path = m->saved_queue_if_no_path; 1297 m->queue_if_no_path = m->saved_queue_if_no_path;
1274 spin_unlock_irqrestore(&m->lock, flags); 1298 spin_unlock_irqrestore(&m->lock, flags);
@@ -1397,51 +1421,71 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
1397 1421
1398static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) 1422static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1399{ 1423{
1400 int r; 1424 int r = -EINVAL;
1401 struct dm_dev *dev; 1425 struct dm_dev *dev;
1402 struct multipath *m = (struct multipath *) ti->private; 1426 struct multipath *m = (struct multipath *) ti->private;
1403 action_fn action; 1427 action_fn action;
1404 1428
1429 mutex_lock(&m->work_mutex);
1430
1431 if (m->suspended) {
1432 r = -EBUSY;
1433 goto out;
1434 }
1435
1436 if (dm_suspended(ti)) {
1437 r = -EBUSY;
1438 goto out;
1439 }
1440
1405 if (argc == 1) { 1441 if (argc == 1) {
1406 if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) 1442 if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
1407 return queue_if_no_path(m, 1, 0); 1443 r = queue_if_no_path(m, 1, 0);
1408 else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) 1444 goto out;
1409 return queue_if_no_path(m, 0, 0); 1445 } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) {
1446 r = queue_if_no_path(m, 0, 0);
1447 goto out;
1448 }
1410 } 1449 }
1411 1450
1412 if (argc != 2) 1451 if (argc != 2) {
1413 goto error; 1452 DMWARN("Unrecognised multipath message received.");
1453 goto out;
1454 }
1414 1455
1415 if (!strnicmp(argv[0], MESG_STR("disable_group"))) 1456 if (!strnicmp(argv[0], MESG_STR("disable_group"))) {
1416 return bypass_pg_num(m, argv[1], 1); 1457 r = bypass_pg_num(m, argv[1], 1);
1417 else if (!strnicmp(argv[0], MESG_STR("enable_group"))) 1458 goto out;
1418 return bypass_pg_num(m, argv[1], 0); 1459 } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) {
1419 else if (!strnicmp(argv[0], MESG_STR("switch_group"))) 1460 r = bypass_pg_num(m, argv[1], 0);
1420 return switch_pg_num(m, argv[1]); 1461 goto out;
1421 else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) 1462 } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) {
1463 r = switch_pg_num(m, argv[1]);
1464 goto out;
1465 } else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
1422 action = reinstate_path; 1466 action = reinstate_path;
1423 else if (!strnicmp(argv[0], MESG_STR("fail_path"))) 1467 else if (!strnicmp(argv[0], MESG_STR("fail_path")))
1424 action = fail_path; 1468 action = fail_path;
1425 else 1469 else {
1426 goto error; 1470 DMWARN("Unrecognised multipath message received.");
1471 goto out;
1472 }
1427 1473
1428 r = dm_get_device(ti, argv[1], ti->begin, ti->len, 1474 r = dm_get_device(ti, argv[1], ti->begin, ti->len,
1429 dm_table_get_mode(ti->table), &dev); 1475 dm_table_get_mode(ti->table), &dev);
1430 if (r) { 1476 if (r) {
1431 DMWARN("message: error getting device %s", 1477 DMWARN("message: error getting device %s",
1432 argv[1]); 1478 argv[1]);
1433 return -EINVAL; 1479 goto out;
1434 } 1480 }
1435 1481
1436 r = action_dev(m, dev, action); 1482 r = action_dev(m, dev, action);
1437 1483
1438 dm_put_device(ti, dev); 1484 dm_put_device(ti, dev);
1439 1485
1486out:
1487 mutex_unlock(&m->work_mutex);
1440 return r; 1488 return r;
1441
1442error:
1443 DMWARN("Unrecognised multipath message received.");
1444 return -EINVAL;
1445} 1489}
1446 1490
1447static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, 1491static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
@@ -1567,13 +1611,14 @@ out:
1567 *---------------------------------------------------------------*/ 1611 *---------------------------------------------------------------*/
1568static struct target_type multipath_target = { 1612static struct target_type multipath_target = {
1569 .name = "multipath", 1613 .name = "multipath",
1570 .version = {1, 1, 0}, 1614 .version = {1, 1, 1},
1571 .module = THIS_MODULE, 1615 .module = THIS_MODULE,
1572 .ctr = multipath_ctr, 1616 .ctr = multipath_ctr,
1573 .dtr = multipath_dtr, 1617 .dtr = multipath_dtr,
1574 .map_rq = multipath_map, 1618 .map_rq = multipath_map,
1575 .rq_end_io = multipath_end_io, 1619 .rq_end_io = multipath_end_io,
1576 .presuspend = multipath_presuspend, 1620 .presuspend = multipath_presuspend,
1621 .postsuspend = multipath_postsuspend,
1577 .resume = multipath_resume, 1622 .resume = multipath_resume,
1578 .status = multipath_status, 1623 .status = multipath_status,
1579 .message = multipath_message, 1624 .message = multipath_message,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index cc9dc79b0784..ad779bd13aec 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
35 *---------------------------------------------------------------*/ 35 *---------------------------------------------------------------*/
36enum dm_raid1_error { 36enum dm_raid1_error {
37 DM_RAID1_WRITE_ERROR, 37 DM_RAID1_WRITE_ERROR,
38 DM_RAID1_FLUSH_ERROR,
38 DM_RAID1_SYNC_ERROR, 39 DM_RAID1_SYNC_ERROR,
39 DM_RAID1_READ_ERROR 40 DM_RAID1_READ_ERROR
40}; 41};
@@ -57,6 +58,7 @@ struct mirror_set {
57 struct bio_list reads; 58 struct bio_list reads;
58 struct bio_list writes; 59 struct bio_list writes;
59 struct bio_list failures; 60 struct bio_list failures;
61 struct bio_list holds; /* bios are waiting until suspend */
60 62
61 struct dm_region_hash *rh; 63 struct dm_region_hash *rh;
62 struct dm_kcopyd_client *kcopyd_client; 64 struct dm_kcopyd_client *kcopyd_client;
@@ -67,6 +69,7 @@ struct mirror_set {
67 region_t nr_regions; 69 region_t nr_regions;
68 int in_sync; 70 int in_sync;
69 int log_failure; 71 int log_failure;
72 int leg_failure;
70 atomic_t suspend; 73 atomic_t suspend;
71 74
72 atomic_t default_mirror; /* Default mirror */ 75 atomic_t default_mirror; /* Default mirror */
@@ -179,6 +182,17 @@ static void set_default_mirror(struct mirror *m)
179 atomic_set(&ms->default_mirror, m - m0); 182 atomic_set(&ms->default_mirror, m - m0);
180} 183}
181 184
185static struct mirror *get_valid_mirror(struct mirror_set *ms)
186{
187 struct mirror *m;
188
189 for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
190 if (!atomic_read(&m->error_count))
191 return m;
192
193 return NULL;
194}
195
182/* fail_mirror 196/* fail_mirror
183 * @m: mirror device to fail 197 * @m: mirror device to fail
184 * @error_type: one of the enum's, DM_RAID1_*_ERROR 198 * @error_type: one of the enum's, DM_RAID1_*_ERROR
@@ -198,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
198 struct mirror_set *ms = m->ms; 212 struct mirror_set *ms = m->ms;
199 struct mirror *new; 213 struct mirror *new;
200 214
215 ms->leg_failure = 1;
216
201 /* 217 /*
202 * error_count is used for nothing more than a 218 * error_count is used for nothing more than a
203 * simple way to tell if a device has encountered 219 * simple way to tell if a device has encountered
@@ -224,19 +240,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
224 goto out; 240 goto out;
225 } 241 }
226 242
227 for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) 243 new = get_valid_mirror(ms);
228 if (!atomic_read(&new->error_count)) { 244 if (new)
229 set_default_mirror(new); 245 set_default_mirror(new);
230 break; 246 else
231 }
232
233 if (unlikely(new == ms->mirror + ms->nr_mirrors))
234 DMWARN("All sides of mirror have failed."); 247 DMWARN("All sides of mirror have failed.");
235 248
236out: 249out:
237 schedule_work(&ms->trigger_event); 250 schedule_work(&ms->trigger_event);
238} 251}
239 252
253static int mirror_flush(struct dm_target *ti)
254{
255 struct mirror_set *ms = ti->private;
256 unsigned long error_bits;
257
258 unsigned int i;
259 struct dm_io_region io[ms->nr_mirrors];
260 struct mirror *m;
261 struct dm_io_request io_req = {
262 .bi_rw = WRITE_BARRIER,
263 .mem.type = DM_IO_KMEM,
264 .mem.ptr.bvec = NULL,
265 .client = ms->io_client,
266 };
267
268 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
269 io[i].bdev = m->dev->bdev;
270 io[i].sector = 0;
271 io[i].count = 0;
272 }
273
274 error_bits = -1;
275 dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
276 if (unlikely(error_bits != 0)) {
277 for (i = 0; i < ms->nr_mirrors; i++)
278 if (test_bit(i, &error_bits))
279 fail_mirror(ms->mirror + i,
280 DM_RAID1_FLUSH_ERROR);
281 return -EIO;
282 }
283
284 return 0;
285}
286
240/*----------------------------------------------------------------- 287/*-----------------------------------------------------------------
241 * Recovery. 288 * Recovery.
242 * 289 *
@@ -396,6 +443,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
396 */ 443 */
397static sector_t map_sector(struct mirror *m, struct bio *bio) 444static sector_t map_sector(struct mirror *m, struct bio *bio)
398{ 445{
446 if (unlikely(!bio->bi_size))
447 return 0;
399 return m->offset + (bio->bi_sector - m->ms->ti->begin); 448 return m->offset + (bio->bi_sector - m->ms->ti->begin);
400} 449}
401 450
@@ -413,6 +462,27 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
413 io->count = bio->bi_size >> 9; 462 io->count = bio->bi_size >> 9;
414} 463}
415 464
465static void hold_bio(struct mirror_set *ms, struct bio *bio)
466{
467 /*
468 * If device is suspended, complete the bio.
469 */
470 if (atomic_read(&ms->suspend)) {
471 if (dm_noflush_suspending(ms->ti))
472 bio_endio(bio, DM_ENDIO_REQUEUE);
473 else
474 bio_endio(bio, -EIO);
475 return;
476 }
477
478 /*
479 * Hold bio until the suspend is complete.
480 */
481 spin_lock_irq(&ms->lock);
482 bio_list_add(&ms->holds, bio);
483 spin_unlock_irq(&ms->lock);
484}
485
416/*----------------------------------------------------------------- 486/*-----------------------------------------------------------------
417 * Reads 487 * Reads
418 *---------------------------------------------------------------*/ 488 *---------------------------------------------------------------*/
@@ -511,7 +581,6 @@ static void write_callback(unsigned long error, void *context)
511 unsigned i, ret = 0; 581 unsigned i, ret = 0;
512 struct bio *bio = (struct bio *) context; 582 struct bio *bio = (struct bio *) context;
513 struct mirror_set *ms; 583 struct mirror_set *ms;
514 int uptodate = 0;
515 int should_wake = 0; 584 int should_wake = 0;
516 unsigned long flags; 585 unsigned long flags;
517 586
@@ -524,36 +593,27 @@ static void write_callback(unsigned long error, void *context)
524 * This way we handle both writes to SYNC and NOSYNC 593 * This way we handle both writes to SYNC and NOSYNC
525 * regions with the same code. 594 * regions with the same code.
526 */ 595 */
527 if (likely(!error)) 596 if (likely(!error)) {
528 goto out; 597 bio_endio(bio, ret);
598 return;
599 }
529 600
530 for (i = 0; i < ms->nr_mirrors; i++) 601 for (i = 0; i < ms->nr_mirrors; i++)
531 if (test_bit(i, &error)) 602 if (test_bit(i, &error))
532 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); 603 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
533 else
534 uptodate = 1;
535 604
536 if (unlikely(!uptodate)) { 605 /*
537 DMERR("All replicated volumes dead, failing I/O"); 606 * Need to raise event. Since raising
538 /* None of the writes succeeded, fail the I/O. */ 607 * events can block, we need to do it in
539 ret = -EIO; 608 * the main thread.
540 } else if (errors_handled(ms)) { 609 */
541 /* 610 spin_lock_irqsave(&ms->lock, flags);
542 * Need to raise event. Since raising 611 if (!ms->failures.head)
543 * events can block, we need to do it in 612 should_wake = 1;
544 * the main thread. 613 bio_list_add(&ms->failures, bio);
545 */ 614 spin_unlock_irqrestore(&ms->lock, flags);
546 spin_lock_irqsave(&ms->lock, flags); 615 if (should_wake)
547 if (!ms->failures.head) 616 wakeup_mirrord(ms);
548 should_wake = 1;
549 bio_list_add(&ms->failures, bio);
550 spin_unlock_irqrestore(&ms->lock, flags);
551 if (should_wake)
552 wakeup_mirrord(ms);
553 return;
554 }
555out:
556 bio_endio(bio, ret);
557} 617}
558 618
559static void do_write(struct mirror_set *ms, struct bio *bio) 619static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -562,7 +622,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
562 struct dm_io_region io[ms->nr_mirrors], *dest = io; 622 struct dm_io_region io[ms->nr_mirrors], *dest = io;
563 struct mirror *m; 623 struct mirror *m;
564 struct dm_io_request io_req = { 624 struct dm_io_request io_req = {
565 .bi_rw = WRITE, 625 .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
566 .mem.type = DM_IO_BVEC, 626 .mem.type = DM_IO_BVEC,
567 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 627 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
568 .notify.fn = write_callback, 628 .notify.fn = write_callback,
@@ -603,6 +663,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
603 bio_list_init(&requeue); 663 bio_list_init(&requeue);
604 664
605 while ((bio = bio_list_pop(writes))) { 665 while ((bio = bio_list_pop(writes))) {
666 if (unlikely(bio_empty_barrier(bio))) {
667 bio_list_add(&sync, bio);
668 continue;
669 }
670
606 region = dm_rh_bio_to_region(ms->rh, bio); 671 region = dm_rh_bio_to_region(ms->rh, bio);
607 672
608 if (log->type->is_remote_recovering && 673 if (log->type->is_remote_recovering &&
@@ -672,8 +737,12 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
672 dm_rh_delay(ms->rh, bio); 737 dm_rh_delay(ms->rh, bio);
673 738
674 while ((bio = bio_list_pop(&nosync))) { 739 while ((bio = bio_list_pop(&nosync))) {
675 map_bio(get_default_mirror(ms), bio); 740 if (unlikely(ms->leg_failure) && errors_handled(ms))
676 generic_make_request(bio); 741 hold_bio(ms, bio);
742 else {
743 map_bio(get_default_mirror(ms), bio);
744 generic_make_request(bio);
745 }
677 } 746 }
678} 747}
679 748
@@ -681,20 +750,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
681{ 750{
682 struct bio *bio; 751 struct bio *bio;
683 752
684 if (!failures->head) 753 if (likely(!failures->head))
685 return;
686
687 if (!ms->log_failure) {
688 while ((bio = bio_list_pop(failures))) {
689 ms->in_sync = 0;
690 dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
691 }
692 return; 754 return;
693 }
694 755
695 /* 756 /*
696 * If the log has failed, unattempted writes are being 757 * If the log has failed, unattempted writes are being
697 * put on the failures list. We can't issue those writes 758 * put on the holds list. We can't issue those writes
698 * until a log has been marked, so we must store them. 759 * until a log has been marked, so we must store them.
699 * 760 *
700 * If a 'noflush' suspend is in progress, we can requeue 761 * If a 'noflush' suspend is in progress, we can requeue
@@ -709,23 +770,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
709 * for us to treat them the same and requeue them 770 * for us to treat them the same and requeue them
710 * as well. 771 * as well.
711 */ 772 */
712 if (dm_noflush_suspending(ms->ti)) { 773 while ((bio = bio_list_pop(failures))) {
713 while ((bio = bio_list_pop(failures))) 774 if (!ms->log_failure) {
714 bio_endio(bio, DM_ENDIO_REQUEUE); 775 ms->in_sync = 0;
715 return; 776 dm_rh_mark_nosync(ms->rh, bio);
716 } 777 }
717 778
718 if (atomic_read(&ms->suspend)) { 779 /*
719 while ((bio = bio_list_pop(failures))) 780 * If all the legs are dead, fail the I/O.
781 * If we have been told to handle errors, hold the bio
782 * and wait for userspace to deal with the problem.
783 * Otherwise pretend that the I/O succeeded. (This would
784 * be wrong if the failed leg returned after reboot and
785 * got replicated back to the good legs.)
786 */
787 if (!get_valid_mirror(ms))
720 bio_endio(bio, -EIO); 788 bio_endio(bio, -EIO);
721 return; 789 else if (errors_handled(ms))
790 hold_bio(ms, bio);
791 else
792 bio_endio(bio, 0);
722 } 793 }
723
724 spin_lock_irq(&ms->lock);
725 bio_list_merge(&ms->failures, failures);
726 spin_unlock_irq(&ms->lock);
727
728 delayed_wake(ms);
729} 794}
730 795
731static void trigger_event(struct work_struct *work) 796static void trigger_event(struct work_struct *work)
@@ -784,12 +849,17 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
784 } 849 }
785 850
786 spin_lock_init(&ms->lock); 851 spin_lock_init(&ms->lock);
852 bio_list_init(&ms->reads);
853 bio_list_init(&ms->writes);
854 bio_list_init(&ms->failures);
855 bio_list_init(&ms->holds);
787 856
788 ms->ti = ti; 857 ms->ti = ti;
789 ms->nr_mirrors = nr_mirrors; 858 ms->nr_mirrors = nr_mirrors;
790 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 859 ms->nr_regions = dm_sector_div_up(ti->len, region_size);
791 ms->in_sync = 0; 860 ms->in_sync = 0;
792 ms->log_failure = 0; 861 ms->log_failure = 0;
862 ms->leg_failure = 0;
793 atomic_set(&ms->suspend, 0); 863 atomic_set(&ms->suspend, 0);
794 atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 864 atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
795 865
@@ -889,7 +959,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
889 return NULL; 959 return NULL;
890 } 960 }
891 961
892 dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); 962 dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count,
963 argv + 2);
893 if (!dl) { 964 if (!dl) {
894 ti->error = "Error creating mirror dirty log"; 965 ti->error = "Error creating mirror dirty log";
895 return NULL; 966 return NULL;
@@ -995,6 +1066,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
995 1066
996 ti->private = ms; 1067 ti->private = ms;
997 ti->split_io = dm_rh_get_region_size(ms->rh); 1068 ti->split_io = dm_rh_get_region_size(ms->rh);
1069 ti->num_flush_requests = 1;
998 1070
999 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); 1071 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
1000 if (!ms->kmirrord_wq) { 1072 if (!ms->kmirrord_wq) {
@@ -1122,7 +1194,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1122 * We need to dec pending if this was a write. 1194 * We need to dec pending if this was a write.
1123 */ 1195 */
1124 if (rw == WRITE) { 1196 if (rw == WRITE) {
1125 dm_rh_dec(ms->rh, map_context->ll); 1197 if (likely(!bio_empty_barrier(bio)))
1198 dm_rh_dec(ms->rh, map_context->ll);
1126 return error; 1199 return error;
1127 } 1200 }
1128 1201
@@ -1180,6 +1253,9 @@ static void mirror_presuspend(struct dm_target *ti)
1180 struct mirror_set *ms = (struct mirror_set *) ti->private; 1253 struct mirror_set *ms = (struct mirror_set *) ti->private;
1181 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 1254 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1182 1255
1256 struct bio_list holds;
1257 struct bio *bio;
1258
1183 atomic_set(&ms->suspend, 1); 1259 atomic_set(&ms->suspend, 1);
1184 1260
1185 /* 1261 /*
@@ -1202,6 +1278,22 @@ static void mirror_presuspend(struct dm_target *ti)
1202 * we know that all of our I/O has been pushed. 1278 * we know that all of our I/O has been pushed.
1203 */ 1279 */
1204 flush_workqueue(ms->kmirrord_wq); 1280 flush_workqueue(ms->kmirrord_wq);
1281
1282 /*
1283 * Now set ms->suspend is set and the workqueue flushed, no more
1284 * entries can be added to ms->hold list, so process it.
1285 *
1286 * Bios can still arrive concurrently with or after this
1287 * presuspend function, but they cannot join the hold list
1288 * because ms->suspend is set.
1289 */
1290 spin_lock_irq(&ms->lock);
1291 holds = ms->holds;
1292 bio_list_init(&ms->holds);
1293 spin_unlock_irq(&ms->lock);
1294
1295 while ((bio = bio_list_pop(&holds)))
1296 hold_bio(ms, bio);
1205} 1297}
1206 1298
1207static void mirror_postsuspend(struct dm_target *ti) 1299static void mirror_postsuspend(struct dm_target *ti)
@@ -1244,7 +1336,8 @@ static char device_status_char(struct mirror *m)
1244 if (!atomic_read(&(m->error_count))) 1336 if (!atomic_read(&(m->error_count)))
1245 return 'A'; 1337 return 'A';
1246 1338
1247 return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : 1339 return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' :
1340 (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
1248 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : 1341 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
1249 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; 1342 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
1250} 1343}
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 36dbe29f2fd6..5f19ceb6fe91 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -79,6 +79,11 @@ struct dm_region_hash {
79 struct list_head recovered_regions; 79 struct list_head recovered_regions;
80 struct list_head failed_recovered_regions; 80 struct list_head failed_recovered_regions;
81 81
82 /*
83 * If there was a barrier failure no regions can be marked clean.
84 */
85 int barrier_failure;
86
82 void *context; 87 void *context;
83 sector_t target_begin; 88 sector_t target_begin;
84 89
@@ -211,6 +216,7 @@ struct dm_region_hash *dm_region_hash_create(
211 INIT_LIST_HEAD(&rh->quiesced_regions); 216 INIT_LIST_HEAD(&rh->quiesced_regions);
212 INIT_LIST_HEAD(&rh->recovered_regions); 217 INIT_LIST_HEAD(&rh->recovered_regions);
213 INIT_LIST_HEAD(&rh->failed_recovered_regions); 218 INIT_LIST_HEAD(&rh->failed_recovered_regions);
219 rh->barrier_failure = 0;
214 220
215 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 221 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
216 sizeof(struct dm_region)); 222 sizeof(struct dm_region));
@@ -377,8 +383,6 @@ static void complete_resync_work(struct dm_region *reg, int success)
377/* dm_rh_mark_nosync 383/* dm_rh_mark_nosync
378 * @ms 384 * @ms
379 * @bio 385 * @bio
380 * @done
381 * @error
382 * 386 *
383 * The bio was written on some mirror(s) but failed on other mirror(s). 387 * The bio was written on some mirror(s) but failed on other mirror(s).
384 * We can successfully endio the bio but should avoid the region being 388 * We can successfully endio the bio but should avoid the region being
@@ -386,8 +390,7 @@ static void complete_resync_work(struct dm_region *reg, int success)
386 * 390 *
387 * This function is _not_ safe in interrupt context! 391 * This function is _not_ safe in interrupt context!
388 */ 392 */
389void dm_rh_mark_nosync(struct dm_region_hash *rh, 393void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
390 struct bio *bio, unsigned done, int error)
391{ 394{
392 unsigned long flags; 395 unsigned long flags;
393 struct dm_dirty_log *log = rh->log; 396 struct dm_dirty_log *log = rh->log;
@@ -395,6 +398,11 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
395 region_t region = dm_rh_bio_to_region(rh, bio); 398 region_t region = dm_rh_bio_to_region(rh, bio);
396 int recovering = 0; 399 int recovering = 0;
397 400
401 if (bio_empty_barrier(bio)) {
402 rh->barrier_failure = 1;
403 return;
404 }
405
398 /* We must inform the log that the sync count has changed. */ 406 /* We must inform the log that the sync count has changed. */
399 log->type->set_region_sync(log, region, 0); 407 log->type->set_region_sync(log, region, 0);
400 408
@@ -419,7 +427,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
419 BUG_ON(!list_empty(&reg->list)); 427 BUG_ON(!list_empty(&reg->list));
420 spin_unlock_irqrestore(&rh->region_lock, flags); 428 spin_unlock_irqrestore(&rh->region_lock, flags);
421 429
422 bio_endio(bio, error);
423 if (recovering) 430 if (recovering)
424 complete_resync_work(reg, 0); 431 complete_resync_work(reg, 0);
425} 432}
@@ -515,8 +522,11 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
515{ 522{
516 struct bio *bio; 523 struct bio *bio;
517 524
518 for (bio = bios->head; bio; bio = bio->bi_next) 525 for (bio = bios->head; bio; bio = bio->bi_next) {
526 if (bio_empty_barrier(bio))
527 continue;
519 rh_inc(rh, dm_rh_bio_to_region(rh, bio)); 528 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
529 }
520} 530}
521EXPORT_SYMBOL_GPL(dm_rh_inc_pending); 531EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
522 532
@@ -544,7 +554,14 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
544 */ 554 */
545 555
546 /* do nothing for DM_RH_NOSYNC */ 556 /* do nothing for DM_RH_NOSYNC */
547 if (reg->state == DM_RH_RECOVERING) { 557 if (unlikely(rh->barrier_failure)) {
558 /*
559 * If a write barrier failed some time ago, we
560 * don't know whether or not this write made it
561 * to the disk, so we must resync the device.
562 */
563 reg->state = DM_RH_NOSYNC;
564 } else if (reg->state == DM_RH_RECOVERING) {
548 list_add_tail(&reg->list, &rh->quiesced_regions); 565 list_add_tail(&reg->list, &rh->quiesced_regions);
549 } else if (reg->state == DM_RH_DIRTY) { 566 } else if (reg->state == DM_RH_DIRTY) {
550 reg->state = DM_RH_CLEAN; 567 reg->state = DM_RH_CLEAN;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 0c746420c008..7d08879689ac 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -55,6 +55,8 @@
55 */ 55 */
56#define SNAPSHOT_DISK_VERSION 1 56#define SNAPSHOT_DISK_VERSION 1
57 57
58#define NUM_SNAPSHOT_HDR_CHUNKS 1
59
58struct disk_header { 60struct disk_header {
59 uint32_t magic; 61 uint32_t magic;
60 62
@@ -120,7 +122,22 @@ struct pstore {
120 122
121 /* 123 /*
122 * The next free chunk for an exception. 124 * The next free chunk for an exception.
125 *
126 * When creating exceptions, all the chunks here and above are
127 * free. It holds the next chunk to be allocated. On rare
128 * occasions (e.g. after a system crash) holes can be left in
129 * the exception store because chunks can be committed out of
130 * order.
131 *
132 * When merging exceptions, it does not necessarily mean all the
133 * chunks here and above are free. It holds the value it would
134 * have held if all chunks had been committed in order of
135 * allocation. Consequently the value may occasionally be
136 * slightly too low, but since it's only used for 'status' and
137 * it can never reach its minimum value too early this doesn't
138 * matter.
123 */ 139 */
140
124 chunk_t next_free; 141 chunk_t next_free;
125 142
126 /* 143 /*
@@ -214,7 +231,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
214 int metadata) 231 int metadata)
215{ 232{
216 struct dm_io_region where = { 233 struct dm_io_region where = {
217 .bdev = ps->store->cow->bdev, 234 .bdev = dm_snap_cow(ps->store->snap)->bdev,
218 .sector = ps->store->chunk_size * chunk, 235 .sector = ps->store->chunk_size * chunk,
219 .count = ps->store->chunk_size, 236 .count = ps->store->chunk_size,
220 }; 237 };
@@ -294,7 +311,8 @@ static int read_header(struct pstore *ps, int *new_snapshot)
294 */ 311 */
295 if (!ps->store->chunk_size) { 312 if (!ps->store->chunk_size) {
296 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, 313 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
297 bdev_logical_block_size(ps->store->cow->bdev) >> 9); 314 bdev_logical_block_size(dm_snap_cow(ps->store->snap)->
315 bdev) >> 9);
298 ps->store->chunk_mask = ps->store->chunk_size - 1; 316 ps->store->chunk_mask = ps->store->chunk_size - 1;
299 ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; 317 ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
300 chunk_size_supplied = 0; 318 chunk_size_supplied = 0;
@@ -408,6 +426,15 @@ static void write_exception(struct pstore *ps,
408 e->new_chunk = cpu_to_le64(de->new_chunk); 426 e->new_chunk = cpu_to_le64(de->new_chunk);
409} 427}
410 428
429static void clear_exception(struct pstore *ps, uint32_t index)
430{
431 struct disk_exception *e = get_exception(ps, index);
432
433 /* clear it */
434 e->old_chunk = 0;
435 e->new_chunk = 0;
436}
437
411/* 438/*
412 * Registers the exceptions that are present in the current area. 439 * Registers the exceptions that are present in the current area.
413 * 'full' is filled in to indicate if the area has been 440 * 'full' is filled in to indicate if the area has been
@@ -489,11 +516,23 @@ static struct pstore *get_info(struct dm_exception_store *store)
489 return (struct pstore *) store->context; 516 return (struct pstore *) store->context;
490} 517}
491 518
492static void persistent_fraction_full(struct dm_exception_store *store, 519static void persistent_usage(struct dm_exception_store *store,
493 sector_t *numerator, sector_t *denominator) 520 sector_t *total_sectors,
521 sector_t *sectors_allocated,
522 sector_t *metadata_sectors)
494{ 523{
495 *numerator = get_info(store)->next_free * store->chunk_size; 524 struct pstore *ps = get_info(store);
496 *denominator = get_dev_size(store->cow->bdev); 525
526 *sectors_allocated = ps->next_free * store->chunk_size;
527 *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
528
529 /*
530 * First chunk is the fixed header.
531 * Then there are (ps->current_area + 1) metadata chunks, each one
532 * separated from the next by ps->exceptions_per_area data chunks.
533 */
534 *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) *
535 store->chunk_size;
497} 536}
498 537
499static void persistent_dtr(struct dm_exception_store *store) 538static void persistent_dtr(struct dm_exception_store *store)
@@ -552,44 +591,40 @@ static int persistent_read_metadata(struct dm_exception_store *store,
552 ps->current_area = 0; 591 ps->current_area = 0;
553 zero_memory_area(ps); 592 zero_memory_area(ps);
554 r = zero_disk_area(ps, 0); 593 r = zero_disk_area(ps, 0);
555 if (r) { 594 if (r)
556 DMWARN("zero_disk_area(0) failed"); 595 DMWARN("zero_disk_area(0) failed");
557 return r; 596 return r;
558 } 597 }
559 } else { 598 /*
560 /* 599 * Sanity checks.
561 * Sanity checks. 600 */
562 */ 601 if (ps->version != SNAPSHOT_DISK_VERSION) {
563 if (ps->version != SNAPSHOT_DISK_VERSION) { 602 DMWARN("unable to handle snapshot disk version %d",
564 DMWARN("unable to handle snapshot disk version %d", 603 ps->version);
565 ps->version); 604 return -EINVAL;
566 return -EINVAL; 605 }
567 }
568 606
569 /* 607 /*
570 * Metadata are valid, but snapshot is invalidated 608 * Metadata are valid, but snapshot is invalidated
571 */ 609 */
572 if (!ps->valid) 610 if (!ps->valid)
573 return 1; 611 return 1;
574 612
575 /* 613 /*
576 * Read the metadata. 614 * Read the metadata.
577 */ 615 */
578 r = read_exceptions(ps, callback, callback_context); 616 r = read_exceptions(ps, callback, callback_context);
579 if (r)
580 return r;
581 }
582 617
583 return 0; 618 return r;
584} 619}
585 620
586static int persistent_prepare_exception(struct dm_exception_store *store, 621static int persistent_prepare_exception(struct dm_exception_store *store,
587 struct dm_snap_exception *e) 622 struct dm_exception *e)
588{ 623{
589 struct pstore *ps = get_info(store); 624 struct pstore *ps = get_info(store);
590 uint32_t stride; 625 uint32_t stride;
591 chunk_t next_free; 626 chunk_t next_free;
592 sector_t size = get_dev_size(store->cow->bdev); 627 sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
593 628
594 /* Is there enough room ? */ 629 /* Is there enough room ? */
595 if (size < ((ps->next_free + 1) * store->chunk_size)) 630 if (size < ((ps->next_free + 1) * store->chunk_size))
@@ -611,7 +646,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
611} 646}
612 647
613static void persistent_commit_exception(struct dm_exception_store *store, 648static void persistent_commit_exception(struct dm_exception_store *store,
614 struct dm_snap_exception *e, 649 struct dm_exception *e,
615 void (*callback) (void *, int success), 650 void (*callback) (void *, int success),
616 void *callback_context) 651 void *callback_context)
617{ 652{
@@ -672,6 +707,85 @@ static void persistent_commit_exception(struct dm_exception_store *store,
672 ps->callback_count = 0; 707 ps->callback_count = 0;
673} 708}
674 709
710static int persistent_prepare_merge(struct dm_exception_store *store,
711 chunk_t *last_old_chunk,
712 chunk_t *last_new_chunk)
713{
714 struct pstore *ps = get_info(store);
715 struct disk_exception de;
716 int nr_consecutive;
717 int r;
718
719 /*
720 * When current area is empty, move back to preceding area.
721 */
722 if (!ps->current_committed) {
723 /*
724 * Have we finished?
725 */
726 if (!ps->current_area)
727 return 0;
728
729 ps->current_area--;
730 r = area_io(ps, READ);
731 if (r < 0)
732 return r;
733 ps->current_committed = ps->exceptions_per_area;
734 }
735
736 read_exception(ps, ps->current_committed - 1, &de);
737 *last_old_chunk = de.old_chunk;
738 *last_new_chunk = de.new_chunk;
739
740 /*
741 * Find number of consecutive chunks within the current area,
742 * working backwards.
743 */
744 for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
745 nr_consecutive++) {
746 read_exception(ps, ps->current_committed - 1 - nr_consecutive,
747 &de);
748 if (de.old_chunk != *last_old_chunk - nr_consecutive ||
749 de.new_chunk != *last_new_chunk - nr_consecutive)
750 break;
751 }
752
753 return nr_consecutive;
754}
755
756static int persistent_commit_merge(struct dm_exception_store *store,
757 int nr_merged)
758{
759 int r, i;
760 struct pstore *ps = get_info(store);
761
762 BUG_ON(nr_merged > ps->current_committed);
763
764 for (i = 0; i < nr_merged; i++)
765 clear_exception(ps, ps->current_committed - 1 - i);
766
767 r = area_io(ps, WRITE);
768 if (r < 0)
769 return r;
770
771 ps->current_committed -= nr_merged;
772
773 /*
774 * At this stage, only persistent_usage() uses ps->next_free, so
775 * we make no attempt to keep ps->next_free strictly accurate
776 * as exceptions may have been committed out-of-order originally.
777 * Once a snapshot has become merging, we set it to the value it
778 * would have held had all the exceptions been committed in order.
779 *
780 * ps->current_area does not get reduced by prepare_merge() until
781 * after commit_merge() has removed the nr_merged previous exceptions.
782 */
783 ps->next_free = (area_location(ps, ps->current_area) - 1) +
784 (ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS;
785
786 return 0;
787}
788
675static void persistent_drop_snapshot(struct dm_exception_store *store) 789static void persistent_drop_snapshot(struct dm_exception_store *store)
676{ 790{
677 struct pstore *ps = get_info(store); 791 struct pstore *ps = get_info(store);
@@ -697,7 +811,7 @@ static int persistent_ctr(struct dm_exception_store *store,
697 ps->area = NULL; 811 ps->area = NULL;
698 ps->zero_area = NULL; 812 ps->zero_area = NULL;
699 ps->header_area = NULL; 813 ps->header_area = NULL;
700 ps->next_free = 2; /* skipping the header and first area */ 814 ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */
701 ps->current_committed = 0; 815 ps->current_committed = 0;
702 816
703 ps->callback_count = 0; 817 ps->callback_count = 0;
@@ -726,8 +840,7 @@ static unsigned persistent_status(struct dm_exception_store *store,
726 case STATUSTYPE_INFO: 840 case STATUSTYPE_INFO:
727 break; 841 break;
728 case STATUSTYPE_TABLE: 842 case STATUSTYPE_TABLE:
729 DMEMIT(" %s P %llu", store->cow->name, 843 DMEMIT(" P %llu", (unsigned long long)store->chunk_size);
730 (unsigned long long)store->chunk_size);
731 } 844 }
732 845
733 return sz; 846 return sz;
@@ -741,8 +854,10 @@ static struct dm_exception_store_type _persistent_type = {
741 .read_metadata = persistent_read_metadata, 854 .read_metadata = persistent_read_metadata,
742 .prepare_exception = persistent_prepare_exception, 855 .prepare_exception = persistent_prepare_exception,
743 .commit_exception = persistent_commit_exception, 856 .commit_exception = persistent_commit_exception,
857 .prepare_merge = persistent_prepare_merge,
858 .commit_merge = persistent_commit_merge,
744 .drop_snapshot = persistent_drop_snapshot, 859 .drop_snapshot = persistent_drop_snapshot,
745 .fraction_full = persistent_fraction_full, 860 .usage = persistent_usage,
746 .status = persistent_status, 861 .status = persistent_status,
747}; 862};
748 863
@@ -754,8 +869,10 @@ static struct dm_exception_store_type _persistent_compat_type = {
754 .read_metadata = persistent_read_metadata, 869 .read_metadata = persistent_read_metadata,
755 .prepare_exception = persistent_prepare_exception, 870 .prepare_exception = persistent_prepare_exception,
756 .commit_exception = persistent_commit_exception, 871 .commit_exception = persistent_commit_exception,
872 .prepare_merge = persistent_prepare_merge,
873 .commit_merge = persistent_commit_merge,
757 .drop_snapshot = persistent_drop_snapshot, 874 .drop_snapshot = persistent_drop_snapshot,
758 .fraction_full = persistent_fraction_full, 875 .usage = persistent_usage,
759 .status = persistent_status, 876 .status = persistent_status,
760}; 877};
761 878
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index cde5aa558e6d..a0898a66a2f8 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -36,10 +36,10 @@ static int transient_read_metadata(struct dm_exception_store *store,
36} 36}
37 37
38static int transient_prepare_exception(struct dm_exception_store *store, 38static int transient_prepare_exception(struct dm_exception_store *store,
39 struct dm_snap_exception *e) 39 struct dm_exception *e)
40{ 40{
41 struct transient_c *tc = store->context; 41 struct transient_c *tc = store->context;
42 sector_t size = get_dev_size(store->cow->bdev); 42 sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
43 43
44 if (size < (tc->next_free + store->chunk_size)) 44 if (size < (tc->next_free + store->chunk_size))
45 return -1; 45 return -1;
@@ -51,7 +51,7 @@ static int transient_prepare_exception(struct dm_exception_store *store,
51} 51}
52 52
53static void transient_commit_exception(struct dm_exception_store *store, 53static void transient_commit_exception(struct dm_exception_store *store,
54 struct dm_snap_exception *e, 54 struct dm_exception *e,
55 void (*callback) (void *, int success), 55 void (*callback) (void *, int success),
56 void *callback_context) 56 void *callback_context)
57{ 57{
@@ -59,11 +59,14 @@ static void transient_commit_exception(struct dm_exception_store *store,
59 callback(callback_context, 1); 59 callback(callback_context, 1);
60} 60}
61 61
62static void transient_fraction_full(struct dm_exception_store *store, 62static void transient_usage(struct dm_exception_store *store,
63 sector_t *numerator, sector_t *denominator) 63 sector_t *total_sectors,
64 sector_t *sectors_allocated,
65 sector_t *metadata_sectors)
64{ 66{
65 *numerator = ((struct transient_c *) store->context)->next_free; 67 *sectors_allocated = ((struct transient_c *) store->context)->next_free;
66 *denominator = get_dev_size(store->cow->bdev); 68 *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
69 *metadata_sectors = 0;
67} 70}
68 71
69static int transient_ctr(struct dm_exception_store *store, 72static int transient_ctr(struct dm_exception_store *store,
@@ -91,8 +94,7 @@ static unsigned transient_status(struct dm_exception_store *store,
91 case STATUSTYPE_INFO: 94 case STATUSTYPE_INFO:
92 break; 95 break;
93 case STATUSTYPE_TABLE: 96 case STATUSTYPE_TABLE:
94 DMEMIT(" %s N %llu", store->cow->name, 97 DMEMIT(" N %llu", (unsigned long long)store->chunk_size);
95 (unsigned long long)store->chunk_size);
96 } 98 }
97 99
98 return sz; 100 return sz;
@@ -106,7 +108,7 @@ static struct dm_exception_store_type _transient_type = {
106 .read_metadata = transient_read_metadata, 108 .read_metadata = transient_read_metadata,
107 .prepare_exception = transient_prepare_exception, 109 .prepare_exception = transient_prepare_exception,
108 .commit_exception = transient_commit_exception, 110 .commit_exception = transient_commit_exception,
109 .fraction_full = transient_fraction_full, 111 .usage = transient_usage,
110 .status = transient_status, 112 .status = transient_status,
111}; 113};
112 114
@@ -118,7 +120,7 @@ static struct dm_exception_store_type _transient_compat_type = {
118 .read_metadata = transient_read_metadata, 120 .read_metadata = transient_read_metadata,
119 .prepare_exception = transient_prepare_exception, 121 .prepare_exception = transient_prepare_exception,
120 .commit_exception = transient_commit_exception, 122 .commit_exception = transient_commit_exception,
121 .fraction_full = transient_fraction_full, 123 .usage = transient_usage,
122 .status = transient_status, 124 .status = transient_status,
123}; 125};
124 126
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 3a3ba46e6d4b..ee8eb283650d 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -25,6 +25,11 @@
25 25
26#define DM_MSG_PREFIX "snapshots" 26#define DM_MSG_PREFIX "snapshots"
27 27
28static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
29
30#define dm_target_is_snapshot_merge(ti) \
31 ((ti)->type->name == dm_snapshot_merge_target_name)
32
28/* 33/*
29 * The percentage increment we will wake up users at 34 * The percentage increment we will wake up users at
30 */ 35 */
@@ -49,7 +54,7 @@
49#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ 54#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
50 (DM_TRACKED_CHUNK_HASH_SIZE - 1)) 55 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
51 56
52struct exception_table { 57struct dm_exception_table {
53 uint32_t hash_mask; 58 uint32_t hash_mask;
54 unsigned hash_shift; 59 unsigned hash_shift;
55 struct list_head *table; 60 struct list_head *table;
@@ -59,22 +64,31 @@ struct dm_snapshot {
59 struct rw_semaphore lock; 64 struct rw_semaphore lock;
60 65
61 struct dm_dev *origin; 66 struct dm_dev *origin;
67 struct dm_dev *cow;
68
69 struct dm_target *ti;
62 70
63 /* List of snapshots per Origin */ 71 /* List of snapshots per Origin */
64 struct list_head list; 72 struct list_head list;
65 73
66 /* You can't use a snapshot if this is 0 (e.g. if full) */ 74 /*
75 * You can't use a snapshot if this is 0 (e.g. if full).
76 * A snapshot-merge target never clears this.
77 */
67 int valid; 78 int valid;
68 79
69 /* Origin writes don't trigger exceptions until this is set */ 80 /* Origin writes don't trigger exceptions until this is set */
70 int active; 81 int active;
71 82
83 /* Whether or not owning mapped_device is suspended */
84 int suspended;
85
72 mempool_t *pending_pool; 86 mempool_t *pending_pool;
73 87
74 atomic_t pending_exceptions_count; 88 atomic_t pending_exceptions_count;
75 89
76 struct exception_table pending; 90 struct dm_exception_table pending;
77 struct exception_table complete; 91 struct dm_exception_table complete;
78 92
79 /* 93 /*
80 * pe_lock protects all pending_exception operations and access 94 * pe_lock protects all pending_exception operations and access
@@ -95,8 +109,51 @@ struct dm_snapshot {
95 mempool_t *tracked_chunk_pool; 109 mempool_t *tracked_chunk_pool;
96 spinlock_t tracked_chunk_lock; 110 spinlock_t tracked_chunk_lock;
97 struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; 111 struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
112
113 /*
114 * The merge operation failed if this flag is set.
115 * Failure modes are handled as follows:
116 * - I/O error reading the header
117 * => don't load the target; abort.
118 * - Header does not have "valid" flag set
119 * => use the origin; forget about the snapshot.
120 * - I/O error when reading exceptions
121 * => don't load the target; abort.
122 * (We can't use the intermediate origin state.)
123 * - I/O error while merging
124 * => stop merging; set merge_failed; process I/O normally.
125 */
126 int merge_failed;
127
128 /* Wait for events based on state_bits */
129 unsigned long state_bits;
130
131 /* Range of chunks currently being merged. */
132 chunk_t first_merging_chunk;
133 int num_merging_chunks;
134
135 /*
136 * Incoming bios that overlap with chunks being merged must wait
137 * for them to be committed.
138 */
139 struct bio_list bios_queued_during_merge;
98}; 140};
99 141
142/*
143 * state_bits:
144 * RUNNING_MERGE - Merge operation is in progress.
145 * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
146 * cleared afterwards.
147 */
148#define RUNNING_MERGE 0
149#define SHUTDOWN_MERGE 1
150
151struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
152{
153 return s->cow;
154}
155EXPORT_SYMBOL(dm_snap_cow);
156
100static struct workqueue_struct *ksnapd; 157static struct workqueue_struct *ksnapd;
101static void flush_queued_bios(struct work_struct *work); 158static void flush_queued_bios(struct work_struct *work);
102 159
@@ -116,7 +173,7 @@ static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
116} 173}
117 174
118struct dm_snap_pending_exception { 175struct dm_snap_pending_exception {
119 struct dm_snap_exception e; 176 struct dm_exception e;
120 177
121 /* 178 /*
122 * Origin buffers waiting for this to complete are held 179 * Origin buffers waiting for this to complete are held
@@ -125,28 +182,6 @@ struct dm_snap_pending_exception {
125 struct bio_list origin_bios; 182 struct bio_list origin_bios;
126 struct bio_list snapshot_bios; 183 struct bio_list snapshot_bios;
127 184
128 /*
129 * Short-term queue of pending exceptions prior to submission.
130 */
131 struct list_head list;
132
133 /*
134 * The primary pending_exception is the one that holds
135 * the ref_count and the list of origin_bios for a
136 * group of pending_exceptions. It is always last to get freed.
137 * These fields get set up when writing to the origin.
138 */
139 struct dm_snap_pending_exception *primary_pe;
140
141 /*
142 * Number of pending_exceptions processing this chunk.
143 * When this drops to zero we must complete the origin bios.
144 * If incrementing or decrementing this, hold pe->snap->lock for
145 * the sibling concerned and not pe->primary_pe->snap->lock unless
146 * they are the same.
147 */
148 atomic_t ref_count;
149
150 /* Pointer back to snapshot context */ 185 /* Pointer back to snapshot context */
151 struct dm_snapshot *snap; 186 struct dm_snapshot *snap;
152 187
@@ -222,6 +257,16 @@ static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
222} 257}
223 258
224/* 259/*
260 * This conflicting I/O is extremely improbable in the caller,
261 * so msleep(1) is sufficient and there is no need for a wait queue.
262 */
263static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
264{
265 while (__chunk_is_tracked(s, chunk))
266 msleep(1);
267}
268
269/*
225 * One of these per registered origin, held in the snapshot_origins hash 270 * One of these per registered origin, held in the snapshot_origins hash
226 */ 271 */
227struct origin { 272struct origin {
@@ -243,6 +288,10 @@ struct origin {
243static struct list_head *_origins; 288static struct list_head *_origins;
244static struct rw_semaphore _origins_lock; 289static struct rw_semaphore _origins_lock;
245 290
291static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
292static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
293static uint64_t _pending_exceptions_done_count;
294
246static int init_origin_hash(void) 295static int init_origin_hash(void)
247{ 296{
248 int i; 297 int i;
@@ -291,22 +340,144 @@ static void __insert_origin(struct origin *o)
291} 340}
292 341
293/* 342/*
343 * _origins_lock must be held when calling this function.
344 * Returns number of snapshots registered using the supplied cow device, plus:
345 * snap_src - a snapshot suitable for use as a source of exception handover
346 * snap_dest - a snapshot capable of receiving exception handover.
347 * snap_merge - an existing snapshot-merge target linked to the same origin.
348 * There can be at most one snapshot-merge target. The parameter is optional.
349 *
350 * Possible return values and states of snap_src and snap_dest.
351 * 0: NULL, NULL - first new snapshot
352 * 1: snap_src, NULL - normal snapshot
353 * 2: snap_src, snap_dest - waiting for handover
354 * 2: snap_src, NULL - handed over, waiting for old to be deleted
355 * 1: NULL, snap_dest - source got destroyed without handover
356 */
357static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
358 struct dm_snapshot **snap_src,
359 struct dm_snapshot **snap_dest,
360 struct dm_snapshot **snap_merge)
361{
362 struct dm_snapshot *s;
363 struct origin *o;
364 int count = 0;
365 int active;
366
367 o = __lookup_origin(snap->origin->bdev);
368 if (!o)
369 goto out;
370
371 list_for_each_entry(s, &o->snapshots, list) {
372 if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
373 *snap_merge = s;
374 if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
375 continue;
376
377 down_read(&s->lock);
378 active = s->active;
379 up_read(&s->lock);
380
381 if (active) {
382 if (snap_src)
383 *snap_src = s;
384 } else if (snap_dest)
385 *snap_dest = s;
386
387 count++;
388 }
389
390out:
391 return count;
392}
393
394/*
395 * On success, returns 1 if this snapshot is a handover destination,
396 * otherwise returns 0.
397 */
398static int __validate_exception_handover(struct dm_snapshot *snap)
399{
400 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
401 struct dm_snapshot *snap_merge = NULL;
402
403 /* Does snapshot need exceptions handed over to it? */
404 if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
405 &snap_merge) == 2) ||
406 snap_dest) {
407 snap->ti->error = "Snapshot cow pairing for exception "
408 "table handover failed";
409 return -EINVAL;
410 }
411
412 /*
413 * If no snap_src was found, snap cannot become a handover
414 * destination.
415 */
416 if (!snap_src)
417 return 0;
418
419 /*
420 * Non-snapshot-merge handover?
421 */
422 if (!dm_target_is_snapshot_merge(snap->ti))
423 return 1;
424
425 /*
426 * Do not allow more than one merging snapshot.
427 */
428 if (snap_merge) {
429 snap->ti->error = "A snapshot is already merging.";
430 return -EINVAL;
431 }
432
433 if (!snap_src->store->type->prepare_merge ||
434 !snap_src->store->type->commit_merge) {
435 snap->ti->error = "Snapshot exception store does not "
436 "support snapshot-merge.";
437 return -EINVAL;
438 }
439
440 return 1;
441}
442
443static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
444{
445 struct dm_snapshot *l;
446
447 /* Sort the list according to chunk size, largest-first smallest-last */
448 list_for_each_entry(l, &o->snapshots, list)
449 if (l->store->chunk_size < s->store->chunk_size)
450 break;
451 list_add_tail(&s->list, &l->list);
452}
453
454/*
294 * Make a note of the snapshot and its origin so we can look it 455 * Make a note of the snapshot and its origin so we can look it
295 * up when the origin has a write on it. 456 * up when the origin has a write on it.
457 *
458 * Also validate snapshot exception store handovers.
459 * On success, returns 1 if this registration is a handover destination,
460 * otherwise returns 0.
296 */ 461 */
297static int register_snapshot(struct dm_snapshot *snap) 462static int register_snapshot(struct dm_snapshot *snap)
298{ 463{
299 struct dm_snapshot *l; 464 struct origin *o, *new_o = NULL;
300 struct origin *o, *new_o;
301 struct block_device *bdev = snap->origin->bdev; 465 struct block_device *bdev = snap->origin->bdev;
466 int r = 0;
302 467
303 new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); 468 new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
304 if (!new_o) 469 if (!new_o)
305 return -ENOMEM; 470 return -ENOMEM;
306 471
307 down_write(&_origins_lock); 472 down_write(&_origins_lock);
308 o = __lookup_origin(bdev);
309 473
474 r = __validate_exception_handover(snap);
475 if (r < 0) {
476 kfree(new_o);
477 goto out;
478 }
479
480 o = __lookup_origin(bdev);
310 if (o) 481 if (o)
311 kfree(new_o); 482 kfree(new_o);
312 else { 483 else {
@@ -320,14 +491,27 @@ static int register_snapshot(struct dm_snapshot *snap)
320 __insert_origin(o); 491 __insert_origin(o);
321 } 492 }
322 493
323 /* Sort the list according to chunk size, largest-first smallest-last */ 494 __insert_snapshot(o, snap);
324 list_for_each_entry(l, &o->snapshots, list) 495
325 if (l->store->chunk_size < snap->store->chunk_size) 496out:
326 break; 497 up_write(&_origins_lock);
327 list_add_tail(&snap->list, &l->list); 498
499 return r;
500}
501
502/*
503 * Move snapshot to correct place in list according to chunk size.
504 */
505static void reregister_snapshot(struct dm_snapshot *s)
506{
507 struct block_device *bdev = s->origin->bdev;
508
509 down_write(&_origins_lock);
510
511 list_del(&s->list);
512 __insert_snapshot(__lookup_origin(bdev), s);
328 513
329 up_write(&_origins_lock); 514 up_write(&_origins_lock);
330 return 0;
331} 515}
332 516
333static void unregister_snapshot(struct dm_snapshot *s) 517static void unregister_snapshot(struct dm_snapshot *s)
@@ -338,7 +522,7 @@ static void unregister_snapshot(struct dm_snapshot *s)
338 o = __lookup_origin(s->origin->bdev); 522 o = __lookup_origin(s->origin->bdev);
339 523
340 list_del(&s->list); 524 list_del(&s->list);
341 if (list_empty(&o->snapshots)) { 525 if (o && list_empty(&o->snapshots)) {
342 list_del(&o->hash_list); 526 list_del(&o->hash_list);
343 kfree(o); 527 kfree(o);
344 } 528 }
@@ -351,8 +535,8 @@ static void unregister_snapshot(struct dm_snapshot *s)
351 * The lowest hash_shift bits of the chunk number are ignored, allowing 535 * The lowest hash_shift bits of the chunk number are ignored, allowing
352 * some consecutive chunks to be grouped together. 536 * some consecutive chunks to be grouped together.
353 */ 537 */
354static int init_exception_table(struct exception_table *et, uint32_t size, 538static int dm_exception_table_init(struct dm_exception_table *et,
355 unsigned hash_shift) 539 uint32_t size, unsigned hash_shift)
356{ 540{
357 unsigned int i; 541 unsigned int i;
358 542
@@ -368,10 +552,11 @@ static int init_exception_table(struct exception_table *et, uint32_t size,
368 return 0; 552 return 0;
369} 553}
370 554
371static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem) 555static void dm_exception_table_exit(struct dm_exception_table *et,
556 struct kmem_cache *mem)
372{ 557{
373 struct list_head *slot; 558 struct list_head *slot;
374 struct dm_snap_exception *ex, *next; 559 struct dm_exception *ex, *next;
375 int i, size; 560 int i, size;
376 561
377 size = et->hash_mask + 1; 562 size = et->hash_mask + 1;
@@ -385,19 +570,12 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache *
385 vfree(et->table); 570 vfree(et->table);
386} 571}
387 572
388static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) 573static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
389{ 574{
390 return (chunk >> et->hash_shift) & et->hash_mask; 575 return (chunk >> et->hash_shift) & et->hash_mask;
391} 576}
392 577
393static void insert_exception(struct exception_table *eh, 578static void dm_remove_exception(struct dm_exception *e)
394 struct dm_snap_exception *e)
395{
396 struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
397 list_add(&e->hash_list, l);
398}
399
400static void remove_exception(struct dm_snap_exception *e)
401{ 579{
402 list_del(&e->hash_list); 580 list_del(&e->hash_list);
403} 581}
@@ -406,11 +584,11 @@ static void remove_exception(struct dm_snap_exception *e)
406 * Return the exception data for a sector, or NULL if not 584 * Return the exception data for a sector, or NULL if not
407 * remapped. 585 * remapped.
408 */ 586 */
409static struct dm_snap_exception *lookup_exception(struct exception_table *et, 587static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
410 chunk_t chunk) 588 chunk_t chunk)
411{ 589{
412 struct list_head *slot; 590 struct list_head *slot;
413 struct dm_snap_exception *e; 591 struct dm_exception *e;
414 592
415 slot = &et->table[exception_hash(et, chunk)]; 593 slot = &et->table[exception_hash(et, chunk)];
416 list_for_each_entry (e, slot, hash_list) 594 list_for_each_entry (e, slot, hash_list)
@@ -421,9 +599,9 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et,
421 return NULL; 599 return NULL;
422} 600}
423 601
424static struct dm_snap_exception *alloc_exception(void) 602static struct dm_exception *alloc_completed_exception(void)
425{ 603{
426 struct dm_snap_exception *e; 604 struct dm_exception *e;
427 605
428 e = kmem_cache_alloc(exception_cache, GFP_NOIO); 606 e = kmem_cache_alloc(exception_cache, GFP_NOIO);
429 if (!e) 607 if (!e)
@@ -432,7 +610,7 @@ static struct dm_snap_exception *alloc_exception(void)
432 return e; 610 return e;
433} 611}
434 612
435static void free_exception(struct dm_snap_exception *e) 613static void free_completed_exception(struct dm_exception *e)
436{ 614{
437 kmem_cache_free(exception_cache, e); 615 kmem_cache_free(exception_cache, e);
438} 616}
@@ -457,12 +635,11 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
457 atomic_dec(&s->pending_exceptions_count); 635 atomic_dec(&s->pending_exceptions_count);
458} 636}
459 637
460static void insert_completed_exception(struct dm_snapshot *s, 638static void dm_insert_exception(struct dm_exception_table *eh,
461 struct dm_snap_exception *new_e) 639 struct dm_exception *new_e)
462{ 640{
463 struct exception_table *eh = &s->complete;
464 struct list_head *l; 641 struct list_head *l;
465 struct dm_snap_exception *e = NULL; 642 struct dm_exception *e = NULL;
466 643
467 l = &eh->table[exception_hash(eh, new_e->old_chunk)]; 644 l = &eh->table[exception_hash(eh, new_e->old_chunk)];
468 645
@@ -478,7 +655,7 @@ static void insert_completed_exception(struct dm_snapshot *s,
478 new_e->new_chunk == (dm_chunk_number(e->new_chunk) + 655 new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
479 dm_consecutive_chunk_count(e) + 1)) { 656 dm_consecutive_chunk_count(e) + 1)) {
480 dm_consecutive_chunk_count_inc(e); 657 dm_consecutive_chunk_count_inc(e);
481 free_exception(new_e); 658 free_completed_exception(new_e);
482 return; 659 return;
483 } 660 }
484 661
@@ -488,7 +665,7 @@ static void insert_completed_exception(struct dm_snapshot *s,
488 dm_consecutive_chunk_count_inc(e); 665 dm_consecutive_chunk_count_inc(e);
489 e->old_chunk--; 666 e->old_chunk--;
490 e->new_chunk--; 667 e->new_chunk--;
491 free_exception(new_e); 668 free_completed_exception(new_e);
492 return; 669 return;
493 } 670 }
494 671
@@ -507,9 +684,9 @@ out:
507static int dm_add_exception(void *context, chunk_t old, chunk_t new) 684static int dm_add_exception(void *context, chunk_t old, chunk_t new)
508{ 685{
509 struct dm_snapshot *s = context; 686 struct dm_snapshot *s = context;
510 struct dm_snap_exception *e; 687 struct dm_exception *e;
511 688
512 e = alloc_exception(); 689 e = alloc_completed_exception();
513 if (!e) 690 if (!e)
514 return -ENOMEM; 691 return -ENOMEM;
515 692
@@ -518,11 +695,30 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
518 /* Consecutive_count is implicitly initialised to zero */ 695 /* Consecutive_count is implicitly initialised to zero */
519 e->new_chunk = new; 696 e->new_chunk = new;
520 697
521 insert_completed_exception(s, e); 698 dm_insert_exception(&s->complete, e);
522 699
523 return 0; 700 return 0;
524} 701}
525 702
703#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
704
705/*
706 * Return a minimum chunk size of all snapshots that have the specified origin.
707 * Return zero if the origin has no snapshots.
708 */
709static sector_t __minimum_chunk_size(struct origin *o)
710{
711 struct dm_snapshot *snap;
712 unsigned chunk_size = 0;
713
714 if (o)
715 list_for_each_entry(snap, &o->snapshots, list)
716 chunk_size = min_not_zero(chunk_size,
717 snap->store->chunk_size);
718
719 return chunk_size;
720}
721
526/* 722/*
527 * Hard coded magic. 723 * Hard coded magic.
528 */ 724 */
@@ -546,16 +742,18 @@ static int init_hash_tables(struct dm_snapshot *s)
546 * Calculate based on the size of the original volume or 742 * Calculate based on the size of the original volume or
547 * the COW volume... 743 * the COW volume...
548 */ 744 */
549 cow_dev_size = get_dev_size(s->store->cow->bdev); 745 cow_dev_size = get_dev_size(s->cow->bdev);
550 origin_dev_size = get_dev_size(s->origin->bdev); 746 origin_dev_size = get_dev_size(s->origin->bdev);
551 max_buckets = calc_max_buckets(); 747 max_buckets = calc_max_buckets();
552 748
553 hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; 749 hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
554 hash_size = min(hash_size, max_buckets); 750 hash_size = min(hash_size, max_buckets);
555 751
752 if (hash_size < 64)
753 hash_size = 64;
556 hash_size = rounddown_pow_of_two(hash_size); 754 hash_size = rounddown_pow_of_two(hash_size);
557 if (init_exception_table(&s->complete, hash_size, 755 if (dm_exception_table_init(&s->complete, hash_size,
558 DM_CHUNK_CONSECUTIVE_BITS)) 756 DM_CHUNK_CONSECUTIVE_BITS))
559 return -ENOMEM; 757 return -ENOMEM;
560 758
561 /* 759 /*
@@ -566,14 +764,284 @@ static int init_hash_tables(struct dm_snapshot *s)
566 if (hash_size < 64) 764 if (hash_size < 64)
567 hash_size = 64; 765 hash_size = 64;
568 766
569 if (init_exception_table(&s->pending, hash_size, 0)) { 767 if (dm_exception_table_init(&s->pending, hash_size, 0)) {
570 exit_exception_table(&s->complete, exception_cache); 768 dm_exception_table_exit(&s->complete, exception_cache);
571 return -ENOMEM; 769 return -ENOMEM;
572 } 770 }
573 771
574 return 0; 772 return 0;
575} 773}
576 774
775static void merge_shutdown(struct dm_snapshot *s)
776{
777 clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
778 smp_mb__after_clear_bit();
779 wake_up_bit(&s->state_bits, RUNNING_MERGE);
780}
781
782static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s)
783{
784 s->first_merging_chunk = 0;
785 s->num_merging_chunks = 0;
786
787 return bio_list_get(&s->bios_queued_during_merge);
788}
789
790/*
791 * Remove one chunk from the index of completed exceptions.
792 */
793static int __remove_single_exception_chunk(struct dm_snapshot *s,
794 chunk_t old_chunk)
795{
796 struct dm_exception *e;
797
798 e = dm_lookup_exception(&s->complete, old_chunk);
799 if (!e) {
800 DMERR("Corruption detected: exception for block %llu is "
801 "on disk but not in memory",
802 (unsigned long long)old_chunk);
803 return -EINVAL;
804 }
805
806 /*
807 * If this is the only chunk using this exception, remove exception.
808 */
809 if (!dm_consecutive_chunk_count(e)) {
810 dm_remove_exception(e);
811 free_completed_exception(e);
812 return 0;
813 }
814
815 /*
816 * The chunk may be either at the beginning or the end of a
817 * group of consecutive chunks - never in the middle. We are
818 * removing chunks in the opposite order to that in which they
819 * were added, so this should always be true.
820 * Decrement the consecutive chunk counter and adjust the
821 * starting point if necessary.
822 */
823 if (old_chunk == e->old_chunk) {
824 e->old_chunk++;
825 e->new_chunk++;
826 } else if (old_chunk != e->old_chunk +
827 dm_consecutive_chunk_count(e)) {
828 DMERR("Attempt to merge block %llu from the "
829 "middle of a chunk range [%llu - %llu]",
830 (unsigned long long)old_chunk,
831 (unsigned long long)e->old_chunk,
832 (unsigned long long)
833 e->old_chunk + dm_consecutive_chunk_count(e));
834 return -EINVAL;
835 }
836
837 dm_consecutive_chunk_count_dec(e);
838
839 return 0;
840}
841
842static void flush_bios(struct bio *bio);
843
844static int remove_single_exception_chunk(struct dm_snapshot *s)
845{
846 struct bio *b = NULL;
847 int r;
848 chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
849
850 down_write(&s->lock);
851
852 /*
853 * Process chunks (and associated exceptions) in reverse order
854 * so that dm_consecutive_chunk_count_dec() accounting works.
855 */
856 do {
857 r = __remove_single_exception_chunk(s, old_chunk);
858 if (r)
859 goto out;
860 } while (old_chunk-- > s->first_merging_chunk);
861
862 b = __release_queued_bios_after_merge(s);
863
864out:
865 up_write(&s->lock);
866 if (b)
867 flush_bios(b);
868
869 return r;
870}
871
872static int origin_write_extent(struct dm_snapshot *merging_snap,
873 sector_t sector, unsigned chunk_size);
874
875static void merge_callback(int read_err, unsigned long write_err,
876 void *context);
877
878static uint64_t read_pending_exceptions_done_count(void)
879{
880 uint64_t pending_exceptions_done;
881
882 spin_lock(&_pending_exceptions_done_spinlock);
883 pending_exceptions_done = _pending_exceptions_done_count;
884 spin_unlock(&_pending_exceptions_done_spinlock);
885
886 return pending_exceptions_done;
887}
888
889static void increment_pending_exceptions_done_count(void)
890{
891 spin_lock(&_pending_exceptions_done_spinlock);
892 _pending_exceptions_done_count++;
893 spin_unlock(&_pending_exceptions_done_spinlock);
894
895 wake_up_all(&_pending_exceptions_done);
896}
897
898static void snapshot_merge_next_chunks(struct dm_snapshot *s)
899{
900 int i, linear_chunks;
901 chunk_t old_chunk, new_chunk;
902 struct dm_io_region src, dest;
903 sector_t io_size;
904 uint64_t previous_count;
905
906 BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
907 if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
908 goto shut;
909
910 /*
911 * valid flag never changes during merge, so no lock required.
912 */
913 if (!s->valid) {
914 DMERR("Snapshot is invalid: can't merge");
915 goto shut;
916 }
917
918 linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
919 &new_chunk);
920 if (linear_chunks <= 0) {
921 if (linear_chunks < 0) {
922 DMERR("Read error in exception store: "
923 "shutting down merge");
924 down_write(&s->lock);
925 s->merge_failed = 1;
926 up_write(&s->lock);
927 }
928 goto shut;
929 }
930
931 /* Adjust old_chunk and new_chunk to reflect start of linear region */
932 old_chunk = old_chunk + 1 - linear_chunks;
933 new_chunk = new_chunk + 1 - linear_chunks;
934
935 /*
936 * Use one (potentially large) I/O to copy all 'linear_chunks'
937 * from the exception store to the origin
938 */
939 io_size = linear_chunks * s->store->chunk_size;
940
941 dest.bdev = s->origin->bdev;
942 dest.sector = chunk_to_sector(s->store, old_chunk);
943 dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
944
945 src.bdev = s->cow->bdev;
946 src.sector = chunk_to_sector(s->store, new_chunk);
947 src.count = dest.count;
948
949 /*
950 * Reallocate any exceptions needed in other snapshots then
951 * wait for the pending exceptions to complete.
952 * Each time any pending exception (globally on the system)
953 * completes we are woken and repeat the process to find out
954 * if we can proceed. While this may not seem a particularly
955 * efficient algorithm, it is not expected to have any
956 * significant impact on performance.
957 */
958 previous_count = read_pending_exceptions_done_count();
959 while (origin_write_extent(s, dest.sector, io_size)) {
960 wait_event(_pending_exceptions_done,
961 (read_pending_exceptions_done_count() !=
962 previous_count));
963 /* Retry after the wait, until all exceptions are done. */
964 previous_count = read_pending_exceptions_done_count();
965 }
966
967 down_write(&s->lock);
968 s->first_merging_chunk = old_chunk;
969 s->num_merging_chunks = linear_chunks;
970 up_write(&s->lock);
971
972 /* Wait until writes to all 'linear_chunks' drain */
973 for (i = 0; i < linear_chunks; i++)
974 __check_for_conflicting_io(s, old_chunk + i);
975
976 dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
977 return;
978
979shut:
980 merge_shutdown(s);
981}
982
983static void error_bios(struct bio *bio);
984
985static void merge_callback(int read_err, unsigned long write_err, void *context)
986{
987 struct dm_snapshot *s = context;
988 struct bio *b = NULL;
989
990 if (read_err || write_err) {
991 if (read_err)
992 DMERR("Read error: shutting down merge.");
993 else
994 DMERR("Write error: shutting down merge.");
995 goto shut;
996 }
997
998 if (s->store->type->commit_merge(s->store,
999 s->num_merging_chunks) < 0) {
1000 DMERR("Write error in exception store: shutting down merge");
1001 goto shut;
1002 }
1003
1004 if (remove_single_exception_chunk(s) < 0)
1005 goto shut;
1006
1007 snapshot_merge_next_chunks(s);
1008
1009 return;
1010
1011shut:
1012 down_write(&s->lock);
1013 s->merge_failed = 1;
1014 b = __release_queued_bios_after_merge(s);
1015 up_write(&s->lock);
1016 error_bios(b);
1017
1018 merge_shutdown(s);
1019}
1020
1021static void start_merge(struct dm_snapshot *s)
1022{
1023 if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
1024 snapshot_merge_next_chunks(s);
1025}
1026
1027static int wait_schedule(void *ptr)
1028{
1029 schedule();
1030
1031 return 0;
1032}
1033
1034/*
1035 * Stop the merging process and wait until it finishes.
1036 */
1037static void stop_merge(struct dm_snapshot *s)
1038{
1039 set_bit(SHUTDOWN_MERGE, &s->state_bits);
1040 wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule,
1041 TASK_UNINTERRUPTIBLE);
1042 clear_bit(SHUTDOWN_MERGE, &s->state_bits);
1043}
1044
577/* 1045/*
578 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> 1046 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
579 */ 1047 */
@@ -582,50 +1050,73 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
582 struct dm_snapshot *s; 1050 struct dm_snapshot *s;
583 int i; 1051 int i;
584 int r = -EINVAL; 1052 int r = -EINVAL;
585 char *origin_path; 1053 char *origin_path, *cow_path;
586 struct dm_exception_store *store; 1054 unsigned args_used, num_flush_requests = 1;
587 unsigned args_used; 1055 fmode_t origin_mode = FMODE_READ;
588 1056
589 if (argc != 4) { 1057 if (argc != 4) {
590 ti->error = "requires exactly 4 arguments"; 1058 ti->error = "requires exactly 4 arguments";
591 r = -EINVAL; 1059 r = -EINVAL;
592 goto bad_args; 1060 goto bad;
1061 }
1062
1063 if (dm_target_is_snapshot_merge(ti)) {
1064 num_flush_requests = 2;
1065 origin_mode = FMODE_WRITE;
593 } 1066 }
594 1067
595 origin_path = argv[0]; 1068 origin_path = argv[0];
596 argv++; 1069 argv++;
597 argc--; 1070 argc--;
598 1071
599 r = dm_exception_store_create(ti, argc, argv, &args_used, &store); 1072 s = kmalloc(sizeof(*s), GFP_KERNEL);
1073 if (!s) {
1074 ti->error = "Cannot allocate snapshot context private "
1075 "structure";
1076 r = -ENOMEM;
1077 goto bad;
1078 }
1079
1080 cow_path = argv[0];
1081 argv++;
1082 argc--;
1083
1084 r = dm_get_device(ti, cow_path, 0, 0,
1085 FMODE_READ | FMODE_WRITE, &s->cow);
1086 if (r) {
1087 ti->error = "Cannot get COW device";
1088 goto bad_cow;
1089 }
1090
1091 r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
600 if (r) { 1092 if (r) {
601 ti->error = "Couldn't create exception store"; 1093 ti->error = "Couldn't create exception store";
602 r = -EINVAL; 1094 r = -EINVAL;
603 goto bad_args; 1095 goto bad_store;
604 } 1096 }
605 1097
606 argv += args_used; 1098 argv += args_used;
607 argc -= args_used; 1099 argc -= args_used;
608 1100
609 s = kmalloc(sizeof(*s), GFP_KERNEL); 1101 r = dm_get_device(ti, origin_path, 0, ti->len, origin_mode, &s->origin);
610 if (!s) {
611 ti->error = "Cannot allocate snapshot context private "
612 "structure";
613 r = -ENOMEM;
614 goto bad_snap;
615 }
616
617 r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
618 if (r) { 1102 if (r) {
619 ti->error = "Cannot get origin device"; 1103 ti->error = "Cannot get origin device";
620 goto bad_origin; 1104 goto bad_origin;
621 } 1105 }
622 1106
623 s->store = store; 1107 s->ti = ti;
624 s->valid = 1; 1108 s->valid = 1;
625 s->active = 0; 1109 s->active = 0;
1110 s->suspended = 0;
626 atomic_set(&s->pending_exceptions_count, 0); 1111 atomic_set(&s->pending_exceptions_count, 0);
627 init_rwsem(&s->lock); 1112 init_rwsem(&s->lock);
1113 INIT_LIST_HEAD(&s->list);
628 spin_lock_init(&s->pe_lock); 1114 spin_lock_init(&s->pe_lock);
1115 s->state_bits = 0;
1116 s->merge_failed = 0;
1117 s->first_merging_chunk = 0;
1118 s->num_merging_chunks = 0;
1119 bio_list_init(&s->bios_queued_during_merge);
629 1120
630 /* Allocate hash table for COW data */ 1121 /* Allocate hash table for COW data */
631 if (init_hash_tables(s)) { 1122 if (init_hash_tables(s)) {
@@ -659,39 +1150,55 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
659 1150
660 spin_lock_init(&s->tracked_chunk_lock); 1151 spin_lock_init(&s->tracked_chunk_lock);
661 1152
662 /* Metadata must only be loaded into one table at once */ 1153 bio_list_init(&s->queued_bios);
1154 INIT_WORK(&s->queued_bios_work, flush_queued_bios);
1155
1156 ti->private = s;
1157 ti->num_flush_requests = num_flush_requests;
1158
1159 /* Add snapshot to the list of snapshots for this origin */
1160 /* Exceptions aren't triggered till snapshot_resume() is called */
1161 r = register_snapshot(s);
1162 if (r == -ENOMEM) {
1163 ti->error = "Snapshot origin struct allocation failed";
1164 goto bad_load_and_register;
1165 } else if (r < 0) {
1166 /* invalid handover, register_snapshot has set ti->error */
1167 goto bad_load_and_register;
1168 }
1169
1170 /*
1171 * Metadata must only be loaded into one table at once, so skip this
1172 * if metadata will be handed over during resume.
1173 * Chunk size will be set during the handover - set it to zero to
1174 * ensure it's ignored.
1175 */
1176 if (r > 0) {
1177 s->store->chunk_size = 0;
1178 return 0;
1179 }
1180
663 r = s->store->type->read_metadata(s->store, dm_add_exception, 1181 r = s->store->type->read_metadata(s->store, dm_add_exception,
664 (void *)s); 1182 (void *)s);
665 if (r < 0) { 1183 if (r < 0) {
666 ti->error = "Failed to read snapshot metadata"; 1184 ti->error = "Failed to read snapshot metadata";
667 goto bad_load_and_register; 1185 goto bad_read_metadata;
668 } else if (r > 0) { 1186 } else if (r > 0) {
669 s->valid = 0; 1187 s->valid = 0;
670 DMWARN("Snapshot is marked invalid."); 1188 DMWARN("Snapshot is marked invalid.");
671 } 1189 }
672 1190
673 bio_list_init(&s->queued_bios);
674 INIT_WORK(&s->queued_bios_work, flush_queued_bios);
675
676 if (!s->store->chunk_size) { 1191 if (!s->store->chunk_size) {
677 ti->error = "Chunk size not set"; 1192 ti->error = "Chunk size not set";
678 goto bad_load_and_register; 1193 goto bad_read_metadata;
679 }
680
681 /* Add snapshot to the list of snapshots for this origin */
682 /* Exceptions aren't triggered till snapshot_resume() is called */
683 if (register_snapshot(s)) {
684 r = -EINVAL;
685 ti->error = "Cannot register snapshot origin";
686 goto bad_load_and_register;
687 } 1194 }
688
689 ti->private = s;
690 ti->split_io = s->store->chunk_size; 1195 ti->split_io = s->store->chunk_size;
691 ti->num_flush_requests = 1;
692 1196
693 return 0; 1197 return 0;
694 1198
1199bad_read_metadata:
1200 unregister_snapshot(s);
1201
695bad_load_and_register: 1202bad_load_and_register:
696 mempool_destroy(s->tracked_chunk_pool); 1203 mempool_destroy(s->tracked_chunk_pool);
697 1204
@@ -702,19 +1209,22 @@ bad_pending_pool:
702 dm_kcopyd_client_destroy(s->kcopyd_client); 1209 dm_kcopyd_client_destroy(s->kcopyd_client);
703 1210
704bad_kcopyd: 1211bad_kcopyd:
705 exit_exception_table(&s->pending, pending_cache); 1212 dm_exception_table_exit(&s->pending, pending_cache);
706 exit_exception_table(&s->complete, exception_cache); 1213 dm_exception_table_exit(&s->complete, exception_cache);
707 1214
708bad_hash_tables: 1215bad_hash_tables:
709 dm_put_device(ti, s->origin); 1216 dm_put_device(ti, s->origin);
710 1217
711bad_origin: 1218bad_origin:
712 kfree(s); 1219 dm_exception_store_destroy(s->store);
713 1220
714bad_snap: 1221bad_store:
715 dm_exception_store_destroy(store); 1222 dm_put_device(ti, s->cow);
1223
1224bad_cow:
1225 kfree(s);
716 1226
717bad_args: 1227bad:
718 return r; 1228 return r;
719} 1229}
720 1230
@@ -723,8 +1233,39 @@ static void __free_exceptions(struct dm_snapshot *s)
723 dm_kcopyd_client_destroy(s->kcopyd_client); 1233 dm_kcopyd_client_destroy(s->kcopyd_client);
724 s->kcopyd_client = NULL; 1234 s->kcopyd_client = NULL;
725 1235
726 exit_exception_table(&s->pending, pending_cache); 1236 dm_exception_table_exit(&s->pending, pending_cache);
727 exit_exception_table(&s->complete, exception_cache); 1237 dm_exception_table_exit(&s->complete, exception_cache);
1238}
1239
1240static void __handover_exceptions(struct dm_snapshot *snap_src,
1241 struct dm_snapshot *snap_dest)
1242{
1243 union {
1244 struct dm_exception_table table_swap;
1245 struct dm_exception_store *store_swap;
1246 } u;
1247
1248 /*
1249 * Swap all snapshot context information between the two instances.
1250 */
1251 u.table_swap = snap_dest->complete;
1252 snap_dest->complete = snap_src->complete;
1253 snap_src->complete = u.table_swap;
1254
1255 u.store_swap = snap_dest->store;
1256 snap_dest->store = snap_src->store;
1257 snap_src->store = u.store_swap;
1258
1259 snap_dest->store->snap = snap_dest;
1260 snap_src->store->snap = snap_src;
1261
1262 snap_dest->ti->split_io = snap_dest->store->chunk_size;
1263 snap_dest->valid = snap_src->valid;
1264
1265 /*
1266 * Set source invalid to ensure it receives no further I/O.
1267 */
1268 snap_src->valid = 0;
728} 1269}
729 1270
730static void snapshot_dtr(struct dm_target *ti) 1271static void snapshot_dtr(struct dm_target *ti)
@@ -733,9 +1274,24 @@ static void snapshot_dtr(struct dm_target *ti)
733 int i; 1274 int i;
734#endif 1275#endif
735 struct dm_snapshot *s = ti->private; 1276 struct dm_snapshot *s = ti->private;
1277 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
736 1278
737 flush_workqueue(ksnapd); 1279 flush_workqueue(ksnapd);
738 1280
1281 down_read(&_origins_lock);
1282 /* Check whether exception handover must be cancelled */
1283 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1284 if (snap_src && snap_dest && (s == snap_src)) {
1285 down_write(&snap_dest->lock);
1286 snap_dest->valid = 0;
1287 up_write(&snap_dest->lock);
1288 DMERR("Cancelling snapshot handover.");
1289 }
1290 up_read(&_origins_lock);
1291
1292 if (dm_target_is_snapshot_merge(ti))
1293 stop_merge(s);
1294
739 /* Prevent further origin writes from using this snapshot. */ 1295 /* Prevent further origin writes from using this snapshot. */
740 /* After this returns there can be no new kcopyd jobs. */ 1296 /* After this returns there can be no new kcopyd jobs. */
741 unregister_snapshot(s); 1297 unregister_snapshot(s);
@@ -763,6 +1319,8 @@ static void snapshot_dtr(struct dm_target *ti)
763 1319
764 dm_exception_store_destroy(s->store); 1320 dm_exception_store_destroy(s->store);
765 1321
1322 dm_put_device(ti, s->cow);
1323
766 kfree(s); 1324 kfree(s);
767} 1325}
768 1326
@@ -795,6 +1353,26 @@ static void flush_queued_bios(struct work_struct *work)
795 flush_bios(queued_bios); 1353 flush_bios(queued_bios);
796} 1354}
797 1355
1356static int do_origin(struct dm_dev *origin, struct bio *bio);
1357
1358/*
1359 * Flush a list of buffers.
1360 */
1361static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
1362{
1363 struct bio *n;
1364 int r;
1365
1366 while (bio) {
1367 n = bio->bi_next;
1368 bio->bi_next = NULL;
1369 r = do_origin(s->origin, bio);
1370 if (r == DM_MAPIO_REMAPPED)
1371 generic_make_request(bio);
1372 bio = n;
1373 }
1374}
1375
798/* 1376/*
799 * Error a list of buffers. 1377 * Error a list of buffers.
800 */ 1378 */
@@ -825,45 +1403,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
825 1403
826 s->valid = 0; 1404 s->valid = 0;
827 1405
828 dm_table_event(s->store->ti->table); 1406 dm_table_event(s->ti->table);
829}
830
831static void get_pending_exception(struct dm_snap_pending_exception *pe)
832{
833 atomic_inc(&pe->ref_count);
834}
835
836static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe)
837{
838 struct dm_snap_pending_exception *primary_pe;
839 struct bio *origin_bios = NULL;
840
841 primary_pe = pe->primary_pe;
842
843 /*
844 * If this pe is involved in a write to the origin and
845 * it is the last sibling to complete then release
846 * the bios for the original write to the origin.
847 */
848 if (primary_pe &&
849 atomic_dec_and_test(&primary_pe->ref_count)) {
850 origin_bios = bio_list_get(&primary_pe->origin_bios);
851 free_pending_exception(primary_pe);
852 }
853
854 /*
855 * Free the pe if it's not linked to an origin write or if
856 * it's not itself a primary pe.
857 */
858 if (!primary_pe || primary_pe != pe)
859 free_pending_exception(pe);
860
861 return origin_bios;
862} 1407}
863 1408
864static void pending_complete(struct dm_snap_pending_exception *pe, int success) 1409static void pending_complete(struct dm_snap_pending_exception *pe, int success)
865{ 1410{
866 struct dm_snap_exception *e; 1411 struct dm_exception *e;
867 struct dm_snapshot *s = pe->snap; 1412 struct dm_snapshot *s = pe->snap;
868 struct bio *origin_bios = NULL; 1413 struct bio *origin_bios = NULL;
869 struct bio *snapshot_bios = NULL; 1414 struct bio *snapshot_bios = NULL;
@@ -877,7 +1422,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
877 goto out; 1422 goto out;
878 } 1423 }
879 1424
880 e = alloc_exception(); 1425 e = alloc_completed_exception();
881 if (!e) { 1426 if (!e) {
882 down_write(&s->lock); 1427 down_write(&s->lock);
883 __invalidate_snapshot(s, -ENOMEM); 1428 __invalidate_snapshot(s, -ENOMEM);
@@ -888,28 +1433,27 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
888 1433
889 down_write(&s->lock); 1434 down_write(&s->lock);
890 if (!s->valid) { 1435 if (!s->valid) {
891 free_exception(e); 1436 free_completed_exception(e);
892 error = 1; 1437 error = 1;
893 goto out; 1438 goto out;
894 } 1439 }
895 1440
896 /* 1441 /* Check for conflicting reads */
897 * Check for conflicting reads. This is extremely improbable, 1442 __check_for_conflicting_io(s, pe->e.old_chunk);
898 * so msleep(1) is sufficient and there is no need for a wait queue.
899 */
900 while (__chunk_is_tracked(s, pe->e.old_chunk))
901 msleep(1);
902 1443
903 /* 1444 /*
904 * Add a proper exception, and remove the 1445 * Add a proper exception, and remove the
905 * in-flight exception from the list. 1446 * in-flight exception from the list.
906 */ 1447 */
907 insert_completed_exception(s, e); 1448 dm_insert_exception(&s->complete, e);
908 1449
909 out: 1450 out:
910 remove_exception(&pe->e); 1451 dm_remove_exception(&pe->e);
911 snapshot_bios = bio_list_get(&pe->snapshot_bios); 1452 snapshot_bios = bio_list_get(&pe->snapshot_bios);
912 origin_bios = put_pending_exception(pe); 1453 origin_bios = bio_list_get(&pe->origin_bios);
1454 free_pending_exception(pe);
1455
1456 increment_pending_exceptions_done_count();
913 1457
914 up_write(&s->lock); 1458 up_write(&s->lock);
915 1459
@@ -919,7 +1463,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
919 else 1463 else
920 flush_bios(snapshot_bios); 1464 flush_bios(snapshot_bios);
921 1465
922 flush_bios(origin_bios); 1466 retry_origin_bios(s, origin_bios);
923} 1467}
924 1468
925static void commit_callback(void *context, int success) 1469static void commit_callback(void *context, int success)
@@ -963,7 +1507,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
963 src.sector = chunk_to_sector(s->store, pe->e.old_chunk); 1507 src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
964 src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); 1508 src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
965 1509
966 dest.bdev = s->store->cow->bdev; 1510 dest.bdev = s->cow->bdev;
967 dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); 1511 dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
968 dest.count = src.count; 1512 dest.count = src.count;
969 1513
@@ -975,7 +1519,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
975static struct dm_snap_pending_exception * 1519static struct dm_snap_pending_exception *
976__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) 1520__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
977{ 1521{
978 struct dm_snap_exception *e = lookup_exception(&s->pending, chunk); 1522 struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
979 1523
980 if (!e) 1524 if (!e)
981 return NULL; 1525 return NULL;
@@ -1006,8 +1550,6 @@ __find_pending_exception(struct dm_snapshot *s,
1006 pe->e.old_chunk = chunk; 1550 pe->e.old_chunk = chunk;
1007 bio_list_init(&pe->origin_bios); 1551 bio_list_init(&pe->origin_bios);
1008 bio_list_init(&pe->snapshot_bios); 1552 bio_list_init(&pe->snapshot_bios);
1009 pe->primary_pe = NULL;
1010 atomic_set(&pe->ref_count, 0);
1011 pe->started = 0; 1553 pe->started = 0;
1012 1554
1013 if (s->store->type->prepare_exception(s->store, &pe->e)) { 1555 if (s->store->type->prepare_exception(s->store, &pe->e)) {
@@ -1015,16 +1557,15 @@ __find_pending_exception(struct dm_snapshot *s,
1015 return NULL; 1557 return NULL;
1016 } 1558 }
1017 1559
1018 get_pending_exception(pe); 1560 dm_insert_exception(&s->pending, &pe->e);
1019 insert_exception(&s->pending, &pe->e);
1020 1561
1021 return pe; 1562 return pe;
1022} 1563}
1023 1564
1024static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, 1565static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1025 struct bio *bio, chunk_t chunk) 1566 struct bio *bio, chunk_t chunk)
1026{ 1567{
1027 bio->bi_bdev = s->store->cow->bdev; 1568 bio->bi_bdev = s->cow->bdev;
1028 bio->bi_sector = chunk_to_sector(s->store, 1569 bio->bi_sector = chunk_to_sector(s->store,
1029 dm_chunk_number(e->new_chunk) + 1570 dm_chunk_number(e->new_chunk) +
1030 (chunk - e->old_chunk)) + 1571 (chunk - e->old_chunk)) +
@@ -1035,14 +1576,14 @@ static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
1035static int snapshot_map(struct dm_target *ti, struct bio *bio, 1576static int snapshot_map(struct dm_target *ti, struct bio *bio,
1036 union map_info *map_context) 1577 union map_info *map_context)
1037{ 1578{
1038 struct dm_snap_exception *e; 1579 struct dm_exception *e;
1039 struct dm_snapshot *s = ti->private; 1580 struct dm_snapshot *s = ti->private;
1040 int r = DM_MAPIO_REMAPPED; 1581 int r = DM_MAPIO_REMAPPED;
1041 chunk_t chunk; 1582 chunk_t chunk;
1042 struct dm_snap_pending_exception *pe = NULL; 1583 struct dm_snap_pending_exception *pe = NULL;
1043 1584
1044 if (unlikely(bio_empty_barrier(bio))) { 1585 if (unlikely(bio_empty_barrier(bio))) {
1045 bio->bi_bdev = s->store->cow->bdev; 1586 bio->bi_bdev = s->cow->bdev;
1046 return DM_MAPIO_REMAPPED; 1587 return DM_MAPIO_REMAPPED;
1047 } 1588 }
1048 1589
@@ -1063,7 +1604,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1063 } 1604 }
1064 1605
1065 /* If the block is already remapped - use that, else remap it */ 1606 /* If the block is already remapped - use that, else remap it */
1066 e = lookup_exception(&s->complete, chunk); 1607 e = dm_lookup_exception(&s->complete, chunk);
1067 if (e) { 1608 if (e) {
1068 remap_exception(s, e, bio, chunk); 1609 remap_exception(s, e, bio, chunk);
1069 goto out_unlock; 1610 goto out_unlock;
@@ -1087,7 +1628,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1087 goto out_unlock; 1628 goto out_unlock;
1088 } 1629 }
1089 1630
1090 e = lookup_exception(&s->complete, chunk); 1631 e = dm_lookup_exception(&s->complete, chunk);
1091 if (e) { 1632 if (e) {
1092 free_pending_exception(pe); 1633 free_pending_exception(pe);
1093 remap_exception(s, e, bio, chunk); 1634 remap_exception(s, e, bio, chunk);
@@ -1125,6 +1666,78 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1125 return r; 1666 return r;
1126} 1667}
1127 1668
1669/*
1670 * A snapshot-merge target behaves like a combination of a snapshot
1671 * target and a snapshot-origin target. It only generates new
1672 * exceptions in other snapshots and not in the one that is being
1673 * merged.
1674 *
1675 * For each chunk, if there is an existing exception, it is used to
1676 * redirect I/O to the cow device. Otherwise I/O is sent to the origin,
1677 * which in turn might generate exceptions in other snapshots.
1678 * If merging is currently taking place on the chunk in question, the
1679 * I/O is deferred by adding it to s->bios_queued_during_merge.
1680 */
1681static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
1682 union map_info *map_context)
1683{
1684 struct dm_exception *e;
1685 struct dm_snapshot *s = ti->private;
1686 int r = DM_MAPIO_REMAPPED;
1687 chunk_t chunk;
1688
1689 if (unlikely(bio_empty_barrier(bio))) {
1690 if (!map_context->flush_request)
1691 bio->bi_bdev = s->origin->bdev;
1692 else
1693 bio->bi_bdev = s->cow->bdev;
1694 map_context->ptr = NULL;
1695 return DM_MAPIO_REMAPPED;
1696 }
1697
1698 chunk = sector_to_chunk(s->store, bio->bi_sector);
1699
1700 down_write(&s->lock);
1701
1702 /* Full merging snapshots are redirected to the origin */
1703 if (!s->valid)
1704 goto redirect_to_origin;
1705
1706 /* If the block is already remapped - use that */
1707 e = dm_lookup_exception(&s->complete, chunk);
1708 if (e) {
1709 /* Queue writes overlapping with chunks being merged */
1710 if (bio_rw(bio) == WRITE &&
1711 chunk >= s->first_merging_chunk &&
1712 chunk < (s->first_merging_chunk +
1713 s->num_merging_chunks)) {
1714 bio->bi_bdev = s->origin->bdev;
1715 bio_list_add(&s->bios_queued_during_merge, bio);
1716 r = DM_MAPIO_SUBMITTED;
1717 goto out_unlock;
1718 }
1719
1720 remap_exception(s, e, bio, chunk);
1721
1722 if (bio_rw(bio) == WRITE)
1723 map_context->ptr = track_chunk(s, chunk);
1724 goto out_unlock;
1725 }
1726
1727redirect_to_origin:
1728 bio->bi_bdev = s->origin->bdev;
1729
1730 if (bio_rw(bio) == WRITE) {
1731 up_write(&s->lock);
1732 return do_origin(s->origin, bio);
1733 }
1734
1735out_unlock:
1736 up_write(&s->lock);
1737
1738 return r;
1739}
1740
1128static int snapshot_end_io(struct dm_target *ti, struct bio *bio, 1741static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
1129 int error, union map_info *map_context) 1742 int error, union map_info *map_context)
1130{ 1743{
@@ -1137,40 +1750,135 @@ static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
1137 return 0; 1750 return 0;
1138} 1751}
1139 1752
1753static void snapshot_merge_presuspend(struct dm_target *ti)
1754{
1755 struct dm_snapshot *s = ti->private;
1756
1757 stop_merge(s);
1758}
1759
1760static void snapshot_postsuspend(struct dm_target *ti)
1761{
1762 struct dm_snapshot *s = ti->private;
1763
1764 down_write(&s->lock);
1765 s->suspended = 1;
1766 up_write(&s->lock);
1767}
1768
1769static int snapshot_preresume(struct dm_target *ti)
1770{
1771 int r = 0;
1772 struct dm_snapshot *s = ti->private;
1773 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1774
1775 down_read(&_origins_lock);
1776 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1777 if (snap_src && snap_dest) {
1778 down_read(&snap_src->lock);
1779 if (s == snap_src) {
1780 DMERR("Unable to resume snapshot source until "
1781 "handover completes.");
1782 r = -EINVAL;
1783 } else if (!snap_src->suspended) {
1784 DMERR("Unable to perform snapshot handover until "
1785 "source is suspended.");
1786 r = -EINVAL;
1787 }
1788 up_read(&snap_src->lock);
1789 }
1790 up_read(&_origins_lock);
1791
1792 return r;
1793}
1794
1140static void snapshot_resume(struct dm_target *ti) 1795static void snapshot_resume(struct dm_target *ti)
1141{ 1796{
1142 struct dm_snapshot *s = ti->private; 1797 struct dm_snapshot *s = ti->private;
1798 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1799
1800 down_read(&_origins_lock);
1801 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1802 if (snap_src && snap_dest) {
1803 down_write(&snap_src->lock);
1804 down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
1805 __handover_exceptions(snap_src, snap_dest);
1806 up_write(&snap_dest->lock);
1807 up_write(&snap_src->lock);
1808 }
1809 up_read(&_origins_lock);
1810
1811 /* Now we have correct chunk size, reregister */
1812 reregister_snapshot(s);
1143 1813
1144 down_write(&s->lock); 1814 down_write(&s->lock);
1145 s->active = 1; 1815 s->active = 1;
1816 s->suspended = 0;
1146 up_write(&s->lock); 1817 up_write(&s->lock);
1147} 1818}
1148 1819
1820static sector_t get_origin_minimum_chunksize(struct block_device *bdev)
1821{
1822 sector_t min_chunksize;
1823
1824 down_read(&_origins_lock);
1825 min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
1826 up_read(&_origins_lock);
1827
1828 return min_chunksize;
1829}
1830
1831static void snapshot_merge_resume(struct dm_target *ti)
1832{
1833 struct dm_snapshot *s = ti->private;
1834
1835 /*
1836 * Handover exceptions from existing snapshot.
1837 */
1838 snapshot_resume(ti);
1839
1840 /*
1841 * snapshot-merge acts as an origin, so set ti->split_io
1842 */
1843 ti->split_io = get_origin_minimum_chunksize(s->origin->bdev);
1844
1845 start_merge(s);
1846}
1847
1149static int snapshot_status(struct dm_target *ti, status_type_t type, 1848static int snapshot_status(struct dm_target *ti, status_type_t type,
1150 char *result, unsigned int maxlen) 1849 char *result, unsigned int maxlen)
1151{ 1850{
1152 unsigned sz = 0; 1851 unsigned sz = 0;
1153 struct dm_snapshot *snap = ti->private; 1852 struct dm_snapshot *snap = ti->private;
1154 1853
1155 down_write(&snap->lock);
1156
1157 switch (type) { 1854 switch (type) {
1158 case STATUSTYPE_INFO: 1855 case STATUSTYPE_INFO:
1856
1857 down_write(&snap->lock);
1858
1159 if (!snap->valid) 1859 if (!snap->valid)
1160 DMEMIT("Invalid"); 1860 DMEMIT("Invalid");
1861 else if (snap->merge_failed)
1862 DMEMIT("Merge failed");
1161 else { 1863 else {
1162 if (snap->store->type->fraction_full) { 1864 if (snap->store->type->usage) {
1163 sector_t numerator, denominator; 1865 sector_t total_sectors, sectors_allocated,
1164 snap->store->type->fraction_full(snap->store, 1866 metadata_sectors;
1165 &numerator, 1867 snap->store->type->usage(snap->store,
1166 &denominator); 1868 &total_sectors,
1167 DMEMIT("%llu/%llu", 1869 &sectors_allocated,
1168 (unsigned long long)numerator, 1870 &metadata_sectors);
1169 (unsigned long long)denominator); 1871 DMEMIT("%llu/%llu %llu",
1872 (unsigned long long)sectors_allocated,
1873 (unsigned long long)total_sectors,
1874 (unsigned long long)metadata_sectors);
1170 } 1875 }
1171 else 1876 else
1172 DMEMIT("Unknown"); 1877 DMEMIT("Unknown");
1173 } 1878 }
1879
1880 up_write(&snap->lock);
1881
1174 break; 1882 break;
1175 1883
1176 case STATUSTYPE_TABLE: 1884 case STATUSTYPE_TABLE:
@@ -1179,14 +1887,12 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
1179 * to make private copies if the output is to 1887 * to make private copies if the output is to
1180 * make sense. 1888 * make sense.
1181 */ 1889 */
1182 DMEMIT("%s", snap->origin->name); 1890 DMEMIT("%s %s", snap->origin->name, snap->cow->name);
1183 snap->store->type->status(snap->store, type, result + sz, 1891 snap->store->type->status(snap->store, type, result + sz,
1184 maxlen - sz); 1892 maxlen - sz);
1185 break; 1893 break;
1186 } 1894 }
1187 1895
1188 up_write(&snap->lock);
1189
1190 return 0; 1896 return 0;
1191} 1897}
1192 1898
@@ -1202,17 +1908,36 @@ static int snapshot_iterate_devices(struct dm_target *ti,
1202/*----------------------------------------------------------------- 1908/*-----------------------------------------------------------------
1203 * Origin methods 1909 * Origin methods
1204 *---------------------------------------------------------------*/ 1910 *---------------------------------------------------------------*/
1205static int __origin_write(struct list_head *snapshots, struct bio *bio) 1911
1912/*
1913 * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
1914 * supplied bio was ignored. The caller may submit it immediately.
1915 * (No remapping actually occurs as the origin is always a direct linear
1916 * map.)
1917 *
1918 * If further exceptions are required, DM_MAPIO_SUBMITTED is returned
1919 * and any supplied bio is added to a list to be submitted once all
1920 * the necessary exceptions exist.
1921 */
1922static int __origin_write(struct list_head *snapshots, sector_t sector,
1923 struct bio *bio)
1206{ 1924{
1207 int r = DM_MAPIO_REMAPPED, first = 0; 1925 int r = DM_MAPIO_REMAPPED;
1208 struct dm_snapshot *snap; 1926 struct dm_snapshot *snap;
1209 struct dm_snap_exception *e; 1927 struct dm_exception *e;
1210 struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL; 1928 struct dm_snap_pending_exception *pe;
1929 struct dm_snap_pending_exception *pe_to_start_now = NULL;
1930 struct dm_snap_pending_exception *pe_to_start_last = NULL;
1211 chunk_t chunk; 1931 chunk_t chunk;
1212 LIST_HEAD(pe_queue);
1213 1932
1214 /* Do all the snapshots on this origin */ 1933 /* Do all the snapshots on this origin */
1215 list_for_each_entry (snap, snapshots, list) { 1934 list_for_each_entry (snap, snapshots, list) {
1935 /*
1936 * Don't make new exceptions in a merging snapshot
1937 * because it has effectively been deleted
1938 */
1939 if (dm_target_is_snapshot_merge(snap->ti))
1940 continue;
1216 1941
1217 down_write(&snap->lock); 1942 down_write(&snap->lock);
1218 1943
@@ -1221,24 +1946,21 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1221 goto next_snapshot; 1946 goto next_snapshot;
1222 1947
1223 /* Nothing to do if writing beyond end of snapshot */ 1948 /* Nothing to do if writing beyond end of snapshot */
1224 if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table)) 1949 if (sector >= dm_table_get_size(snap->ti->table))
1225 goto next_snapshot; 1950 goto next_snapshot;
1226 1951
1227 /* 1952 /*
1228 * Remember, different snapshots can have 1953 * Remember, different snapshots can have
1229 * different chunk sizes. 1954 * different chunk sizes.
1230 */ 1955 */
1231 chunk = sector_to_chunk(snap->store, bio->bi_sector); 1956 chunk = sector_to_chunk(snap->store, sector);
1232 1957
1233 /* 1958 /*
1234 * Check exception table to see if block 1959 * Check exception table to see if block
1235 * is already remapped in this snapshot 1960 * is already remapped in this snapshot
1236 * and trigger an exception if not. 1961 * and trigger an exception if not.
1237 *
1238 * ref_count is initialised to 1 so pending_complete()
1239 * won't destroy the primary_pe while we're inside this loop.
1240 */ 1962 */
1241 e = lookup_exception(&snap->complete, chunk); 1963 e = dm_lookup_exception(&snap->complete, chunk);
1242 if (e) 1964 if (e)
1243 goto next_snapshot; 1965 goto next_snapshot;
1244 1966
@@ -1253,7 +1975,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1253 goto next_snapshot; 1975 goto next_snapshot;
1254 } 1976 }
1255 1977
1256 e = lookup_exception(&snap->complete, chunk); 1978 e = dm_lookup_exception(&snap->complete, chunk);
1257 if (e) { 1979 if (e) {
1258 free_pending_exception(pe); 1980 free_pending_exception(pe);
1259 goto next_snapshot; 1981 goto next_snapshot;
@@ -1266,59 +1988,43 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1266 } 1988 }
1267 } 1989 }
1268 1990
1269 if (!primary_pe) { 1991 r = DM_MAPIO_SUBMITTED;
1270 /*
1271 * Either every pe here has same
1272 * primary_pe or none has one yet.
1273 */
1274 if (pe->primary_pe)
1275 primary_pe = pe->primary_pe;
1276 else {
1277 primary_pe = pe;
1278 first = 1;
1279 }
1280
1281 bio_list_add(&primary_pe->origin_bios, bio);
1282 1992
1283 r = DM_MAPIO_SUBMITTED; 1993 /*
1284 } 1994 * If an origin bio was supplied, queue it to wait for the
1995 * completion of this exception, and start this one last,
1996 * at the end of the function.
1997 */
1998 if (bio) {
1999 bio_list_add(&pe->origin_bios, bio);
2000 bio = NULL;
1285 2001
1286 if (!pe->primary_pe) { 2002 if (!pe->started) {
1287 pe->primary_pe = primary_pe; 2003 pe->started = 1;
1288 get_pending_exception(primary_pe); 2004 pe_to_start_last = pe;
2005 }
1289 } 2006 }
1290 2007
1291 if (!pe->started) { 2008 if (!pe->started) {
1292 pe->started = 1; 2009 pe->started = 1;
1293 list_add_tail(&pe->list, &pe_queue); 2010 pe_to_start_now = pe;
1294 } 2011 }
1295 2012
1296 next_snapshot: 2013 next_snapshot:
1297 up_write(&snap->lock); 2014 up_write(&snap->lock);
1298 }
1299 2015
1300 if (!primary_pe) 2016 if (pe_to_start_now) {
1301 return r; 2017 start_copy(pe_to_start_now);
1302 2018 pe_to_start_now = NULL;
1303 /* 2019 }
1304 * If this is the first time we're processing this chunk and
1305 * ref_count is now 1 it means all the pending exceptions
1306 * got completed while we were in the loop above, so it falls to
1307 * us here to remove the primary_pe and submit any origin_bios.
1308 */
1309
1310 if (first && atomic_dec_and_test(&primary_pe->ref_count)) {
1311 flush_bios(bio_list_get(&primary_pe->origin_bios));
1312 free_pending_exception(primary_pe);
1313 /* If we got here, pe_queue is necessarily empty. */
1314 return r;
1315 } 2020 }
1316 2021
1317 /* 2022 /*
1318 * Now that we have a complete pe list we can start the copying. 2023 * Submit the exception against which the bio is queued last,
2024 * to give the other exceptions a head start.
1319 */ 2025 */
1320 list_for_each_entry_safe(pe, next_pe, &pe_queue, list) 2026 if (pe_to_start_last)
1321 start_copy(pe); 2027 start_copy(pe_to_start_last);
1322 2028
1323 return r; 2029 return r;
1324} 2030}
@@ -1334,13 +2040,48 @@ static int do_origin(struct dm_dev *origin, struct bio *bio)
1334 down_read(&_origins_lock); 2040 down_read(&_origins_lock);
1335 o = __lookup_origin(origin->bdev); 2041 o = __lookup_origin(origin->bdev);
1336 if (o) 2042 if (o)
1337 r = __origin_write(&o->snapshots, bio); 2043 r = __origin_write(&o->snapshots, bio->bi_sector, bio);
1338 up_read(&_origins_lock); 2044 up_read(&_origins_lock);
1339 2045
1340 return r; 2046 return r;
1341} 2047}
1342 2048
1343/* 2049/*
2050 * Trigger exceptions in all non-merging snapshots.
2051 *
2052 * The chunk size of the merging snapshot may be larger than the chunk
2053 * size of some other snapshot so we may need to reallocate multiple
2054 * chunks in other snapshots.
2055 *
2056 * We scan all the overlapping exceptions in the other snapshots.
2057 * Returns 1 if anything was reallocated and must be waited for,
2058 * otherwise returns 0.
2059 *
2060 * size must be a multiple of merging_snap's chunk_size.
2061 */
2062static int origin_write_extent(struct dm_snapshot *merging_snap,
2063 sector_t sector, unsigned size)
2064{
2065 int must_wait = 0;
2066 sector_t n;
2067 struct origin *o;
2068
2069 /*
2070 * The origin's __minimum_chunk_size() got stored in split_io
2071 * by snapshot_merge_resume().
2072 */
2073 down_read(&_origins_lock);
2074 o = __lookup_origin(merging_snap->origin->bdev);
2075 for (n = 0; n < size; n += merging_snap->ti->split_io)
2076 if (__origin_write(&o->snapshots, sector + n, NULL) ==
2077 DM_MAPIO_SUBMITTED)
2078 must_wait = 1;
2079 up_read(&_origins_lock);
2080
2081 return must_wait;
2082}
2083
2084/*
1344 * Origin: maps a linear range of a device, with hooks for snapshotting. 2085 * Origin: maps a linear range of a device, with hooks for snapshotting.
1345 */ 2086 */
1346 2087
@@ -1391,8 +2132,6 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
1391 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; 2132 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
1392} 2133}
1393 2134
1394#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
1395
1396/* 2135/*
1397 * Set the target "split_io" field to the minimum of all the snapshots' 2136 * Set the target "split_io" field to the minimum of all the snapshots'
1398 * chunk sizes. 2137 * chunk sizes.
@@ -1400,19 +2139,8 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
1400static void origin_resume(struct dm_target *ti) 2139static void origin_resume(struct dm_target *ti)
1401{ 2140{
1402 struct dm_dev *dev = ti->private; 2141 struct dm_dev *dev = ti->private;
1403 struct dm_snapshot *snap;
1404 struct origin *o;
1405 unsigned chunk_size = 0;
1406
1407 down_read(&_origins_lock);
1408 o = __lookup_origin(dev->bdev);
1409 if (o)
1410 list_for_each_entry (snap, &o->snapshots, list)
1411 chunk_size = min_not_zero(chunk_size,
1412 snap->store->chunk_size);
1413 up_read(&_origins_lock);
1414 2142
1415 ti->split_io = chunk_size; 2143 ti->split_io = get_origin_minimum_chunksize(dev->bdev);
1416} 2144}
1417 2145
1418static int origin_status(struct dm_target *ti, status_type_t type, char *result, 2146static int origin_status(struct dm_target *ti, status_type_t type, char *result,
@@ -1455,17 +2183,35 @@ static struct target_type origin_target = {
1455 2183
1456static struct target_type snapshot_target = { 2184static struct target_type snapshot_target = {
1457 .name = "snapshot", 2185 .name = "snapshot",
1458 .version = {1, 7, 0}, 2186 .version = {1, 9, 0},
1459 .module = THIS_MODULE, 2187 .module = THIS_MODULE,
1460 .ctr = snapshot_ctr, 2188 .ctr = snapshot_ctr,
1461 .dtr = snapshot_dtr, 2189 .dtr = snapshot_dtr,
1462 .map = snapshot_map, 2190 .map = snapshot_map,
1463 .end_io = snapshot_end_io, 2191 .end_io = snapshot_end_io,
2192 .postsuspend = snapshot_postsuspend,
2193 .preresume = snapshot_preresume,
1464 .resume = snapshot_resume, 2194 .resume = snapshot_resume,
1465 .status = snapshot_status, 2195 .status = snapshot_status,
1466 .iterate_devices = snapshot_iterate_devices, 2196 .iterate_devices = snapshot_iterate_devices,
1467}; 2197};
1468 2198
2199static struct target_type merge_target = {
2200 .name = dm_snapshot_merge_target_name,
2201 .version = {1, 0, 0},
2202 .module = THIS_MODULE,
2203 .ctr = snapshot_ctr,
2204 .dtr = snapshot_dtr,
2205 .map = snapshot_merge_map,
2206 .end_io = snapshot_end_io,
2207 .presuspend = snapshot_merge_presuspend,
2208 .postsuspend = snapshot_postsuspend,
2209 .preresume = snapshot_preresume,
2210 .resume = snapshot_merge_resume,
2211 .status = snapshot_status,
2212 .iterate_devices = snapshot_iterate_devices,
2213};
2214
1469static int __init dm_snapshot_init(void) 2215static int __init dm_snapshot_init(void)
1470{ 2216{
1471 int r; 2217 int r;
@@ -1477,7 +2223,7 @@ static int __init dm_snapshot_init(void)
1477 } 2223 }
1478 2224
1479 r = dm_register_target(&snapshot_target); 2225 r = dm_register_target(&snapshot_target);
1480 if (r) { 2226 if (r < 0) {
1481 DMERR("snapshot target register failed %d", r); 2227 DMERR("snapshot target register failed %d", r);
1482 goto bad_register_snapshot_target; 2228 goto bad_register_snapshot_target;
1483 } 2229 }
@@ -1485,34 +2231,40 @@ static int __init dm_snapshot_init(void)
1485 r = dm_register_target(&origin_target); 2231 r = dm_register_target(&origin_target);
1486 if (r < 0) { 2232 if (r < 0) {
1487 DMERR("Origin target register failed %d", r); 2233 DMERR("Origin target register failed %d", r);
1488 goto bad1; 2234 goto bad_register_origin_target;
2235 }
2236
2237 r = dm_register_target(&merge_target);
2238 if (r < 0) {
2239 DMERR("Merge target register failed %d", r);
2240 goto bad_register_merge_target;
1489 } 2241 }
1490 2242
1491 r = init_origin_hash(); 2243 r = init_origin_hash();
1492 if (r) { 2244 if (r) {
1493 DMERR("init_origin_hash failed."); 2245 DMERR("init_origin_hash failed.");
1494 goto bad2; 2246 goto bad_origin_hash;
1495 } 2247 }
1496 2248
1497 exception_cache = KMEM_CACHE(dm_snap_exception, 0); 2249 exception_cache = KMEM_CACHE(dm_exception, 0);
1498 if (!exception_cache) { 2250 if (!exception_cache) {
1499 DMERR("Couldn't create exception cache."); 2251 DMERR("Couldn't create exception cache.");
1500 r = -ENOMEM; 2252 r = -ENOMEM;
1501 goto bad3; 2253 goto bad_exception_cache;
1502 } 2254 }
1503 2255
1504 pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); 2256 pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
1505 if (!pending_cache) { 2257 if (!pending_cache) {
1506 DMERR("Couldn't create pending cache."); 2258 DMERR("Couldn't create pending cache.");
1507 r = -ENOMEM; 2259 r = -ENOMEM;
1508 goto bad4; 2260 goto bad_pending_cache;
1509 } 2261 }
1510 2262
1511 tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); 2263 tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
1512 if (!tracked_chunk_cache) { 2264 if (!tracked_chunk_cache) {
1513 DMERR("Couldn't create cache to track chunks in use."); 2265 DMERR("Couldn't create cache to track chunks in use.");
1514 r = -ENOMEM; 2266 r = -ENOMEM;
1515 goto bad5; 2267 goto bad_tracked_chunk_cache;
1516 } 2268 }
1517 2269
1518 ksnapd = create_singlethread_workqueue("ksnapd"); 2270 ksnapd = create_singlethread_workqueue("ksnapd");
@@ -1526,19 +2278,21 @@ static int __init dm_snapshot_init(void)
1526 2278
1527bad_pending_pool: 2279bad_pending_pool:
1528 kmem_cache_destroy(tracked_chunk_cache); 2280 kmem_cache_destroy(tracked_chunk_cache);
1529bad5: 2281bad_tracked_chunk_cache:
1530 kmem_cache_destroy(pending_cache); 2282 kmem_cache_destroy(pending_cache);
1531bad4: 2283bad_pending_cache:
1532 kmem_cache_destroy(exception_cache); 2284 kmem_cache_destroy(exception_cache);
1533bad3: 2285bad_exception_cache:
1534 exit_origin_hash(); 2286 exit_origin_hash();
1535bad2: 2287bad_origin_hash:
2288 dm_unregister_target(&merge_target);
2289bad_register_merge_target:
1536 dm_unregister_target(&origin_target); 2290 dm_unregister_target(&origin_target);
1537bad1: 2291bad_register_origin_target:
1538 dm_unregister_target(&snapshot_target); 2292 dm_unregister_target(&snapshot_target);
1539
1540bad_register_snapshot_target: 2293bad_register_snapshot_target:
1541 dm_exception_store_exit(); 2294 dm_exception_store_exit();
2295
1542 return r; 2296 return r;
1543} 2297}
1544 2298
@@ -1548,6 +2302,7 @@ static void __exit dm_snapshot_exit(void)
1548 2302
1549 dm_unregister_target(&snapshot_target); 2303 dm_unregister_target(&snapshot_target);
1550 dm_unregister_target(&origin_target); 2304 dm_unregister_target(&origin_target);
2305 dm_unregister_target(&merge_target);
1551 2306
1552 exit_origin_hash(); 2307 exit_origin_hash();
1553 kmem_cache_destroy(pending_cache); 2308 kmem_cache_destroy(pending_cache);
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 4b045903a4e2..f53392df7b97 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -59,7 +59,7 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
59 59
60static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) 60static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
61{ 61{
62 sprintf(buf, "%d\n", dm_suspended(md)); 62 sprintf(buf, "%d\n", dm_suspended_md(md));
63 63
64 return strlen(buf); 64 return strlen(buf);
65} 65}
@@ -80,12 +80,20 @@ static struct sysfs_ops dm_sysfs_ops = {
80}; 80};
81 81
82/* 82/*
83 * The sysfs structure is embedded in md struct, nothing to do here
84 */
85static void dm_sysfs_release(struct kobject *kobj)
86{
87}
88
89/*
83 * dm kobject is embedded in mapped_device structure 90 * dm kobject is embedded in mapped_device structure
84 * no need to define release function here 91 * no need to define release function here
85 */ 92 */
86static struct kobj_type dm_ktype = { 93static struct kobj_type dm_ktype = {
87 .sysfs_ops = &dm_sysfs_ops, 94 .sysfs_ops = &dm_sysfs_ops,
88 .default_attrs = dm_attrs, 95 .default_attrs = dm_attrs,
96 .release = dm_sysfs_release
89}; 97};
90 98
91/* 99/*
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 91976e8fae5f..be625475cf6d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -238,6 +238,9 @@ void dm_table_destroy(struct dm_table *t)
238{ 238{
239 unsigned int i; 239 unsigned int i;
240 240
241 if (!t)
242 return;
243
241 while (atomic_read(&t->holders)) 244 while (atomic_read(&t->holders))
242 msleep(1); 245 msleep(1);
243 smp_mb(); 246 smp_mb();
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c
index 6f65883aef12..c7c555a8c7b2 100644
--- a/drivers/md/dm-uevent.c
+++ b/drivers/md/dm-uevent.c
@@ -139,14 +139,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj)
139 list_del_init(&event->elist); 139 list_del_init(&event->elist);
140 140
141 /* 141 /*
142 * Need to call dm_copy_name_and_uuid from here for now. 142 * When a device is being removed this copy fails and we
143 * Context of previous var adds and locking used for 143 * discard these unsent events.
144 * hash_cell not compatable.
145 */ 144 */
146 if (dm_copy_name_and_uuid(event->md, event->name, 145 if (dm_copy_name_and_uuid(event->md, event->name,
147 event->uuid)) { 146 event->uuid)) {
148 DMERR("%s: dm_copy_name_and_uuid() failed", 147 DMINFO("%s: skipping sending uevent for lost device",
149 __func__); 148 __func__);
150 goto uevent_free; 149 goto uevent_free;
151 } 150 }
152 151
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 724efc63904d..3167480b532c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -143,9 +143,19 @@ struct mapped_device {
143 int barrier_error; 143 int barrier_error;
144 144
145 /* 145 /*
146 * Protect barrier_error from concurrent endio processing
147 * in request-based dm.
148 */
149 spinlock_t barrier_error_lock;
150
151 /*
146 * Processing queue (flush/barriers) 152 * Processing queue (flush/barriers)
147 */ 153 */
148 struct workqueue_struct *wq; 154 struct workqueue_struct *wq;
155 struct work_struct barrier_work;
156
157 /* A pointer to the currently processing pre/post flush request */
158 struct request *flush_request;
149 159
150 /* 160 /*
151 * The current mapping. 161 * The current mapping.
@@ -178,9 +188,6 @@ struct mapped_device {
178 /* forced geometry settings */ 188 /* forced geometry settings */
179 struct hd_geometry geometry; 189 struct hd_geometry geometry;
180 190
181 /* marker of flush suspend for request-based dm */
182 struct request suspend_rq;
183
184 /* For saving the address of __make_request for request based dm */ 191 /* For saving the address of __make_request for request based dm */
185 make_request_fn *saved_make_request_fn; 192 make_request_fn *saved_make_request_fn;
186 193
@@ -275,6 +282,7 @@ static int (*_inits[])(void) __initdata = {
275 dm_target_init, 282 dm_target_init,
276 dm_linear_init, 283 dm_linear_init,
277 dm_stripe_init, 284 dm_stripe_init,
285 dm_io_init,
278 dm_kcopyd_init, 286 dm_kcopyd_init,
279 dm_interface_init, 287 dm_interface_init,
280}; 288};
@@ -284,6 +292,7 @@ static void (*_exits[])(void) = {
284 dm_target_exit, 292 dm_target_exit,
285 dm_linear_exit, 293 dm_linear_exit,
286 dm_stripe_exit, 294 dm_stripe_exit,
295 dm_io_exit,
287 dm_kcopyd_exit, 296 dm_kcopyd_exit,
288 dm_interface_exit, 297 dm_interface_exit,
289}; 298};
@@ -320,6 +329,11 @@ static void __exit dm_exit(void)
320/* 329/*
321 * Block device functions 330 * Block device functions
322 */ 331 */
332int dm_deleting_md(struct mapped_device *md)
333{
334 return test_bit(DMF_DELETING, &md->flags);
335}
336
323static int dm_blk_open(struct block_device *bdev, fmode_t mode) 337static int dm_blk_open(struct block_device *bdev, fmode_t mode)
324{ 338{
325 struct mapped_device *md; 339 struct mapped_device *md;
@@ -331,7 +345,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
331 goto out; 345 goto out;
332 346
333 if (test_bit(DMF_FREEING, &md->flags) || 347 if (test_bit(DMF_FREEING, &md->flags) ||
334 test_bit(DMF_DELETING, &md->flags)) { 348 dm_deleting_md(md)) {
335 md = NULL; 349 md = NULL;
336 goto out; 350 goto out;
337 } 351 }
@@ -388,7 +402,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
388 unsigned int cmd, unsigned long arg) 402 unsigned int cmd, unsigned long arg)
389{ 403{
390 struct mapped_device *md = bdev->bd_disk->private_data; 404 struct mapped_device *md = bdev->bd_disk->private_data;
391 struct dm_table *map = dm_get_table(md); 405 struct dm_table *map = dm_get_live_table(md);
392 struct dm_target *tgt; 406 struct dm_target *tgt;
393 int r = -ENOTTY; 407 int r = -ENOTTY;
394 408
@@ -401,7 +415,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
401 415
402 tgt = dm_table_get_target(map, 0); 416 tgt = dm_table_get_target(map, 0);
403 417
404 if (dm_suspended(md)) { 418 if (dm_suspended_md(md)) {
405 r = -EAGAIN; 419 r = -EAGAIN;
406 goto out; 420 goto out;
407 } 421 }
@@ -430,9 +444,10 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
430 mempool_free(tio, md->tio_pool); 444 mempool_free(tio, md->tio_pool);
431} 445}
432 446
433static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) 447static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
448 gfp_t gfp_mask)
434{ 449{
435 return mempool_alloc(md->tio_pool, GFP_ATOMIC); 450 return mempool_alloc(md->tio_pool, gfp_mask);
436} 451}
437 452
438static void free_rq_tio(struct dm_rq_target_io *tio) 453static void free_rq_tio(struct dm_rq_target_io *tio)
@@ -450,6 +465,12 @@ static void free_bio_info(struct dm_rq_clone_bio_info *info)
450 mempool_free(info, info->tio->md->io_pool); 465 mempool_free(info, info->tio->md->io_pool);
451} 466}
452 467
468static int md_in_flight(struct mapped_device *md)
469{
470 return atomic_read(&md->pending[READ]) +
471 atomic_read(&md->pending[WRITE]);
472}
473
453static void start_io_acct(struct dm_io *io) 474static void start_io_acct(struct dm_io *io)
454{ 475{
455 struct mapped_device *md = io->md; 476 struct mapped_device *md = io->md;
@@ -512,7 +533,7 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
512 * function to access the md->map field, and make sure they call 533 * function to access the md->map field, and make sure they call
513 * dm_table_put() when finished. 534 * dm_table_put() when finished.
514 */ 535 */
515struct dm_table *dm_get_table(struct mapped_device *md) 536struct dm_table *dm_get_live_table(struct mapped_device *md)
516{ 537{
517 struct dm_table *t; 538 struct dm_table *t;
518 unsigned long flags; 539 unsigned long flags;
@@ -716,28 +737,38 @@ static void end_clone_bio(struct bio *clone, int error)
716 blk_update_request(tio->orig, 0, nr_bytes); 737 blk_update_request(tio->orig, 0, nr_bytes);
717} 738}
718 739
740static void store_barrier_error(struct mapped_device *md, int error)
741{
742 unsigned long flags;
743
744 spin_lock_irqsave(&md->barrier_error_lock, flags);
745 /*
746 * Basically, the first error is taken, but:
747 * -EOPNOTSUPP supersedes any I/O error.
748 * Requeue request supersedes any I/O error but -EOPNOTSUPP.
749 */
750 if (!md->barrier_error || error == -EOPNOTSUPP ||
751 (md->barrier_error != -EOPNOTSUPP &&
752 error == DM_ENDIO_REQUEUE))
753 md->barrier_error = error;
754 spin_unlock_irqrestore(&md->barrier_error_lock, flags);
755}
756
719/* 757/*
720 * Don't touch any member of the md after calling this function because 758 * Don't touch any member of the md after calling this function because
721 * the md may be freed in dm_put() at the end of this function. 759 * the md may be freed in dm_put() at the end of this function.
722 * Or do dm_get() before calling this function and dm_put() later. 760 * Or do dm_get() before calling this function and dm_put() later.
723 */ 761 */
724static void rq_completed(struct mapped_device *md, int run_queue) 762static void rq_completed(struct mapped_device *md, int rw, int run_queue)
725{ 763{
726 int wakeup_waiters = 0; 764 atomic_dec(&md->pending[rw]);
727 struct request_queue *q = md->queue;
728 unsigned long flags;
729
730 spin_lock_irqsave(q->queue_lock, flags);
731 if (!queue_in_flight(q))
732 wakeup_waiters = 1;
733 spin_unlock_irqrestore(q->queue_lock, flags);
734 765
735 /* nudge anyone waiting on suspend queue */ 766 /* nudge anyone waiting on suspend queue */
736 if (wakeup_waiters) 767 if (!md_in_flight(md))
737 wake_up(&md->wait); 768 wake_up(&md->wait);
738 769
739 if (run_queue) 770 if (run_queue)
740 blk_run_queue(q); 771 blk_run_queue(md->queue);
741 772
742 /* 773 /*
743 * dm_put() must be at the end of this function. See the comment above 774 * dm_put() must be at the end of this function. See the comment above
@@ -753,6 +784,44 @@ static void free_rq_clone(struct request *clone)
753 free_rq_tio(tio); 784 free_rq_tio(tio);
754} 785}
755 786
787/*
788 * Complete the clone and the original request.
789 * Must be called without queue lock.
790 */
791static void dm_end_request(struct request *clone, int error)
792{
793 int rw = rq_data_dir(clone);
794 int run_queue = 1;
795 bool is_barrier = blk_barrier_rq(clone);
796 struct dm_rq_target_io *tio = clone->end_io_data;
797 struct mapped_device *md = tio->md;
798 struct request *rq = tio->orig;
799
800 if (blk_pc_request(rq) && !is_barrier) {
801 rq->errors = clone->errors;
802 rq->resid_len = clone->resid_len;
803
804 if (rq->sense)
805 /*
806 * We are using the sense buffer of the original
807 * request.
808 * So setting the length of the sense data is enough.
809 */
810 rq->sense_len = clone->sense_len;
811 }
812
813 free_rq_clone(clone);
814
815 if (unlikely(is_barrier)) {
816 if (unlikely(error))
817 store_barrier_error(md, error);
818 run_queue = 0;
819 } else
820 blk_end_request_all(rq, error);
821
822 rq_completed(md, rw, run_queue);
823}
824
756static void dm_unprep_request(struct request *rq) 825static void dm_unprep_request(struct request *rq)
757{ 826{
758 struct request *clone = rq->special; 827 struct request *clone = rq->special;
@@ -768,12 +837,23 @@ static void dm_unprep_request(struct request *rq)
768 */ 837 */
769void dm_requeue_unmapped_request(struct request *clone) 838void dm_requeue_unmapped_request(struct request *clone)
770{ 839{
840 int rw = rq_data_dir(clone);
771 struct dm_rq_target_io *tio = clone->end_io_data; 841 struct dm_rq_target_io *tio = clone->end_io_data;
772 struct mapped_device *md = tio->md; 842 struct mapped_device *md = tio->md;
773 struct request *rq = tio->orig; 843 struct request *rq = tio->orig;
774 struct request_queue *q = rq->q; 844 struct request_queue *q = rq->q;
775 unsigned long flags; 845 unsigned long flags;
776 846
847 if (unlikely(blk_barrier_rq(clone))) {
848 /*
849 * Barrier clones share an original request.
850 * Leave it to dm_end_request(), which handles this special
851 * case.
852 */
853 dm_end_request(clone, DM_ENDIO_REQUEUE);
854 return;
855 }
856
777 dm_unprep_request(rq); 857 dm_unprep_request(rq);
778 858
779 spin_lock_irqsave(q->queue_lock, flags); 859 spin_lock_irqsave(q->queue_lock, flags);
@@ -782,7 +862,7 @@ void dm_requeue_unmapped_request(struct request *clone)
782 blk_requeue_request(q, rq); 862 blk_requeue_request(q, rq);
783 spin_unlock_irqrestore(q->queue_lock, flags); 863 spin_unlock_irqrestore(q->queue_lock, flags);
784 864
785 rq_completed(md, 0); 865 rq_completed(md, rw, 0);
786} 866}
787EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 867EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
788 868
@@ -815,34 +895,28 @@ static void start_queue(struct request_queue *q)
815 spin_unlock_irqrestore(q->queue_lock, flags); 895 spin_unlock_irqrestore(q->queue_lock, flags);
816} 896}
817 897
818/* 898static void dm_done(struct request *clone, int error, bool mapped)
819 * Complete the clone and the original request.
820 * Must be called without queue lock.
821 */
822static void dm_end_request(struct request *clone, int error)
823{ 899{
900 int r = error;
824 struct dm_rq_target_io *tio = clone->end_io_data; 901 struct dm_rq_target_io *tio = clone->end_io_data;
825 struct mapped_device *md = tio->md; 902 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
826 struct request *rq = tio->orig;
827 903
828 if (blk_pc_request(rq)) { 904 if (mapped && rq_end_io)
829 rq->errors = clone->errors; 905 r = rq_end_io(tio->ti, clone, error, &tio->info);
830 rq->resid_len = clone->resid_len;
831 906
832 if (rq->sense) 907 if (r <= 0)
833 /* 908 /* The target wants to complete the I/O */
834 * We are using the sense buffer of the original 909 dm_end_request(clone, r);
835 * request. 910 else if (r == DM_ENDIO_INCOMPLETE)
836 * So setting the length of the sense data is enough. 911 /* The target will handle the I/O */
837 */ 912 return;
838 rq->sense_len = clone->sense_len; 913 else if (r == DM_ENDIO_REQUEUE)
914 /* The target wants to requeue the I/O */
915 dm_requeue_unmapped_request(clone);
916 else {
917 DMWARN("unimplemented target endio return value: %d", r);
918 BUG();
839 } 919 }
840
841 free_rq_clone(clone);
842
843 blk_end_request_all(rq, error);
844
845 rq_completed(md, 1);
846} 920}
847 921
848/* 922/*
@@ -850,27 +924,14 @@ static void dm_end_request(struct request *clone, int error)
850 */ 924 */
851static void dm_softirq_done(struct request *rq) 925static void dm_softirq_done(struct request *rq)
852{ 926{
927 bool mapped = true;
853 struct request *clone = rq->completion_data; 928 struct request *clone = rq->completion_data;
854 struct dm_rq_target_io *tio = clone->end_io_data; 929 struct dm_rq_target_io *tio = clone->end_io_data;
855 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
856 int error = tio->error;
857 930
858 if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) 931 if (rq->cmd_flags & REQ_FAILED)
859 error = rq_end_io(tio->ti, clone, error, &tio->info); 932 mapped = false;
860 933
861 if (error <= 0) 934 dm_done(clone, tio->error, mapped);
862 /* The target wants to complete the I/O */
863 dm_end_request(clone, error);
864 else if (error == DM_ENDIO_INCOMPLETE)
865 /* The target will handle the I/O */
866 return;
867 else if (error == DM_ENDIO_REQUEUE)
868 /* The target wants to requeue the I/O */
869 dm_requeue_unmapped_request(clone);
870 else {
871 DMWARN("unimplemented target endio return value: %d", error);
872 BUG();
873 }
874} 935}
875 936
876/* 937/*
@@ -882,6 +943,19 @@ static void dm_complete_request(struct request *clone, int error)
882 struct dm_rq_target_io *tio = clone->end_io_data; 943 struct dm_rq_target_io *tio = clone->end_io_data;
883 struct request *rq = tio->orig; 944 struct request *rq = tio->orig;
884 945
946 if (unlikely(blk_barrier_rq(clone))) {
947 /*
948 * Barrier clones share an original request. So can't use
949 * softirq_done with the original.
950 * Pass the clone to dm_done() directly in this special case.
951 * It is safe (even if clone->q->queue_lock is held here)
952 * because there is no I/O dispatching during the completion
953 * of barrier clone.
954 */
955 dm_done(clone, error, true);
956 return;
957 }
958
885 tio->error = error; 959 tio->error = error;
886 rq->completion_data = clone; 960 rq->completion_data = clone;
887 blk_complete_request(rq); 961 blk_complete_request(rq);
@@ -898,6 +972,17 @@ void dm_kill_unmapped_request(struct request *clone, int error)
898 struct dm_rq_target_io *tio = clone->end_io_data; 972 struct dm_rq_target_io *tio = clone->end_io_data;
899 struct request *rq = tio->orig; 973 struct request *rq = tio->orig;
900 974
975 if (unlikely(blk_barrier_rq(clone))) {
976 /*
977 * Barrier clones share an original request.
978 * Leave it to dm_end_request(), which handles this special
979 * case.
980 */
981 BUG_ON(error > 0);
982 dm_end_request(clone, error);
983 return;
984 }
985
901 rq->cmd_flags |= REQ_FAILED; 986 rq->cmd_flags |= REQ_FAILED;
902 dm_complete_request(clone, error); 987 dm_complete_request(clone, error);
903} 988}
@@ -1214,7 +1299,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1214 struct clone_info ci; 1299 struct clone_info ci;
1215 int error = 0; 1300 int error = 0;
1216 1301
1217 ci.map = dm_get_table(md); 1302 ci.map = dm_get_live_table(md);
1218 if (unlikely(!ci.map)) { 1303 if (unlikely(!ci.map)) {
1219 if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) 1304 if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
1220 bio_io_error(bio); 1305 bio_io_error(bio);
@@ -1255,7 +1340,7 @@ static int dm_merge_bvec(struct request_queue *q,
1255 struct bio_vec *biovec) 1340 struct bio_vec *biovec)
1256{ 1341{
1257 struct mapped_device *md = q->queuedata; 1342 struct mapped_device *md = q->queuedata;
1258 struct dm_table *map = dm_get_table(md); 1343 struct dm_table *map = dm_get_live_table(md);
1259 struct dm_target *ti; 1344 struct dm_target *ti;
1260 sector_t max_sectors; 1345 sector_t max_sectors;
1261 int max_size = 0; 1346 int max_size = 0;
@@ -1352,11 +1437,6 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
1352{ 1437{
1353 struct mapped_device *md = q->queuedata; 1438 struct mapped_device *md = q->queuedata;
1354 1439
1355 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
1356 bio_endio(bio, -EOPNOTSUPP);
1357 return 0;
1358 }
1359
1360 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1440 return md->saved_make_request_fn(q, bio); /* call __make_request() */
1361} 1441}
1362 1442
@@ -1375,6 +1455,25 @@ static int dm_request(struct request_queue *q, struct bio *bio)
1375 return _dm_request(q, bio); 1455 return _dm_request(q, bio);
1376} 1456}
1377 1457
1458/*
1459 * Mark this request as flush request, so that dm_request_fn() can
1460 * recognize.
1461 */
1462static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq)
1463{
1464 rq->cmd_type = REQ_TYPE_LINUX_BLOCK;
1465 rq->cmd[0] = REQ_LB_OP_FLUSH;
1466}
1467
1468static bool dm_rq_is_flush_request(struct request *rq)
1469{
1470 if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK &&
1471 rq->cmd[0] == REQ_LB_OP_FLUSH)
1472 return true;
1473 else
1474 return false;
1475}
1476
1378void dm_dispatch_request(struct request *rq) 1477void dm_dispatch_request(struct request *rq)
1379{ 1478{
1380 int r; 1479 int r;
@@ -1420,25 +1519,54 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1420static int setup_clone(struct request *clone, struct request *rq, 1519static int setup_clone(struct request *clone, struct request *rq,
1421 struct dm_rq_target_io *tio) 1520 struct dm_rq_target_io *tio)
1422{ 1521{
1423 int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1522 int r;
1424 dm_rq_bio_constructor, tio);
1425 1523
1426 if (r) 1524 if (dm_rq_is_flush_request(rq)) {
1427 return r; 1525 blk_rq_init(NULL, clone);
1526 clone->cmd_type = REQ_TYPE_FS;
1527 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
1528 } else {
1529 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1530 dm_rq_bio_constructor, tio);
1531 if (r)
1532 return r;
1533
1534 clone->cmd = rq->cmd;
1535 clone->cmd_len = rq->cmd_len;
1536 clone->sense = rq->sense;
1537 clone->buffer = rq->buffer;
1538 }
1428 1539
1429 clone->cmd = rq->cmd;
1430 clone->cmd_len = rq->cmd_len;
1431 clone->sense = rq->sense;
1432 clone->buffer = rq->buffer;
1433 clone->end_io = end_clone_request; 1540 clone->end_io = end_clone_request;
1434 clone->end_io_data = tio; 1541 clone->end_io_data = tio;
1435 1542
1436 return 0; 1543 return 0;
1437} 1544}
1438 1545
1439static int dm_rq_flush_suspending(struct mapped_device *md) 1546static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1547 gfp_t gfp_mask)
1440{ 1548{
1441 return !md->suspend_rq.special; 1549 struct request *clone;
1550 struct dm_rq_target_io *tio;
1551
1552 tio = alloc_rq_tio(md, gfp_mask);
1553 if (!tio)
1554 return NULL;
1555
1556 tio->md = md;
1557 tio->ti = NULL;
1558 tio->orig = rq;
1559 tio->error = 0;
1560 memset(&tio->info, 0, sizeof(tio->info));
1561
1562 clone = &tio->clone;
1563 if (setup_clone(clone, rq, tio)) {
1564 /* -ENOMEM */
1565 free_rq_tio(tio);
1566 return NULL;
1567 }
1568
1569 return clone;
1442} 1570}
1443 1571
1444/* 1572/*
@@ -1447,39 +1575,19 @@ static int dm_rq_flush_suspending(struct mapped_device *md)
1447static int dm_prep_fn(struct request_queue *q, struct request *rq) 1575static int dm_prep_fn(struct request_queue *q, struct request *rq)
1448{ 1576{
1449 struct mapped_device *md = q->queuedata; 1577 struct mapped_device *md = q->queuedata;
1450 struct dm_rq_target_io *tio;
1451 struct request *clone; 1578 struct request *clone;
1452 1579
1453 if (unlikely(rq == &md->suspend_rq)) { 1580 if (unlikely(dm_rq_is_flush_request(rq)))
1454 if (dm_rq_flush_suspending(md)) 1581 return BLKPREP_OK;
1455 return BLKPREP_OK;
1456 else
1457 /* The flush suspend was interrupted */
1458 return BLKPREP_KILL;
1459 }
1460 1582
1461 if (unlikely(rq->special)) { 1583 if (unlikely(rq->special)) {
1462 DMWARN("Already has something in rq->special."); 1584 DMWARN("Already has something in rq->special.");
1463 return BLKPREP_KILL; 1585 return BLKPREP_KILL;
1464 } 1586 }
1465 1587
1466 tio = alloc_rq_tio(md); /* Only one for each original request */ 1588 clone = clone_rq(rq, md, GFP_ATOMIC);
1467 if (!tio) 1589 if (!clone)
1468 /* -ENOMEM */
1469 return BLKPREP_DEFER;
1470
1471 tio->md = md;
1472 tio->ti = NULL;
1473 tio->orig = rq;
1474 tio->error = 0;
1475 memset(&tio->info, 0, sizeof(tio->info));
1476
1477 clone = &tio->clone;
1478 if (setup_clone(clone, rq, tio)) {
1479 /* -ENOMEM */
1480 free_rq_tio(tio);
1481 return BLKPREP_DEFER; 1590 return BLKPREP_DEFER;
1482 }
1483 1591
1484 rq->special = clone; 1592 rq->special = clone;
1485 rq->cmd_flags |= REQ_DONTPREP; 1593 rq->cmd_flags |= REQ_DONTPREP;
@@ -1487,11 +1595,10 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
1487 return BLKPREP_OK; 1595 return BLKPREP_OK;
1488} 1596}
1489 1597
1490static void map_request(struct dm_target *ti, struct request *rq, 1598static void map_request(struct dm_target *ti, struct request *clone,
1491 struct mapped_device *md) 1599 struct mapped_device *md)
1492{ 1600{
1493 int r; 1601 int r;
1494 struct request *clone = rq->special;
1495 struct dm_rq_target_io *tio = clone->end_io_data; 1602 struct dm_rq_target_io *tio = clone->end_io_data;
1496 1603
1497 /* 1604 /*
@@ -1511,6 +1618,8 @@ static void map_request(struct dm_target *ti, struct request *rq,
1511 break; 1618 break;
1512 case DM_MAPIO_REMAPPED: 1619 case DM_MAPIO_REMAPPED:
1513 /* The target has remapped the I/O so dispatch it */ 1620 /* The target has remapped the I/O so dispatch it */
1621 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1622 blk_rq_pos(tio->orig));
1514 dm_dispatch_request(clone); 1623 dm_dispatch_request(clone);
1515 break; 1624 break;
1516 case DM_MAPIO_REQUEUE: 1625 case DM_MAPIO_REQUEUE:
@@ -1536,29 +1645,26 @@ static void map_request(struct dm_target *ti, struct request *rq,
1536static void dm_request_fn(struct request_queue *q) 1645static void dm_request_fn(struct request_queue *q)
1537{ 1646{
1538 struct mapped_device *md = q->queuedata; 1647 struct mapped_device *md = q->queuedata;
1539 struct dm_table *map = dm_get_table(md); 1648 struct dm_table *map = dm_get_live_table(md);
1540 struct dm_target *ti; 1649 struct dm_target *ti;
1541 struct request *rq; 1650 struct request *rq, *clone;
1542 1651
1543 /* 1652 /*
1544 * For noflush suspend, check blk_queue_stopped() to immediately 1653 * For suspend, check blk_queue_stopped() and increment
1545 * quit I/O dispatching. 1654 * ->pending within a single queue_lock not to increment the
1655 * number of in-flight I/Os after the queue is stopped in
1656 * dm_suspend().
1546 */ 1657 */
1547 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1658 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
1548 rq = blk_peek_request(q); 1659 rq = blk_peek_request(q);
1549 if (!rq) 1660 if (!rq)
1550 goto plug_and_out; 1661 goto plug_and_out;
1551 1662
1552 if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ 1663 if (unlikely(dm_rq_is_flush_request(rq))) {
1553 if (queue_in_flight(q)) 1664 BUG_ON(md->flush_request);
1554 /* Not quiet yet. Wait more */ 1665 md->flush_request = rq;
1555 goto plug_and_out;
1556
1557 /* This device should be quiet now */
1558 __stop_queue(q);
1559 blk_start_request(rq); 1666 blk_start_request(rq);
1560 __blk_end_request_all(rq, 0); 1667 queue_work(md->wq, &md->barrier_work);
1561 wake_up(&md->wait);
1562 goto out; 1668 goto out;
1563 } 1669 }
1564 1670
@@ -1567,8 +1673,11 @@ static void dm_request_fn(struct request_queue *q)
1567 goto plug_and_out; 1673 goto plug_and_out;
1568 1674
1569 blk_start_request(rq); 1675 blk_start_request(rq);
1676 clone = rq->special;
1677 atomic_inc(&md->pending[rq_data_dir(clone)]);
1678
1570 spin_unlock(q->queue_lock); 1679 spin_unlock(q->queue_lock);
1571 map_request(ti, rq, md); 1680 map_request(ti, clone, md);
1572 spin_lock_irq(q->queue_lock); 1681 spin_lock_irq(q->queue_lock);
1573 } 1682 }
1574 1683
@@ -1595,7 +1704,7 @@ static int dm_lld_busy(struct request_queue *q)
1595{ 1704{
1596 int r; 1705 int r;
1597 struct mapped_device *md = q->queuedata; 1706 struct mapped_device *md = q->queuedata;
1598 struct dm_table *map = dm_get_table(md); 1707 struct dm_table *map = dm_get_live_table(md);
1599 1708
1600 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1709 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1601 r = 1; 1710 r = 1;
@@ -1610,7 +1719,7 @@ static int dm_lld_busy(struct request_queue *q)
1610static void dm_unplug_all(struct request_queue *q) 1719static void dm_unplug_all(struct request_queue *q)
1611{ 1720{
1612 struct mapped_device *md = q->queuedata; 1721 struct mapped_device *md = q->queuedata;
1613 struct dm_table *map = dm_get_table(md); 1722 struct dm_table *map = dm_get_live_table(md);
1614 1723
1615 if (map) { 1724 if (map) {
1616 if (dm_request_based(md)) 1725 if (dm_request_based(md))
@@ -1628,7 +1737,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
1628 struct dm_table *map; 1737 struct dm_table *map;
1629 1738
1630 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1739 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1631 map = dm_get_table(md); 1740 map = dm_get_live_table(md);
1632 if (map) { 1741 if (map) {
1633 /* 1742 /*
1634 * Request-based dm cares about only own queue for 1743 * Request-based dm cares about only own queue for
@@ -1725,6 +1834,7 @@ out:
1725static const struct block_device_operations dm_blk_dops; 1834static const struct block_device_operations dm_blk_dops;
1726 1835
1727static void dm_wq_work(struct work_struct *work); 1836static void dm_wq_work(struct work_struct *work);
1837static void dm_rq_barrier_work(struct work_struct *work);
1728 1838
1729/* 1839/*
1730 * Allocate and initialise a blank device with a given minor. 1840 * Allocate and initialise a blank device with a given minor.
@@ -1754,6 +1864,7 @@ static struct mapped_device *alloc_dev(int minor)
1754 init_rwsem(&md->io_lock); 1864 init_rwsem(&md->io_lock);
1755 mutex_init(&md->suspend_lock); 1865 mutex_init(&md->suspend_lock);
1756 spin_lock_init(&md->deferred_lock); 1866 spin_lock_init(&md->deferred_lock);
1867 spin_lock_init(&md->barrier_error_lock);
1757 rwlock_init(&md->map_lock); 1868 rwlock_init(&md->map_lock);
1758 atomic_set(&md->holders, 1); 1869 atomic_set(&md->holders, 1);
1759 atomic_set(&md->open_count, 0); 1870 atomic_set(&md->open_count, 0);
@@ -1788,6 +1899,8 @@ static struct mapped_device *alloc_dev(int minor)
1788 blk_queue_softirq_done(md->queue, dm_softirq_done); 1899 blk_queue_softirq_done(md->queue, dm_softirq_done);
1789 blk_queue_prep_rq(md->queue, dm_prep_fn); 1900 blk_queue_prep_rq(md->queue, dm_prep_fn);
1790 blk_queue_lld_busy(md->queue, dm_lld_busy); 1901 blk_queue_lld_busy(md->queue, dm_lld_busy);
1902 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH,
1903 dm_rq_prepare_flush);
1791 1904
1792 md->disk = alloc_disk(1); 1905 md->disk = alloc_disk(1);
1793 if (!md->disk) 1906 if (!md->disk)
@@ -1797,6 +1910,7 @@ static struct mapped_device *alloc_dev(int minor)
1797 atomic_set(&md->pending[1], 0); 1910 atomic_set(&md->pending[1], 0);
1798 init_waitqueue_head(&md->wait); 1911 init_waitqueue_head(&md->wait);
1799 INIT_WORK(&md->work, dm_wq_work); 1912 INIT_WORK(&md->work, dm_wq_work);
1913 INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
1800 init_waitqueue_head(&md->eventq); 1914 init_waitqueue_head(&md->eventq);
1801 1915
1802 md->disk->major = _major; 1916 md->disk->major = _major;
@@ -1921,9 +2035,13 @@ static void __set_size(struct mapped_device *md, sector_t size)
1921 mutex_unlock(&md->bdev->bd_inode->i_mutex); 2035 mutex_unlock(&md->bdev->bd_inode->i_mutex);
1922} 2036}
1923 2037
1924static int __bind(struct mapped_device *md, struct dm_table *t, 2038/*
1925 struct queue_limits *limits) 2039 * Returns old map, which caller must destroy.
2040 */
2041static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2042 struct queue_limits *limits)
1926{ 2043{
2044 struct dm_table *old_map;
1927 struct request_queue *q = md->queue; 2045 struct request_queue *q = md->queue;
1928 sector_t size; 2046 sector_t size;
1929 unsigned long flags; 2047 unsigned long flags;
@@ -1938,11 +2056,6 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
1938 2056
1939 __set_size(md, size); 2057 __set_size(md, size);
1940 2058
1941 if (!size) {
1942 dm_table_destroy(t);
1943 return 0;
1944 }
1945
1946 dm_table_event_callback(t, event_callback, md); 2059 dm_table_event_callback(t, event_callback, md);
1947 2060
1948 /* 2061 /*
@@ -1958,26 +2071,31 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
1958 __bind_mempools(md, t); 2071 __bind_mempools(md, t);
1959 2072
1960 write_lock_irqsave(&md->map_lock, flags); 2073 write_lock_irqsave(&md->map_lock, flags);
2074 old_map = md->map;
1961 md->map = t; 2075 md->map = t;
1962 dm_table_set_restrictions(t, q, limits); 2076 dm_table_set_restrictions(t, q, limits);
1963 write_unlock_irqrestore(&md->map_lock, flags); 2077 write_unlock_irqrestore(&md->map_lock, flags);
1964 2078
1965 return 0; 2079 return old_map;
1966} 2080}
1967 2081
1968static void __unbind(struct mapped_device *md) 2082/*
2083 * Returns unbound table for the caller to free.
2084 */
2085static struct dm_table *__unbind(struct mapped_device *md)
1969{ 2086{
1970 struct dm_table *map = md->map; 2087 struct dm_table *map = md->map;
1971 unsigned long flags; 2088 unsigned long flags;
1972 2089
1973 if (!map) 2090 if (!map)
1974 return; 2091 return NULL;
1975 2092
1976 dm_table_event_callback(map, NULL, NULL); 2093 dm_table_event_callback(map, NULL, NULL);
1977 write_lock_irqsave(&md->map_lock, flags); 2094 write_lock_irqsave(&md->map_lock, flags);
1978 md->map = NULL; 2095 md->map = NULL;
1979 write_unlock_irqrestore(&md->map_lock, flags); 2096 write_unlock_irqrestore(&md->map_lock, flags);
1980 dm_table_destroy(map); 2097
2098 return map;
1981} 2099}
1982 2100
1983/* 2101/*
@@ -2059,18 +2177,18 @@ void dm_put(struct mapped_device *md)
2059 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2177 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2060 2178
2061 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 2179 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
2062 map = dm_get_table(md); 2180 map = dm_get_live_table(md);
2063 idr_replace(&_minor_idr, MINOR_ALLOCED, 2181 idr_replace(&_minor_idr, MINOR_ALLOCED,
2064 MINOR(disk_devt(dm_disk(md)))); 2182 MINOR(disk_devt(dm_disk(md))));
2065 set_bit(DMF_FREEING, &md->flags); 2183 set_bit(DMF_FREEING, &md->flags);
2066 spin_unlock(&_minor_lock); 2184 spin_unlock(&_minor_lock);
2067 if (!dm_suspended(md)) { 2185 if (!dm_suspended_md(md)) {
2068 dm_table_presuspend_targets(map); 2186 dm_table_presuspend_targets(map);
2069 dm_table_postsuspend_targets(map); 2187 dm_table_postsuspend_targets(map);
2070 } 2188 }
2071 dm_sysfs_exit(md); 2189 dm_sysfs_exit(md);
2072 dm_table_put(map); 2190 dm_table_put(map);
2073 __unbind(md); 2191 dm_table_destroy(__unbind(md));
2074 free_dev(md); 2192 free_dev(md);
2075 } 2193 }
2076} 2194}
@@ -2080,8 +2198,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2080{ 2198{
2081 int r = 0; 2199 int r = 0;
2082 DECLARE_WAITQUEUE(wait, current); 2200 DECLARE_WAITQUEUE(wait, current);
2083 struct request_queue *q = md->queue;
2084 unsigned long flags;
2085 2201
2086 dm_unplug_all(md->queue); 2202 dm_unplug_all(md->queue);
2087 2203
@@ -2091,15 +2207,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2091 set_current_state(interruptible); 2207 set_current_state(interruptible);
2092 2208
2093 smp_mb(); 2209 smp_mb();
2094 if (dm_request_based(md)) { 2210 if (!md_in_flight(md))
2095 spin_lock_irqsave(q->queue_lock, flags);
2096 if (!queue_in_flight(q) && blk_queue_stopped(q)) {
2097 spin_unlock_irqrestore(q->queue_lock, flags);
2098 break;
2099 }
2100 spin_unlock_irqrestore(q->queue_lock, flags);
2101 } else if (!atomic_read(&md->pending[0]) &&
2102 !atomic_read(&md->pending[1]))
2103 break; 2211 break;
2104 2212
2105 if (interruptible == TASK_INTERRUPTIBLE && 2213 if (interruptible == TASK_INTERRUPTIBLE &&
@@ -2194,98 +2302,106 @@ static void dm_queue_flush(struct mapped_device *md)
2194 queue_work(md->wq, &md->work); 2302 queue_work(md->wq, &md->work);
2195} 2303}
2196 2304
2197/* 2305static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
2198 * Swap in a new table (destroying old one).
2199 */
2200int dm_swap_table(struct mapped_device *md, struct dm_table *table)
2201{ 2306{
2202 struct queue_limits limits; 2307 struct dm_rq_target_io *tio = clone->end_io_data;
2203 int r = -EINVAL;
2204 2308
2205 mutex_lock(&md->suspend_lock); 2309 tio->info.flush_request = flush_nr;
2310}
2206 2311
2207 /* device must be suspended */ 2312/* Issue barrier requests to targets and wait for their completion. */
2208 if (!dm_suspended(md)) 2313static int dm_rq_barrier(struct mapped_device *md)
2209 goto out; 2314{
2315 int i, j;
2316 struct dm_table *map = dm_get_live_table(md);
2317 unsigned num_targets = dm_table_get_num_targets(map);
2318 struct dm_target *ti;
2319 struct request *clone;
2210 2320
2211 r = dm_calculate_queue_limits(table, &limits); 2321 md->barrier_error = 0;
2212 if (r)
2213 goto out;
2214 2322
2215 /* cannot change the device type, once a table is bound */ 2323 for (i = 0; i < num_targets; i++) {
2216 if (md->map && 2324 ti = dm_table_get_target(map, i);
2217 (dm_table_get_type(md->map) != dm_table_get_type(table))) { 2325 for (j = 0; j < ti->num_flush_requests; j++) {
2218 DMWARN("can't change the device type after a table is bound"); 2326 clone = clone_rq(md->flush_request, md, GFP_NOIO);
2219 goto out; 2327 dm_rq_set_flush_nr(clone, j);
2328 atomic_inc(&md->pending[rq_data_dir(clone)]);
2329 map_request(ti, clone, md);
2330 }
2220 } 2331 }
2221 2332
2222 __unbind(md); 2333 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2223 r = __bind(md, table, &limits); 2334 dm_table_put(map);
2224
2225out:
2226 mutex_unlock(&md->suspend_lock);
2227 return r;
2228}
2229 2335
2230static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) 2336 return md->barrier_error;
2231{
2232 md->suspend_rq.special = (void *)0x1;
2233} 2337}
2234 2338
2235static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) 2339static void dm_rq_barrier_work(struct work_struct *work)
2236{ 2340{
2341 int error;
2342 struct mapped_device *md = container_of(work, struct mapped_device,
2343 barrier_work);
2237 struct request_queue *q = md->queue; 2344 struct request_queue *q = md->queue;
2345 struct request *rq;
2238 unsigned long flags; 2346 unsigned long flags;
2239 2347
2240 spin_lock_irqsave(q->queue_lock, flags); 2348 /*
2241 if (!noflush) 2349 * Hold the md reference here and leave it at the last part so that
2242 dm_rq_invalidate_suspend_marker(md); 2350 * the md can't be deleted by device opener when the barrier request
2243 __start_queue(q); 2351 * completes.
2244 spin_unlock_irqrestore(q->queue_lock, flags); 2352 */
2245} 2353 dm_get(md);
2246 2354
2247static void dm_rq_start_suspend(struct mapped_device *md, int noflush) 2355 error = dm_rq_barrier(md);
2248{
2249 struct request *rq = &md->suspend_rq;
2250 struct request_queue *q = md->queue;
2251 2356
2252 if (noflush) 2357 rq = md->flush_request;
2253 stop_queue(q); 2358 md->flush_request = NULL;
2254 else { 2359
2255 blk_rq_init(q, rq); 2360 if (error == DM_ENDIO_REQUEUE) {
2256 blk_insert_request(q, rq, 0, NULL); 2361 spin_lock_irqsave(q->queue_lock, flags);
2257 } 2362 blk_requeue_request(q, rq);
2363 spin_unlock_irqrestore(q->queue_lock, flags);
2364 } else
2365 blk_end_request_all(rq, error);
2366
2367 blk_run_queue(q);
2368
2369 dm_put(md);
2258} 2370}
2259 2371
2260static int dm_rq_suspend_available(struct mapped_device *md, int noflush) 2372/*
2373 * Swap in a new table, returning the old one for the caller to destroy.
2374 */
2375struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2261{ 2376{
2262 int r = 1; 2377 struct dm_table *map = ERR_PTR(-EINVAL);
2263 struct request *rq = &md->suspend_rq; 2378 struct queue_limits limits;
2264 struct request_queue *q = md->queue; 2379 int r;
2265 unsigned long flags;
2266 2380
2267 if (noflush) 2381 mutex_lock(&md->suspend_lock);
2268 return r;
2269 2382
2270 /* The marker must be protected by queue lock if it is in use */ 2383 /* device must be suspended */
2271 spin_lock_irqsave(q->queue_lock, flags); 2384 if (!dm_suspended_md(md))
2272 if (unlikely(rq->ref_count)) { 2385 goto out;
2273 /* 2386
2274 * This can happen, when the previous flush suspend was 2387 r = dm_calculate_queue_limits(table, &limits);
2275 * interrupted, the marker is still in the queue and 2388 if (r) {
2276 * this flush suspend has been invoked, because we don't 2389 map = ERR_PTR(r);
2277 * remove the marker at the time of suspend interruption. 2390 goto out;
2278 * We have only one marker per mapped_device, so we can't
2279 * start another flush suspend while it is in use.
2280 */
2281 BUG_ON(!rq->special); /* The marker should be invalidated */
2282 DMWARN("Invalidating the previous flush suspend is still in"
2283 " progress. Please retry later.");
2284 r = 0;
2285 } 2391 }
2286 spin_unlock_irqrestore(q->queue_lock, flags);
2287 2392
2288 return r; 2393 /* cannot change the device type, once a table is bound */
2394 if (md->map &&
2395 (dm_table_get_type(md->map) != dm_table_get_type(table))) {
2396 DMWARN("can't change the device type after a table is bound");
2397 goto out;
2398 }
2399
2400 map = __bind(md, table, &limits);
2401
2402out:
2403 mutex_unlock(&md->suspend_lock);
2404 return map;
2289} 2405}
2290 2406
2291/* 2407/*
@@ -2330,49 +2446,11 @@ static void unlock_fs(struct mapped_device *md)
2330/* 2446/*
2331 * Suspend mechanism in request-based dm. 2447 * Suspend mechanism in request-based dm.
2332 * 2448 *
2333 * After the suspend starts, further incoming requests are kept in 2449 * 1. Flush all I/Os by lock_fs() if needed.
2334 * the request_queue and deferred. 2450 * 2. Stop dispatching any I/O by stopping the request_queue.
2335 * Remaining requests in the request_queue at the start of suspend are flushed 2451 * 3. Wait for all in-flight I/Os to be completed or requeued.
2336 * if it is flush suspend.
2337 * The suspend completes when the following conditions have been satisfied,
2338 * so wait for it:
2339 * 1. q->in_flight is 0 (which means no in_flight request)
2340 * 2. queue has been stopped (which means no request dispatching)
2341 *
2342 * 2452 *
2343 * Noflush suspend 2453 * To abort suspend, start the request_queue.
2344 * ---------------
2345 * Noflush suspend doesn't need to dispatch remaining requests.
2346 * So stop the queue immediately. Then, wait for all in_flight requests
2347 * to be completed or requeued.
2348 *
2349 * To abort noflush suspend, start the queue.
2350 *
2351 *
2352 * Flush suspend
2353 * -------------
2354 * Flush suspend needs to dispatch remaining requests. So stop the queue
2355 * after the remaining requests are completed. (Requeued request must be also
2356 * re-dispatched and completed. Until then, we can't stop the queue.)
2357 *
2358 * During flushing the remaining requests, further incoming requests are also
2359 * inserted to the same queue. To distinguish which requests are to be
2360 * flushed, we insert a marker request to the queue at the time of starting
2361 * flush suspend, like a barrier.
2362 * The dispatching is blocked when the marker is found on the top of the queue.
2363 * And the queue is stopped when all in_flight requests are completed, since
2364 * that means the remaining requests are completely flushed.
2365 * Then, the marker is removed from the queue.
2366 *
2367 * To abort flush suspend, we also need to take care of the marker, not only
2368 * starting the queue.
2369 * We don't remove the marker forcibly from the queue since it's against
2370 * the block-layer manner. Instead, we put a invalidated mark on the marker.
2371 * When the invalidated marker is found on the top of the queue, it is
2372 * immediately removed from the queue, so it doesn't block dispatching.
2373 * Because we have only one marker per mapped_device, we can't start another
2374 * flush suspend until the invalidated marker is removed from the queue.
2375 * So fail and return with -EBUSY in such a case.
2376 */ 2454 */
2377int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2455int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2378{ 2456{
@@ -2383,17 +2461,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2383 2461
2384 mutex_lock(&md->suspend_lock); 2462 mutex_lock(&md->suspend_lock);
2385 2463
2386 if (dm_suspended(md)) { 2464 if (dm_suspended_md(md)) {
2387 r = -EINVAL; 2465 r = -EINVAL;
2388 goto out_unlock; 2466 goto out_unlock;
2389 } 2467 }
2390 2468
2391 if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { 2469 map = dm_get_live_table(md);
2392 r = -EBUSY;
2393 goto out_unlock;
2394 }
2395
2396 map = dm_get_table(md);
2397 2470
2398 /* 2471 /*
2399 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2472 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2406,8 +2479,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2406 dm_table_presuspend_targets(map); 2479 dm_table_presuspend_targets(map);
2407 2480
2408 /* 2481 /*
2409 * Flush I/O to the device. noflush supersedes do_lockfs, 2482 * Flush I/O to the device.
2410 * because lock_fs() needs to flush I/Os. 2483 * Any I/O submitted after lock_fs() may not be flushed.
2484 * noflush takes precedence over do_lockfs.
2485 * (lock_fs() flushes I/Os and waits for them to complete.)
2411 */ 2486 */
2412 if (!noflush && do_lockfs) { 2487 if (!noflush && do_lockfs) {
2413 r = lock_fs(md); 2488 r = lock_fs(md);
@@ -2436,10 +2511,15 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2436 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2511 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2437 up_write(&md->io_lock); 2512 up_write(&md->io_lock);
2438 2513
2439 flush_workqueue(md->wq); 2514 /*
2440 2515 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
2516 * can be kicked until md->queue is stopped. So stop md->queue before
2517 * flushing md->wq.
2518 */
2441 if (dm_request_based(md)) 2519 if (dm_request_based(md))
2442 dm_rq_start_suspend(md, noflush); 2520 stop_queue(md->queue);
2521
2522 flush_workqueue(md->wq);
2443 2523
2444 /* 2524 /*
2445 * At this point no more requests are entering target request routines. 2525 * At this point no more requests are entering target request routines.
@@ -2458,7 +2538,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2458 dm_queue_flush(md); 2538 dm_queue_flush(md);
2459 2539
2460 if (dm_request_based(md)) 2540 if (dm_request_based(md))
2461 dm_rq_abort_suspend(md, noflush); 2541 start_queue(md->queue);
2462 2542
2463 unlock_fs(md); 2543 unlock_fs(md);
2464 goto out; /* pushback list is already flushed, so skip flush */ 2544 goto out; /* pushback list is already flushed, so skip flush */
@@ -2470,10 +2550,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2470 * requests are being added to md->deferred list. 2550 * requests are being added to md->deferred list.
2471 */ 2551 */
2472 2552
2473 dm_table_postsuspend_targets(map);
2474
2475 set_bit(DMF_SUSPENDED, &md->flags); 2553 set_bit(DMF_SUSPENDED, &md->flags);
2476 2554
2555 dm_table_postsuspend_targets(map);
2556
2477out: 2557out:
2478 dm_table_put(map); 2558 dm_table_put(map);
2479 2559
@@ -2488,10 +2568,10 @@ int dm_resume(struct mapped_device *md)
2488 struct dm_table *map = NULL; 2568 struct dm_table *map = NULL;
2489 2569
2490 mutex_lock(&md->suspend_lock); 2570 mutex_lock(&md->suspend_lock);
2491 if (!dm_suspended(md)) 2571 if (!dm_suspended_md(md))
2492 goto out; 2572 goto out;
2493 2573
2494 map = dm_get_table(md); 2574 map = dm_get_live_table(md);
2495 if (!map || !dm_table_get_size(map)) 2575 if (!map || !dm_table_get_size(map))
2496 goto out; 2576 goto out;
2497 2577
@@ -2592,18 +2672,29 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2592 return NULL; 2672 return NULL;
2593 2673
2594 if (test_bit(DMF_FREEING, &md->flags) || 2674 if (test_bit(DMF_FREEING, &md->flags) ||
2595 test_bit(DMF_DELETING, &md->flags)) 2675 dm_deleting_md(md))
2596 return NULL; 2676 return NULL;
2597 2677
2598 dm_get(md); 2678 dm_get(md);
2599 return md; 2679 return md;
2600} 2680}
2601 2681
2602int dm_suspended(struct mapped_device *md) 2682int dm_suspended_md(struct mapped_device *md)
2603{ 2683{
2604 return test_bit(DMF_SUSPENDED, &md->flags); 2684 return test_bit(DMF_SUSPENDED, &md->flags);
2605} 2685}
2606 2686
2687int dm_suspended(struct dm_target *ti)
2688{
2689 struct mapped_device *md = dm_table_get_md(ti->table);
2690 int r = dm_suspended_md(md);
2691
2692 dm_put(md);
2693
2694 return r;
2695}
2696EXPORT_SYMBOL_GPL(dm_suspended);
2697
2607int dm_noflush_suspending(struct dm_target *ti) 2698int dm_noflush_suspending(struct dm_target *ti)
2608{ 2699{
2609 struct mapped_device *md = dm_table_get_md(ti->table); 2700 struct mapped_device *md = dm_table_get_md(ti->table);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a7663eba17e2..8dadaa5bc396 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -89,6 +89,16 @@ int dm_target_iterate(void (*iter_func)(struct target_type *tt,
89int dm_split_args(int *argc, char ***argvp, char *input); 89int dm_split_args(int *argc, char ***argvp, char *input);
90 90
91/* 91/*
92 * Is this mapped_device being deleted?
93 */
94int dm_deleting_md(struct mapped_device *md);
95
96/*
97 * Is this mapped_device suspended?
98 */
99int dm_suspended_md(struct mapped_device *md);
100
101/*
92 * The device-mapper can be driven through one of two interfaces; 102 * The device-mapper can be driven through one of two interfaces;
93 * ioctl or filesystem, depending which patch you have applied. 103 * ioctl or filesystem, depending which patch you have applied.
94 */ 104 */
@@ -118,6 +128,9 @@ int dm_lock_for_deletion(struct mapped_device *md);
118void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 128void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
119 unsigned cookie); 129 unsigned cookie);
120 130
131int dm_io_init(void);
132void dm_io_exit(void);
133
121int dm_kcopyd_init(void); 134int dm_kcopyd_init(void);
122void dm_kcopyd_exit(void); 135void dm_kcopyd_exit(void);
123 136
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index df7607e6dce8..d4c9c0b88adc 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -235,7 +235,7 @@ void dm_uevent_add(struct mapped_device *md, struct list_head *elist);
235const char *dm_device_name(struct mapped_device *md); 235const char *dm_device_name(struct mapped_device *md);
236int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid); 236int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid);
237struct gendisk *dm_disk(struct mapped_device *md); 237struct gendisk *dm_disk(struct mapped_device *md);
238int dm_suspended(struct mapped_device *md); 238int dm_suspended(struct dm_target *ti);
239int dm_noflush_suspending(struct dm_target *ti); 239int dm_noflush_suspending(struct dm_target *ti);
240union map_info *dm_get_mapinfo(struct bio *bio); 240union map_info *dm_get_mapinfo(struct bio *bio);
241union map_info *dm_get_rq_mapinfo(struct request *rq); 241union map_info *dm_get_rq_mapinfo(struct request *rq);
@@ -276,7 +276,7 @@ void dm_table_unplug_all(struct dm_table *t);
276/* 276/*
277 * Table reference counting. 277 * Table reference counting.
278 */ 278 */
279struct dm_table *dm_get_table(struct mapped_device *md); 279struct dm_table *dm_get_live_table(struct mapped_device *md);
280void dm_table_get(struct dm_table *t); 280void dm_table_get(struct dm_table *t);
281void dm_table_put(struct dm_table *t); 281void dm_table_put(struct dm_table *t);
282 282
@@ -295,8 +295,10 @@ void dm_table_event(struct dm_table *t);
295 295
296/* 296/*
297 * The device must be suspended before calling this method. 297 * The device must be suspended before calling this method.
298 * Returns the previous table, which the caller must destroy.
298 */ 299 */
299int dm_swap_table(struct mapped_device *md, struct dm_table *t); 300struct dm_table *dm_swap_table(struct mapped_device *md,
301 struct dm_table *t);
300 302
301/* 303/*
302 * A wrapper around vmalloc. 304 * A wrapper around vmalloc.
diff --git a/include/linux/dm-dirty-log.h b/include/linux/dm-dirty-log.h
index 5e8b11d88f6f..7084503c3405 100644
--- a/include/linux/dm-dirty-log.h
+++ b/include/linux/dm-dirty-log.h
@@ -21,6 +21,7 @@ struct dm_dirty_log_type;
21 21
22struct dm_dirty_log { 22struct dm_dirty_log {
23 struct dm_dirty_log_type *type; 23 struct dm_dirty_log_type *type;
24 int (*flush_callback_fn)(struct dm_target *ti);
24 void *context; 25 void *context;
25}; 26};
26 27
@@ -136,8 +137,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type);
136 * type->constructor/destructor() directly. 137 * type->constructor/destructor() directly.
137 */ 138 */
138struct dm_dirty_log *dm_dirty_log_create(const char *type_name, 139struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
139 struct dm_target *ti, 140 struct dm_target *ti,
140 unsigned argc, char **argv); 141 int (*flush_callback_fn)(struct dm_target *ti),
142 unsigned argc, char **argv);
141void dm_dirty_log_destroy(struct dm_dirty_log *log); 143void dm_dirty_log_destroy(struct dm_dirty_log *log);
142 144
143#endif /* __KERNEL__ */ 145#endif /* __KERNEL__ */
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 2ab84c83c31a..aa95508d2f95 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited. 2 * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
3 * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004 - 2009 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the LGPL. 5 * This file is released under the LGPL.
6 */ 6 */
@@ -266,9 +266,9 @@ enum {
266#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 266#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
267 267
268#define DM_VERSION_MAJOR 4 268#define DM_VERSION_MAJOR 4
269#define DM_VERSION_MINOR 15 269#define DM_VERSION_MINOR 16
270#define DM_VERSION_PATCHLEVEL 0 270#define DM_VERSION_PATCHLEVEL 0
271#define DM_VERSION_EXTRA "-ioctl (2009-04-01)" 271#define DM_VERSION_EXTRA "-ioctl (2009-11-05)"
272 272
273/* Status bits */ 273/* Status bits */
274#define DM_READONLY_FLAG (1 << 0) /* In/Out */ 274#define DM_READONLY_FLAG (1 << 0) /* In/Out */
@@ -309,4 +309,11 @@ enum {
309 */ 309 */
310#define DM_NOFLUSH_FLAG (1 << 11) /* In */ 310#define DM_NOFLUSH_FLAG (1 << 11) /* In */
311 311
312/*
313 * If set, any table information returned will relate to the inactive
314 * table instead of the live one. Always check DM_INACTIVE_PRESENT_FLAG
315 * is set before using the data returned.
316 */
317#define DM_QUERY_INACTIVE_TABLE_FLAG (1 << 12) /* In */
318
312#endif /* _LINUX_DM_IOCTL_H */ 319#endif /* _LINUX_DM_IOCTL_H */
diff --git a/include/linux/dm-region-hash.h b/include/linux/dm-region-hash.h
index a9e652a41373..9e2a7a401df5 100644
--- a/include/linux/dm-region-hash.h
+++ b/include/linux/dm-region-hash.h
@@ -78,8 +78,7 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region);
78/* Delay bios on regions. */ 78/* Delay bios on regions. */
79void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio); 79void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio);
80 80
81void dm_rh_mark_nosync(struct dm_region_hash *rh, 81void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio);
82 struct bio *bio, unsigned done, int error);
83 82
84/* 83/*
85 * Region recovery control. 84 * Region recovery control.