diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-15 12:12:01 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-15 12:12:01 -0500 |
commit | 53365383c4667aba55385cd1858582c19a7a8a36 (patch) | |
tree | b290d003534b3947834762c2fb492d9d0beb985f /drivers/md | |
parent | 51b736b85155a56543fda8aeca5f8592795d7983 (diff) | |
parent | d2fdb776e08d4231d7e86a879cc663a93913c202 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm
* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (80 commits)
dm snapshot: use merge origin if snapshot invalid
dm snapshot: report merge failure in status
dm snapshot: merge consecutive chunks together
dm snapshot: trigger exceptions in remaining snapshots during merge
dm snapshot: delay merging a chunk until writes to it complete
dm snapshot: queue writes to chunks being merged
dm snapshot: add merging
dm snapshot: permit only one merge at once
dm snapshot: support barriers in snapshot merge target
dm snapshot: avoid allocating exceptions in merge
dm snapshot: rework writing to origin
dm snapshot: add merge target
dm exception store: add merge specific methods
dm snapshot: create function for chunk_is_tracked wait
dm snapshot: make bio optional in __origin_write
dm mpath: reject messages when device is suspended
dm: export suspended state to targets
dm: rename dm_suspended to dm_suspended_md
dm: swap target postsuspend call and setting suspended flag
dm crypt: add plain64 iv
...
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-crypt.c | 207 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.c | 33 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.h | 62 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 120 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 123 | ||||
-rw-r--r-- | drivers/md/dm-kcopyd.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 77 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 95 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 219 | ||||
-rw-r--r-- | drivers/md/dm-region-hash.c | 31 | ||||
-rw-r--r-- | drivers/md/dm-snap-persistent.c | 195 | ||||
-rw-r--r-- | drivers/md/dm-snap-transient.c | 24 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 1279 | ||||
-rw-r--r-- | drivers/md/dm-sysfs.c | 10 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-uevent.c | 9 | ||||
-rw-r--r-- | drivers/md/dm.c | 643 | ||||
-rw-r--r-- | drivers/md/dm.h | 13 |
18 files changed, 2274 insertions, 874 deletions
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index e412980763bd..a93637223c8d 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2003 Christophe Saout <christophe@saout.de> | 2 | * Copyright (C) 2003 Christophe Saout <christophe@saout.de> |
3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> | 3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> |
4 | * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. | 4 | * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. |
5 | * | 5 | * |
6 | * This file is released under the GPL. | 6 | * This file is released under the GPL. |
7 | */ | 7 | */ |
@@ -71,10 +71,21 @@ struct crypt_iv_operations { | |||
71 | int (*ctr)(struct crypt_config *cc, struct dm_target *ti, | 71 | int (*ctr)(struct crypt_config *cc, struct dm_target *ti, |
72 | const char *opts); | 72 | const char *opts); |
73 | void (*dtr)(struct crypt_config *cc); | 73 | void (*dtr)(struct crypt_config *cc); |
74 | const char *(*status)(struct crypt_config *cc); | 74 | int (*init)(struct crypt_config *cc); |
75 | int (*wipe)(struct crypt_config *cc); | ||
75 | int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); | 76 | int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); |
76 | }; | 77 | }; |
77 | 78 | ||
79 | struct iv_essiv_private { | ||
80 | struct crypto_cipher *tfm; | ||
81 | struct crypto_hash *hash_tfm; | ||
82 | u8 *salt; | ||
83 | }; | ||
84 | |||
85 | struct iv_benbi_private { | ||
86 | int shift; | ||
87 | }; | ||
88 | |||
78 | /* | 89 | /* |
79 | * Crypt: maps a linear range of a block device | 90 | * Crypt: maps a linear range of a block device |
80 | * and encrypts / decrypts at the same time. | 91 | * and encrypts / decrypts at the same time. |
@@ -102,8 +113,8 @@ struct crypt_config { | |||
102 | struct crypt_iv_operations *iv_gen_ops; | 113 | struct crypt_iv_operations *iv_gen_ops; |
103 | char *iv_mode; | 114 | char *iv_mode; |
104 | union { | 115 | union { |
105 | struct crypto_cipher *essiv_tfm; | 116 | struct iv_essiv_private essiv; |
106 | int benbi_shift; | 117 | struct iv_benbi_private benbi; |
107 | } iv_gen_private; | 118 | } iv_gen_private; |
108 | sector_t iv_offset; | 119 | sector_t iv_offset; |
109 | unsigned int iv_size; | 120 | unsigned int iv_size; |
@@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); | |||
147 | * plain: the initial vector is the 32-bit little-endian version of the sector | 158 | * plain: the initial vector is the 32-bit little-endian version of the sector |
148 | * number, padded with zeros if necessary. | 159 | * number, padded with zeros if necessary. |
149 | * | 160 | * |
161 | * plain64: the initial vector is the 64-bit little-endian version of the sector | ||
162 | * number, padded with zeros if necessary. | ||
163 | * | ||
150 | * essiv: "encrypted sector|salt initial vector", the sector number is | 164 | * essiv: "encrypted sector|salt initial vector", the sector number is |
151 | * encrypted with the bulk cipher using a salt as key. The salt | 165 | * encrypted with the bulk cipher using a salt as key. The salt |
152 | * should be derived from the bulk cipher's key via hashing. | 166 | * should be derived from the bulk cipher's key via hashing. |
@@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | |||
169 | return 0; | 183 | return 0; |
170 | } | 184 | } |
171 | 185 | ||
172 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | 186 | static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, |
173 | const char *opts) | 187 | sector_t sector) |
174 | { | 188 | { |
175 | struct crypto_cipher *essiv_tfm; | 189 | memset(iv, 0, cc->iv_size); |
176 | struct crypto_hash *hash_tfm; | 190 | *(u64 *)iv = cpu_to_le64(sector); |
191 | |||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | /* Initialise ESSIV - compute salt but no local memory allocations */ | ||
196 | static int crypt_iv_essiv_init(struct crypt_config *cc) | ||
197 | { | ||
198 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | ||
177 | struct hash_desc desc; | 199 | struct hash_desc desc; |
178 | struct scatterlist sg; | 200 | struct scatterlist sg; |
179 | unsigned int saltsize; | ||
180 | u8 *salt; | ||
181 | int err; | 201 | int err; |
182 | 202 | ||
183 | if (opts == NULL) { | 203 | sg_init_one(&sg, cc->key, cc->key_size); |
204 | desc.tfm = essiv->hash_tfm; | ||
205 | desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
206 | |||
207 | err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt); | ||
208 | if (err) | ||
209 | return err; | ||
210 | |||
211 | return crypto_cipher_setkey(essiv->tfm, essiv->salt, | ||
212 | crypto_hash_digestsize(essiv->hash_tfm)); | ||
213 | } | ||
214 | |||
215 | /* Wipe salt and reset key derived from volume key */ | ||
216 | static int crypt_iv_essiv_wipe(struct crypt_config *cc) | ||
217 | { | ||
218 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | ||
219 | unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); | ||
220 | |||
221 | memset(essiv->salt, 0, salt_size); | ||
222 | |||
223 | return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); | ||
224 | } | ||
225 | |||
226 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) | ||
227 | { | ||
228 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | ||
229 | |||
230 | crypto_free_cipher(essiv->tfm); | ||
231 | essiv->tfm = NULL; | ||
232 | |||
233 | crypto_free_hash(essiv->hash_tfm); | ||
234 | essiv->hash_tfm = NULL; | ||
235 | |||
236 | kzfree(essiv->salt); | ||
237 | essiv->salt = NULL; | ||
238 | } | ||
239 | |||
240 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | ||
241 | const char *opts) | ||
242 | { | ||
243 | struct crypto_cipher *essiv_tfm = NULL; | ||
244 | struct crypto_hash *hash_tfm = NULL; | ||
245 | u8 *salt = NULL; | ||
246 | int err; | ||
247 | |||
248 | if (!opts) { | ||
184 | ti->error = "Digest algorithm missing for ESSIV mode"; | 249 | ti->error = "Digest algorithm missing for ESSIV mode"; |
185 | return -EINVAL; | 250 | return -EINVAL; |
186 | } | 251 | } |
187 | 252 | ||
188 | /* Hash the cipher key with the given hash algorithm */ | 253 | /* Allocate hash algorithm */ |
189 | hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); | 254 | hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); |
190 | if (IS_ERR(hash_tfm)) { | 255 | if (IS_ERR(hash_tfm)) { |
191 | ti->error = "Error initializing ESSIV hash"; | 256 | ti->error = "Error initializing ESSIV hash"; |
192 | return PTR_ERR(hash_tfm); | 257 | err = PTR_ERR(hash_tfm); |
258 | goto bad; | ||
193 | } | 259 | } |
194 | 260 | ||
195 | saltsize = crypto_hash_digestsize(hash_tfm); | 261 | salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL); |
196 | salt = kmalloc(saltsize, GFP_KERNEL); | 262 | if (!salt) { |
197 | if (salt == NULL) { | ||
198 | ti->error = "Error kmallocing salt storage in ESSIV"; | 263 | ti->error = "Error kmallocing salt storage in ESSIV"; |
199 | crypto_free_hash(hash_tfm); | 264 | err = -ENOMEM; |
200 | return -ENOMEM; | 265 | goto bad; |
201 | } | 266 | } |
202 | 267 | ||
203 | sg_init_one(&sg, cc->key, cc->key_size); | 268 | /* Allocate essiv_tfm */ |
204 | desc.tfm = hash_tfm; | ||
205 | desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
206 | err = crypto_hash_digest(&desc, &sg, cc->key_size, salt); | ||
207 | crypto_free_hash(hash_tfm); | ||
208 | |||
209 | if (err) { | ||
210 | ti->error = "Error calculating hash in ESSIV"; | ||
211 | kfree(salt); | ||
212 | return err; | ||
213 | } | ||
214 | |||
215 | /* Setup the essiv_tfm with the given salt */ | ||
216 | essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); | 269 | essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); |
217 | if (IS_ERR(essiv_tfm)) { | 270 | if (IS_ERR(essiv_tfm)) { |
218 | ti->error = "Error allocating crypto tfm for ESSIV"; | 271 | ti->error = "Error allocating crypto tfm for ESSIV"; |
219 | kfree(salt); | 272 | err = PTR_ERR(essiv_tfm); |
220 | return PTR_ERR(essiv_tfm); | 273 | goto bad; |
221 | } | 274 | } |
222 | if (crypto_cipher_blocksize(essiv_tfm) != | 275 | if (crypto_cipher_blocksize(essiv_tfm) != |
223 | crypto_ablkcipher_ivsize(cc->tfm)) { | 276 | crypto_ablkcipher_ivsize(cc->tfm)) { |
224 | ti->error = "Block size of ESSIV cipher does " | 277 | ti->error = "Block size of ESSIV cipher does " |
225 | "not match IV size of block cipher"; | 278 | "not match IV size of block cipher"; |
226 | crypto_free_cipher(essiv_tfm); | 279 | err = -EINVAL; |
227 | kfree(salt); | 280 | goto bad; |
228 | return -EINVAL; | ||
229 | } | 281 | } |
230 | err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); | ||
231 | if (err) { | ||
232 | ti->error = "Failed to set key for ESSIV cipher"; | ||
233 | crypto_free_cipher(essiv_tfm); | ||
234 | kfree(salt); | ||
235 | return err; | ||
236 | } | ||
237 | kfree(salt); | ||
238 | 282 | ||
239 | cc->iv_gen_private.essiv_tfm = essiv_tfm; | 283 | cc->iv_gen_private.essiv.salt = salt; |
284 | cc->iv_gen_private.essiv.tfm = essiv_tfm; | ||
285 | cc->iv_gen_private.essiv.hash_tfm = hash_tfm; | ||
286 | |||
240 | return 0; | 287 | return 0; |
241 | } | ||
242 | 288 | ||
243 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) | 289 | bad: |
244 | { | 290 | if (essiv_tfm && !IS_ERR(essiv_tfm)) |
245 | crypto_free_cipher(cc->iv_gen_private.essiv_tfm); | 291 | crypto_free_cipher(essiv_tfm); |
246 | cc->iv_gen_private.essiv_tfm = NULL; | 292 | if (hash_tfm && !IS_ERR(hash_tfm)) |
293 | crypto_free_hash(hash_tfm); | ||
294 | kfree(salt); | ||
295 | return err; | ||
247 | } | 296 | } |
248 | 297 | ||
249 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | 298 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) |
250 | { | 299 | { |
251 | memset(iv, 0, cc->iv_size); | 300 | memset(iv, 0, cc->iv_size); |
252 | *(u64 *)iv = cpu_to_le64(sector); | 301 | *(u64 *)iv = cpu_to_le64(sector); |
253 | crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv); | 302 | crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); |
254 | return 0; | 303 | return 0; |
255 | } | 304 | } |
256 | 305 | ||
@@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
273 | return -EINVAL; | 322 | return -EINVAL; |
274 | } | 323 | } |
275 | 324 | ||
276 | cc->iv_gen_private.benbi_shift = 9 - log; | 325 | cc->iv_gen_private.benbi.shift = 9 - log; |
277 | 326 | ||
278 | return 0; | 327 | return 0; |
279 | } | 328 | } |
@@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | |||
288 | 337 | ||
289 | memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ | 338 | memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ |
290 | 339 | ||
291 | val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1); | 340 | val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); |
292 | put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); | 341 | put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); |
293 | 342 | ||
294 | return 0; | 343 | return 0; |
@@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = { | |||
305 | .generator = crypt_iv_plain_gen | 354 | .generator = crypt_iv_plain_gen |
306 | }; | 355 | }; |
307 | 356 | ||
357 | static struct crypt_iv_operations crypt_iv_plain64_ops = { | ||
358 | .generator = crypt_iv_plain64_gen | ||
359 | }; | ||
360 | |||
308 | static struct crypt_iv_operations crypt_iv_essiv_ops = { | 361 | static struct crypt_iv_operations crypt_iv_essiv_ops = { |
309 | .ctr = crypt_iv_essiv_ctr, | 362 | .ctr = crypt_iv_essiv_ctr, |
310 | .dtr = crypt_iv_essiv_dtr, | 363 | .dtr = crypt_iv_essiv_dtr, |
364 | .init = crypt_iv_essiv_init, | ||
365 | .wipe = crypt_iv_essiv_wipe, | ||
311 | .generator = crypt_iv_essiv_gen | 366 | .generator = crypt_iv_essiv_gen |
312 | }; | 367 | }; |
313 | 368 | ||
@@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key) | |||
934 | 989 | ||
935 | set_bit(DM_CRYPT_KEY_VALID, &cc->flags); | 990 | set_bit(DM_CRYPT_KEY_VALID, &cc->flags); |
936 | 991 | ||
937 | return 0; | 992 | return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); |
938 | } | 993 | } |
939 | 994 | ||
940 | static int crypt_wipe_key(struct crypt_config *cc) | 995 | static int crypt_wipe_key(struct crypt_config *cc) |
941 | { | 996 | { |
942 | clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); | 997 | clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); |
943 | memset(&cc->key, 0, cc->key_size * sizeof(u8)); | 998 | memset(&cc->key, 0, cc->key_size * sizeof(u8)); |
944 | return 0; | 999 | return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); |
945 | } | 1000 | } |
946 | 1001 | ||
947 | /* | 1002 | /* |
@@ -983,11 +1038,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
983 | return -ENOMEM; | 1038 | return -ENOMEM; |
984 | } | 1039 | } |
985 | 1040 | ||
986 | if (crypt_set_key(cc, argv[1])) { | ||
987 | ti->error = "Error decoding key"; | ||
988 | goto bad_cipher; | ||
989 | } | ||
990 | |||
991 | /* Compatibility mode for old dm-crypt cipher strings */ | 1041 | /* Compatibility mode for old dm-crypt cipher strings */ |
992 | if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { | 1042 | if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { |
993 | chainmode = "cbc"; | 1043 | chainmode = "cbc"; |
@@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1015 | strcpy(cc->chainmode, chainmode); | 1065 | strcpy(cc->chainmode, chainmode); |
1016 | cc->tfm = tfm; | 1066 | cc->tfm = tfm; |
1017 | 1067 | ||
1068 | if (crypt_set_key(cc, argv[1]) < 0) { | ||
1069 | ti->error = "Error decoding and setting key"; | ||
1070 | goto bad_ivmode; | ||
1071 | } | ||
1072 | |||
1018 | /* | 1073 | /* |
1019 | * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". | 1074 | * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". |
1020 | * See comments at iv code | 1075 | * See comments at iv code |
@@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1024 | cc->iv_gen_ops = NULL; | 1079 | cc->iv_gen_ops = NULL; |
1025 | else if (strcmp(ivmode, "plain") == 0) | 1080 | else if (strcmp(ivmode, "plain") == 0) |
1026 | cc->iv_gen_ops = &crypt_iv_plain_ops; | 1081 | cc->iv_gen_ops = &crypt_iv_plain_ops; |
1082 | else if (strcmp(ivmode, "plain64") == 0) | ||
1083 | cc->iv_gen_ops = &crypt_iv_plain64_ops; | ||
1027 | else if (strcmp(ivmode, "essiv") == 0) | 1084 | else if (strcmp(ivmode, "essiv") == 0) |
1028 | cc->iv_gen_ops = &crypt_iv_essiv_ops; | 1085 | cc->iv_gen_ops = &crypt_iv_essiv_ops; |
1029 | else if (strcmp(ivmode, "benbi") == 0) | 1086 | else if (strcmp(ivmode, "benbi") == 0) |
@@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1039 | cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) | 1096 | cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) |
1040 | goto bad_ivmode; | 1097 | goto bad_ivmode; |
1041 | 1098 | ||
1099 | if (cc->iv_gen_ops && cc->iv_gen_ops->init && | ||
1100 | cc->iv_gen_ops->init(cc) < 0) { | ||
1101 | ti->error = "Error initialising IV"; | ||
1102 | goto bad_slab_pool; | ||
1103 | } | ||
1104 | |||
1042 | cc->iv_size = crypto_ablkcipher_ivsize(tfm); | 1105 | cc->iv_size = crypto_ablkcipher_ivsize(tfm); |
1043 | if (cc->iv_size) | 1106 | if (cc->iv_size) |
1044 | /* at least a 64 bit sector number should fit in our buffer */ | 1107 | /* at least a 64 bit sector number should fit in our buffer */ |
@@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1085 | goto bad_bs; | 1148 | goto bad_bs; |
1086 | } | 1149 | } |
1087 | 1150 | ||
1088 | if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) { | ||
1089 | ti->error = "Error setting key"; | ||
1090 | goto bad_device; | ||
1091 | } | ||
1092 | |||
1093 | if (sscanf(argv[2], "%llu", &tmpll) != 1) { | 1151 | if (sscanf(argv[2], "%llu", &tmpll) != 1) { |
1094 | ti->error = "Invalid iv_offset sector"; | 1152 | ti->error = "Invalid iv_offset sector"; |
1095 | goto bad_device; | 1153 | goto bad_device; |
@@ -1278,6 +1336,7 @@ static void crypt_resume(struct dm_target *ti) | |||
1278 | static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) | 1336 | static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) |
1279 | { | 1337 | { |
1280 | struct crypt_config *cc = ti->private; | 1338 | struct crypt_config *cc = ti->private; |
1339 | int ret = -EINVAL; | ||
1281 | 1340 | ||
1282 | if (argc < 2) | 1341 | if (argc < 2) |
1283 | goto error; | 1342 | goto error; |
@@ -1287,10 +1346,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) | |||
1287 | DMWARN("not suspended during key manipulation."); | 1346 | DMWARN("not suspended during key manipulation."); |
1288 | return -EINVAL; | 1347 | return -EINVAL; |
1289 | } | 1348 | } |
1290 | if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) | 1349 | if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) { |
1291 | return crypt_set_key(cc, argv[2]); | 1350 | ret = crypt_set_key(cc, argv[2]); |
1292 | if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) | 1351 | if (ret) |
1352 | return ret; | ||
1353 | if (cc->iv_gen_ops && cc->iv_gen_ops->init) | ||
1354 | ret = cc->iv_gen_ops->init(cc); | ||
1355 | return ret; | ||
1356 | } | ||
1357 | if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) { | ||
1358 | if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { | ||
1359 | ret = cc->iv_gen_ops->wipe(cc); | ||
1360 | if (ret) | ||
1361 | return ret; | ||
1362 | } | ||
1293 | return crypt_wipe_key(cc); | 1363 | return crypt_wipe_key(cc); |
1364 | } | ||
1294 | } | 1365 | } |
1295 | 1366 | ||
1296 | error: | 1367 | error: |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 7dbe652efb5a..2b7907b6dd09 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
@@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, | |||
172 | } | 172 | } |
173 | 173 | ||
174 | /* Validate the chunk size against the device block size */ | 174 | /* Validate the chunk size against the device block size */ |
175 | if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) { | 175 | if (chunk_size % |
176 | (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) { | ||
176 | *error = "Chunk size is not a multiple of device blocksize"; | 177 | *error = "Chunk size is not a multiple of device blocksize"; |
177 | return -EINVAL; | 178 | return -EINVAL; |
178 | } | 179 | } |
@@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, | |||
190 | } | 191 | } |
191 | 192 | ||
192 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | 193 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, |
194 | struct dm_snapshot *snap, | ||
193 | unsigned *args_used, | 195 | unsigned *args_used, |
194 | struct dm_exception_store **store) | 196 | struct dm_exception_store **store) |
195 | { | 197 | { |
@@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
198 | struct dm_exception_store *tmp_store; | 200 | struct dm_exception_store *tmp_store; |
199 | char persistent; | 201 | char persistent; |
200 | 202 | ||
201 | if (argc < 3) { | 203 | if (argc < 2) { |
202 | ti->error = "Insufficient exception store arguments"; | 204 | ti->error = "Insufficient exception store arguments"; |
203 | return -EINVAL; | 205 | return -EINVAL; |
204 | } | 206 | } |
@@ -209,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
209 | return -ENOMEM; | 211 | return -ENOMEM; |
210 | } | 212 | } |
211 | 213 | ||
212 | persistent = toupper(*argv[1]); | 214 | persistent = toupper(*argv[0]); |
213 | if (persistent == 'P') | 215 | if (persistent == 'P') |
214 | type = get_type("P"); | 216 | type = get_type("P"); |
215 | else if (persistent == 'N') | 217 | else if (persistent == 'N') |
216 | type = get_type("N"); | 218 | type = get_type("N"); |
217 | else { | 219 | else { |
218 | ti->error = "Persistent flag is not P or N"; | 220 | ti->error = "Persistent flag is not P or N"; |
219 | return -EINVAL; | 221 | r = -EINVAL; |
222 | goto bad_type; | ||
220 | } | 223 | } |
221 | 224 | ||
222 | if (!type) { | 225 | if (!type) { |
@@ -226,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
226 | } | 229 | } |
227 | 230 | ||
228 | tmp_store->type = type; | 231 | tmp_store->type = type; |
229 | tmp_store->ti = ti; | 232 | tmp_store->snap = snap; |
230 | |||
231 | r = dm_get_device(ti, argv[0], 0, 0, | ||
232 | FMODE_READ | FMODE_WRITE, &tmp_store->cow); | ||
233 | if (r) { | ||
234 | ti->error = "Cannot get COW device"; | ||
235 | goto bad_cow; | ||
236 | } | ||
237 | 233 | ||
238 | r = set_chunk_size(tmp_store, argv[2], &ti->error); | 234 | r = set_chunk_size(tmp_store, argv[1], &ti->error); |
239 | if (r) | 235 | if (r) |
240 | goto bad_ctr; | 236 | goto bad; |
241 | 237 | ||
242 | r = type->ctr(tmp_store, 0, NULL); | 238 | r = type->ctr(tmp_store, 0, NULL); |
243 | if (r) { | 239 | if (r) { |
244 | ti->error = "Exception store type constructor failed"; | 240 | ti->error = "Exception store type constructor failed"; |
245 | goto bad_ctr; | 241 | goto bad; |
246 | } | 242 | } |
247 | 243 | ||
248 | *args_used = 3; | 244 | *args_used = 2; |
249 | *store = tmp_store; | 245 | *store = tmp_store; |
250 | return 0; | 246 | return 0; |
251 | 247 | ||
252 | bad_ctr: | 248 | bad: |
253 | dm_put_device(ti, tmp_store->cow); | ||
254 | bad_cow: | ||
255 | put_type(type); | 249 | put_type(type); |
256 | bad_type: | 250 | bad_type: |
257 | kfree(tmp_store); | 251 | kfree(tmp_store); |
@@ -262,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create); | |||
262 | void dm_exception_store_destroy(struct dm_exception_store *store) | 256 | void dm_exception_store_destroy(struct dm_exception_store *store) |
263 | { | 257 | { |
264 | store->type->dtr(store); | 258 | store->type->dtr(store); |
265 | dm_put_device(store->ti, store->cow); | ||
266 | put_type(store->type); | 259 | put_type(store->type); |
267 | kfree(store); | 260 | kfree(store); |
268 | } | 261 | } |
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 8a223a48802c..e8dfa06af3ba 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h | |||
@@ -26,7 +26,7 @@ typedef sector_t chunk_t; | |||
26 | * of chunks that follow contiguously. Remaining bits hold the number of the | 26 | * of chunks that follow contiguously. Remaining bits hold the number of the |
27 | * chunk within the device. | 27 | * chunk within the device. |
28 | */ | 28 | */ |
29 | struct dm_snap_exception { | 29 | struct dm_exception { |
30 | struct list_head hash_list; | 30 | struct list_head hash_list; |
31 | 31 | ||
32 | chunk_t old_chunk; | 32 | chunk_t old_chunk; |
@@ -64,17 +64,34 @@ struct dm_exception_store_type { | |||
64 | * Find somewhere to store the next exception. | 64 | * Find somewhere to store the next exception. |
65 | */ | 65 | */ |
66 | int (*prepare_exception) (struct dm_exception_store *store, | 66 | int (*prepare_exception) (struct dm_exception_store *store, |
67 | struct dm_snap_exception *e); | 67 | struct dm_exception *e); |
68 | 68 | ||
69 | /* | 69 | /* |
70 | * Update the metadata with this exception. | 70 | * Update the metadata with this exception. |
71 | */ | 71 | */ |
72 | void (*commit_exception) (struct dm_exception_store *store, | 72 | void (*commit_exception) (struct dm_exception_store *store, |
73 | struct dm_snap_exception *e, | 73 | struct dm_exception *e, |
74 | void (*callback) (void *, int success), | 74 | void (*callback) (void *, int success), |
75 | void *callback_context); | 75 | void *callback_context); |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * Returns 0 if the exception store is empty. | ||
79 | * | ||
80 | * If there are exceptions still to be merged, sets | ||
81 | * *last_old_chunk and *last_new_chunk to the most recent | ||
82 | * still-to-be-merged chunk and returns the number of | ||
83 | * consecutive previous ones. | ||
84 | */ | ||
85 | int (*prepare_merge) (struct dm_exception_store *store, | ||
86 | chunk_t *last_old_chunk, chunk_t *last_new_chunk); | ||
87 | |||
88 | /* | ||
89 | * Clear the last n exceptions. | ||
90 | * nr_merged must be <= the value returned by prepare_merge. | ||
91 | */ | ||
92 | int (*commit_merge) (struct dm_exception_store *store, int nr_merged); | ||
93 | |||
94 | /* | ||
78 | * The snapshot is invalid, note this in the metadata. | 95 | * The snapshot is invalid, note this in the metadata. |
79 | */ | 96 | */ |
80 | void (*drop_snapshot) (struct dm_exception_store *store); | 97 | void (*drop_snapshot) (struct dm_exception_store *store); |
@@ -86,19 +103,19 @@ struct dm_exception_store_type { | |||
86 | /* | 103 | /* |
87 | * Return how full the snapshot is. | 104 | * Return how full the snapshot is. |
88 | */ | 105 | */ |
89 | void (*fraction_full) (struct dm_exception_store *store, | 106 | void (*usage) (struct dm_exception_store *store, |
90 | sector_t *numerator, | 107 | sector_t *total_sectors, sector_t *sectors_allocated, |
91 | sector_t *denominator); | 108 | sector_t *metadata_sectors); |
92 | 109 | ||
93 | /* For internal device-mapper use only. */ | 110 | /* For internal device-mapper use only. */ |
94 | struct list_head list; | 111 | struct list_head list; |
95 | }; | 112 | }; |
96 | 113 | ||
114 | struct dm_snapshot; | ||
115 | |||
97 | struct dm_exception_store { | 116 | struct dm_exception_store { |
98 | struct dm_exception_store_type *type; | 117 | struct dm_exception_store_type *type; |
99 | struct dm_target *ti; | 118 | struct dm_snapshot *snap; |
100 | |||
101 | struct dm_dev *cow; | ||
102 | 119 | ||
103 | /* Size of data blocks saved - must be a power of 2 */ | 120 | /* Size of data blocks saved - must be a power of 2 */ |
104 | unsigned chunk_size; | 121 | unsigned chunk_size; |
@@ -109,6 +126,11 @@ struct dm_exception_store { | |||
109 | }; | 126 | }; |
110 | 127 | ||
111 | /* | 128 | /* |
129 | * Obtain the cow device used by a given snapshot. | ||
130 | */ | ||
131 | struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); | ||
132 | |||
133 | /* | ||
112 | * Funtions to manipulate consecutive chunks | 134 | * Funtions to manipulate consecutive chunks |
113 | */ | 135 | */ |
114 | # if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) | 136 | # if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) |
@@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk) | |||
120 | return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); | 142 | return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); |
121 | } | 143 | } |
122 | 144 | ||
123 | static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) | 145 | static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) |
124 | { | 146 | { |
125 | return e->new_chunk >> DM_CHUNK_NUMBER_BITS; | 147 | return e->new_chunk >> DM_CHUNK_NUMBER_BITS; |
126 | } | 148 | } |
127 | 149 | ||
128 | static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) | 150 | static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) |
129 | { | 151 | { |
130 | e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); | 152 | e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); |
131 | 153 | ||
132 | BUG_ON(!dm_consecutive_chunk_count(e)); | 154 | BUG_ON(!dm_consecutive_chunk_count(e)); |
133 | } | 155 | } |
134 | 156 | ||
157 | static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) | ||
158 | { | ||
159 | BUG_ON(!dm_consecutive_chunk_count(e)); | ||
160 | |||
161 | e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS); | ||
162 | } | ||
163 | |||
135 | # else | 164 | # else |
136 | # define DM_CHUNK_CONSECUTIVE_BITS 0 | 165 | # define DM_CHUNK_CONSECUTIVE_BITS 0 |
137 | 166 | ||
@@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk) | |||
140 | return chunk; | 169 | return chunk; |
141 | } | 170 | } |
142 | 171 | ||
143 | static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) | 172 | static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) |
144 | { | 173 | { |
145 | return 0; | 174 | return 0; |
146 | } | 175 | } |
147 | 176 | ||
148 | static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) | 177 | static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) |
178 | { | ||
179 | } | ||
180 | |||
181 | static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) | ||
149 | { | 182 | { |
150 | } | 183 | } |
151 | 184 | ||
@@ -162,7 +195,7 @@ static inline sector_t get_dev_size(struct block_device *bdev) | |||
162 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, | 195 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, |
163 | sector_t sector) | 196 | sector_t sector) |
164 | { | 197 | { |
165 | return (sector & ~store->chunk_mask) >> store->chunk_shift; | 198 | return sector >> store->chunk_shift; |
166 | } | 199 | } |
167 | 200 | ||
168 | int dm_exception_store_type_register(struct dm_exception_store_type *type); | 201 | int dm_exception_store_type_register(struct dm_exception_store_type *type); |
@@ -173,6 +206,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, | |||
173 | char **error); | 206 | char **error); |
174 | 207 | ||
175 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | 208 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, |
209 | struct dm_snapshot *snap, | ||
176 | unsigned *args_used, | 210 | unsigned *args_used, |
177 | struct dm_exception_store **store); | 211 | struct dm_exception_store **store); |
178 | void dm_exception_store_destroy(struct dm_exception_store *store); | 212 | void dm_exception_store_destroy(struct dm_exception_store *store); |
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 3a2e6a2f8bdd..10f457ca6af2 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c | |||
@@ -5,6 +5,8 @@ | |||
5 | * This file is released under the GPL. | 5 | * This file is released under the GPL. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "dm.h" | ||
9 | |||
8 | #include <linux/device-mapper.h> | 10 | #include <linux/device-mapper.h> |
9 | 11 | ||
10 | #include <linux/bio.h> | 12 | #include <linux/bio.h> |
@@ -14,12 +16,19 @@ | |||
14 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
15 | #include <linux/dm-io.h> | 17 | #include <linux/dm-io.h> |
16 | 18 | ||
19 | #define DM_MSG_PREFIX "io" | ||
20 | |||
21 | #define DM_IO_MAX_REGIONS BITS_PER_LONG | ||
22 | |||
17 | struct dm_io_client { | 23 | struct dm_io_client { |
18 | mempool_t *pool; | 24 | mempool_t *pool; |
19 | struct bio_set *bios; | 25 | struct bio_set *bios; |
20 | }; | 26 | }; |
21 | 27 | ||
22 | /* FIXME: can we shrink this ? */ | 28 | /* |
29 | * Aligning 'struct io' reduces the number of bits required to store | ||
30 | * its address. Refer to store_io_and_region_in_bio() below. | ||
31 | */ | ||
23 | struct io { | 32 | struct io { |
24 | unsigned long error_bits; | 33 | unsigned long error_bits; |
25 | unsigned long eopnotsupp_bits; | 34 | unsigned long eopnotsupp_bits; |
@@ -28,7 +37,9 @@ struct io { | |||
28 | struct dm_io_client *client; | 37 | struct dm_io_client *client; |
29 | io_notify_fn callback; | 38 | io_notify_fn callback; |
30 | void *context; | 39 | void *context; |
31 | }; | 40 | } __attribute__((aligned(DM_IO_MAX_REGIONS))); |
41 | |||
42 | static struct kmem_cache *_dm_io_cache; | ||
32 | 43 | ||
33 | /* | 44 | /* |
34 | * io contexts are only dynamically allocated for asynchronous | 45 | * io contexts are only dynamically allocated for asynchronous |
@@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages) | |||
53 | if (!client) | 64 | if (!client) |
54 | return ERR_PTR(-ENOMEM); | 65 | return ERR_PTR(-ENOMEM); |
55 | 66 | ||
56 | client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io)); | 67 | client->pool = mempool_create_slab_pool(ios, _dm_io_cache); |
57 | if (!client->pool) | 68 | if (!client->pool) |
58 | goto bad; | 69 | goto bad; |
59 | 70 | ||
@@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy); | |||
88 | 99 | ||
89 | /*----------------------------------------------------------------- | 100 | /*----------------------------------------------------------------- |
90 | * We need to keep track of which region a bio is doing io for. | 101 | * We need to keep track of which region a bio is doing io for. |
91 | * In order to save a memory allocation we store this the last | 102 | * To avoid a memory allocation to store just 5 or 6 bits, we |
92 | * bvec which we know is unused (blech). | 103 | * ensure the 'struct io' pointer is aligned so enough low bits are |
93 | * XXX This is ugly and can OOPS with some configs... find another way. | 104 | * always zero and then combine it with the region number directly in |
105 | * bi_private. | ||
94 | *---------------------------------------------------------------*/ | 106 | *---------------------------------------------------------------*/ |
95 | static inline void bio_set_region(struct bio *bio, unsigned region) | 107 | static void store_io_and_region_in_bio(struct bio *bio, struct io *io, |
108 | unsigned region) | ||
96 | { | 109 | { |
97 | bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; | 110 | if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) { |
111 | DMCRIT("Unaligned struct io pointer %p", io); | ||
112 | BUG(); | ||
113 | } | ||
114 | |||
115 | bio->bi_private = (void *)((unsigned long)io | region); | ||
98 | } | 116 | } |
99 | 117 | ||
100 | static inline unsigned bio_get_region(struct bio *bio) | 118 | static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, |
119 | unsigned *region) | ||
101 | { | 120 | { |
102 | return bio->bi_io_vec[bio->bi_max_vecs].bv_len; | 121 | unsigned long val = (unsigned long)bio->bi_private; |
122 | |||
123 | *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS); | ||
124 | *region = val & (DM_IO_MAX_REGIONS - 1); | ||
103 | } | 125 | } |
104 | 126 | ||
105 | /*----------------------------------------------------------------- | 127 | /*----------------------------------------------------------------- |
@@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error) | |||
140 | /* | 162 | /* |
141 | * The bio destructor in bio_put() may use the io object. | 163 | * The bio destructor in bio_put() may use the io object. |
142 | */ | 164 | */ |
143 | io = bio->bi_private; | 165 | retrieve_io_and_region_from_bio(bio, &io, ®ion); |
144 | region = bio_get_region(bio); | ||
145 | 166 | ||
146 | bio->bi_max_vecs++; | ||
147 | bio_put(bio); | 167 | bio_put(bio); |
148 | 168 | ||
149 | dec_count(io, region, error); | 169 | dec_count(io, region, error); |
@@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data) | |||
243 | 263 | ||
244 | static void dm_bio_destructor(struct bio *bio) | 264 | static void dm_bio_destructor(struct bio *bio) |
245 | { | 265 | { |
246 | struct io *io = bio->bi_private; | 266 | unsigned region; |
267 | struct io *io; | ||
268 | |||
269 | retrieve_io_and_region_from_bio(bio, &io, ®ion); | ||
247 | 270 | ||
248 | bio_free(bio, io->client->bios); | 271 | bio_free(bio, io->client->bios); |
249 | } | 272 | } |
@@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, | |||
286 | unsigned num_bvecs; | 309 | unsigned num_bvecs; |
287 | sector_t remaining = where->count; | 310 | sector_t remaining = where->count; |
288 | 311 | ||
289 | while (remaining) { | 312 | /* |
313 | * where->count may be zero if rw holds a write barrier and we | ||
314 | * need to send a zero-sized barrier. | ||
315 | */ | ||
316 | do { | ||
290 | /* | 317 | /* |
291 | * Allocate a suitably sized-bio: we add an extra | 318 | * Allocate a suitably sized-bio. |
292 | * bvec for bio_get/set_region() and decrement bi_max_vecs | ||
293 | * to hide it from bio_add_page(). | ||
294 | */ | 319 | */ |
295 | num_bvecs = dm_sector_div_up(remaining, | 320 | num_bvecs = dm_sector_div_up(remaining, |
296 | (PAGE_SIZE >> SECTOR_SHIFT)); | 321 | (PAGE_SIZE >> SECTOR_SHIFT)); |
297 | num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev), | 322 | num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs); |
298 | num_bvecs); | ||
299 | if (unlikely(num_bvecs > BIO_MAX_PAGES)) | ||
300 | num_bvecs = BIO_MAX_PAGES; | ||
301 | bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); | 323 | bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); |
302 | bio->bi_sector = where->sector + (where->count - remaining); | 324 | bio->bi_sector = where->sector + (where->count - remaining); |
303 | bio->bi_bdev = where->bdev; | 325 | bio->bi_bdev = where->bdev; |
304 | bio->bi_end_io = endio; | 326 | bio->bi_end_io = endio; |
305 | bio->bi_private = io; | ||
306 | bio->bi_destructor = dm_bio_destructor; | 327 | bio->bi_destructor = dm_bio_destructor; |
307 | bio->bi_max_vecs--; | 328 | store_io_and_region_in_bio(bio, io, region); |
308 | bio_set_region(bio, region); | ||
309 | 329 | ||
310 | /* | 330 | /* |
311 | * Try and add as many pages as possible. | 331 | * Try and add as many pages as possible. |
@@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, | |||
323 | 343 | ||
324 | atomic_inc(&io->count); | 344 | atomic_inc(&io->count); |
325 | submit_bio(rw, bio); | 345 | submit_bio(rw, bio); |
326 | } | 346 | } while (remaining); |
327 | } | 347 | } |
328 | 348 | ||
329 | static void dispatch_io(int rw, unsigned int num_regions, | 349 | static void dispatch_io(int rw, unsigned int num_regions, |
@@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions, | |||
333 | int i; | 353 | int i; |
334 | struct dpages old_pages = *dp; | 354 | struct dpages old_pages = *dp; |
335 | 355 | ||
356 | BUG_ON(num_regions > DM_IO_MAX_REGIONS); | ||
357 | |||
336 | if (sync) | 358 | if (sync) |
337 | rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); | 359 | rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); |
338 | 360 | ||
@@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions, | |||
342 | */ | 364 | */ |
343 | for (i = 0; i < num_regions; i++) { | 365 | for (i = 0; i < num_regions; i++) { |
344 | *dp = old_pages; | 366 | *dp = old_pages; |
345 | if (where[i].count) | 367 | if (where[i].count || (rw & (1 << BIO_RW_BARRIER))) |
346 | do_region(rw, i, where + i, dp, io); | 368 | do_region(rw, i, where + i, dp, io); |
347 | } | 369 | } |
348 | 370 | ||
@@ -357,7 +379,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
357 | struct dm_io_region *where, int rw, struct dpages *dp, | 379 | struct dm_io_region *where, int rw, struct dpages *dp, |
358 | unsigned long *error_bits) | 380 | unsigned long *error_bits) |
359 | { | 381 | { |
360 | struct io io; | 382 | /* |
383 | * gcc <= 4.3 can't do the alignment for stack variables, so we must | ||
384 | * align it on our own. | ||
385 | * volatile prevents the optimizer from removing or reusing | ||
386 | * "io_" field from the stack frame (allowed in ANSI C). | ||
387 | */ | ||
388 | volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; | ||
389 | struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); | ||
361 | 390 | ||
362 | if (num_regions > 1 && (rw & RW_MASK) != WRITE) { | 391 | if (num_regions > 1 && (rw & RW_MASK) != WRITE) { |
363 | WARN_ON(1); | 392 | WARN_ON(1); |
@@ -365,33 +394,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
365 | } | 394 | } |
366 | 395 | ||
367 | retry: | 396 | retry: |
368 | io.error_bits = 0; | 397 | io->error_bits = 0; |
369 | io.eopnotsupp_bits = 0; | 398 | io->eopnotsupp_bits = 0; |
370 | atomic_set(&io.count, 1); /* see dispatch_io() */ | 399 | atomic_set(&io->count, 1); /* see dispatch_io() */ |
371 | io.sleeper = current; | 400 | io->sleeper = current; |
372 | io.client = client; | 401 | io->client = client; |
373 | 402 | ||
374 | dispatch_io(rw, num_regions, where, dp, &io, 1); | 403 | dispatch_io(rw, num_regions, where, dp, io, 1); |
375 | 404 | ||
376 | while (1) { | 405 | while (1) { |
377 | set_current_state(TASK_UNINTERRUPTIBLE); | 406 | set_current_state(TASK_UNINTERRUPTIBLE); |
378 | 407 | ||
379 | if (!atomic_read(&io.count)) | 408 | if (!atomic_read(&io->count)) |
380 | break; | 409 | break; |
381 | 410 | ||
382 | io_schedule(); | 411 | io_schedule(); |
383 | } | 412 | } |
384 | set_current_state(TASK_RUNNING); | 413 | set_current_state(TASK_RUNNING); |
385 | 414 | ||
386 | if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { | 415 | if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { |
387 | rw &= ~(1 << BIO_RW_BARRIER); | 416 | rw &= ~(1 << BIO_RW_BARRIER); |
388 | goto retry; | 417 | goto retry; |
389 | } | 418 | } |
390 | 419 | ||
391 | if (error_bits) | 420 | if (error_bits) |
392 | *error_bits = io.error_bits; | 421 | *error_bits = io->error_bits; |
393 | 422 | ||
394 | return io.error_bits ? -EIO : 0; | 423 | return io->error_bits ? -EIO : 0; |
395 | } | 424 | } |
396 | 425 | ||
397 | static int async_io(struct dm_io_client *client, unsigned int num_regions, | 426 | static int async_io(struct dm_io_client *client, unsigned int num_regions, |
@@ -472,3 +501,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions, | |||
472 | &dp, io_req->notify.fn, io_req->notify.context); | 501 | &dp, io_req->notify.fn, io_req->notify.context); |
473 | } | 502 | } |
474 | EXPORT_SYMBOL(dm_io); | 503 | EXPORT_SYMBOL(dm_io); |
504 | |||
505 | int __init dm_io_init(void) | ||
506 | { | ||
507 | _dm_io_cache = KMEM_CACHE(io, 0); | ||
508 | if (!_dm_io_cache) | ||
509 | return -ENOMEM; | ||
510 | |||
511 | return 0; | ||
512 | } | ||
513 | |||
514 | void dm_io_exit(void) | ||
515 | { | ||
516 | kmem_cache_destroy(_dm_io_cache); | ||
517 | _dm_io_cache = NULL; | ||
518 | } | ||
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index a67942931582..1d669322b27c 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -56,6 +56,11 @@ static void dm_hash_remove_all(int keep_open_devices); | |||
56 | */ | 56 | */ |
57 | static DECLARE_RWSEM(_hash_lock); | 57 | static DECLARE_RWSEM(_hash_lock); |
58 | 58 | ||
59 | /* | ||
60 | * Protects use of mdptr to obtain hash cell name and uuid from mapped device. | ||
61 | */ | ||
62 | static DEFINE_MUTEX(dm_hash_cells_mutex); | ||
63 | |||
59 | static void init_buckets(struct list_head *buckets) | 64 | static void init_buckets(struct list_head *buckets) |
60 | { | 65 | { |
61 | unsigned int i; | 66 | unsigned int i; |
@@ -206,7 +211,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi | |||
206 | list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); | 211 | list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); |
207 | } | 212 | } |
208 | dm_get(md); | 213 | dm_get(md); |
214 | mutex_lock(&dm_hash_cells_mutex); | ||
209 | dm_set_mdptr(md, cell); | 215 | dm_set_mdptr(md, cell); |
216 | mutex_unlock(&dm_hash_cells_mutex); | ||
210 | up_write(&_hash_lock); | 217 | up_write(&_hash_lock); |
211 | 218 | ||
212 | return 0; | 219 | return 0; |
@@ -224,9 +231,11 @@ static void __hash_remove(struct hash_cell *hc) | |||
224 | /* remove from the dev hash */ | 231 | /* remove from the dev hash */ |
225 | list_del(&hc->uuid_list); | 232 | list_del(&hc->uuid_list); |
226 | list_del(&hc->name_list); | 233 | list_del(&hc->name_list); |
234 | mutex_lock(&dm_hash_cells_mutex); | ||
227 | dm_set_mdptr(hc->md, NULL); | 235 | dm_set_mdptr(hc->md, NULL); |
236 | mutex_unlock(&dm_hash_cells_mutex); | ||
228 | 237 | ||
229 | table = dm_get_table(hc->md); | 238 | table = dm_get_live_table(hc->md); |
230 | if (table) { | 239 | if (table) { |
231 | dm_table_event(table); | 240 | dm_table_event(table); |
232 | dm_table_put(table); | 241 | dm_table_put(table); |
@@ -321,13 +330,15 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) | |||
321 | */ | 330 | */ |
322 | list_del(&hc->name_list); | 331 | list_del(&hc->name_list); |
323 | old_name = hc->name; | 332 | old_name = hc->name; |
333 | mutex_lock(&dm_hash_cells_mutex); | ||
324 | hc->name = new_name; | 334 | hc->name = new_name; |
335 | mutex_unlock(&dm_hash_cells_mutex); | ||
325 | list_add(&hc->name_list, _name_buckets + hash_str(new_name)); | 336 | list_add(&hc->name_list, _name_buckets + hash_str(new_name)); |
326 | 337 | ||
327 | /* | 338 | /* |
328 | * Wake up any dm event waiters. | 339 | * Wake up any dm event waiters. |
329 | */ | 340 | */ |
330 | table = dm_get_table(hc->md); | 341 | table = dm_get_live_table(hc->md); |
331 | if (table) { | 342 | if (table) { |
332 | dm_table_event(table); | 343 | dm_table_event(table); |
333 | dm_table_put(table); | 344 | dm_table_put(table); |
@@ -512,8 +523,6 @@ static int list_versions(struct dm_ioctl *param, size_t param_size) | |||
512 | return 0; | 523 | return 0; |
513 | } | 524 | } |
514 | 525 | ||
515 | |||
516 | |||
517 | static int check_name(const char *name) | 526 | static int check_name(const char *name) |
518 | { | 527 | { |
519 | if (strchr(name, '/')) { | 528 | if (strchr(name, '/')) { |
@@ -525,6 +534,40 @@ static int check_name(const char *name) | |||
525 | } | 534 | } |
526 | 535 | ||
527 | /* | 536 | /* |
537 | * On successful return, the caller must not attempt to acquire | ||
538 | * _hash_lock without first calling dm_table_put, because dm_table_destroy | ||
539 | * waits for this dm_table_put and could be called under this lock. | ||
540 | */ | ||
541 | static struct dm_table *dm_get_inactive_table(struct mapped_device *md) | ||
542 | { | ||
543 | struct hash_cell *hc; | ||
544 | struct dm_table *table = NULL; | ||
545 | |||
546 | down_read(&_hash_lock); | ||
547 | hc = dm_get_mdptr(md); | ||
548 | if (!hc || hc->md != md) { | ||
549 | DMWARN("device has been removed from the dev hash table."); | ||
550 | goto out; | ||
551 | } | ||
552 | |||
553 | table = hc->new_map; | ||
554 | if (table) | ||
555 | dm_table_get(table); | ||
556 | |||
557 | out: | ||
558 | up_read(&_hash_lock); | ||
559 | |||
560 | return table; | ||
561 | } | ||
562 | |||
563 | static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, | ||
564 | struct dm_ioctl *param) | ||
565 | { | ||
566 | return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ? | ||
567 | dm_get_inactive_table(md) : dm_get_live_table(md); | ||
568 | } | ||
569 | |||
570 | /* | ||
528 | * Fills in a dm_ioctl structure, ready for sending back to | 571 | * Fills in a dm_ioctl structure, ready for sending back to |
529 | * userland. | 572 | * userland. |
530 | */ | 573 | */ |
@@ -536,7 +579,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) | |||
536 | param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | | 579 | param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | |
537 | DM_ACTIVE_PRESENT_FLAG); | 580 | DM_ACTIVE_PRESENT_FLAG); |
538 | 581 | ||
539 | if (dm_suspended(md)) | 582 | if (dm_suspended_md(md)) |
540 | param->flags |= DM_SUSPEND_FLAG; | 583 | param->flags |= DM_SUSPEND_FLAG; |
541 | 584 | ||
542 | param->dev = huge_encode_dev(disk_devt(disk)); | 585 | param->dev = huge_encode_dev(disk_devt(disk)); |
@@ -548,18 +591,30 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) | |||
548 | */ | 591 | */ |
549 | param->open_count = dm_open_count(md); | 592 | param->open_count = dm_open_count(md); |
550 | 593 | ||
551 | if (get_disk_ro(disk)) | ||
552 | param->flags |= DM_READONLY_FLAG; | ||
553 | |||
554 | param->event_nr = dm_get_event_nr(md); | 594 | param->event_nr = dm_get_event_nr(md); |
595 | param->target_count = 0; | ||
555 | 596 | ||
556 | table = dm_get_table(md); | 597 | table = dm_get_live_table(md); |
557 | if (table) { | 598 | if (table) { |
558 | param->flags |= DM_ACTIVE_PRESENT_FLAG; | 599 | if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { |
559 | param->target_count = dm_table_get_num_targets(table); | 600 | if (get_disk_ro(disk)) |
601 | param->flags |= DM_READONLY_FLAG; | ||
602 | param->target_count = dm_table_get_num_targets(table); | ||
603 | } | ||
560 | dm_table_put(table); | 604 | dm_table_put(table); |
561 | } else | 605 | |
562 | param->target_count = 0; | 606 | param->flags |= DM_ACTIVE_PRESENT_FLAG; |
607 | } | ||
608 | |||
609 | if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) { | ||
610 | table = dm_get_inactive_table(md); | ||
611 | if (table) { | ||
612 | if (!(dm_table_get_mode(table) & FMODE_WRITE)) | ||
613 | param->flags |= DM_READONLY_FLAG; | ||
614 | param->target_count = dm_table_get_num_targets(table); | ||
615 | dm_table_put(table); | ||
616 | } | ||
617 | } | ||
563 | 618 | ||
564 | return 0; | 619 | return 0; |
565 | } | 620 | } |
@@ -634,9 +689,9 @@ static struct mapped_device *find_device(struct dm_ioctl *param) | |||
634 | * Sneakily write in both the name and the uuid | 689 | * Sneakily write in both the name and the uuid |
635 | * while we have the cell. | 690 | * while we have the cell. |
636 | */ | 691 | */ |
637 | strncpy(param->name, hc->name, sizeof(param->name)); | 692 | strlcpy(param->name, hc->name, sizeof(param->name)); |
638 | if (hc->uuid) | 693 | if (hc->uuid) |
639 | strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1); | 694 | strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); |
640 | else | 695 | else |
641 | param->uuid[0] = '\0'; | 696 | param->uuid[0] = '\0'; |
642 | 697 | ||
@@ -784,7 +839,7 @@ static int do_suspend(struct dm_ioctl *param) | |||
784 | if (param->flags & DM_NOFLUSH_FLAG) | 839 | if (param->flags & DM_NOFLUSH_FLAG) |
785 | suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; | 840 | suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; |
786 | 841 | ||
787 | if (!dm_suspended(md)) | 842 | if (!dm_suspended_md(md)) |
788 | r = dm_suspend(md, suspend_flags); | 843 | r = dm_suspend(md, suspend_flags); |
789 | 844 | ||
790 | if (!r) | 845 | if (!r) |
@@ -800,7 +855,7 @@ static int do_resume(struct dm_ioctl *param) | |||
800 | unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; | 855 | unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; |
801 | struct hash_cell *hc; | 856 | struct hash_cell *hc; |
802 | struct mapped_device *md; | 857 | struct mapped_device *md; |
803 | struct dm_table *new_map; | 858 | struct dm_table *new_map, *old_map = NULL; |
804 | 859 | ||
805 | down_write(&_hash_lock); | 860 | down_write(&_hash_lock); |
806 | 861 | ||
@@ -826,14 +881,14 @@ static int do_resume(struct dm_ioctl *param) | |||
826 | suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; | 881 | suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; |
827 | if (param->flags & DM_NOFLUSH_FLAG) | 882 | if (param->flags & DM_NOFLUSH_FLAG) |
828 | suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; | 883 | suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; |
829 | if (!dm_suspended(md)) | 884 | if (!dm_suspended_md(md)) |
830 | dm_suspend(md, suspend_flags); | 885 | dm_suspend(md, suspend_flags); |
831 | 886 | ||
832 | r = dm_swap_table(md, new_map); | 887 | old_map = dm_swap_table(md, new_map); |
833 | if (r) { | 888 | if (IS_ERR(old_map)) { |
834 | dm_table_destroy(new_map); | 889 | dm_table_destroy(new_map); |
835 | dm_put(md); | 890 | dm_put(md); |
836 | return r; | 891 | return PTR_ERR(old_map); |
837 | } | 892 | } |
838 | 893 | ||
839 | if (dm_table_get_mode(new_map) & FMODE_WRITE) | 894 | if (dm_table_get_mode(new_map) & FMODE_WRITE) |
@@ -842,9 +897,11 @@ static int do_resume(struct dm_ioctl *param) | |||
842 | set_disk_ro(dm_disk(md), 1); | 897 | set_disk_ro(dm_disk(md), 1); |
843 | } | 898 | } |
844 | 899 | ||
845 | if (dm_suspended(md)) | 900 | if (dm_suspended_md(md)) |
846 | r = dm_resume(md); | 901 | r = dm_resume(md); |
847 | 902 | ||
903 | if (old_map) | ||
904 | dm_table_destroy(old_map); | ||
848 | 905 | ||
849 | if (!r) { | 906 | if (!r) { |
850 | dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); | 907 | dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); |
@@ -982,7 +1039,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) | |||
982 | if (r) | 1039 | if (r) |
983 | goto out; | 1040 | goto out; |
984 | 1041 | ||
985 | table = dm_get_table(md); | 1042 | table = dm_get_live_or_inactive_table(md, param); |
986 | if (table) { | 1043 | if (table) { |
987 | retrieve_status(table, param, param_size); | 1044 | retrieve_status(table, param, param_size); |
988 | dm_table_put(table); | 1045 | dm_table_put(table); |
@@ -1215,7 +1272,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) | |||
1215 | if (r) | 1272 | if (r) |
1216 | goto out; | 1273 | goto out; |
1217 | 1274 | ||
1218 | table = dm_get_table(md); | 1275 | table = dm_get_live_or_inactive_table(md, param); |
1219 | if (table) { | 1276 | if (table) { |
1220 | retrieve_deps(table, param, param_size); | 1277 | retrieve_deps(table, param, param_size); |
1221 | dm_table_put(table); | 1278 | dm_table_put(table); |
@@ -1244,13 +1301,13 @@ static int table_status(struct dm_ioctl *param, size_t param_size) | |||
1244 | if (r) | 1301 | if (r) |
1245 | goto out; | 1302 | goto out; |
1246 | 1303 | ||
1247 | table = dm_get_table(md); | 1304 | table = dm_get_live_or_inactive_table(md, param); |
1248 | if (table) { | 1305 | if (table) { |
1249 | retrieve_status(table, param, param_size); | 1306 | retrieve_status(table, param, param_size); |
1250 | dm_table_put(table); | 1307 | dm_table_put(table); |
1251 | } | 1308 | } |
1252 | 1309 | ||
1253 | out: | 1310 | out: |
1254 | dm_put(md); | 1311 | dm_put(md); |
1255 | return r; | 1312 | return r; |
1256 | } | 1313 | } |
@@ -1288,10 +1345,15 @@ static int target_message(struct dm_ioctl *param, size_t param_size) | |||
1288 | goto out; | 1345 | goto out; |
1289 | } | 1346 | } |
1290 | 1347 | ||
1291 | table = dm_get_table(md); | 1348 | table = dm_get_live_table(md); |
1292 | if (!table) | 1349 | if (!table) |
1293 | goto out_argv; | 1350 | goto out_argv; |
1294 | 1351 | ||
1352 | if (dm_deleting_md(md)) { | ||
1353 | r = -ENXIO; | ||
1354 | goto out_table; | ||
1355 | } | ||
1356 | |||
1295 | ti = dm_table_find_target(table, tmsg->sector); | 1357 | ti = dm_table_find_target(table, tmsg->sector); |
1296 | if (!dm_target_is_valid(ti)) { | 1358 | if (!dm_target_is_valid(ti)) { |
1297 | DMWARN("Target message sector outside device."); | 1359 | DMWARN("Target message sector outside device."); |
@@ -1303,6 +1365,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) | |||
1303 | r = -EINVAL; | 1365 | r = -EINVAL; |
1304 | } | 1366 | } |
1305 | 1367 | ||
1368 | out_table: | ||
1306 | dm_table_put(table); | 1369 | dm_table_put(table); |
1307 | out_argv: | 1370 | out_argv: |
1308 | kfree(argv); | 1371 | kfree(argv); |
@@ -1582,8 +1645,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) | |||
1582 | if (!md) | 1645 | if (!md) |
1583 | return -ENXIO; | 1646 | return -ENXIO; |
1584 | 1647 | ||
1585 | dm_get(md); | 1648 | mutex_lock(&dm_hash_cells_mutex); |
1586 | down_read(&_hash_lock); | ||
1587 | hc = dm_get_mdptr(md); | 1649 | hc = dm_get_mdptr(md); |
1588 | if (!hc || hc->md != md) { | 1650 | if (!hc || hc->md != md) { |
1589 | r = -ENXIO; | 1651 | r = -ENXIO; |
@@ -1596,8 +1658,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) | |||
1596 | strcpy(uuid, hc->uuid ? : ""); | 1658 | strcpy(uuid, hc->uuid ? : ""); |
1597 | 1659 | ||
1598 | out: | 1660 | out: |
1599 | up_read(&_hash_lock); | 1661 | mutex_unlock(&dm_hash_cells_mutex); |
1600 | dm_put(md); | ||
1601 | 1662 | ||
1602 | return r; | 1663 | return r; |
1603 | } | 1664 | } |
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3e3fc06cb861..addf83475040 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c | |||
@@ -450,7 +450,10 @@ static void dispatch_job(struct kcopyd_job *job) | |||
450 | { | 450 | { |
451 | struct dm_kcopyd_client *kc = job->kc; | 451 | struct dm_kcopyd_client *kc = job->kc; |
452 | atomic_inc(&kc->nr_jobs); | 452 | atomic_inc(&kc->nr_jobs); |
453 | push(&kc->pages_jobs, job); | 453 | if (unlikely(!job->source.count)) |
454 | push(&kc->complete_jobs, job); | ||
455 | else | ||
456 | push(&kc->pages_jobs, job); | ||
454 | wake(kc); | 457 | wake(kc); |
455 | } | 458 | } |
456 | 459 | ||
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 9443896ede07..7035582786fb 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
@@ -145,8 +145,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type) | |||
145 | EXPORT_SYMBOL(dm_dirty_log_type_unregister); | 145 | EXPORT_SYMBOL(dm_dirty_log_type_unregister); |
146 | 146 | ||
147 | struct dm_dirty_log *dm_dirty_log_create(const char *type_name, | 147 | struct dm_dirty_log *dm_dirty_log_create(const char *type_name, |
148 | struct dm_target *ti, | 148 | struct dm_target *ti, |
149 | unsigned int argc, char **argv) | 149 | int (*flush_callback_fn)(struct dm_target *ti), |
150 | unsigned int argc, char **argv) | ||
150 | { | 151 | { |
151 | struct dm_dirty_log_type *type; | 152 | struct dm_dirty_log_type *type; |
152 | struct dm_dirty_log *log; | 153 | struct dm_dirty_log *log; |
@@ -161,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name, | |||
161 | return NULL; | 162 | return NULL; |
162 | } | 163 | } |
163 | 164 | ||
165 | log->flush_callback_fn = flush_callback_fn; | ||
164 | log->type = type; | 166 | log->type = type; |
165 | if (type->ctr(log, ti, argc, argv)) { | 167 | if (type->ctr(log, ti, argc, argv)) { |
166 | kfree(log); | 168 | kfree(log); |
@@ -208,7 +210,9 @@ struct log_header { | |||
208 | 210 | ||
209 | struct log_c { | 211 | struct log_c { |
210 | struct dm_target *ti; | 212 | struct dm_target *ti; |
211 | int touched; | 213 | int touched_dirtied; |
214 | int touched_cleaned; | ||
215 | int flush_failed; | ||
212 | uint32_t region_size; | 216 | uint32_t region_size; |
213 | unsigned int region_count; | 217 | unsigned int region_count; |
214 | region_t sync_count; | 218 | region_t sync_count; |
@@ -233,6 +237,7 @@ struct log_c { | |||
233 | * Disk log fields | 237 | * Disk log fields |
234 | */ | 238 | */ |
235 | int log_dev_failed; | 239 | int log_dev_failed; |
240 | int log_dev_flush_failed; | ||
236 | struct dm_dev *log_dev; | 241 | struct dm_dev *log_dev; |
237 | struct log_header header; | 242 | struct log_header header; |
238 | 243 | ||
@@ -253,14 +258,14 @@ static inline void log_set_bit(struct log_c *l, | |||
253 | uint32_t *bs, unsigned bit) | 258 | uint32_t *bs, unsigned bit) |
254 | { | 259 | { |
255 | ext2_set_bit(bit, (unsigned long *) bs); | 260 | ext2_set_bit(bit, (unsigned long *) bs); |
256 | l->touched = 1; | 261 | l->touched_cleaned = 1; |
257 | } | 262 | } |
258 | 263 | ||
259 | static inline void log_clear_bit(struct log_c *l, | 264 | static inline void log_clear_bit(struct log_c *l, |
260 | uint32_t *bs, unsigned bit) | 265 | uint32_t *bs, unsigned bit) |
261 | { | 266 | { |
262 | ext2_clear_bit(bit, (unsigned long *) bs); | 267 | ext2_clear_bit(bit, (unsigned long *) bs); |
263 | l->touched = 1; | 268 | l->touched_dirtied = 1; |
264 | } | 269 | } |
265 | 270 | ||
266 | /*---------------------------------------------------------------- | 271 | /*---------------------------------------------------------------- |
@@ -287,6 +292,19 @@ static int rw_header(struct log_c *lc, int rw) | |||
287 | return dm_io(&lc->io_req, 1, &lc->header_location, NULL); | 292 | return dm_io(&lc->io_req, 1, &lc->header_location, NULL); |
288 | } | 293 | } |
289 | 294 | ||
295 | static int flush_header(struct log_c *lc) | ||
296 | { | ||
297 | struct dm_io_region null_location = { | ||
298 | .bdev = lc->header_location.bdev, | ||
299 | .sector = 0, | ||
300 | .count = 0, | ||
301 | }; | ||
302 | |||
303 | lc->io_req.bi_rw = WRITE_BARRIER; | ||
304 | |||
305 | return dm_io(&lc->io_req, 1, &null_location, NULL); | ||
306 | } | ||
307 | |||
290 | static int read_header(struct log_c *log) | 308 | static int read_header(struct log_c *log) |
291 | { | 309 | { |
292 | int r; | 310 | int r; |
@@ -378,7 +396,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
378 | } | 396 | } |
379 | 397 | ||
380 | lc->ti = ti; | 398 | lc->ti = ti; |
381 | lc->touched = 0; | 399 | lc->touched_dirtied = 0; |
400 | lc->touched_cleaned = 0; | ||
401 | lc->flush_failed = 0; | ||
382 | lc->region_size = region_size; | 402 | lc->region_size = region_size; |
383 | lc->region_count = region_count; | 403 | lc->region_count = region_count; |
384 | lc->sync = sync; | 404 | lc->sync = sync; |
@@ -406,6 +426,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
406 | } else { | 426 | } else { |
407 | lc->log_dev = dev; | 427 | lc->log_dev = dev; |
408 | lc->log_dev_failed = 0; | 428 | lc->log_dev_failed = 0; |
429 | lc->log_dev_flush_failed = 0; | ||
409 | lc->header_location.bdev = lc->log_dev->bdev; | 430 | lc->header_location.bdev = lc->log_dev->bdev; |
410 | lc->header_location.sector = 0; | 431 | lc->header_location.sector = 0; |
411 | 432 | ||
@@ -614,6 +635,11 @@ static int disk_resume(struct dm_dirty_log *log) | |||
614 | 635 | ||
615 | /* write the new header */ | 636 | /* write the new header */ |
616 | r = rw_header(lc, WRITE); | 637 | r = rw_header(lc, WRITE); |
638 | if (!r) { | ||
639 | r = flush_header(lc); | ||
640 | if (r) | ||
641 | lc->log_dev_flush_failed = 1; | ||
642 | } | ||
617 | if (r) { | 643 | if (r) { |
618 | DMWARN("%s: Failed to write header on dirty region log device", | 644 | DMWARN("%s: Failed to write header on dirty region log device", |
619 | lc->log_dev->name); | 645 | lc->log_dev->name); |
@@ -656,18 +682,40 @@ static int core_flush(struct dm_dirty_log *log) | |||
656 | 682 | ||
657 | static int disk_flush(struct dm_dirty_log *log) | 683 | static int disk_flush(struct dm_dirty_log *log) |
658 | { | 684 | { |
659 | int r; | 685 | int r, i; |
660 | struct log_c *lc = (struct log_c *) log->context; | 686 | struct log_c *lc = log->context; |
661 | 687 | ||
662 | /* only write if the log has changed */ | 688 | /* only write if the log has changed */ |
663 | if (!lc->touched) | 689 | if (!lc->touched_cleaned && !lc->touched_dirtied) |
664 | return 0; | 690 | return 0; |
665 | 691 | ||
692 | if (lc->touched_cleaned && log->flush_callback_fn && | ||
693 | log->flush_callback_fn(lc->ti)) { | ||
694 | /* | ||
695 | * At this point it is impossible to determine which | ||
696 | * regions are clean and which are dirty (without | ||
697 | * re-reading the log off disk). So mark all of them | ||
698 | * dirty. | ||
699 | */ | ||
700 | lc->flush_failed = 1; | ||
701 | for (i = 0; i < lc->region_count; i++) | ||
702 | log_clear_bit(lc, lc->clean_bits, i); | ||
703 | } | ||
704 | |||
666 | r = rw_header(lc, WRITE); | 705 | r = rw_header(lc, WRITE); |
667 | if (r) | 706 | if (r) |
668 | fail_log_device(lc); | 707 | fail_log_device(lc); |
669 | else | 708 | else { |
670 | lc->touched = 0; | 709 | if (lc->touched_dirtied) { |
710 | r = flush_header(lc); | ||
711 | if (r) { | ||
712 | lc->log_dev_flush_failed = 1; | ||
713 | fail_log_device(lc); | ||
714 | } else | ||
715 | lc->touched_dirtied = 0; | ||
716 | } | ||
717 | lc->touched_cleaned = 0; | ||
718 | } | ||
671 | 719 | ||
672 | return r; | 720 | return r; |
673 | } | 721 | } |
@@ -681,7 +729,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region) | |||
681 | static void core_clear_region(struct dm_dirty_log *log, region_t region) | 729 | static void core_clear_region(struct dm_dirty_log *log, region_t region) |
682 | { | 730 | { |
683 | struct log_c *lc = (struct log_c *) log->context; | 731 | struct log_c *lc = (struct log_c *) log->context; |
684 | log_set_bit(lc, lc->clean_bits, region); | 732 | if (likely(!lc->flush_failed)) |
733 | log_set_bit(lc, lc->clean_bits, region); | ||
685 | } | 734 | } |
686 | 735 | ||
687 | static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) | 736 | static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) |
@@ -762,7 +811,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status, | |||
762 | switch(status) { | 811 | switch(status) { |
763 | case STATUSTYPE_INFO: | 812 | case STATUSTYPE_INFO: |
764 | DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, | 813 | DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, |
765 | lc->log_dev_failed ? 'D' : 'A'); | 814 | lc->log_dev_flush_failed ? 'F' : |
815 | lc->log_dev_failed ? 'D' : | ||
816 | 'A'); | ||
766 | break; | 817 | break; |
767 | 818 | ||
768 | case STATUSTYPE_TABLE: | 819 | case STATUSTYPE_TABLE: |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index dce971dbdfa3..e81345a1d08f 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -93,6 +93,10 @@ struct multipath { | |||
93 | * can resubmit bios on error. | 93 | * can resubmit bios on error. |
94 | */ | 94 | */ |
95 | mempool_t *mpio_pool; | 95 | mempool_t *mpio_pool; |
96 | |||
97 | struct mutex work_mutex; | ||
98 | |||
99 | unsigned suspended; /* Don't create new I/O internally when set. */ | ||
96 | }; | 100 | }; |
97 | 101 | ||
98 | /* | 102 | /* |
@@ -198,6 +202,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) | |||
198 | m->queue_io = 1; | 202 | m->queue_io = 1; |
199 | INIT_WORK(&m->process_queued_ios, process_queued_ios); | 203 | INIT_WORK(&m->process_queued_ios, process_queued_ios); |
200 | INIT_WORK(&m->trigger_event, trigger_event); | 204 | INIT_WORK(&m->trigger_event, trigger_event); |
205 | mutex_init(&m->work_mutex); | ||
201 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); | 206 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); |
202 | if (!m->mpio_pool) { | 207 | if (!m->mpio_pool) { |
203 | kfree(m); | 208 | kfree(m); |
@@ -885,13 +890,18 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, | |||
885 | return r; | 890 | return r; |
886 | } | 891 | } |
887 | 892 | ||
888 | static void multipath_dtr(struct dm_target *ti) | 893 | static void flush_multipath_work(void) |
889 | { | 894 | { |
890 | struct multipath *m = (struct multipath *) ti->private; | ||
891 | |||
892 | flush_workqueue(kmpath_handlerd); | 895 | flush_workqueue(kmpath_handlerd); |
893 | flush_workqueue(kmultipathd); | 896 | flush_workqueue(kmultipathd); |
894 | flush_scheduled_work(); | 897 | flush_scheduled_work(); |
898 | } | ||
899 | |||
900 | static void multipath_dtr(struct dm_target *ti) | ||
901 | { | ||
902 | struct multipath *m = ti->private; | ||
903 | |||
904 | flush_multipath_work(); | ||
895 | free_multipath(m); | 905 | free_multipath(m); |
896 | } | 906 | } |
897 | 907 | ||
@@ -1261,6 +1271,16 @@ static void multipath_presuspend(struct dm_target *ti) | |||
1261 | queue_if_no_path(m, 0, 1); | 1271 | queue_if_no_path(m, 0, 1); |
1262 | } | 1272 | } |
1263 | 1273 | ||
1274 | static void multipath_postsuspend(struct dm_target *ti) | ||
1275 | { | ||
1276 | struct multipath *m = ti->private; | ||
1277 | |||
1278 | mutex_lock(&m->work_mutex); | ||
1279 | m->suspended = 1; | ||
1280 | flush_multipath_work(); | ||
1281 | mutex_unlock(&m->work_mutex); | ||
1282 | } | ||
1283 | |||
1264 | /* | 1284 | /* |
1265 | * Restore the queue_if_no_path setting. | 1285 | * Restore the queue_if_no_path setting. |
1266 | */ | 1286 | */ |
@@ -1269,6 +1289,10 @@ static void multipath_resume(struct dm_target *ti) | |||
1269 | struct multipath *m = (struct multipath *) ti->private; | 1289 | struct multipath *m = (struct multipath *) ti->private; |
1270 | unsigned long flags; | 1290 | unsigned long flags; |
1271 | 1291 | ||
1292 | mutex_lock(&m->work_mutex); | ||
1293 | m->suspended = 0; | ||
1294 | mutex_unlock(&m->work_mutex); | ||
1295 | |||
1272 | spin_lock_irqsave(&m->lock, flags); | 1296 | spin_lock_irqsave(&m->lock, flags); |
1273 | m->queue_if_no_path = m->saved_queue_if_no_path; | 1297 | m->queue_if_no_path = m->saved_queue_if_no_path; |
1274 | spin_unlock_irqrestore(&m->lock, flags); | 1298 | spin_unlock_irqrestore(&m->lock, flags); |
@@ -1397,51 +1421,71 @@ static int multipath_status(struct dm_target *ti, status_type_t type, | |||
1397 | 1421 | ||
1398 | static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) | 1422 | static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) |
1399 | { | 1423 | { |
1400 | int r; | 1424 | int r = -EINVAL; |
1401 | struct dm_dev *dev; | 1425 | struct dm_dev *dev; |
1402 | struct multipath *m = (struct multipath *) ti->private; | 1426 | struct multipath *m = (struct multipath *) ti->private; |
1403 | action_fn action; | 1427 | action_fn action; |
1404 | 1428 | ||
1429 | mutex_lock(&m->work_mutex); | ||
1430 | |||
1431 | if (m->suspended) { | ||
1432 | r = -EBUSY; | ||
1433 | goto out; | ||
1434 | } | ||
1435 | |||
1436 | if (dm_suspended(ti)) { | ||
1437 | r = -EBUSY; | ||
1438 | goto out; | ||
1439 | } | ||
1440 | |||
1405 | if (argc == 1) { | 1441 | if (argc == 1) { |
1406 | if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) | 1442 | if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { |
1407 | return queue_if_no_path(m, 1, 0); | 1443 | r = queue_if_no_path(m, 1, 0); |
1408 | else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) | 1444 | goto out; |
1409 | return queue_if_no_path(m, 0, 0); | 1445 | } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) { |
1446 | r = queue_if_no_path(m, 0, 0); | ||
1447 | goto out; | ||
1448 | } | ||
1410 | } | 1449 | } |
1411 | 1450 | ||
1412 | if (argc != 2) | 1451 | if (argc != 2) { |
1413 | goto error; | 1452 | DMWARN("Unrecognised multipath message received."); |
1453 | goto out; | ||
1454 | } | ||
1414 | 1455 | ||
1415 | if (!strnicmp(argv[0], MESG_STR("disable_group"))) | 1456 | if (!strnicmp(argv[0], MESG_STR("disable_group"))) { |
1416 | return bypass_pg_num(m, argv[1], 1); | 1457 | r = bypass_pg_num(m, argv[1], 1); |
1417 | else if (!strnicmp(argv[0], MESG_STR("enable_group"))) | 1458 | goto out; |
1418 | return bypass_pg_num(m, argv[1], 0); | 1459 | } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) { |
1419 | else if (!strnicmp(argv[0], MESG_STR("switch_group"))) | 1460 | r = bypass_pg_num(m, argv[1], 0); |
1420 | return switch_pg_num(m, argv[1]); | 1461 | goto out; |
1421 | else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) | 1462 | } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) { |
1463 | r = switch_pg_num(m, argv[1]); | ||
1464 | goto out; | ||
1465 | } else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) | ||
1422 | action = reinstate_path; | 1466 | action = reinstate_path; |
1423 | else if (!strnicmp(argv[0], MESG_STR("fail_path"))) | 1467 | else if (!strnicmp(argv[0], MESG_STR("fail_path"))) |
1424 | action = fail_path; | 1468 | action = fail_path; |
1425 | else | 1469 | else { |
1426 | goto error; | 1470 | DMWARN("Unrecognised multipath message received."); |
1471 | goto out; | ||
1472 | } | ||
1427 | 1473 | ||
1428 | r = dm_get_device(ti, argv[1], ti->begin, ti->len, | 1474 | r = dm_get_device(ti, argv[1], ti->begin, ti->len, |
1429 | dm_table_get_mode(ti->table), &dev); | 1475 | dm_table_get_mode(ti->table), &dev); |
1430 | if (r) { | 1476 | if (r) { |
1431 | DMWARN("message: error getting device %s", | 1477 | DMWARN("message: error getting device %s", |
1432 | argv[1]); | 1478 | argv[1]); |
1433 | return -EINVAL; | 1479 | goto out; |
1434 | } | 1480 | } |
1435 | 1481 | ||
1436 | r = action_dev(m, dev, action); | 1482 | r = action_dev(m, dev, action); |
1437 | 1483 | ||
1438 | dm_put_device(ti, dev); | 1484 | dm_put_device(ti, dev); |
1439 | 1485 | ||
1486 | out: | ||
1487 | mutex_unlock(&m->work_mutex); | ||
1440 | return r; | 1488 | return r; |
1441 | |||
1442 | error: | ||
1443 | DMWARN("Unrecognised multipath message received."); | ||
1444 | return -EINVAL; | ||
1445 | } | 1489 | } |
1446 | 1490 | ||
1447 | static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | 1491 | static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, |
@@ -1567,13 +1611,14 @@ out: | |||
1567 | *---------------------------------------------------------------*/ | 1611 | *---------------------------------------------------------------*/ |
1568 | static struct target_type multipath_target = { | 1612 | static struct target_type multipath_target = { |
1569 | .name = "multipath", | 1613 | .name = "multipath", |
1570 | .version = {1, 1, 0}, | 1614 | .version = {1, 1, 1}, |
1571 | .module = THIS_MODULE, | 1615 | .module = THIS_MODULE, |
1572 | .ctr = multipath_ctr, | 1616 | .ctr = multipath_ctr, |
1573 | .dtr = multipath_dtr, | 1617 | .dtr = multipath_dtr, |
1574 | .map_rq = multipath_map, | 1618 | .map_rq = multipath_map, |
1575 | .rq_end_io = multipath_end_io, | 1619 | .rq_end_io = multipath_end_io, |
1576 | .presuspend = multipath_presuspend, | 1620 | .presuspend = multipath_presuspend, |
1621 | .postsuspend = multipath_postsuspend, | ||
1577 | .resume = multipath_resume, | 1622 | .resume = multipath_resume, |
1578 | .status = multipath_status, | 1623 | .status = multipath_status, |
1579 | .message = multipath_message, | 1624 | .message = multipath_message, |
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index cc9dc79b0784..ad779bd13aec 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); | |||
35 | *---------------------------------------------------------------*/ | 35 | *---------------------------------------------------------------*/ |
36 | enum dm_raid1_error { | 36 | enum dm_raid1_error { |
37 | DM_RAID1_WRITE_ERROR, | 37 | DM_RAID1_WRITE_ERROR, |
38 | DM_RAID1_FLUSH_ERROR, | ||
38 | DM_RAID1_SYNC_ERROR, | 39 | DM_RAID1_SYNC_ERROR, |
39 | DM_RAID1_READ_ERROR | 40 | DM_RAID1_READ_ERROR |
40 | }; | 41 | }; |
@@ -57,6 +58,7 @@ struct mirror_set { | |||
57 | struct bio_list reads; | 58 | struct bio_list reads; |
58 | struct bio_list writes; | 59 | struct bio_list writes; |
59 | struct bio_list failures; | 60 | struct bio_list failures; |
61 | struct bio_list holds; /* bios are waiting until suspend */ | ||
60 | 62 | ||
61 | struct dm_region_hash *rh; | 63 | struct dm_region_hash *rh; |
62 | struct dm_kcopyd_client *kcopyd_client; | 64 | struct dm_kcopyd_client *kcopyd_client; |
@@ -67,6 +69,7 @@ struct mirror_set { | |||
67 | region_t nr_regions; | 69 | region_t nr_regions; |
68 | int in_sync; | 70 | int in_sync; |
69 | int log_failure; | 71 | int log_failure; |
72 | int leg_failure; | ||
70 | atomic_t suspend; | 73 | atomic_t suspend; |
71 | 74 | ||
72 | atomic_t default_mirror; /* Default mirror */ | 75 | atomic_t default_mirror; /* Default mirror */ |
@@ -179,6 +182,17 @@ static void set_default_mirror(struct mirror *m) | |||
179 | atomic_set(&ms->default_mirror, m - m0); | 182 | atomic_set(&ms->default_mirror, m - m0); |
180 | } | 183 | } |
181 | 184 | ||
185 | static struct mirror *get_valid_mirror(struct mirror_set *ms) | ||
186 | { | ||
187 | struct mirror *m; | ||
188 | |||
189 | for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++) | ||
190 | if (!atomic_read(&m->error_count)) | ||
191 | return m; | ||
192 | |||
193 | return NULL; | ||
194 | } | ||
195 | |||
182 | /* fail_mirror | 196 | /* fail_mirror |
183 | * @m: mirror device to fail | 197 | * @m: mirror device to fail |
184 | * @error_type: one of the enum's, DM_RAID1_*_ERROR | 198 | * @error_type: one of the enum's, DM_RAID1_*_ERROR |
@@ -198,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) | |||
198 | struct mirror_set *ms = m->ms; | 212 | struct mirror_set *ms = m->ms; |
199 | struct mirror *new; | 213 | struct mirror *new; |
200 | 214 | ||
215 | ms->leg_failure = 1; | ||
216 | |||
201 | /* | 217 | /* |
202 | * error_count is used for nothing more than a | 218 | * error_count is used for nothing more than a |
203 | * simple way to tell if a device has encountered | 219 | * simple way to tell if a device has encountered |
@@ -224,19 +240,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) | |||
224 | goto out; | 240 | goto out; |
225 | } | 241 | } |
226 | 242 | ||
227 | for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) | 243 | new = get_valid_mirror(ms); |
228 | if (!atomic_read(&new->error_count)) { | 244 | if (new) |
229 | set_default_mirror(new); | 245 | set_default_mirror(new); |
230 | break; | 246 | else |
231 | } | ||
232 | |||
233 | if (unlikely(new == ms->mirror + ms->nr_mirrors)) | ||
234 | DMWARN("All sides of mirror have failed."); | 247 | DMWARN("All sides of mirror have failed."); |
235 | 248 | ||
236 | out: | 249 | out: |
237 | schedule_work(&ms->trigger_event); | 250 | schedule_work(&ms->trigger_event); |
238 | } | 251 | } |
239 | 252 | ||
253 | static int mirror_flush(struct dm_target *ti) | ||
254 | { | ||
255 | struct mirror_set *ms = ti->private; | ||
256 | unsigned long error_bits; | ||
257 | |||
258 | unsigned int i; | ||
259 | struct dm_io_region io[ms->nr_mirrors]; | ||
260 | struct mirror *m; | ||
261 | struct dm_io_request io_req = { | ||
262 | .bi_rw = WRITE_BARRIER, | ||
263 | .mem.type = DM_IO_KMEM, | ||
264 | .mem.ptr.bvec = NULL, | ||
265 | .client = ms->io_client, | ||
266 | }; | ||
267 | |||
268 | for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) { | ||
269 | io[i].bdev = m->dev->bdev; | ||
270 | io[i].sector = 0; | ||
271 | io[i].count = 0; | ||
272 | } | ||
273 | |||
274 | error_bits = -1; | ||
275 | dm_io(&io_req, ms->nr_mirrors, io, &error_bits); | ||
276 | if (unlikely(error_bits != 0)) { | ||
277 | for (i = 0; i < ms->nr_mirrors; i++) | ||
278 | if (test_bit(i, &error_bits)) | ||
279 | fail_mirror(ms->mirror + i, | ||
280 | DM_RAID1_FLUSH_ERROR); | ||
281 | return -EIO; | ||
282 | } | ||
283 | |||
284 | return 0; | ||
285 | } | ||
286 | |||
240 | /*----------------------------------------------------------------- | 287 | /*----------------------------------------------------------------- |
241 | * Recovery. | 288 | * Recovery. |
242 | * | 289 | * |
@@ -396,6 +443,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio) | |||
396 | */ | 443 | */ |
397 | static sector_t map_sector(struct mirror *m, struct bio *bio) | 444 | static sector_t map_sector(struct mirror *m, struct bio *bio) |
398 | { | 445 | { |
446 | if (unlikely(!bio->bi_size)) | ||
447 | return 0; | ||
399 | return m->offset + (bio->bi_sector - m->ms->ti->begin); | 448 | return m->offset + (bio->bi_sector - m->ms->ti->begin); |
400 | } | 449 | } |
401 | 450 | ||
@@ -413,6 +462,27 @@ static void map_region(struct dm_io_region *io, struct mirror *m, | |||
413 | io->count = bio->bi_size >> 9; | 462 | io->count = bio->bi_size >> 9; |
414 | } | 463 | } |
415 | 464 | ||
465 | static void hold_bio(struct mirror_set *ms, struct bio *bio) | ||
466 | { | ||
467 | /* | ||
468 | * If device is suspended, complete the bio. | ||
469 | */ | ||
470 | if (atomic_read(&ms->suspend)) { | ||
471 | if (dm_noflush_suspending(ms->ti)) | ||
472 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
473 | else | ||
474 | bio_endio(bio, -EIO); | ||
475 | return; | ||
476 | } | ||
477 | |||
478 | /* | ||
479 | * Hold bio until the suspend is complete. | ||
480 | */ | ||
481 | spin_lock_irq(&ms->lock); | ||
482 | bio_list_add(&ms->holds, bio); | ||
483 | spin_unlock_irq(&ms->lock); | ||
484 | } | ||
485 | |||
416 | /*----------------------------------------------------------------- | 486 | /*----------------------------------------------------------------- |
417 | * Reads | 487 | * Reads |
418 | *---------------------------------------------------------------*/ | 488 | *---------------------------------------------------------------*/ |
@@ -511,7 +581,6 @@ static void write_callback(unsigned long error, void *context) | |||
511 | unsigned i, ret = 0; | 581 | unsigned i, ret = 0; |
512 | struct bio *bio = (struct bio *) context; | 582 | struct bio *bio = (struct bio *) context; |
513 | struct mirror_set *ms; | 583 | struct mirror_set *ms; |
514 | int uptodate = 0; | ||
515 | int should_wake = 0; | 584 | int should_wake = 0; |
516 | unsigned long flags; | 585 | unsigned long flags; |
517 | 586 | ||
@@ -524,36 +593,27 @@ static void write_callback(unsigned long error, void *context) | |||
524 | * This way we handle both writes to SYNC and NOSYNC | 593 | * This way we handle both writes to SYNC and NOSYNC |
525 | * regions with the same code. | 594 | * regions with the same code. |
526 | */ | 595 | */ |
527 | if (likely(!error)) | 596 | if (likely(!error)) { |
528 | goto out; | 597 | bio_endio(bio, ret); |
598 | return; | ||
599 | } | ||
529 | 600 | ||
530 | for (i = 0; i < ms->nr_mirrors; i++) | 601 | for (i = 0; i < ms->nr_mirrors; i++) |
531 | if (test_bit(i, &error)) | 602 | if (test_bit(i, &error)) |
532 | fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); | 603 | fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); |
533 | else | ||
534 | uptodate = 1; | ||
535 | 604 | ||
536 | if (unlikely(!uptodate)) { | 605 | /* |
537 | DMERR("All replicated volumes dead, failing I/O"); | 606 | * Need to raise event. Since raising |
538 | /* None of the writes succeeded, fail the I/O. */ | 607 | * events can block, we need to do it in |
539 | ret = -EIO; | 608 | * the main thread. |
540 | } else if (errors_handled(ms)) { | 609 | */ |
541 | /* | 610 | spin_lock_irqsave(&ms->lock, flags); |
542 | * Need to raise event. Since raising | 611 | if (!ms->failures.head) |
543 | * events can block, we need to do it in | 612 | should_wake = 1; |
544 | * the main thread. | 613 | bio_list_add(&ms->failures, bio); |
545 | */ | 614 | spin_unlock_irqrestore(&ms->lock, flags); |
546 | spin_lock_irqsave(&ms->lock, flags); | 615 | if (should_wake) |
547 | if (!ms->failures.head) | 616 | wakeup_mirrord(ms); |
548 | should_wake = 1; | ||
549 | bio_list_add(&ms->failures, bio); | ||
550 | spin_unlock_irqrestore(&ms->lock, flags); | ||
551 | if (should_wake) | ||
552 | wakeup_mirrord(ms); | ||
553 | return; | ||
554 | } | ||
555 | out: | ||
556 | bio_endio(bio, ret); | ||
557 | } | 617 | } |
558 | 618 | ||
559 | static void do_write(struct mirror_set *ms, struct bio *bio) | 619 | static void do_write(struct mirror_set *ms, struct bio *bio) |
@@ -562,7 +622,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) | |||
562 | struct dm_io_region io[ms->nr_mirrors], *dest = io; | 622 | struct dm_io_region io[ms->nr_mirrors], *dest = io; |
563 | struct mirror *m; | 623 | struct mirror *m; |
564 | struct dm_io_request io_req = { | 624 | struct dm_io_request io_req = { |
565 | .bi_rw = WRITE, | 625 | .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), |
566 | .mem.type = DM_IO_BVEC, | 626 | .mem.type = DM_IO_BVEC, |
567 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, | 627 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, |
568 | .notify.fn = write_callback, | 628 | .notify.fn = write_callback, |
@@ -603,6 +663,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
603 | bio_list_init(&requeue); | 663 | bio_list_init(&requeue); |
604 | 664 | ||
605 | while ((bio = bio_list_pop(writes))) { | 665 | while ((bio = bio_list_pop(writes))) { |
666 | if (unlikely(bio_empty_barrier(bio))) { | ||
667 | bio_list_add(&sync, bio); | ||
668 | continue; | ||
669 | } | ||
670 | |||
606 | region = dm_rh_bio_to_region(ms->rh, bio); | 671 | region = dm_rh_bio_to_region(ms->rh, bio); |
607 | 672 | ||
608 | if (log->type->is_remote_recovering && | 673 | if (log->type->is_remote_recovering && |
@@ -672,8 +737,12 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
672 | dm_rh_delay(ms->rh, bio); | 737 | dm_rh_delay(ms->rh, bio); |
673 | 738 | ||
674 | while ((bio = bio_list_pop(&nosync))) { | 739 | while ((bio = bio_list_pop(&nosync))) { |
675 | map_bio(get_default_mirror(ms), bio); | 740 | if (unlikely(ms->leg_failure) && errors_handled(ms)) |
676 | generic_make_request(bio); | 741 | hold_bio(ms, bio); |
742 | else { | ||
743 | map_bio(get_default_mirror(ms), bio); | ||
744 | generic_make_request(bio); | ||
745 | } | ||
677 | } | 746 | } |
678 | } | 747 | } |
679 | 748 | ||
@@ -681,20 +750,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) | |||
681 | { | 750 | { |
682 | struct bio *bio; | 751 | struct bio *bio; |
683 | 752 | ||
684 | if (!failures->head) | 753 | if (likely(!failures->head)) |
685 | return; | ||
686 | |||
687 | if (!ms->log_failure) { | ||
688 | while ((bio = bio_list_pop(failures))) { | ||
689 | ms->in_sync = 0; | ||
690 | dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); | ||
691 | } | ||
692 | return; | 754 | return; |
693 | } | ||
694 | 755 | ||
695 | /* | 756 | /* |
696 | * If the log has failed, unattempted writes are being | 757 | * If the log has failed, unattempted writes are being |
697 | * put on the failures list. We can't issue those writes | 758 | * put on the holds list. We can't issue those writes |
698 | * until a log has been marked, so we must store them. | 759 | * until a log has been marked, so we must store them. |
699 | * | 760 | * |
700 | * If a 'noflush' suspend is in progress, we can requeue | 761 | * If a 'noflush' suspend is in progress, we can requeue |
@@ -709,23 +770,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) | |||
709 | * for us to treat them the same and requeue them | 770 | * for us to treat them the same and requeue them |
710 | * as well. | 771 | * as well. |
711 | */ | 772 | */ |
712 | if (dm_noflush_suspending(ms->ti)) { | 773 | while ((bio = bio_list_pop(failures))) { |
713 | while ((bio = bio_list_pop(failures))) | 774 | if (!ms->log_failure) { |
714 | bio_endio(bio, DM_ENDIO_REQUEUE); | 775 | ms->in_sync = 0; |
715 | return; | 776 | dm_rh_mark_nosync(ms->rh, bio); |
716 | } | 777 | } |
717 | 778 | ||
718 | if (atomic_read(&ms->suspend)) { | 779 | /* |
719 | while ((bio = bio_list_pop(failures))) | 780 | * If all the legs are dead, fail the I/O. |
781 | * If we have been told to handle errors, hold the bio | ||
782 | * and wait for userspace to deal with the problem. | ||
783 | * Otherwise pretend that the I/O succeeded. (This would | ||
784 | * be wrong if the failed leg returned after reboot and | ||
785 | * got replicated back to the good legs.) | ||
786 | */ | ||
787 | if (!get_valid_mirror(ms)) | ||
720 | bio_endio(bio, -EIO); | 788 | bio_endio(bio, -EIO); |
721 | return; | 789 | else if (errors_handled(ms)) |
790 | hold_bio(ms, bio); | ||
791 | else | ||
792 | bio_endio(bio, 0); | ||
722 | } | 793 | } |
723 | |||
724 | spin_lock_irq(&ms->lock); | ||
725 | bio_list_merge(&ms->failures, failures); | ||
726 | spin_unlock_irq(&ms->lock); | ||
727 | |||
728 | delayed_wake(ms); | ||
729 | } | 794 | } |
730 | 795 | ||
731 | static void trigger_event(struct work_struct *work) | 796 | static void trigger_event(struct work_struct *work) |
@@ -784,12 +849,17 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, | |||
784 | } | 849 | } |
785 | 850 | ||
786 | spin_lock_init(&ms->lock); | 851 | spin_lock_init(&ms->lock); |
852 | bio_list_init(&ms->reads); | ||
853 | bio_list_init(&ms->writes); | ||
854 | bio_list_init(&ms->failures); | ||
855 | bio_list_init(&ms->holds); | ||
787 | 856 | ||
788 | ms->ti = ti; | 857 | ms->ti = ti; |
789 | ms->nr_mirrors = nr_mirrors; | 858 | ms->nr_mirrors = nr_mirrors; |
790 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); | 859 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); |
791 | ms->in_sync = 0; | 860 | ms->in_sync = 0; |
792 | ms->log_failure = 0; | 861 | ms->log_failure = 0; |
862 | ms->leg_failure = 0; | ||
793 | atomic_set(&ms->suspend, 0); | 863 | atomic_set(&ms->suspend, 0); |
794 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); | 864 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); |
795 | 865 | ||
@@ -889,7 +959,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, | |||
889 | return NULL; | 959 | return NULL; |
890 | } | 960 | } |
891 | 961 | ||
892 | dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); | 962 | dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count, |
963 | argv + 2); | ||
893 | if (!dl) { | 964 | if (!dl) { |
894 | ti->error = "Error creating mirror dirty log"; | 965 | ti->error = "Error creating mirror dirty log"; |
895 | return NULL; | 966 | return NULL; |
@@ -995,6 +1066,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
995 | 1066 | ||
996 | ti->private = ms; | 1067 | ti->private = ms; |
997 | ti->split_io = dm_rh_get_region_size(ms->rh); | 1068 | ti->split_io = dm_rh_get_region_size(ms->rh); |
1069 | ti->num_flush_requests = 1; | ||
998 | 1070 | ||
999 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); | 1071 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); |
1000 | if (!ms->kmirrord_wq) { | 1072 | if (!ms->kmirrord_wq) { |
@@ -1122,7 +1194,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, | |||
1122 | * We need to dec pending if this was a write. | 1194 | * We need to dec pending if this was a write. |
1123 | */ | 1195 | */ |
1124 | if (rw == WRITE) { | 1196 | if (rw == WRITE) { |
1125 | dm_rh_dec(ms->rh, map_context->ll); | 1197 | if (likely(!bio_empty_barrier(bio))) |
1198 | dm_rh_dec(ms->rh, map_context->ll); | ||
1126 | return error; | 1199 | return error; |
1127 | } | 1200 | } |
1128 | 1201 | ||
@@ -1180,6 +1253,9 @@ static void mirror_presuspend(struct dm_target *ti) | |||
1180 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1253 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1181 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | 1254 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1182 | 1255 | ||
1256 | struct bio_list holds; | ||
1257 | struct bio *bio; | ||
1258 | |||
1183 | atomic_set(&ms->suspend, 1); | 1259 | atomic_set(&ms->suspend, 1); |
1184 | 1260 | ||
1185 | /* | 1261 | /* |
@@ -1202,6 +1278,22 @@ static void mirror_presuspend(struct dm_target *ti) | |||
1202 | * we know that all of our I/O has been pushed. | 1278 | * we know that all of our I/O has been pushed. |
1203 | */ | 1279 | */ |
1204 | flush_workqueue(ms->kmirrord_wq); | 1280 | flush_workqueue(ms->kmirrord_wq); |
1281 | |||
1282 | /* | ||
1283 | * Now set ms->suspend is set and the workqueue flushed, no more | ||
1284 | * entries can be added to ms->hold list, so process it. | ||
1285 | * | ||
1286 | * Bios can still arrive concurrently with or after this | ||
1287 | * presuspend function, but they cannot join the hold list | ||
1288 | * because ms->suspend is set. | ||
1289 | */ | ||
1290 | spin_lock_irq(&ms->lock); | ||
1291 | holds = ms->holds; | ||
1292 | bio_list_init(&ms->holds); | ||
1293 | spin_unlock_irq(&ms->lock); | ||
1294 | |||
1295 | while ((bio = bio_list_pop(&holds))) | ||
1296 | hold_bio(ms, bio); | ||
1205 | } | 1297 | } |
1206 | 1298 | ||
1207 | static void mirror_postsuspend(struct dm_target *ti) | 1299 | static void mirror_postsuspend(struct dm_target *ti) |
@@ -1244,7 +1336,8 @@ static char device_status_char(struct mirror *m) | |||
1244 | if (!atomic_read(&(m->error_count))) | 1336 | if (!atomic_read(&(m->error_count))) |
1245 | return 'A'; | 1337 | return 'A'; |
1246 | 1338 | ||
1247 | return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : | 1339 | return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' : |
1340 | (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : | ||
1248 | (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : | 1341 | (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : |
1249 | (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; | 1342 | (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; |
1250 | } | 1343 | } |
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 36dbe29f2fd6..5f19ceb6fe91 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c | |||
@@ -79,6 +79,11 @@ struct dm_region_hash { | |||
79 | struct list_head recovered_regions; | 79 | struct list_head recovered_regions; |
80 | struct list_head failed_recovered_regions; | 80 | struct list_head failed_recovered_regions; |
81 | 81 | ||
82 | /* | ||
83 | * If there was a barrier failure no regions can be marked clean. | ||
84 | */ | ||
85 | int barrier_failure; | ||
86 | |||
82 | void *context; | 87 | void *context; |
83 | sector_t target_begin; | 88 | sector_t target_begin; |
84 | 89 | ||
@@ -211,6 +216,7 @@ struct dm_region_hash *dm_region_hash_create( | |||
211 | INIT_LIST_HEAD(&rh->quiesced_regions); | 216 | INIT_LIST_HEAD(&rh->quiesced_regions); |
212 | INIT_LIST_HEAD(&rh->recovered_regions); | 217 | INIT_LIST_HEAD(&rh->recovered_regions); |
213 | INIT_LIST_HEAD(&rh->failed_recovered_regions); | 218 | INIT_LIST_HEAD(&rh->failed_recovered_regions); |
219 | rh->barrier_failure = 0; | ||
214 | 220 | ||
215 | rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, | 221 | rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, |
216 | sizeof(struct dm_region)); | 222 | sizeof(struct dm_region)); |
@@ -377,8 +383,6 @@ static void complete_resync_work(struct dm_region *reg, int success) | |||
377 | /* dm_rh_mark_nosync | 383 | /* dm_rh_mark_nosync |
378 | * @ms | 384 | * @ms |
379 | * @bio | 385 | * @bio |
380 | * @done | ||
381 | * @error | ||
382 | * | 386 | * |
383 | * The bio was written on some mirror(s) but failed on other mirror(s). | 387 | * The bio was written on some mirror(s) but failed on other mirror(s). |
384 | * We can successfully endio the bio but should avoid the region being | 388 | * We can successfully endio the bio but should avoid the region being |
@@ -386,8 +390,7 @@ static void complete_resync_work(struct dm_region *reg, int success) | |||
386 | * | 390 | * |
387 | * This function is _not_ safe in interrupt context! | 391 | * This function is _not_ safe in interrupt context! |
388 | */ | 392 | */ |
389 | void dm_rh_mark_nosync(struct dm_region_hash *rh, | 393 | void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) |
390 | struct bio *bio, unsigned done, int error) | ||
391 | { | 394 | { |
392 | unsigned long flags; | 395 | unsigned long flags; |
393 | struct dm_dirty_log *log = rh->log; | 396 | struct dm_dirty_log *log = rh->log; |
@@ -395,6 +398,11 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, | |||
395 | region_t region = dm_rh_bio_to_region(rh, bio); | 398 | region_t region = dm_rh_bio_to_region(rh, bio); |
396 | int recovering = 0; | 399 | int recovering = 0; |
397 | 400 | ||
401 | if (bio_empty_barrier(bio)) { | ||
402 | rh->barrier_failure = 1; | ||
403 | return; | ||
404 | } | ||
405 | |||
398 | /* We must inform the log that the sync count has changed. */ | 406 | /* We must inform the log that the sync count has changed. */ |
399 | log->type->set_region_sync(log, region, 0); | 407 | log->type->set_region_sync(log, region, 0); |
400 | 408 | ||
@@ -419,7 +427,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, | |||
419 | BUG_ON(!list_empty(®->list)); | 427 | BUG_ON(!list_empty(®->list)); |
420 | spin_unlock_irqrestore(&rh->region_lock, flags); | 428 | spin_unlock_irqrestore(&rh->region_lock, flags); |
421 | 429 | ||
422 | bio_endio(bio, error); | ||
423 | if (recovering) | 430 | if (recovering) |
424 | complete_resync_work(reg, 0); | 431 | complete_resync_work(reg, 0); |
425 | } | 432 | } |
@@ -515,8 +522,11 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) | |||
515 | { | 522 | { |
516 | struct bio *bio; | 523 | struct bio *bio; |
517 | 524 | ||
518 | for (bio = bios->head; bio; bio = bio->bi_next) | 525 | for (bio = bios->head; bio; bio = bio->bi_next) { |
526 | if (bio_empty_barrier(bio)) | ||
527 | continue; | ||
519 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); | 528 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); |
529 | } | ||
520 | } | 530 | } |
521 | EXPORT_SYMBOL_GPL(dm_rh_inc_pending); | 531 | EXPORT_SYMBOL_GPL(dm_rh_inc_pending); |
522 | 532 | ||
@@ -544,7 +554,14 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region) | |||
544 | */ | 554 | */ |
545 | 555 | ||
546 | /* do nothing for DM_RH_NOSYNC */ | 556 | /* do nothing for DM_RH_NOSYNC */ |
547 | if (reg->state == DM_RH_RECOVERING) { | 557 | if (unlikely(rh->barrier_failure)) { |
558 | /* | ||
559 | * If a write barrier failed some time ago, we | ||
560 | * don't know whether or not this write made it | ||
561 | * to the disk, so we must resync the device. | ||
562 | */ | ||
563 | reg->state = DM_RH_NOSYNC; | ||
564 | } else if (reg->state == DM_RH_RECOVERING) { | ||
548 | list_add_tail(®->list, &rh->quiesced_regions); | 565 | list_add_tail(®->list, &rh->quiesced_regions); |
549 | } else if (reg->state == DM_RH_DIRTY) { | 566 | } else if (reg->state == DM_RH_DIRTY) { |
550 | reg->state = DM_RH_CLEAN; | 567 | reg->state = DM_RH_CLEAN; |
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 0c746420c008..7d08879689ac 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
@@ -55,6 +55,8 @@ | |||
55 | */ | 55 | */ |
56 | #define SNAPSHOT_DISK_VERSION 1 | 56 | #define SNAPSHOT_DISK_VERSION 1 |
57 | 57 | ||
58 | #define NUM_SNAPSHOT_HDR_CHUNKS 1 | ||
59 | |||
58 | struct disk_header { | 60 | struct disk_header { |
59 | uint32_t magic; | 61 | uint32_t magic; |
60 | 62 | ||
@@ -120,7 +122,22 @@ struct pstore { | |||
120 | 122 | ||
121 | /* | 123 | /* |
122 | * The next free chunk for an exception. | 124 | * The next free chunk for an exception. |
125 | * | ||
126 | * When creating exceptions, all the chunks here and above are | ||
127 | * free. It holds the next chunk to be allocated. On rare | ||
128 | * occasions (e.g. after a system crash) holes can be left in | ||
129 | * the exception store because chunks can be committed out of | ||
130 | * order. | ||
131 | * | ||
132 | * When merging exceptions, it does not necessarily mean all the | ||
133 | * chunks here and above are free. It holds the value it would | ||
134 | * have held if all chunks had been committed in order of | ||
135 | * allocation. Consequently the value may occasionally be | ||
136 | * slightly too low, but since it's only used for 'status' and | ||
137 | * it can never reach its minimum value too early this doesn't | ||
138 | * matter. | ||
123 | */ | 139 | */ |
140 | |||
124 | chunk_t next_free; | 141 | chunk_t next_free; |
125 | 142 | ||
126 | /* | 143 | /* |
@@ -214,7 +231,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, | |||
214 | int metadata) | 231 | int metadata) |
215 | { | 232 | { |
216 | struct dm_io_region where = { | 233 | struct dm_io_region where = { |
217 | .bdev = ps->store->cow->bdev, | 234 | .bdev = dm_snap_cow(ps->store->snap)->bdev, |
218 | .sector = ps->store->chunk_size * chunk, | 235 | .sector = ps->store->chunk_size * chunk, |
219 | .count = ps->store->chunk_size, | 236 | .count = ps->store->chunk_size, |
220 | }; | 237 | }; |
@@ -294,7 +311,8 @@ static int read_header(struct pstore *ps, int *new_snapshot) | |||
294 | */ | 311 | */ |
295 | if (!ps->store->chunk_size) { | 312 | if (!ps->store->chunk_size) { |
296 | ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, | 313 | ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, |
297 | bdev_logical_block_size(ps->store->cow->bdev) >> 9); | 314 | bdev_logical_block_size(dm_snap_cow(ps->store->snap)-> |
315 | bdev) >> 9); | ||
298 | ps->store->chunk_mask = ps->store->chunk_size - 1; | 316 | ps->store->chunk_mask = ps->store->chunk_size - 1; |
299 | ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; | 317 | ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; |
300 | chunk_size_supplied = 0; | 318 | chunk_size_supplied = 0; |
@@ -408,6 +426,15 @@ static void write_exception(struct pstore *ps, | |||
408 | e->new_chunk = cpu_to_le64(de->new_chunk); | 426 | e->new_chunk = cpu_to_le64(de->new_chunk); |
409 | } | 427 | } |
410 | 428 | ||
429 | static void clear_exception(struct pstore *ps, uint32_t index) | ||
430 | { | ||
431 | struct disk_exception *e = get_exception(ps, index); | ||
432 | |||
433 | /* clear it */ | ||
434 | e->old_chunk = 0; | ||
435 | e->new_chunk = 0; | ||
436 | } | ||
437 | |||
411 | /* | 438 | /* |
412 | * Registers the exceptions that are present in the current area. | 439 | * Registers the exceptions that are present in the current area. |
413 | * 'full' is filled in to indicate if the area has been | 440 | * 'full' is filled in to indicate if the area has been |
@@ -489,11 +516,23 @@ static struct pstore *get_info(struct dm_exception_store *store) | |||
489 | return (struct pstore *) store->context; | 516 | return (struct pstore *) store->context; |
490 | } | 517 | } |
491 | 518 | ||
492 | static void persistent_fraction_full(struct dm_exception_store *store, | 519 | static void persistent_usage(struct dm_exception_store *store, |
493 | sector_t *numerator, sector_t *denominator) | 520 | sector_t *total_sectors, |
521 | sector_t *sectors_allocated, | ||
522 | sector_t *metadata_sectors) | ||
494 | { | 523 | { |
495 | *numerator = get_info(store)->next_free * store->chunk_size; | 524 | struct pstore *ps = get_info(store); |
496 | *denominator = get_dev_size(store->cow->bdev); | 525 | |
526 | *sectors_allocated = ps->next_free * store->chunk_size; | ||
527 | *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); | ||
528 | |||
529 | /* | ||
530 | * First chunk is the fixed header. | ||
531 | * Then there are (ps->current_area + 1) metadata chunks, each one | ||
532 | * separated from the next by ps->exceptions_per_area data chunks. | ||
533 | */ | ||
534 | *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) * | ||
535 | store->chunk_size; | ||
497 | } | 536 | } |
498 | 537 | ||
499 | static void persistent_dtr(struct dm_exception_store *store) | 538 | static void persistent_dtr(struct dm_exception_store *store) |
@@ -552,44 +591,40 @@ static int persistent_read_metadata(struct dm_exception_store *store, | |||
552 | ps->current_area = 0; | 591 | ps->current_area = 0; |
553 | zero_memory_area(ps); | 592 | zero_memory_area(ps); |
554 | r = zero_disk_area(ps, 0); | 593 | r = zero_disk_area(ps, 0); |
555 | if (r) { | 594 | if (r) |
556 | DMWARN("zero_disk_area(0) failed"); | 595 | DMWARN("zero_disk_area(0) failed"); |
557 | return r; | 596 | return r; |
558 | } | 597 | } |
559 | } else { | 598 | /* |
560 | /* | 599 | * Sanity checks. |
561 | * Sanity checks. | 600 | */ |
562 | */ | 601 | if (ps->version != SNAPSHOT_DISK_VERSION) { |
563 | if (ps->version != SNAPSHOT_DISK_VERSION) { | 602 | DMWARN("unable to handle snapshot disk version %d", |
564 | DMWARN("unable to handle snapshot disk version %d", | 603 | ps->version); |
565 | ps->version); | 604 | return -EINVAL; |
566 | return -EINVAL; | 605 | } |
567 | } | ||
568 | 606 | ||
569 | /* | 607 | /* |
570 | * Metadata are valid, but snapshot is invalidated | 608 | * Metadata are valid, but snapshot is invalidated |
571 | */ | 609 | */ |
572 | if (!ps->valid) | 610 | if (!ps->valid) |
573 | return 1; | 611 | return 1; |
574 | 612 | ||
575 | /* | 613 | /* |
576 | * Read the metadata. | 614 | * Read the metadata. |
577 | */ | 615 | */ |
578 | r = read_exceptions(ps, callback, callback_context); | 616 | r = read_exceptions(ps, callback, callback_context); |
579 | if (r) | ||
580 | return r; | ||
581 | } | ||
582 | 617 | ||
583 | return 0; | 618 | return r; |
584 | } | 619 | } |
585 | 620 | ||
586 | static int persistent_prepare_exception(struct dm_exception_store *store, | 621 | static int persistent_prepare_exception(struct dm_exception_store *store, |
587 | struct dm_snap_exception *e) | 622 | struct dm_exception *e) |
588 | { | 623 | { |
589 | struct pstore *ps = get_info(store); | 624 | struct pstore *ps = get_info(store); |
590 | uint32_t stride; | 625 | uint32_t stride; |
591 | chunk_t next_free; | 626 | chunk_t next_free; |
592 | sector_t size = get_dev_size(store->cow->bdev); | 627 | sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); |
593 | 628 | ||
594 | /* Is there enough room ? */ | 629 | /* Is there enough room ? */ |
595 | if (size < ((ps->next_free + 1) * store->chunk_size)) | 630 | if (size < ((ps->next_free + 1) * store->chunk_size)) |
@@ -611,7 +646,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store, | |||
611 | } | 646 | } |
612 | 647 | ||
613 | static void persistent_commit_exception(struct dm_exception_store *store, | 648 | static void persistent_commit_exception(struct dm_exception_store *store, |
614 | struct dm_snap_exception *e, | 649 | struct dm_exception *e, |
615 | void (*callback) (void *, int success), | 650 | void (*callback) (void *, int success), |
616 | void *callback_context) | 651 | void *callback_context) |
617 | { | 652 | { |
@@ -672,6 +707,85 @@ static void persistent_commit_exception(struct dm_exception_store *store, | |||
672 | ps->callback_count = 0; | 707 | ps->callback_count = 0; |
673 | } | 708 | } |
674 | 709 | ||
710 | static int persistent_prepare_merge(struct dm_exception_store *store, | ||
711 | chunk_t *last_old_chunk, | ||
712 | chunk_t *last_new_chunk) | ||
713 | { | ||
714 | struct pstore *ps = get_info(store); | ||
715 | struct disk_exception de; | ||
716 | int nr_consecutive; | ||
717 | int r; | ||
718 | |||
719 | /* | ||
720 | * When current area is empty, move back to preceding area. | ||
721 | */ | ||
722 | if (!ps->current_committed) { | ||
723 | /* | ||
724 | * Have we finished? | ||
725 | */ | ||
726 | if (!ps->current_area) | ||
727 | return 0; | ||
728 | |||
729 | ps->current_area--; | ||
730 | r = area_io(ps, READ); | ||
731 | if (r < 0) | ||
732 | return r; | ||
733 | ps->current_committed = ps->exceptions_per_area; | ||
734 | } | ||
735 | |||
736 | read_exception(ps, ps->current_committed - 1, &de); | ||
737 | *last_old_chunk = de.old_chunk; | ||
738 | *last_new_chunk = de.new_chunk; | ||
739 | |||
740 | /* | ||
741 | * Find number of consecutive chunks within the current area, | ||
742 | * working backwards. | ||
743 | */ | ||
744 | for (nr_consecutive = 1; nr_consecutive < ps->current_committed; | ||
745 | nr_consecutive++) { | ||
746 | read_exception(ps, ps->current_committed - 1 - nr_consecutive, | ||
747 | &de); | ||
748 | if (de.old_chunk != *last_old_chunk - nr_consecutive || | ||
749 | de.new_chunk != *last_new_chunk - nr_consecutive) | ||
750 | break; | ||
751 | } | ||
752 | |||
753 | return nr_consecutive; | ||
754 | } | ||
755 | |||
756 | static int persistent_commit_merge(struct dm_exception_store *store, | ||
757 | int nr_merged) | ||
758 | { | ||
759 | int r, i; | ||
760 | struct pstore *ps = get_info(store); | ||
761 | |||
762 | BUG_ON(nr_merged > ps->current_committed); | ||
763 | |||
764 | for (i = 0; i < nr_merged; i++) | ||
765 | clear_exception(ps, ps->current_committed - 1 - i); | ||
766 | |||
767 | r = area_io(ps, WRITE); | ||
768 | if (r < 0) | ||
769 | return r; | ||
770 | |||
771 | ps->current_committed -= nr_merged; | ||
772 | |||
773 | /* | ||
774 | * At this stage, only persistent_usage() uses ps->next_free, so | ||
775 | * we make no attempt to keep ps->next_free strictly accurate | ||
776 | * as exceptions may have been committed out-of-order originally. | ||
777 | * Once a snapshot has become merging, we set it to the value it | ||
778 | * would have held had all the exceptions been committed in order. | ||
779 | * | ||
780 | * ps->current_area does not get reduced by prepare_merge() until | ||
781 | * after commit_merge() has removed the nr_merged previous exceptions. | ||
782 | */ | ||
783 | ps->next_free = (area_location(ps, ps->current_area) - 1) + | ||
784 | (ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS; | ||
785 | |||
786 | return 0; | ||
787 | } | ||
788 | |||
675 | static void persistent_drop_snapshot(struct dm_exception_store *store) | 789 | static void persistent_drop_snapshot(struct dm_exception_store *store) |
676 | { | 790 | { |
677 | struct pstore *ps = get_info(store); | 791 | struct pstore *ps = get_info(store); |
@@ -697,7 +811,7 @@ static int persistent_ctr(struct dm_exception_store *store, | |||
697 | ps->area = NULL; | 811 | ps->area = NULL; |
698 | ps->zero_area = NULL; | 812 | ps->zero_area = NULL; |
699 | ps->header_area = NULL; | 813 | ps->header_area = NULL; |
700 | ps->next_free = 2; /* skipping the header and first area */ | 814 | ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */ |
701 | ps->current_committed = 0; | 815 | ps->current_committed = 0; |
702 | 816 | ||
703 | ps->callback_count = 0; | 817 | ps->callback_count = 0; |
@@ -726,8 +840,7 @@ static unsigned persistent_status(struct dm_exception_store *store, | |||
726 | case STATUSTYPE_INFO: | 840 | case STATUSTYPE_INFO: |
727 | break; | 841 | break; |
728 | case STATUSTYPE_TABLE: | 842 | case STATUSTYPE_TABLE: |
729 | DMEMIT(" %s P %llu", store->cow->name, | 843 | DMEMIT(" P %llu", (unsigned long long)store->chunk_size); |
730 | (unsigned long long)store->chunk_size); | ||
731 | } | 844 | } |
732 | 845 | ||
733 | return sz; | 846 | return sz; |
@@ -741,8 +854,10 @@ static struct dm_exception_store_type _persistent_type = { | |||
741 | .read_metadata = persistent_read_metadata, | 854 | .read_metadata = persistent_read_metadata, |
742 | .prepare_exception = persistent_prepare_exception, | 855 | .prepare_exception = persistent_prepare_exception, |
743 | .commit_exception = persistent_commit_exception, | 856 | .commit_exception = persistent_commit_exception, |
857 | .prepare_merge = persistent_prepare_merge, | ||
858 | .commit_merge = persistent_commit_merge, | ||
744 | .drop_snapshot = persistent_drop_snapshot, | 859 | .drop_snapshot = persistent_drop_snapshot, |
745 | .fraction_full = persistent_fraction_full, | 860 | .usage = persistent_usage, |
746 | .status = persistent_status, | 861 | .status = persistent_status, |
747 | }; | 862 | }; |
748 | 863 | ||
@@ -754,8 +869,10 @@ static struct dm_exception_store_type _persistent_compat_type = { | |||
754 | .read_metadata = persistent_read_metadata, | 869 | .read_metadata = persistent_read_metadata, |
755 | .prepare_exception = persistent_prepare_exception, | 870 | .prepare_exception = persistent_prepare_exception, |
756 | .commit_exception = persistent_commit_exception, | 871 | .commit_exception = persistent_commit_exception, |
872 | .prepare_merge = persistent_prepare_merge, | ||
873 | .commit_merge = persistent_commit_merge, | ||
757 | .drop_snapshot = persistent_drop_snapshot, | 874 | .drop_snapshot = persistent_drop_snapshot, |
758 | .fraction_full = persistent_fraction_full, | 875 | .usage = persistent_usage, |
759 | .status = persistent_status, | 876 | .status = persistent_status, |
760 | }; | 877 | }; |
761 | 878 | ||
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c index cde5aa558e6d..a0898a66a2f8 100644 --- a/drivers/md/dm-snap-transient.c +++ b/drivers/md/dm-snap-transient.c | |||
@@ -36,10 +36,10 @@ static int transient_read_metadata(struct dm_exception_store *store, | |||
36 | } | 36 | } |
37 | 37 | ||
38 | static int transient_prepare_exception(struct dm_exception_store *store, | 38 | static int transient_prepare_exception(struct dm_exception_store *store, |
39 | struct dm_snap_exception *e) | 39 | struct dm_exception *e) |
40 | { | 40 | { |
41 | struct transient_c *tc = store->context; | 41 | struct transient_c *tc = store->context; |
42 | sector_t size = get_dev_size(store->cow->bdev); | 42 | sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); |
43 | 43 | ||
44 | if (size < (tc->next_free + store->chunk_size)) | 44 | if (size < (tc->next_free + store->chunk_size)) |
45 | return -1; | 45 | return -1; |
@@ -51,7 +51,7 @@ static int transient_prepare_exception(struct dm_exception_store *store, | |||
51 | } | 51 | } |
52 | 52 | ||
53 | static void transient_commit_exception(struct dm_exception_store *store, | 53 | static void transient_commit_exception(struct dm_exception_store *store, |
54 | struct dm_snap_exception *e, | 54 | struct dm_exception *e, |
55 | void (*callback) (void *, int success), | 55 | void (*callback) (void *, int success), |
56 | void *callback_context) | 56 | void *callback_context) |
57 | { | 57 | { |
@@ -59,11 +59,14 @@ static void transient_commit_exception(struct dm_exception_store *store, | |||
59 | callback(callback_context, 1); | 59 | callback(callback_context, 1); |
60 | } | 60 | } |
61 | 61 | ||
62 | static void transient_fraction_full(struct dm_exception_store *store, | 62 | static void transient_usage(struct dm_exception_store *store, |
63 | sector_t *numerator, sector_t *denominator) | 63 | sector_t *total_sectors, |
64 | sector_t *sectors_allocated, | ||
65 | sector_t *metadata_sectors) | ||
64 | { | 66 | { |
65 | *numerator = ((struct transient_c *) store->context)->next_free; | 67 | *sectors_allocated = ((struct transient_c *) store->context)->next_free; |
66 | *denominator = get_dev_size(store->cow->bdev); | 68 | *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); |
69 | *metadata_sectors = 0; | ||
67 | } | 70 | } |
68 | 71 | ||
69 | static int transient_ctr(struct dm_exception_store *store, | 72 | static int transient_ctr(struct dm_exception_store *store, |
@@ -91,8 +94,7 @@ static unsigned transient_status(struct dm_exception_store *store, | |||
91 | case STATUSTYPE_INFO: | 94 | case STATUSTYPE_INFO: |
92 | break; | 95 | break; |
93 | case STATUSTYPE_TABLE: | 96 | case STATUSTYPE_TABLE: |
94 | DMEMIT(" %s N %llu", store->cow->name, | 97 | DMEMIT(" N %llu", (unsigned long long)store->chunk_size); |
95 | (unsigned long long)store->chunk_size); | ||
96 | } | 98 | } |
97 | 99 | ||
98 | return sz; | 100 | return sz; |
@@ -106,7 +108,7 @@ static struct dm_exception_store_type _transient_type = { | |||
106 | .read_metadata = transient_read_metadata, | 108 | .read_metadata = transient_read_metadata, |
107 | .prepare_exception = transient_prepare_exception, | 109 | .prepare_exception = transient_prepare_exception, |
108 | .commit_exception = transient_commit_exception, | 110 | .commit_exception = transient_commit_exception, |
109 | .fraction_full = transient_fraction_full, | 111 | .usage = transient_usage, |
110 | .status = transient_status, | 112 | .status = transient_status, |
111 | }; | 113 | }; |
112 | 114 | ||
@@ -118,7 +120,7 @@ static struct dm_exception_store_type _transient_compat_type = { | |||
118 | .read_metadata = transient_read_metadata, | 120 | .read_metadata = transient_read_metadata, |
119 | .prepare_exception = transient_prepare_exception, | 121 | .prepare_exception = transient_prepare_exception, |
120 | .commit_exception = transient_commit_exception, | 122 | .commit_exception = transient_commit_exception, |
121 | .fraction_full = transient_fraction_full, | 123 | .usage = transient_usage, |
122 | .status = transient_status, | 124 | .status = transient_status, |
123 | }; | 125 | }; |
124 | 126 | ||
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 3a3ba46e6d4b..ee8eb283650d 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
@@ -25,6 +25,11 @@ | |||
25 | 25 | ||
26 | #define DM_MSG_PREFIX "snapshots" | 26 | #define DM_MSG_PREFIX "snapshots" |
27 | 27 | ||
28 | static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; | ||
29 | |||
30 | #define dm_target_is_snapshot_merge(ti) \ | ||
31 | ((ti)->type->name == dm_snapshot_merge_target_name) | ||
32 | |||
28 | /* | 33 | /* |
29 | * The percentage increment we will wake up users at | 34 | * The percentage increment we will wake up users at |
30 | */ | 35 | */ |
@@ -49,7 +54,7 @@ | |||
49 | #define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ | 54 | #define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ |
50 | (DM_TRACKED_CHUNK_HASH_SIZE - 1)) | 55 | (DM_TRACKED_CHUNK_HASH_SIZE - 1)) |
51 | 56 | ||
52 | struct exception_table { | 57 | struct dm_exception_table { |
53 | uint32_t hash_mask; | 58 | uint32_t hash_mask; |
54 | unsigned hash_shift; | 59 | unsigned hash_shift; |
55 | struct list_head *table; | 60 | struct list_head *table; |
@@ -59,22 +64,31 @@ struct dm_snapshot { | |||
59 | struct rw_semaphore lock; | 64 | struct rw_semaphore lock; |
60 | 65 | ||
61 | struct dm_dev *origin; | 66 | struct dm_dev *origin; |
67 | struct dm_dev *cow; | ||
68 | |||
69 | struct dm_target *ti; | ||
62 | 70 | ||
63 | /* List of snapshots per Origin */ | 71 | /* List of snapshots per Origin */ |
64 | struct list_head list; | 72 | struct list_head list; |
65 | 73 | ||
66 | /* You can't use a snapshot if this is 0 (e.g. if full) */ | 74 | /* |
75 | * You can't use a snapshot if this is 0 (e.g. if full). | ||
76 | * A snapshot-merge target never clears this. | ||
77 | */ | ||
67 | int valid; | 78 | int valid; |
68 | 79 | ||
69 | /* Origin writes don't trigger exceptions until this is set */ | 80 | /* Origin writes don't trigger exceptions until this is set */ |
70 | int active; | 81 | int active; |
71 | 82 | ||
83 | /* Whether or not owning mapped_device is suspended */ | ||
84 | int suspended; | ||
85 | |||
72 | mempool_t *pending_pool; | 86 | mempool_t *pending_pool; |
73 | 87 | ||
74 | atomic_t pending_exceptions_count; | 88 | atomic_t pending_exceptions_count; |
75 | 89 | ||
76 | struct exception_table pending; | 90 | struct dm_exception_table pending; |
77 | struct exception_table complete; | 91 | struct dm_exception_table complete; |
78 | 92 | ||
79 | /* | 93 | /* |
80 | * pe_lock protects all pending_exception operations and access | 94 | * pe_lock protects all pending_exception operations and access |
@@ -95,8 +109,51 @@ struct dm_snapshot { | |||
95 | mempool_t *tracked_chunk_pool; | 109 | mempool_t *tracked_chunk_pool; |
96 | spinlock_t tracked_chunk_lock; | 110 | spinlock_t tracked_chunk_lock; |
97 | struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; | 111 | struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; |
112 | |||
113 | /* | ||
114 | * The merge operation failed if this flag is set. | ||
115 | * Failure modes are handled as follows: | ||
116 | * - I/O error reading the header | ||
117 | * => don't load the target; abort. | ||
118 | * - Header does not have "valid" flag set | ||
119 | * => use the origin; forget about the snapshot. | ||
120 | * - I/O error when reading exceptions | ||
121 | * => don't load the target; abort. | ||
122 | * (We can't use the intermediate origin state.) | ||
123 | * - I/O error while merging | ||
124 | * => stop merging; set merge_failed; process I/O normally. | ||
125 | */ | ||
126 | int merge_failed; | ||
127 | |||
128 | /* Wait for events based on state_bits */ | ||
129 | unsigned long state_bits; | ||
130 | |||
131 | /* Range of chunks currently being merged. */ | ||
132 | chunk_t first_merging_chunk; | ||
133 | int num_merging_chunks; | ||
134 | |||
135 | /* | ||
136 | * Incoming bios that overlap with chunks being merged must wait | ||
137 | * for them to be committed. | ||
138 | */ | ||
139 | struct bio_list bios_queued_during_merge; | ||
98 | }; | 140 | }; |
99 | 141 | ||
142 | /* | ||
143 | * state_bits: | ||
144 | * RUNNING_MERGE - Merge operation is in progress. | ||
145 | * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped; | ||
146 | * cleared afterwards. | ||
147 | */ | ||
148 | #define RUNNING_MERGE 0 | ||
149 | #define SHUTDOWN_MERGE 1 | ||
150 | |||
151 | struct dm_dev *dm_snap_cow(struct dm_snapshot *s) | ||
152 | { | ||
153 | return s->cow; | ||
154 | } | ||
155 | EXPORT_SYMBOL(dm_snap_cow); | ||
156 | |||
100 | static struct workqueue_struct *ksnapd; | 157 | static struct workqueue_struct *ksnapd; |
101 | static void flush_queued_bios(struct work_struct *work); | 158 | static void flush_queued_bios(struct work_struct *work); |
102 | 159 | ||
@@ -116,7 +173,7 @@ static int bdev_equal(struct block_device *lhs, struct block_device *rhs) | |||
116 | } | 173 | } |
117 | 174 | ||
118 | struct dm_snap_pending_exception { | 175 | struct dm_snap_pending_exception { |
119 | struct dm_snap_exception e; | 176 | struct dm_exception e; |
120 | 177 | ||
121 | /* | 178 | /* |
122 | * Origin buffers waiting for this to complete are held | 179 | * Origin buffers waiting for this to complete are held |
@@ -125,28 +182,6 @@ struct dm_snap_pending_exception { | |||
125 | struct bio_list origin_bios; | 182 | struct bio_list origin_bios; |
126 | struct bio_list snapshot_bios; | 183 | struct bio_list snapshot_bios; |
127 | 184 | ||
128 | /* | ||
129 | * Short-term queue of pending exceptions prior to submission. | ||
130 | */ | ||
131 | struct list_head list; | ||
132 | |||
133 | /* | ||
134 | * The primary pending_exception is the one that holds | ||
135 | * the ref_count and the list of origin_bios for a | ||
136 | * group of pending_exceptions. It is always last to get freed. | ||
137 | * These fields get set up when writing to the origin. | ||
138 | */ | ||
139 | struct dm_snap_pending_exception *primary_pe; | ||
140 | |||
141 | /* | ||
142 | * Number of pending_exceptions processing this chunk. | ||
143 | * When this drops to zero we must complete the origin bios. | ||
144 | * If incrementing or decrementing this, hold pe->snap->lock for | ||
145 | * the sibling concerned and not pe->primary_pe->snap->lock unless | ||
146 | * they are the same. | ||
147 | */ | ||
148 | atomic_t ref_count; | ||
149 | |||
150 | /* Pointer back to snapshot context */ | 185 | /* Pointer back to snapshot context */ |
151 | struct dm_snapshot *snap; | 186 | struct dm_snapshot *snap; |
152 | 187 | ||
@@ -222,6 +257,16 @@ static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) | |||
222 | } | 257 | } |
223 | 258 | ||
224 | /* | 259 | /* |
260 | * This conflicting I/O is extremely improbable in the caller, | ||
261 | * so msleep(1) is sufficient and there is no need for a wait queue. | ||
262 | */ | ||
263 | static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk) | ||
264 | { | ||
265 | while (__chunk_is_tracked(s, chunk)) | ||
266 | msleep(1); | ||
267 | } | ||
268 | |||
269 | /* | ||
225 | * One of these per registered origin, held in the snapshot_origins hash | 270 | * One of these per registered origin, held in the snapshot_origins hash |
226 | */ | 271 | */ |
227 | struct origin { | 272 | struct origin { |
@@ -243,6 +288,10 @@ struct origin { | |||
243 | static struct list_head *_origins; | 288 | static struct list_head *_origins; |
244 | static struct rw_semaphore _origins_lock; | 289 | static struct rw_semaphore _origins_lock; |
245 | 290 | ||
291 | static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done); | ||
292 | static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock); | ||
293 | static uint64_t _pending_exceptions_done_count; | ||
294 | |||
246 | static int init_origin_hash(void) | 295 | static int init_origin_hash(void) |
247 | { | 296 | { |
248 | int i; | 297 | int i; |
@@ -291,22 +340,144 @@ static void __insert_origin(struct origin *o) | |||
291 | } | 340 | } |
292 | 341 | ||
293 | /* | 342 | /* |
343 | * _origins_lock must be held when calling this function. | ||
344 | * Returns number of snapshots registered using the supplied cow device, plus: | ||
345 | * snap_src - a snapshot suitable for use as a source of exception handover | ||
346 | * snap_dest - a snapshot capable of receiving exception handover. | ||
347 | * snap_merge - an existing snapshot-merge target linked to the same origin. | ||
348 | * There can be at most one snapshot-merge target. The parameter is optional. | ||
349 | * | ||
350 | * Possible return values and states of snap_src and snap_dest. | ||
351 | * 0: NULL, NULL - first new snapshot | ||
352 | * 1: snap_src, NULL - normal snapshot | ||
353 | * 2: snap_src, snap_dest - waiting for handover | ||
354 | * 2: snap_src, NULL - handed over, waiting for old to be deleted | ||
355 | * 1: NULL, snap_dest - source got destroyed without handover | ||
356 | */ | ||
357 | static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, | ||
358 | struct dm_snapshot **snap_src, | ||
359 | struct dm_snapshot **snap_dest, | ||
360 | struct dm_snapshot **snap_merge) | ||
361 | { | ||
362 | struct dm_snapshot *s; | ||
363 | struct origin *o; | ||
364 | int count = 0; | ||
365 | int active; | ||
366 | |||
367 | o = __lookup_origin(snap->origin->bdev); | ||
368 | if (!o) | ||
369 | goto out; | ||
370 | |||
371 | list_for_each_entry(s, &o->snapshots, list) { | ||
372 | if (dm_target_is_snapshot_merge(s->ti) && snap_merge) | ||
373 | *snap_merge = s; | ||
374 | if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) | ||
375 | continue; | ||
376 | |||
377 | down_read(&s->lock); | ||
378 | active = s->active; | ||
379 | up_read(&s->lock); | ||
380 | |||
381 | if (active) { | ||
382 | if (snap_src) | ||
383 | *snap_src = s; | ||
384 | } else if (snap_dest) | ||
385 | *snap_dest = s; | ||
386 | |||
387 | count++; | ||
388 | } | ||
389 | |||
390 | out: | ||
391 | return count; | ||
392 | } | ||
393 | |||
394 | /* | ||
395 | * On success, returns 1 if this snapshot is a handover destination, | ||
396 | * otherwise returns 0. | ||
397 | */ | ||
398 | static int __validate_exception_handover(struct dm_snapshot *snap) | ||
399 | { | ||
400 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | ||
401 | struct dm_snapshot *snap_merge = NULL; | ||
402 | |||
403 | /* Does snapshot need exceptions handed over to it? */ | ||
404 | if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, | ||
405 | &snap_merge) == 2) || | ||
406 | snap_dest) { | ||
407 | snap->ti->error = "Snapshot cow pairing for exception " | ||
408 | "table handover failed"; | ||
409 | return -EINVAL; | ||
410 | } | ||
411 | |||
412 | /* | ||
413 | * If no snap_src was found, snap cannot become a handover | ||
414 | * destination. | ||
415 | */ | ||
416 | if (!snap_src) | ||
417 | return 0; | ||
418 | |||
419 | /* | ||
420 | * Non-snapshot-merge handover? | ||
421 | */ | ||
422 | if (!dm_target_is_snapshot_merge(snap->ti)) | ||
423 | return 1; | ||
424 | |||
425 | /* | ||
426 | * Do not allow more than one merging snapshot. | ||
427 | */ | ||
428 | if (snap_merge) { | ||
429 | snap->ti->error = "A snapshot is already merging."; | ||
430 | return -EINVAL; | ||
431 | } | ||
432 | |||
433 | if (!snap_src->store->type->prepare_merge || | ||
434 | !snap_src->store->type->commit_merge) { | ||
435 | snap->ti->error = "Snapshot exception store does not " | ||
436 | "support snapshot-merge."; | ||
437 | return -EINVAL; | ||
438 | } | ||
439 | |||
440 | return 1; | ||
441 | } | ||
442 | |||
443 | static void __insert_snapshot(struct origin *o, struct dm_snapshot *s) | ||
444 | { | ||
445 | struct dm_snapshot *l; | ||
446 | |||
447 | /* Sort the list according to chunk size, largest-first smallest-last */ | ||
448 | list_for_each_entry(l, &o->snapshots, list) | ||
449 | if (l->store->chunk_size < s->store->chunk_size) | ||
450 | break; | ||
451 | list_add_tail(&s->list, &l->list); | ||
452 | } | ||
453 | |||
454 | /* | ||
294 | * Make a note of the snapshot and its origin so we can look it | 455 | * Make a note of the snapshot and its origin so we can look it |
295 | * up when the origin has a write on it. | 456 | * up when the origin has a write on it. |
457 | * | ||
458 | * Also validate snapshot exception store handovers. | ||
459 | * On success, returns 1 if this registration is a handover destination, | ||
460 | * otherwise returns 0. | ||
296 | */ | 461 | */ |
297 | static int register_snapshot(struct dm_snapshot *snap) | 462 | static int register_snapshot(struct dm_snapshot *snap) |
298 | { | 463 | { |
299 | struct dm_snapshot *l; | 464 | struct origin *o, *new_o = NULL; |
300 | struct origin *o, *new_o; | ||
301 | struct block_device *bdev = snap->origin->bdev; | 465 | struct block_device *bdev = snap->origin->bdev; |
466 | int r = 0; | ||
302 | 467 | ||
303 | new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); | 468 | new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); |
304 | if (!new_o) | 469 | if (!new_o) |
305 | return -ENOMEM; | 470 | return -ENOMEM; |
306 | 471 | ||
307 | down_write(&_origins_lock); | 472 | down_write(&_origins_lock); |
308 | o = __lookup_origin(bdev); | ||
309 | 473 | ||
474 | r = __validate_exception_handover(snap); | ||
475 | if (r < 0) { | ||
476 | kfree(new_o); | ||
477 | goto out; | ||
478 | } | ||
479 | |||
480 | o = __lookup_origin(bdev); | ||
310 | if (o) | 481 | if (o) |
311 | kfree(new_o); | 482 | kfree(new_o); |
312 | else { | 483 | else { |
@@ -320,14 +491,27 @@ static int register_snapshot(struct dm_snapshot *snap) | |||
320 | __insert_origin(o); | 491 | __insert_origin(o); |
321 | } | 492 | } |
322 | 493 | ||
323 | /* Sort the list according to chunk size, largest-first smallest-last */ | 494 | __insert_snapshot(o, snap); |
324 | list_for_each_entry(l, &o->snapshots, list) | 495 | |
325 | if (l->store->chunk_size < snap->store->chunk_size) | 496 | out: |
326 | break; | 497 | up_write(&_origins_lock); |
327 | list_add_tail(&snap->list, &l->list); | 498 | |
499 | return r; | ||
500 | } | ||
501 | |||
502 | /* | ||
503 | * Move snapshot to correct place in list according to chunk size. | ||
504 | */ | ||
505 | static void reregister_snapshot(struct dm_snapshot *s) | ||
506 | { | ||
507 | struct block_device *bdev = s->origin->bdev; | ||
508 | |||
509 | down_write(&_origins_lock); | ||
510 | |||
511 | list_del(&s->list); | ||
512 | __insert_snapshot(__lookup_origin(bdev), s); | ||
328 | 513 | ||
329 | up_write(&_origins_lock); | 514 | up_write(&_origins_lock); |
330 | return 0; | ||
331 | } | 515 | } |
332 | 516 | ||
333 | static void unregister_snapshot(struct dm_snapshot *s) | 517 | static void unregister_snapshot(struct dm_snapshot *s) |
@@ -338,7 +522,7 @@ static void unregister_snapshot(struct dm_snapshot *s) | |||
338 | o = __lookup_origin(s->origin->bdev); | 522 | o = __lookup_origin(s->origin->bdev); |
339 | 523 | ||
340 | list_del(&s->list); | 524 | list_del(&s->list); |
341 | if (list_empty(&o->snapshots)) { | 525 | if (o && list_empty(&o->snapshots)) { |
342 | list_del(&o->hash_list); | 526 | list_del(&o->hash_list); |
343 | kfree(o); | 527 | kfree(o); |
344 | } | 528 | } |
@@ -351,8 +535,8 @@ static void unregister_snapshot(struct dm_snapshot *s) | |||
351 | * The lowest hash_shift bits of the chunk number are ignored, allowing | 535 | * The lowest hash_shift bits of the chunk number are ignored, allowing |
352 | * some consecutive chunks to be grouped together. | 536 | * some consecutive chunks to be grouped together. |
353 | */ | 537 | */ |
354 | static int init_exception_table(struct exception_table *et, uint32_t size, | 538 | static int dm_exception_table_init(struct dm_exception_table *et, |
355 | unsigned hash_shift) | 539 | uint32_t size, unsigned hash_shift) |
356 | { | 540 | { |
357 | unsigned int i; | 541 | unsigned int i; |
358 | 542 | ||
@@ -368,10 +552,11 @@ static int init_exception_table(struct exception_table *et, uint32_t size, | |||
368 | return 0; | 552 | return 0; |
369 | } | 553 | } |
370 | 554 | ||
371 | static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem) | 555 | static void dm_exception_table_exit(struct dm_exception_table *et, |
556 | struct kmem_cache *mem) | ||
372 | { | 557 | { |
373 | struct list_head *slot; | 558 | struct list_head *slot; |
374 | struct dm_snap_exception *ex, *next; | 559 | struct dm_exception *ex, *next; |
375 | int i, size; | 560 | int i, size; |
376 | 561 | ||
377 | size = et->hash_mask + 1; | 562 | size = et->hash_mask + 1; |
@@ -385,19 +570,12 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache * | |||
385 | vfree(et->table); | 570 | vfree(et->table); |
386 | } | 571 | } |
387 | 572 | ||
388 | static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) | 573 | static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) |
389 | { | 574 | { |
390 | return (chunk >> et->hash_shift) & et->hash_mask; | 575 | return (chunk >> et->hash_shift) & et->hash_mask; |
391 | } | 576 | } |
392 | 577 | ||
393 | static void insert_exception(struct exception_table *eh, | 578 | static void dm_remove_exception(struct dm_exception *e) |
394 | struct dm_snap_exception *e) | ||
395 | { | ||
396 | struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; | ||
397 | list_add(&e->hash_list, l); | ||
398 | } | ||
399 | |||
400 | static void remove_exception(struct dm_snap_exception *e) | ||
401 | { | 579 | { |
402 | list_del(&e->hash_list); | 580 | list_del(&e->hash_list); |
403 | } | 581 | } |
@@ -406,11 +584,11 @@ static void remove_exception(struct dm_snap_exception *e) | |||
406 | * Return the exception data for a sector, or NULL if not | 584 | * Return the exception data for a sector, or NULL if not |
407 | * remapped. | 585 | * remapped. |
408 | */ | 586 | */ |
409 | static struct dm_snap_exception *lookup_exception(struct exception_table *et, | 587 | static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, |
410 | chunk_t chunk) | 588 | chunk_t chunk) |
411 | { | 589 | { |
412 | struct list_head *slot; | 590 | struct list_head *slot; |
413 | struct dm_snap_exception *e; | 591 | struct dm_exception *e; |
414 | 592 | ||
415 | slot = &et->table[exception_hash(et, chunk)]; | 593 | slot = &et->table[exception_hash(et, chunk)]; |
416 | list_for_each_entry (e, slot, hash_list) | 594 | list_for_each_entry (e, slot, hash_list) |
@@ -421,9 +599,9 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et, | |||
421 | return NULL; | 599 | return NULL; |
422 | } | 600 | } |
423 | 601 | ||
424 | static struct dm_snap_exception *alloc_exception(void) | 602 | static struct dm_exception *alloc_completed_exception(void) |
425 | { | 603 | { |
426 | struct dm_snap_exception *e; | 604 | struct dm_exception *e; |
427 | 605 | ||
428 | e = kmem_cache_alloc(exception_cache, GFP_NOIO); | 606 | e = kmem_cache_alloc(exception_cache, GFP_NOIO); |
429 | if (!e) | 607 | if (!e) |
@@ -432,7 +610,7 @@ static struct dm_snap_exception *alloc_exception(void) | |||
432 | return e; | 610 | return e; |
433 | } | 611 | } |
434 | 612 | ||
435 | static void free_exception(struct dm_snap_exception *e) | 613 | static void free_completed_exception(struct dm_exception *e) |
436 | { | 614 | { |
437 | kmem_cache_free(exception_cache, e); | 615 | kmem_cache_free(exception_cache, e); |
438 | } | 616 | } |
@@ -457,12 +635,11 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) | |||
457 | atomic_dec(&s->pending_exceptions_count); | 635 | atomic_dec(&s->pending_exceptions_count); |
458 | } | 636 | } |
459 | 637 | ||
460 | static void insert_completed_exception(struct dm_snapshot *s, | 638 | static void dm_insert_exception(struct dm_exception_table *eh, |
461 | struct dm_snap_exception *new_e) | 639 | struct dm_exception *new_e) |
462 | { | 640 | { |
463 | struct exception_table *eh = &s->complete; | ||
464 | struct list_head *l; | 641 | struct list_head *l; |
465 | struct dm_snap_exception *e = NULL; | 642 | struct dm_exception *e = NULL; |
466 | 643 | ||
467 | l = &eh->table[exception_hash(eh, new_e->old_chunk)]; | 644 | l = &eh->table[exception_hash(eh, new_e->old_chunk)]; |
468 | 645 | ||
@@ -478,7 +655,7 @@ static void insert_completed_exception(struct dm_snapshot *s, | |||
478 | new_e->new_chunk == (dm_chunk_number(e->new_chunk) + | 655 | new_e->new_chunk == (dm_chunk_number(e->new_chunk) + |
479 | dm_consecutive_chunk_count(e) + 1)) { | 656 | dm_consecutive_chunk_count(e) + 1)) { |
480 | dm_consecutive_chunk_count_inc(e); | 657 | dm_consecutive_chunk_count_inc(e); |
481 | free_exception(new_e); | 658 | free_completed_exception(new_e); |
482 | return; | 659 | return; |
483 | } | 660 | } |
484 | 661 | ||
@@ -488,7 +665,7 @@ static void insert_completed_exception(struct dm_snapshot *s, | |||
488 | dm_consecutive_chunk_count_inc(e); | 665 | dm_consecutive_chunk_count_inc(e); |
489 | e->old_chunk--; | 666 | e->old_chunk--; |
490 | e->new_chunk--; | 667 | e->new_chunk--; |
491 | free_exception(new_e); | 668 | free_completed_exception(new_e); |
492 | return; | 669 | return; |
493 | } | 670 | } |
494 | 671 | ||
@@ -507,9 +684,9 @@ out: | |||
507 | static int dm_add_exception(void *context, chunk_t old, chunk_t new) | 684 | static int dm_add_exception(void *context, chunk_t old, chunk_t new) |
508 | { | 685 | { |
509 | struct dm_snapshot *s = context; | 686 | struct dm_snapshot *s = context; |
510 | struct dm_snap_exception *e; | 687 | struct dm_exception *e; |
511 | 688 | ||
512 | e = alloc_exception(); | 689 | e = alloc_completed_exception(); |
513 | if (!e) | 690 | if (!e) |
514 | return -ENOMEM; | 691 | return -ENOMEM; |
515 | 692 | ||
@@ -518,11 +695,30 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) | |||
518 | /* Consecutive_count is implicitly initialised to zero */ | 695 | /* Consecutive_count is implicitly initialised to zero */ |
519 | e->new_chunk = new; | 696 | e->new_chunk = new; |
520 | 697 | ||
521 | insert_completed_exception(s, e); | 698 | dm_insert_exception(&s->complete, e); |
522 | 699 | ||
523 | return 0; | 700 | return 0; |
524 | } | 701 | } |
525 | 702 | ||
703 | #define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r))) | ||
704 | |||
705 | /* | ||
706 | * Return a minimum chunk size of all snapshots that have the specified origin. | ||
707 | * Return zero if the origin has no snapshots. | ||
708 | */ | ||
709 | static sector_t __minimum_chunk_size(struct origin *o) | ||
710 | { | ||
711 | struct dm_snapshot *snap; | ||
712 | unsigned chunk_size = 0; | ||
713 | |||
714 | if (o) | ||
715 | list_for_each_entry(snap, &o->snapshots, list) | ||
716 | chunk_size = min_not_zero(chunk_size, | ||
717 | snap->store->chunk_size); | ||
718 | |||
719 | return chunk_size; | ||
720 | } | ||
721 | |||
526 | /* | 722 | /* |
527 | * Hard coded magic. | 723 | * Hard coded magic. |
528 | */ | 724 | */ |
@@ -546,16 +742,18 @@ static int init_hash_tables(struct dm_snapshot *s) | |||
546 | * Calculate based on the size of the original volume or | 742 | * Calculate based on the size of the original volume or |
547 | * the COW volume... | 743 | * the COW volume... |
548 | */ | 744 | */ |
549 | cow_dev_size = get_dev_size(s->store->cow->bdev); | 745 | cow_dev_size = get_dev_size(s->cow->bdev); |
550 | origin_dev_size = get_dev_size(s->origin->bdev); | 746 | origin_dev_size = get_dev_size(s->origin->bdev); |
551 | max_buckets = calc_max_buckets(); | 747 | max_buckets = calc_max_buckets(); |
552 | 748 | ||
553 | hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; | 749 | hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; |
554 | hash_size = min(hash_size, max_buckets); | 750 | hash_size = min(hash_size, max_buckets); |
555 | 751 | ||
752 | if (hash_size < 64) | ||
753 | hash_size = 64; | ||
556 | hash_size = rounddown_pow_of_two(hash_size); | 754 | hash_size = rounddown_pow_of_two(hash_size); |
557 | if (init_exception_table(&s->complete, hash_size, | 755 | if (dm_exception_table_init(&s->complete, hash_size, |
558 | DM_CHUNK_CONSECUTIVE_BITS)) | 756 | DM_CHUNK_CONSECUTIVE_BITS)) |
559 | return -ENOMEM; | 757 | return -ENOMEM; |
560 | 758 | ||
561 | /* | 759 | /* |
@@ -566,14 +764,284 @@ static int init_hash_tables(struct dm_snapshot *s) | |||
566 | if (hash_size < 64) | 764 | if (hash_size < 64) |
567 | hash_size = 64; | 765 | hash_size = 64; |
568 | 766 | ||
569 | if (init_exception_table(&s->pending, hash_size, 0)) { | 767 | if (dm_exception_table_init(&s->pending, hash_size, 0)) { |
570 | exit_exception_table(&s->complete, exception_cache); | 768 | dm_exception_table_exit(&s->complete, exception_cache); |
571 | return -ENOMEM; | 769 | return -ENOMEM; |
572 | } | 770 | } |
573 | 771 | ||
574 | return 0; | 772 | return 0; |
575 | } | 773 | } |
576 | 774 | ||
775 | static void merge_shutdown(struct dm_snapshot *s) | ||
776 | { | ||
777 | clear_bit_unlock(RUNNING_MERGE, &s->state_bits); | ||
778 | smp_mb__after_clear_bit(); | ||
779 | wake_up_bit(&s->state_bits, RUNNING_MERGE); | ||
780 | } | ||
781 | |||
782 | static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s) | ||
783 | { | ||
784 | s->first_merging_chunk = 0; | ||
785 | s->num_merging_chunks = 0; | ||
786 | |||
787 | return bio_list_get(&s->bios_queued_during_merge); | ||
788 | } | ||
789 | |||
790 | /* | ||
791 | * Remove one chunk from the index of completed exceptions. | ||
792 | */ | ||
793 | static int __remove_single_exception_chunk(struct dm_snapshot *s, | ||
794 | chunk_t old_chunk) | ||
795 | { | ||
796 | struct dm_exception *e; | ||
797 | |||
798 | e = dm_lookup_exception(&s->complete, old_chunk); | ||
799 | if (!e) { | ||
800 | DMERR("Corruption detected: exception for block %llu is " | ||
801 | "on disk but not in memory", | ||
802 | (unsigned long long)old_chunk); | ||
803 | return -EINVAL; | ||
804 | } | ||
805 | |||
806 | /* | ||
807 | * If this is the only chunk using this exception, remove exception. | ||
808 | */ | ||
809 | if (!dm_consecutive_chunk_count(e)) { | ||
810 | dm_remove_exception(e); | ||
811 | free_completed_exception(e); | ||
812 | return 0; | ||
813 | } | ||
814 | |||
815 | /* | ||
816 | * The chunk may be either at the beginning or the end of a | ||
817 | * group of consecutive chunks - never in the middle. We are | ||
818 | * removing chunks in the opposite order to that in which they | ||
819 | * were added, so this should always be true. | ||
820 | * Decrement the consecutive chunk counter and adjust the | ||
821 | * starting point if necessary. | ||
822 | */ | ||
823 | if (old_chunk == e->old_chunk) { | ||
824 | e->old_chunk++; | ||
825 | e->new_chunk++; | ||
826 | } else if (old_chunk != e->old_chunk + | ||
827 | dm_consecutive_chunk_count(e)) { | ||
828 | DMERR("Attempt to merge block %llu from the " | ||
829 | "middle of a chunk range [%llu - %llu]", | ||
830 | (unsigned long long)old_chunk, | ||
831 | (unsigned long long)e->old_chunk, | ||
832 | (unsigned long long) | ||
833 | e->old_chunk + dm_consecutive_chunk_count(e)); | ||
834 | return -EINVAL; | ||
835 | } | ||
836 | |||
837 | dm_consecutive_chunk_count_dec(e); | ||
838 | |||
839 | return 0; | ||
840 | } | ||
841 | |||
842 | static void flush_bios(struct bio *bio); | ||
843 | |||
844 | static int remove_single_exception_chunk(struct dm_snapshot *s) | ||
845 | { | ||
846 | struct bio *b = NULL; | ||
847 | int r; | ||
848 | chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; | ||
849 | |||
850 | down_write(&s->lock); | ||
851 | |||
852 | /* | ||
853 | * Process chunks (and associated exceptions) in reverse order | ||
854 | * so that dm_consecutive_chunk_count_dec() accounting works. | ||
855 | */ | ||
856 | do { | ||
857 | r = __remove_single_exception_chunk(s, old_chunk); | ||
858 | if (r) | ||
859 | goto out; | ||
860 | } while (old_chunk-- > s->first_merging_chunk); | ||
861 | |||
862 | b = __release_queued_bios_after_merge(s); | ||
863 | |||
864 | out: | ||
865 | up_write(&s->lock); | ||
866 | if (b) | ||
867 | flush_bios(b); | ||
868 | |||
869 | return r; | ||
870 | } | ||
871 | |||
872 | static int origin_write_extent(struct dm_snapshot *merging_snap, | ||
873 | sector_t sector, unsigned chunk_size); | ||
874 | |||
875 | static void merge_callback(int read_err, unsigned long write_err, | ||
876 | void *context); | ||
877 | |||
878 | static uint64_t read_pending_exceptions_done_count(void) | ||
879 | { | ||
880 | uint64_t pending_exceptions_done; | ||
881 | |||
882 | spin_lock(&_pending_exceptions_done_spinlock); | ||
883 | pending_exceptions_done = _pending_exceptions_done_count; | ||
884 | spin_unlock(&_pending_exceptions_done_spinlock); | ||
885 | |||
886 | return pending_exceptions_done; | ||
887 | } | ||
888 | |||
889 | static void increment_pending_exceptions_done_count(void) | ||
890 | { | ||
891 | spin_lock(&_pending_exceptions_done_spinlock); | ||
892 | _pending_exceptions_done_count++; | ||
893 | spin_unlock(&_pending_exceptions_done_spinlock); | ||
894 | |||
895 | wake_up_all(&_pending_exceptions_done); | ||
896 | } | ||
897 | |||
898 | static void snapshot_merge_next_chunks(struct dm_snapshot *s) | ||
899 | { | ||
900 | int i, linear_chunks; | ||
901 | chunk_t old_chunk, new_chunk; | ||
902 | struct dm_io_region src, dest; | ||
903 | sector_t io_size; | ||
904 | uint64_t previous_count; | ||
905 | |||
906 | BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits)); | ||
907 | if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits))) | ||
908 | goto shut; | ||
909 | |||
910 | /* | ||
911 | * valid flag never changes during merge, so no lock required. | ||
912 | */ | ||
913 | if (!s->valid) { | ||
914 | DMERR("Snapshot is invalid: can't merge"); | ||
915 | goto shut; | ||
916 | } | ||
917 | |||
918 | linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk, | ||
919 | &new_chunk); | ||
920 | if (linear_chunks <= 0) { | ||
921 | if (linear_chunks < 0) { | ||
922 | DMERR("Read error in exception store: " | ||
923 | "shutting down merge"); | ||
924 | down_write(&s->lock); | ||
925 | s->merge_failed = 1; | ||
926 | up_write(&s->lock); | ||
927 | } | ||
928 | goto shut; | ||
929 | } | ||
930 | |||
931 | /* Adjust old_chunk and new_chunk to reflect start of linear region */ | ||
932 | old_chunk = old_chunk + 1 - linear_chunks; | ||
933 | new_chunk = new_chunk + 1 - linear_chunks; | ||
934 | |||
935 | /* | ||
936 | * Use one (potentially large) I/O to copy all 'linear_chunks' | ||
937 | * from the exception store to the origin | ||
938 | */ | ||
939 | io_size = linear_chunks * s->store->chunk_size; | ||
940 | |||
941 | dest.bdev = s->origin->bdev; | ||
942 | dest.sector = chunk_to_sector(s->store, old_chunk); | ||
943 | dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector); | ||
944 | |||
945 | src.bdev = s->cow->bdev; | ||
946 | src.sector = chunk_to_sector(s->store, new_chunk); | ||
947 | src.count = dest.count; | ||
948 | |||
949 | /* | ||
950 | * Reallocate any exceptions needed in other snapshots then | ||
951 | * wait for the pending exceptions to complete. | ||
952 | * Each time any pending exception (globally on the system) | ||
953 | * completes we are woken and repeat the process to find out | ||
954 | * if we can proceed. While this may not seem a particularly | ||
955 | * efficient algorithm, it is not expected to have any | ||
956 | * significant impact on performance. | ||
957 | */ | ||
958 | previous_count = read_pending_exceptions_done_count(); | ||
959 | while (origin_write_extent(s, dest.sector, io_size)) { | ||
960 | wait_event(_pending_exceptions_done, | ||
961 | (read_pending_exceptions_done_count() != | ||
962 | previous_count)); | ||
963 | /* Retry after the wait, until all exceptions are done. */ | ||
964 | previous_count = read_pending_exceptions_done_count(); | ||
965 | } | ||
966 | |||
967 | down_write(&s->lock); | ||
968 | s->first_merging_chunk = old_chunk; | ||
969 | s->num_merging_chunks = linear_chunks; | ||
970 | up_write(&s->lock); | ||
971 | |||
972 | /* Wait until writes to all 'linear_chunks' drain */ | ||
973 | for (i = 0; i < linear_chunks; i++) | ||
974 | __check_for_conflicting_io(s, old_chunk + i); | ||
975 | |||
976 | dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); | ||
977 | return; | ||
978 | |||
979 | shut: | ||
980 | merge_shutdown(s); | ||
981 | } | ||
982 | |||
983 | static void error_bios(struct bio *bio); | ||
984 | |||
985 | static void merge_callback(int read_err, unsigned long write_err, void *context) | ||
986 | { | ||
987 | struct dm_snapshot *s = context; | ||
988 | struct bio *b = NULL; | ||
989 | |||
990 | if (read_err || write_err) { | ||
991 | if (read_err) | ||
992 | DMERR("Read error: shutting down merge."); | ||
993 | else | ||
994 | DMERR("Write error: shutting down merge."); | ||
995 | goto shut; | ||
996 | } | ||
997 | |||
998 | if (s->store->type->commit_merge(s->store, | ||
999 | s->num_merging_chunks) < 0) { | ||
1000 | DMERR("Write error in exception store: shutting down merge"); | ||
1001 | goto shut; | ||
1002 | } | ||
1003 | |||
1004 | if (remove_single_exception_chunk(s) < 0) | ||
1005 | goto shut; | ||
1006 | |||
1007 | snapshot_merge_next_chunks(s); | ||
1008 | |||
1009 | return; | ||
1010 | |||
1011 | shut: | ||
1012 | down_write(&s->lock); | ||
1013 | s->merge_failed = 1; | ||
1014 | b = __release_queued_bios_after_merge(s); | ||
1015 | up_write(&s->lock); | ||
1016 | error_bios(b); | ||
1017 | |||
1018 | merge_shutdown(s); | ||
1019 | } | ||
1020 | |||
1021 | static void start_merge(struct dm_snapshot *s) | ||
1022 | { | ||
1023 | if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits)) | ||
1024 | snapshot_merge_next_chunks(s); | ||
1025 | } | ||
1026 | |||
1027 | static int wait_schedule(void *ptr) | ||
1028 | { | ||
1029 | schedule(); | ||
1030 | |||
1031 | return 0; | ||
1032 | } | ||
1033 | |||
1034 | /* | ||
1035 | * Stop the merging process and wait until it finishes. | ||
1036 | */ | ||
1037 | static void stop_merge(struct dm_snapshot *s) | ||
1038 | { | ||
1039 | set_bit(SHUTDOWN_MERGE, &s->state_bits); | ||
1040 | wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule, | ||
1041 | TASK_UNINTERRUPTIBLE); | ||
1042 | clear_bit(SHUTDOWN_MERGE, &s->state_bits); | ||
1043 | } | ||
1044 | |||
577 | /* | 1045 | /* |
578 | * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> | 1046 | * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> |
579 | */ | 1047 | */ |
@@ -582,50 +1050,73 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
582 | struct dm_snapshot *s; | 1050 | struct dm_snapshot *s; |
583 | int i; | 1051 | int i; |
584 | int r = -EINVAL; | 1052 | int r = -EINVAL; |
585 | char *origin_path; | 1053 | char *origin_path, *cow_path; |
586 | struct dm_exception_store *store; | 1054 | unsigned args_used, num_flush_requests = 1; |
587 | unsigned args_used; | 1055 | fmode_t origin_mode = FMODE_READ; |
588 | 1056 | ||
589 | if (argc != 4) { | 1057 | if (argc != 4) { |
590 | ti->error = "requires exactly 4 arguments"; | 1058 | ti->error = "requires exactly 4 arguments"; |
591 | r = -EINVAL; | 1059 | r = -EINVAL; |
592 | goto bad_args; | 1060 | goto bad; |
1061 | } | ||
1062 | |||
1063 | if (dm_target_is_snapshot_merge(ti)) { | ||
1064 | num_flush_requests = 2; | ||
1065 | origin_mode = FMODE_WRITE; | ||
593 | } | 1066 | } |
594 | 1067 | ||
595 | origin_path = argv[0]; | 1068 | origin_path = argv[0]; |
596 | argv++; | 1069 | argv++; |
597 | argc--; | 1070 | argc--; |
598 | 1071 | ||
599 | r = dm_exception_store_create(ti, argc, argv, &args_used, &store); | 1072 | s = kmalloc(sizeof(*s), GFP_KERNEL); |
1073 | if (!s) { | ||
1074 | ti->error = "Cannot allocate snapshot context private " | ||
1075 | "structure"; | ||
1076 | r = -ENOMEM; | ||
1077 | goto bad; | ||
1078 | } | ||
1079 | |||
1080 | cow_path = argv[0]; | ||
1081 | argv++; | ||
1082 | argc--; | ||
1083 | |||
1084 | r = dm_get_device(ti, cow_path, 0, 0, | ||
1085 | FMODE_READ | FMODE_WRITE, &s->cow); | ||
1086 | if (r) { | ||
1087 | ti->error = "Cannot get COW device"; | ||
1088 | goto bad_cow; | ||
1089 | } | ||
1090 | |||
1091 | r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); | ||
600 | if (r) { | 1092 | if (r) { |
601 | ti->error = "Couldn't create exception store"; | 1093 | ti->error = "Couldn't create exception store"; |
602 | r = -EINVAL; | 1094 | r = -EINVAL; |
603 | goto bad_args; | 1095 | goto bad_store; |
604 | } | 1096 | } |
605 | 1097 | ||
606 | argv += args_used; | 1098 | argv += args_used; |
607 | argc -= args_used; | 1099 | argc -= args_used; |
608 | 1100 | ||
609 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 1101 | r = dm_get_device(ti, origin_path, 0, ti->len, origin_mode, &s->origin); |
610 | if (!s) { | ||
611 | ti->error = "Cannot allocate snapshot context private " | ||
612 | "structure"; | ||
613 | r = -ENOMEM; | ||
614 | goto bad_snap; | ||
615 | } | ||
616 | |||
617 | r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); | ||
618 | if (r) { | 1102 | if (r) { |
619 | ti->error = "Cannot get origin device"; | 1103 | ti->error = "Cannot get origin device"; |
620 | goto bad_origin; | 1104 | goto bad_origin; |
621 | } | 1105 | } |
622 | 1106 | ||
623 | s->store = store; | 1107 | s->ti = ti; |
624 | s->valid = 1; | 1108 | s->valid = 1; |
625 | s->active = 0; | 1109 | s->active = 0; |
1110 | s->suspended = 0; | ||
626 | atomic_set(&s->pending_exceptions_count, 0); | 1111 | atomic_set(&s->pending_exceptions_count, 0); |
627 | init_rwsem(&s->lock); | 1112 | init_rwsem(&s->lock); |
1113 | INIT_LIST_HEAD(&s->list); | ||
628 | spin_lock_init(&s->pe_lock); | 1114 | spin_lock_init(&s->pe_lock); |
1115 | s->state_bits = 0; | ||
1116 | s->merge_failed = 0; | ||
1117 | s->first_merging_chunk = 0; | ||
1118 | s->num_merging_chunks = 0; | ||
1119 | bio_list_init(&s->bios_queued_during_merge); | ||
629 | 1120 | ||
630 | /* Allocate hash table for COW data */ | 1121 | /* Allocate hash table for COW data */ |
631 | if (init_hash_tables(s)) { | 1122 | if (init_hash_tables(s)) { |
@@ -659,39 +1150,55 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
659 | 1150 | ||
660 | spin_lock_init(&s->tracked_chunk_lock); | 1151 | spin_lock_init(&s->tracked_chunk_lock); |
661 | 1152 | ||
662 | /* Metadata must only be loaded into one table at once */ | 1153 | bio_list_init(&s->queued_bios); |
1154 | INIT_WORK(&s->queued_bios_work, flush_queued_bios); | ||
1155 | |||
1156 | ti->private = s; | ||
1157 | ti->num_flush_requests = num_flush_requests; | ||
1158 | |||
1159 | /* Add snapshot to the list of snapshots for this origin */ | ||
1160 | /* Exceptions aren't triggered till snapshot_resume() is called */ | ||
1161 | r = register_snapshot(s); | ||
1162 | if (r == -ENOMEM) { | ||
1163 | ti->error = "Snapshot origin struct allocation failed"; | ||
1164 | goto bad_load_and_register; | ||
1165 | } else if (r < 0) { | ||
1166 | /* invalid handover, register_snapshot has set ti->error */ | ||
1167 | goto bad_load_and_register; | ||
1168 | } | ||
1169 | |||
1170 | /* | ||
1171 | * Metadata must only be loaded into one table at once, so skip this | ||
1172 | * if metadata will be handed over during resume. | ||
1173 | * Chunk size will be set during the handover - set it to zero to | ||
1174 | * ensure it's ignored. | ||
1175 | */ | ||
1176 | if (r > 0) { | ||
1177 | s->store->chunk_size = 0; | ||
1178 | return 0; | ||
1179 | } | ||
1180 | |||
663 | r = s->store->type->read_metadata(s->store, dm_add_exception, | 1181 | r = s->store->type->read_metadata(s->store, dm_add_exception, |
664 | (void *)s); | 1182 | (void *)s); |
665 | if (r < 0) { | 1183 | if (r < 0) { |
666 | ti->error = "Failed to read snapshot metadata"; | 1184 | ti->error = "Failed to read snapshot metadata"; |
667 | goto bad_load_and_register; | 1185 | goto bad_read_metadata; |
668 | } else if (r > 0) { | 1186 | } else if (r > 0) { |
669 | s->valid = 0; | 1187 | s->valid = 0; |
670 | DMWARN("Snapshot is marked invalid."); | 1188 | DMWARN("Snapshot is marked invalid."); |
671 | } | 1189 | } |
672 | 1190 | ||
673 | bio_list_init(&s->queued_bios); | ||
674 | INIT_WORK(&s->queued_bios_work, flush_queued_bios); | ||
675 | |||
676 | if (!s->store->chunk_size) { | 1191 | if (!s->store->chunk_size) { |
677 | ti->error = "Chunk size not set"; | 1192 | ti->error = "Chunk size not set"; |
678 | goto bad_load_and_register; | 1193 | goto bad_read_metadata; |
679 | } | ||
680 | |||
681 | /* Add snapshot to the list of snapshots for this origin */ | ||
682 | /* Exceptions aren't triggered till snapshot_resume() is called */ | ||
683 | if (register_snapshot(s)) { | ||
684 | r = -EINVAL; | ||
685 | ti->error = "Cannot register snapshot origin"; | ||
686 | goto bad_load_and_register; | ||
687 | } | 1194 | } |
688 | |||
689 | ti->private = s; | ||
690 | ti->split_io = s->store->chunk_size; | 1195 | ti->split_io = s->store->chunk_size; |
691 | ti->num_flush_requests = 1; | ||
692 | 1196 | ||
693 | return 0; | 1197 | return 0; |
694 | 1198 | ||
1199 | bad_read_metadata: | ||
1200 | unregister_snapshot(s); | ||
1201 | |||
695 | bad_load_and_register: | 1202 | bad_load_and_register: |
696 | mempool_destroy(s->tracked_chunk_pool); | 1203 | mempool_destroy(s->tracked_chunk_pool); |
697 | 1204 | ||
@@ -702,19 +1209,22 @@ bad_pending_pool: | |||
702 | dm_kcopyd_client_destroy(s->kcopyd_client); | 1209 | dm_kcopyd_client_destroy(s->kcopyd_client); |
703 | 1210 | ||
704 | bad_kcopyd: | 1211 | bad_kcopyd: |
705 | exit_exception_table(&s->pending, pending_cache); | 1212 | dm_exception_table_exit(&s->pending, pending_cache); |
706 | exit_exception_table(&s->complete, exception_cache); | 1213 | dm_exception_table_exit(&s->complete, exception_cache); |
707 | 1214 | ||
708 | bad_hash_tables: | 1215 | bad_hash_tables: |
709 | dm_put_device(ti, s->origin); | 1216 | dm_put_device(ti, s->origin); |
710 | 1217 | ||
711 | bad_origin: | 1218 | bad_origin: |
712 | kfree(s); | 1219 | dm_exception_store_destroy(s->store); |
713 | 1220 | ||
714 | bad_snap: | 1221 | bad_store: |
715 | dm_exception_store_destroy(store); | 1222 | dm_put_device(ti, s->cow); |
1223 | |||
1224 | bad_cow: | ||
1225 | kfree(s); | ||
716 | 1226 | ||
717 | bad_args: | 1227 | bad: |
718 | return r; | 1228 | return r; |
719 | } | 1229 | } |
720 | 1230 | ||
@@ -723,8 +1233,39 @@ static void __free_exceptions(struct dm_snapshot *s) | |||
723 | dm_kcopyd_client_destroy(s->kcopyd_client); | 1233 | dm_kcopyd_client_destroy(s->kcopyd_client); |
724 | s->kcopyd_client = NULL; | 1234 | s->kcopyd_client = NULL; |
725 | 1235 | ||
726 | exit_exception_table(&s->pending, pending_cache); | 1236 | dm_exception_table_exit(&s->pending, pending_cache); |
727 | exit_exception_table(&s->complete, exception_cache); | 1237 | dm_exception_table_exit(&s->complete, exception_cache); |
1238 | } | ||
1239 | |||
1240 | static void __handover_exceptions(struct dm_snapshot *snap_src, | ||
1241 | struct dm_snapshot *snap_dest) | ||
1242 | { | ||
1243 | union { | ||
1244 | struct dm_exception_table table_swap; | ||
1245 | struct dm_exception_store *store_swap; | ||
1246 | } u; | ||
1247 | |||
1248 | /* | ||
1249 | * Swap all snapshot context information between the two instances. | ||
1250 | */ | ||
1251 | u.table_swap = snap_dest->complete; | ||
1252 | snap_dest->complete = snap_src->complete; | ||
1253 | snap_src->complete = u.table_swap; | ||
1254 | |||
1255 | u.store_swap = snap_dest->store; | ||
1256 | snap_dest->store = snap_src->store; | ||
1257 | snap_src->store = u.store_swap; | ||
1258 | |||
1259 | snap_dest->store->snap = snap_dest; | ||
1260 | snap_src->store->snap = snap_src; | ||
1261 | |||
1262 | snap_dest->ti->split_io = snap_dest->store->chunk_size; | ||
1263 | snap_dest->valid = snap_src->valid; | ||
1264 | |||
1265 | /* | ||
1266 | * Set source invalid to ensure it receives no further I/O. | ||
1267 | */ | ||
1268 | snap_src->valid = 0; | ||
728 | } | 1269 | } |
729 | 1270 | ||
730 | static void snapshot_dtr(struct dm_target *ti) | 1271 | static void snapshot_dtr(struct dm_target *ti) |
@@ -733,9 +1274,24 @@ static void snapshot_dtr(struct dm_target *ti) | |||
733 | int i; | 1274 | int i; |
734 | #endif | 1275 | #endif |
735 | struct dm_snapshot *s = ti->private; | 1276 | struct dm_snapshot *s = ti->private; |
1277 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | ||
736 | 1278 | ||
737 | flush_workqueue(ksnapd); | 1279 | flush_workqueue(ksnapd); |
738 | 1280 | ||
1281 | down_read(&_origins_lock); | ||
1282 | /* Check whether exception handover must be cancelled */ | ||
1283 | (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); | ||
1284 | if (snap_src && snap_dest && (s == snap_src)) { | ||
1285 | down_write(&snap_dest->lock); | ||
1286 | snap_dest->valid = 0; | ||
1287 | up_write(&snap_dest->lock); | ||
1288 | DMERR("Cancelling snapshot handover."); | ||
1289 | } | ||
1290 | up_read(&_origins_lock); | ||
1291 | |||
1292 | if (dm_target_is_snapshot_merge(ti)) | ||
1293 | stop_merge(s); | ||
1294 | |||
739 | /* Prevent further origin writes from using this snapshot. */ | 1295 | /* Prevent further origin writes from using this snapshot. */ |
740 | /* After this returns there can be no new kcopyd jobs. */ | 1296 | /* After this returns there can be no new kcopyd jobs. */ |
741 | unregister_snapshot(s); | 1297 | unregister_snapshot(s); |
@@ -763,6 +1319,8 @@ static void snapshot_dtr(struct dm_target *ti) | |||
763 | 1319 | ||
764 | dm_exception_store_destroy(s->store); | 1320 | dm_exception_store_destroy(s->store); |
765 | 1321 | ||
1322 | dm_put_device(ti, s->cow); | ||
1323 | |||
766 | kfree(s); | 1324 | kfree(s); |
767 | } | 1325 | } |
768 | 1326 | ||
@@ -795,6 +1353,26 @@ static void flush_queued_bios(struct work_struct *work) | |||
795 | flush_bios(queued_bios); | 1353 | flush_bios(queued_bios); |
796 | } | 1354 | } |
797 | 1355 | ||
1356 | static int do_origin(struct dm_dev *origin, struct bio *bio); | ||
1357 | |||
1358 | /* | ||
1359 | * Flush a list of buffers. | ||
1360 | */ | ||
1361 | static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) | ||
1362 | { | ||
1363 | struct bio *n; | ||
1364 | int r; | ||
1365 | |||
1366 | while (bio) { | ||
1367 | n = bio->bi_next; | ||
1368 | bio->bi_next = NULL; | ||
1369 | r = do_origin(s->origin, bio); | ||
1370 | if (r == DM_MAPIO_REMAPPED) | ||
1371 | generic_make_request(bio); | ||
1372 | bio = n; | ||
1373 | } | ||
1374 | } | ||
1375 | |||
798 | /* | 1376 | /* |
799 | * Error a list of buffers. | 1377 | * Error a list of buffers. |
800 | */ | 1378 | */ |
@@ -825,45 +1403,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) | |||
825 | 1403 | ||
826 | s->valid = 0; | 1404 | s->valid = 0; |
827 | 1405 | ||
828 | dm_table_event(s->store->ti->table); | 1406 | dm_table_event(s->ti->table); |
829 | } | ||
830 | |||
831 | static void get_pending_exception(struct dm_snap_pending_exception *pe) | ||
832 | { | ||
833 | atomic_inc(&pe->ref_count); | ||
834 | } | ||
835 | |||
836 | static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe) | ||
837 | { | ||
838 | struct dm_snap_pending_exception *primary_pe; | ||
839 | struct bio *origin_bios = NULL; | ||
840 | |||
841 | primary_pe = pe->primary_pe; | ||
842 | |||
843 | /* | ||
844 | * If this pe is involved in a write to the origin and | ||
845 | * it is the last sibling to complete then release | ||
846 | * the bios for the original write to the origin. | ||
847 | */ | ||
848 | if (primary_pe && | ||
849 | atomic_dec_and_test(&primary_pe->ref_count)) { | ||
850 | origin_bios = bio_list_get(&primary_pe->origin_bios); | ||
851 | free_pending_exception(primary_pe); | ||
852 | } | ||
853 | |||
854 | /* | ||
855 | * Free the pe if it's not linked to an origin write or if | ||
856 | * it's not itself a primary pe. | ||
857 | */ | ||
858 | if (!primary_pe || primary_pe != pe) | ||
859 | free_pending_exception(pe); | ||
860 | |||
861 | return origin_bios; | ||
862 | } | 1407 | } |
863 | 1408 | ||
864 | static void pending_complete(struct dm_snap_pending_exception *pe, int success) | 1409 | static void pending_complete(struct dm_snap_pending_exception *pe, int success) |
865 | { | 1410 | { |
866 | struct dm_snap_exception *e; | 1411 | struct dm_exception *e; |
867 | struct dm_snapshot *s = pe->snap; | 1412 | struct dm_snapshot *s = pe->snap; |
868 | struct bio *origin_bios = NULL; | 1413 | struct bio *origin_bios = NULL; |
869 | struct bio *snapshot_bios = NULL; | 1414 | struct bio *snapshot_bios = NULL; |
@@ -877,7 +1422,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
877 | goto out; | 1422 | goto out; |
878 | } | 1423 | } |
879 | 1424 | ||
880 | e = alloc_exception(); | 1425 | e = alloc_completed_exception(); |
881 | if (!e) { | 1426 | if (!e) { |
882 | down_write(&s->lock); | 1427 | down_write(&s->lock); |
883 | __invalidate_snapshot(s, -ENOMEM); | 1428 | __invalidate_snapshot(s, -ENOMEM); |
@@ -888,28 +1433,27 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
888 | 1433 | ||
889 | down_write(&s->lock); | 1434 | down_write(&s->lock); |
890 | if (!s->valid) { | 1435 | if (!s->valid) { |
891 | free_exception(e); | 1436 | free_completed_exception(e); |
892 | error = 1; | 1437 | error = 1; |
893 | goto out; | 1438 | goto out; |
894 | } | 1439 | } |
895 | 1440 | ||
896 | /* | 1441 | /* Check for conflicting reads */ |
897 | * Check for conflicting reads. This is extremely improbable, | 1442 | __check_for_conflicting_io(s, pe->e.old_chunk); |
898 | * so msleep(1) is sufficient and there is no need for a wait queue. | ||
899 | */ | ||
900 | while (__chunk_is_tracked(s, pe->e.old_chunk)) | ||
901 | msleep(1); | ||
902 | 1443 | ||
903 | /* | 1444 | /* |
904 | * Add a proper exception, and remove the | 1445 | * Add a proper exception, and remove the |
905 | * in-flight exception from the list. | 1446 | * in-flight exception from the list. |
906 | */ | 1447 | */ |
907 | insert_completed_exception(s, e); | 1448 | dm_insert_exception(&s->complete, e); |
908 | 1449 | ||
909 | out: | 1450 | out: |
910 | remove_exception(&pe->e); | 1451 | dm_remove_exception(&pe->e); |
911 | snapshot_bios = bio_list_get(&pe->snapshot_bios); | 1452 | snapshot_bios = bio_list_get(&pe->snapshot_bios); |
912 | origin_bios = put_pending_exception(pe); | 1453 | origin_bios = bio_list_get(&pe->origin_bios); |
1454 | free_pending_exception(pe); | ||
1455 | |||
1456 | increment_pending_exceptions_done_count(); | ||
913 | 1457 | ||
914 | up_write(&s->lock); | 1458 | up_write(&s->lock); |
915 | 1459 | ||
@@ -919,7 +1463,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
919 | else | 1463 | else |
920 | flush_bios(snapshot_bios); | 1464 | flush_bios(snapshot_bios); |
921 | 1465 | ||
922 | flush_bios(origin_bios); | 1466 | retry_origin_bios(s, origin_bios); |
923 | } | 1467 | } |
924 | 1468 | ||
925 | static void commit_callback(void *context, int success) | 1469 | static void commit_callback(void *context, int success) |
@@ -963,7 +1507,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) | |||
963 | src.sector = chunk_to_sector(s->store, pe->e.old_chunk); | 1507 | src.sector = chunk_to_sector(s->store, pe->e.old_chunk); |
964 | src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); | 1508 | src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); |
965 | 1509 | ||
966 | dest.bdev = s->store->cow->bdev; | 1510 | dest.bdev = s->cow->bdev; |
967 | dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); | 1511 | dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); |
968 | dest.count = src.count; | 1512 | dest.count = src.count; |
969 | 1513 | ||
@@ -975,7 +1519,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) | |||
975 | static struct dm_snap_pending_exception * | 1519 | static struct dm_snap_pending_exception * |
976 | __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) | 1520 | __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) |
977 | { | 1521 | { |
978 | struct dm_snap_exception *e = lookup_exception(&s->pending, chunk); | 1522 | struct dm_exception *e = dm_lookup_exception(&s->pending, chunk); |
979 | 1523 | ||
980 | if (!e) | 1524 | if (!e) |
981 | return NULL; | 1525 | return NULL; |
@@ -1006,8 +1550,6 @@ __find_pending_exception(struct dm_snapshot *s, | |||
1006 | pe->e.old_chunk = chunk; | 1550 | pe->e.old_chunk = chunk; |
1007 | bio_list_init(&pe->origin_bios); | 1551 | bio_list_init(&pe->origin_bios); |
1008 | bio_list_init(&pe->snapshot_bios); | 1552 | bio_list_init(&pe->snapshot_bios); |
1009 | pe->primary_pe = NULL; | ||
1010 | atomic_set(&pe->ref_count, 0); | ||
1011 | pe->started = 0; | 1553 | pe->started = 0; |
1012 | 1554 | ||
1013 | if (s->store->type->prepare_exception(s->store, &pe->e)) { | 1555 | if (s->store->type->prepare_exception(s->store, &pe->e)) { |
@@ -1015,16 +1557,15 @@ __find_pending_exception(struct dm_snapshot *s, | |||
1015 | return NULL; | 1557 | return NULL; |
1016 | } | 1558 | } |
1017 | 1559 | ||
1018 | get_pending_exception(pe); | 1560 | dm_insert_exception(&s->pending, &pe->e); |
1019 | insert_exception(&s->pending, &pe->e); | ||
1020 | 1561 | ||
1021 | return pe; | 1562 | return pe; |
1022 | } | 1563 | } |
1023 | 1564 | ||
1024 | static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, | 1565 | static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, |
1025 | struct bio *bio, chunk_t chunk) | 1566 | struct bio *bio, chunk_t chunk) |
1026 | { | 1567 | { |
1027 | bio->bi_bdev = s->store->cow->bdev; | 1568 | bio->bi_bdev = s->cow->bdev; |
1028 | bio->bi_sector = chunk_to_sector(s->store, | 1569 | bio->bi_sector = chunk_to_sector(s->store, |
1029 | dm_chunk_number(e->new_chunk) + | 1570 | dm_chunk_number(e->new_chunk) + |
1030 | (chunk - e->old_chunk)) + | 1571 | (chunk - e->old_chunk)) + |
@@ -1035,14 +1576,14 @@ static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, | |||
1035 | static int snapshot_map(struct dm_target *ti, struct bio *bio, | 1576 | static int snapshot_map(struct dm_target *ti, struct bio *bio, |
1036 | union map_info *map_context) | 1577 | union map_info *map_context) |
1037 | { | 1578 | { |
1038 | struct dm_snap_exception *e; | 1579 | struct dm_exception *e; |
1039 | struct dm_snapshot *s = ti->private; | 1580 | struct dm_snapshot *s = ti->private; |
1040 | int r = DM_MAPIO_REMAPPED; | 1581 | int r = DM_MAPIO_REMAPPED; |
1041 | chunk_t chunk; | 1582 | chunk_t chunk; |
1042 | struct dm_snap_pending_exception *pe = NULL; | 1583 | struct dm_snap_pending_exception *pe = NULL; |
1043 | 1584 | ||
1044 | if (unlikely(bio_empty_barrier(bio))) { | 1585 | if (unlikely(bio_empty_barrier(bio))) { |
1045 | bio->bi_bdev = s->store->cow->bdev; | 1586 | bio->bi_bdev = s->cow->bdev; |
1046 | return DM_MAPIO_REMAPPED; | 1587 | return DM_MAPIO_REMAPPED; |
1047 | } | 1588 | } |
1048 | 1589 | ||
@@ -1063,7 +1604,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1063 | } | 1604 | } |
1064 | 1605 | ||
1065 | /* If the block is already remapped - use that, else remap it */ | 1606 | /* If the block is already remapped - use that, else remap it */ |
1066 | e = lookup_exception(&s->complete, chunk); | 1607 | e = dm_lookup_exception(&s->complete, chunk); |
1067 | if (e) { | 1608 | if (e) { |
1068 | remap_exception(s, e, bio, chunk); | 1609 | remap_exception(s, e, bio, chunk); |
1069 | goto out_unlock; | 1610 | goto out_unlock; |
@@ -1087,7 +1628,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1087 | goto out_unlock; | 1628 | goto out_unlock; |
1088 | } | 1629 | } |
1089 | 1630 | ||
1090 | e = lookup_exception(&s->complete, chunk); | 1631 | e = dm_lookup_exception(&s->complete, chunk); |
1091 | if (e) { | 1632 | if (e) { |
1092 | free_pending_exception(pe); | 1633 | free_pending_exception(pe); |
1093 | remap_exception(s, e, bio, chunk); | 1634 | remap_exception(s, e, bio, chunk); |
@@ -1125,6 +1666,78 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1125 | return r; | 1666 | return r; |
1126 | } | 1667 | } |
1127 | 1668 | ||
1669 | /* | ||
1670 | * A snapshot-merge target behaves like a combination of a snapshot | ||
1671 | * target and a snapshot-origin target. It only generates new | ||
1672 | * exceptions in other snapshots and not in the one that is being | ||
1673 | * merged. | ||
1674 | * | ||
1675 | * For each chunk, if there is an existing exception, it is used to | ||
1676 | * redirect I/O to the cow device. Otherwise I/O is sent to the origin, | ||
1677 | * which in turn might generate exceptions in other snapshots. | ||
1678 | * If merging is currently taking place on the chunk in question, the | ||
1679 | * I/O is deferred by adding it to s->bios_queued_during_merge. | ||
1680 | */ | ||
1681 | static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, | ||
1682 | union map_info *map_context) | ||
1683 | { | ||
1684 | struct dm_exception *e; | ||
1685 | struct dm_snapshot *s = ti->private; | ||
1686 | int r = DM_MAPIO_REMAPPED; | ||
1687 | chunk_t chunk; | ||
1688 | |||
1689 | if (unlikely(bio_empty_barrier(bio))) { | ||
1690 | if (!map_context->flush_request) | ||
1691 | bio->bi_bdev = s->origin->bdev; | ||
1692 | else | ||
1693 | bio->bi_bdev = s->cow->bdev; | ||
1694 | map_context->ptr = NULL; | ||
1695 | return DM_MAPIO_REMAPPED; | ||
1696 | } | ||
1697 | |||
1698 | chunk = sector_to_chunk(s->store, bio->bi_sector); | ||
1699 | |||
1700 | down_write(&s->lock); | ||
1701 | |||
1702 | /* Full merging snapshots are redirected to the origin */ | ||
1703 | if (!s->valid) | ||
1704 | goto redirect_to_origin; | ||
1705 | |||
1706 | /* If the block is already remapped - use that */ | ||
1707 | e = dm_lookup_exception(&s->complete, chunk); | ||
1708 | if (e) { | ||
1709 | /* Queue writes overlapping with chunks being merged */ | ||
1710 | if (bio_rw(bio) == WRITE && | ||
1711 | chunk >= s->first_merging_chunk && | ||
1712 | chunk < (s->first_merging_chunk + | ||
1713 | s->num_merging_chunks)) { | ||
1714 | bio->bi_bdev = s->origin->bdev; | ||
1715 | bio_list_add(&s->bios_queued_during_merge, bio); | ||
1716 | r = DM_MAPIO_SUBMITTED; | ||
1717 | goto out_unlock; | ||
1718 | } | ||
1719 | |||
1720 | remap_exception(s, e, bio, chunk); | ||
1721 | |||
1722 | if (bio_rw(bio) == WRITE) | ||
1723 | map_context->ptr = track_chunk(s, chunk); | ||
1724 | goto out_unlock; | ||
1725 | } | ||
1726 | |||
1727 | redirect_to_origin: | ||
1728 | bio->bi_bdev = s->origin->bdev; | ||
1729 | |||
1730 | if (bio_rw(bio) == WRITE) { | ||
1731 | up_write(&s->lock); | ||
1732 | return do_origin(s->origin, bio); | ||
1733 | } | ||
1734 | |||
1735 | out_unlock: | ||
1736 | up_write(&s->lock); | ||
1737 | |||
1738 | return r; | ||
1739 | } | ||
1740 | |||
1128 | static int snapshot_end_io(struct dm_target *ti, struct bio *bio, | 1741 | static int snapshot_end_io(struct dm_target *ti, struct bio *bio, |
1129 | int error, union map_info *map_context) | 1742 | int error, union map_info *map_context) |
1130 | { | 1743 | { |
@@ -1137,40 +1750,135 @@ static int snapshot_end_io(struct dm_target *ti, struct bio *bio, | |||
1137 | return 0; | 1750 | return 0; |
1138 | } | 1751 | } |
1139 | 1752 | ||
1753 | static void snapshot_merge_presuspend(struct dm_target *ti) | ||
1754 | { | ||
1755 | struct dm_snapshot *s = ti->private; | ||
1756 | |||
1757 | stop_merge(s); | ||
1758 | } | ||
1759 | |||
1760 | static void snapshot_postsuspend(struct dm_target *ti) | ||
1761 | { | ||
1762 | struct dm_snapshot *s = ti->private; | ||
1763 | |||
1764 | down_write(&s->lock); | ||
1765 | s->suspended = 1; | ||
1766 | up_write(&s->lock); | ||
1767 | } | ||
1768 | |||
1769 | static int snapshot_preresume(struct dm_target *ti) | ||
1770 | { | ||
1771 | int r = 0; | ||
1772 | struct dm_snapshot *s = ti->private; | ||
1773 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | ||
1774 | |||
1775 | down_read(&_origins_lock); | ||
1776 | (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); | ||
1777 | if (snap_src && snap_dest) { | ||
1778 | down_read(&snap_src->lock); | ||
1779 | if (s == snap_src) { | ||
1780 | DMERR("Unable to resume snapshot source until " | ||
1781 | "handover completes."); | ||
1782 | r = -EINVAL; | ||
1783 | } else if (!snap_src->suspended) { | ||
1784 | DMERR("Unable to perform snapshot handover until " | ||
1785 | "source is suspended."); | ||
1786 | r = -EINVAL; | ||
1787 | } | ||
1788 | up_read(&snap_src->lock); | ||
1789 | } | ||
1790 | up_read(&_origins_lock); | ||
1791 | |||
1792 | return r; | ||
1793 | } | ||
1794 | |||
1140 | static void snapshot_resume(struct dm_target *ti) | 1795 | static void snapshot_resume(struct dm_target *ti) |
1141 | { | 1796 | { |
1142 | struct dm_snapshot *s = ti->private; | 1797 | struct dm_snapshot *s = ti->private; |
1798 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | ||
1799 | |||
1800 | down_read(&_origins_lock); | ||
1801 | (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); | ||
1802 | if (snap_src && snap_dest) { | ||
1803 | down_write(&snap_src->lock); | ||
1804 | down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); | ||
1805 | __handover_exceptions(snap_src, snap_dest); | ||
1806 | up_write(&snap_dest->lock); | ||
1807 | up_write(&snap_src->lock); | ||
1808 | } | ||
1809 | up_read(&_origins_lock); | ||
1810 | |||
1811 | /* Now we have correct chunk size, reregister */ | ||
1812 | reregister_snapshot(s); | ||
1143 | 1813 | ||
1144 | down_write(&s->lock); | 1814 | down_write(&s->lock); |
1145 | s->active = 1; | 1815 | s->active = 1; |
1816 | s->suspended = 0; | ||
1146 | up_write(&s->lock); | 1817 | up_write(&s->lock); |
1147 | } | 1818 | } |
1148 | 1819 | ||
1820 | static sector_t get_origin_minimum_chunksize(struct block_device *bdev) | ||
1821 | { | ||
1822 | sector_t min_chunksize; | ||
1823 | |||
1824 | down_read(&_origins_lock); | ||
1825 | min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); | ||
1826 | up_read(&_origins_lock); | ||
1827 | |||
1828 | return min_chunksize; | ||
1829 | } | ||
1830 | |||
1831 | static void snapshot_merge_resume(struct dm_target *ti) | ||
1832 | { | ||
1833 | struct dm_snapshot *s = ti->private; | ||
1834 | |||
1835 | /* | ||
1836 | * Handover exceptions from existing snapshot. | ||
1837 | */ | ||
1838 | snapshot_resume(ti); | ||
1839 | |||
1840 | /* | ||
1841 | * snapshot-merge acts as an origin, so set ti->split_io | ||
1842 | */ | ||
1843 | ti->split_io = get_origin_minimum_chunksize(s->origin->bdev); | ||
1844 | |||
1845 | start_merge(s); | ||
1846 | } | ||
1847 | |||
1149 | static int snapshot_status(struct dm_target *ti, status_type_t type, | 1848 | static int snapshot_status(struct dm_target *ti, status_type_t type, |
1150 | char *result, unsigned int maxlen) | 1849 | char *result, unsigned int maxlen) |
1151 | { | 1850 | { |
1152 | unsigned sz = 0; | 1851 | unsigned sz = 0; |
1153 | struct dm_snapshot *snap = ti->private; | 1852 | struct dm_snapshot *snap = ti->private; |
1154 | 1853 | ||
1155 | down_write(&snap->lock); | ||
1156 | |||
1157 | switch (type) { | 1854 | switch (type) { |
1158 | case STATUSTYPE_INFO: | 1855 | case STATUSTYPE_INFO: |
1856 | |||
1857 | down_write(&snap->lock); | ||
1858 | |||
1159 | if (!snap->valid) | 1859 | if (!snap->valid) |
1160 | DMEMIT("Invalid"); | 1860 | DMEMIT("Invalid"); |
1861 | else if (snap->merge_failed) | ||
1862 | DMEMIT("Merge failed"); | ||
1161 | else { | 1863 | else { |
1162 | if (snap->store->type->fraction_full) { | 1864 | if (snap->store->type->usage) { |
1163 | sector_t numerator, denominator; | 1865 | sector_t total_sectors, sectors_allocated, |
1164 | snap->store->type->fraction_full(snap->store, | 1866 | metadata_sectors; |
1165 | &numerator, | 1867 | snap->store->type->usage(snap->store, |
1166 | &denominator); | 1868 | &total_sectors, |
1167 | DMEMIT("%llu/%llu", | 1869 | §ors_allocated, |
1168 | (unsigned long long)numerator, | 1870 | &metadata_sectors); |
1169 | (unsigned long long)denominator); | 1871 | DMEMIT("%llu/%llu %llu", |
1872 | (unsigned long long)sectors_allocated, | ||
1873 | (unsigned long long)total_sectors, | ||
1874 | (unsigned long long)metadata_sectors); | ||
1170 | } | 1875 | } |
1171 | else | 1876 | else |
1172 | DMEMIT("Unknown"); | 1877 | DMEMIT("Unknown"); |
1173 | } | 1878 | } |
1879 | |||
1880 | up_write(&snap->lock); | ||
1881 | |||
1174 | break; | 1882 | break; |
1175 | 1883 | ||
1176 | case STATUSTYPE_TABLE: | 1884 | case STATUSTYPE_TABLE: |
@@ -1179,14 +1887,12 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, | |||
1179 | * to make private copies if the output is to | 1887 | * to make private copies if the output is to |
1180 | * make sense. | 1888 | * make sense. |
1181 | */ | 1889 | */ |
1182 | DMEMIT("%s", snap->origin->name); | 1890 | DMEMIT("%s %s", snap->origin->name, snap->cow->name); |
1183 | snap->store->type->status(snap->store, type, result + sz, | 1891 | snap->store->type->status(snap->store, type, result + sz, |
1184 | maxlen - sz); | 1892 | maxlen - sz); |
1185 | break; | 1893 | break; |
1186 | } | 1894 | } |
1187 | 1895 | ||
1188 | up_write(&snap->lock); | ||
1189 | |||
1190 | return 0; | 1896 | return 0; |
1191 | } | 1897 | } |
1192 | 1898 | ||
@@ -1202,17 +1908,36 @@ static int snapshot_iterate_devices(struct dm_target *ti, | |||
1202 | /*----------------------------------------------------------------- | 1908 | /*----------------------------------------------------------------- |
1203 | * Origin methods | 1909 | * Origin methods |
1204 | *---------------------------------------------------------------*/ | 1910 | *---------------------------------------------------------------*/ |
1205 | static int __origin_write(struct list_head *snapshots, struct bio *bio) | 1911 | |
1912 | /* | ||
1913 | * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any | ||
1914 | * supplied bio was ignored. The caller may submit it immediately. | ||
1915 | * (No remapping actually occurs as the origin is always a direct linear | ||
1916 | * map.) | ||
1917 | * | ||
1918 | * If further exceptions are required, DM_MAPIO_SUBMITTED is returned | ||
1919 | * and any supplied bio is added to a list to be submitted once all | ||
1920 | * the necessary exceptions exist. | ||
1921 | */ | ||
1922 | static int __origin_write(struct list_head *snapshots, sector_t sector, | ||
1923 | struct bio *bio) | ||
1206 | { | 1924 | { |
1207 | int r = DM_MAPIO_REMAPPED, first = 0; | 1925 | int r = DM_MAPIO_REMAPPED; |
1208 | struct dm_snapshot *snap; | 1926 | struct dm_snapshot *snap; |
1209 | struct dm_snap_exception *e; | 1927 | struct dm_exception *e; |
1210 | struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL; | 1928 | struct dm_snap_pending_exception *pe; |
1929 | struct dm_snap_pending_exception *pe_to_start_now = NULL; | ||
1930 | struct dm_snap_pending_exception *pe_to_start_last = NULL; | ||
1211 | chunk_t chunk; | 1931 | chunk_t chunk; |
1212 | LIST_HEAD(pe_queue); | ||
1213 | 1932 | ||
1214 | /* Do all the snapshots on this origin */ | 1933 | /* Do all the snapshots on this origin */ |
1215 | list_for_each_entry (snap, snapshots, list) { | 1934 | list_for_each_entry (snap, snapshots, list) { |
1935 | /* | ||
1936 | * Don't make new exceptions in a merging snapshot | ||
1937 | * because it has effectively been deleted | ||
1938 | */ | ||
1939 | if (dm_target_is_snapshot_merge(snap->ti)) | ||
1940 | continue; | ||
1216 | 1941 | ||
1217 | down_write(&snap->lock); | 1942 | down_write(&snap->lock); |
1218 | 1943 | ||
@@ -1221,24 +1946,21 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) | |||
1221 | goto next_snapshot; | 1946 | goto next_snapshot; |
1222 | 1947 | ||
1223 | /* Nothing to do if writing beyond end of snapshot */ | 1948 | /* Nothing to do if writing beyond end of snapshot */ |
1224 | if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table)) | 1949 | if (sector >= dm_table_get_size(snap->ti->table)) |
1225 | goto next_snapshot; | 1950 | goto next_snapshot; |
1226 | 1951 | ||
1227 | /* | 1952 | /* |
1228 | * Remember, different snapshots can have | 1953 | * Remember, different snapshots can have |
1229 | * different chunk sizes. | 1954 | * different chunk sizes. |
1230 | */ | 1955 | */ |
1231 | chunk = sector_to_chunk(snap->store, bio->bi_sector); | 1956 | chunk = sector_to_chunk(snap->store, sector); |
1232 | 1957 | ||
1233 | /* | 1958 | /* |
1234 | * Check exception table to see if block | 1959 | * Check exception table to see if block |
1235 | * is already remapped in this snapshot | 1960 | * is already remapped in this snapshot |
1236 | * and trigger an exception if not. | 1961 | * and trigger an exception if not. |
1237 | * | ||
1238 | * ref_count is initialised to 1 so pending_complete() | ||
1239 | * won't destroy the primary_pe while we're inside this loop. | ||
1240 | */ | 1962 | */ |
1241 | e = lookup_exception(&snap->complete, chunk); | 1963 | e = dm_lookup_exception(&snap->complete, chunk); |
1242 | if (e) | 1964 | if (e) |
1243 | goto next_snapshot; | 1965 | goto next_snapshot; |
1244 | 1966 | ||
@@ -1253,7 +1975,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) | |||
1253 | goto next_snapshot; | 1975 | goto next_snapshot; |
1254 | } | 1976 | } |
1255 | 1977 | ||
1256 | e = lookup_exception(&snap->complete, chunk); | 1978 | e = dm_lookup_exception(&snap->complete, chunk); |
1257 | if (e) { | 1979 | if (e) { |
1258 | free_pending_exception(pe); | 1980 | free_pending_exception(pe); |
1259 | goto next_snapshot; | 1981 | goto next_snapshot; |
@@ -1266,59 +1988,43 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) | |||
1266 | } | 1988 | } |
1267 | } | 1989 | } |
1268 | 1990 | ||
1269 | if (!primary_pe) { | 1991 | r = DM_MAPIO_SUBMITTED; |
1270 | /* | ||
1271 | * Either every pe here has same | ||
1272 | * primary_pe or none has one yet. | ||
1273 | */ | ||
1274 | if (pe->primary_pe) | ||
1275 | primary_pe = pe->primary_pe; | ||
1276 | else { | ||
1277 | primary_pe = pe; | ||
1278 | first = 1; | ||
1279 | } | ||
1280 | |||
1281 | bio_list_add(&primary_pe->origin_bios, bio); | ||
1282 | 1992 | ||
1283 | r = DM_MAPIO_SUBMITTED; | 1993 | /* |
1284 | } | 1994 | * If an origin bio was supplied, queue it to wait for the |
1995 | * completion of this exception, and start this one last, | ||
1996 | * at the end of the function. | ||
1997 | */ | ||
1998 | if (bio) { | ||
1999 | bio_list_add(&pe->origin_bios, bio); | ||
2000 | bio = NULL; | ||
1285 | 2001 | ||
1286 | if (!pe->primary_pe) { | 2002 | if (!pe->started) { |
1287 | pe->primary_pe = primary_pe; | 2003 | pe->started = 1; |
1288 | get_pending_exception(primary_pe); | 2004 | pe_to_start_last = pe; |
2005 | } | ||
1289 | } | 2006 | } |
1290 | 2007 | ||
1291 | if (!pe->started) { | 2008 | if (!pe->started) { |
1292 | pe->started = 1; | 2009 | pe->started = 1; |
1293 | list_add_tail(&pe->list, &pe_queue); | 2010 | pe_to_start_now = pe; |
1294 | } | 2011 | } |
1295 | 2012 | ||
1296 | next_snapshot: | 2013 | next_snapshot: |
1297 | up_write(&snap->lock); | 2014 | up_write(&snap->lock); |
1298 | } | ||
1299 | 2015 | ||
1300 | if (!primary_pe) | 2016 | if (pe_to_start_now) { |
1301 | return r; | 2017 | start_copy(pe_to_start_now); |
1302 | 2018 | pe_to_start_now = NULL; | |
1303 | /* | 2019 | } |
1304 | * If this is the first time we're processing this chunk and | ||
1305 | * ref_count is now 1 it means all the pending exceptions | ||
1306 | * got completed while we were in the loop above, so it falls to | ||
1307 | * us here to remove the primary_pe and submit any origin_bios. | ||
1308 | */ | ||
1309 | |||
1310 | if (first && atomic_dec_and_test(&primary_pe->ref_count)) { | ||
1311 | flush_bios(bio_list_get(&primary_pe->origin_bios)); | ||
1312 | free_pending_exception(primary_pe); | ||
1313 | /* If we got here, pe_queue is necessarily empty. */ | ||
1314 | return r; | ||
1315 | } | 2020 | } |
1316 | 2021 | ||
1317 | /* | 2022 | /* |
1318 | * Now that we have a complete pe list we can start the copying. | 2023 | * Submit the exception against which the bio is queued last, |
2024 | * to give the other exceptions a head start. | ||
1319 | */ | 2025 | */ |
1320 | list_for_each_entry_safe(pe, next_pe, &pe_queue, list) | 2026 | if (pe_to_start_last) |
1321 | start_copy(pe); | 2027 | start_copy(pe_to_start_last); |
1322 | 2028 | ||
1323 | return r; | 2029 | return r; |
1324 | } | 2030 | } |
@@ -1334,13 +2040,48 @@ static int do_origin(struct dm_dev *origin, struct bio *bio) | |||
1334 | down_read(&_origins_lock); | 2040 | down_read(&_origins_lock); |
1335 | o = __lookup_origin(origin->bdev); | 2041 | o = __lookup_origin(origin->bdev); |
1336 | if (o) | 2042 | if (o) |
1337 | r = __origin_write(&o->snapshots, bio); | 2043 | r = __origin_write(&o->snapshots, bio->bi_sector, bio); |
1338 | up_read(&_origins_lock); | 2044 | up_read(&_origins_lock); |
1339 | 2045 | ||
1340 | return r; | 2046 | return r; |
1341 | } | 2047 | } |
1342 | 2048 | ||
1343 | /* | 2049 | /* |
2050 | * Trigger exceptions in all non-merging snapshots. | ||
2051 | * | ||
2052 | * The chunk size of the merging snapshot may be larger than the chunk | ||
2053 | * size of some other snapshot so we may need to reallocate multiple | ||
2054 | * chunks in other snapshots. | ||
2055 | * | ||
2056 | * We scan all the overlapping exceptions in the other snapshots. | ||
2057 | * Returns 1 if anything was reallocated and must be waited for, | ||
2058 | * otherwise returns 0. | ||
2059 | * | ||
2060 | * size must be a multiple of merging_snap's chunk_size. | ||
2061 | */ | ||
2062 | static int origin_write_extent(struct dm_snapshot *merging_snap, | ||
2063 | sector_t sector, unsigned size) | ||
2064 | { | ||
2065 | int must_wait = 0; | ||
2066 | sector_t n; | ||
2067 | struct origin *o; | ||
2068 | |||
2069 | /* | ||
2070 | * The origin's __minimum_chunk_size() got stored in split_io | ||
2071 | * by snapshot_merge_resume(). | ||
2072 | */ | ||
2073 | down_read(&_origins_lock); | ||
2074 | o = __lookup_origin(merging_snap->origin->bdev); | ||
2075 | for (n = 0; n < size; n += merging_snap->ti->split_io) | ||
2076 | if (__origin_write(&o->snapshots, sector + n, NULL) == | ||
2077 | DM_MAPIO_SUBMITTED) | ||
2078 | must_wait = 1; | ||
2079 | up_read(&_origins_lock); | ||
2080 | |||
2081 | return must_wait; | ||
2082 | } | ||
2083 | |||
2084 | /* | ||
1344 | * Origin: maps a linear range of a device, with hooks for snapshotting. | 2085 | * Origin: maps a linear range of a device, with hooks for snapshotting. |
1345 | */ | 2086 | */ |
1346 | 2087 | ||
@@ -1391,8 +2132,6 @@ static int origin_map(struct dm_target *ti, struct bio *bio, | |||
1391 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; | 2132 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; |
1392 | } | 2133 | } |
1393 | 2134 | ||
1394 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
1395 | |||
1396 | /* | 2135 | /* |
1397 | * Set the target "split_io" field to the minimum of all the snapshots' | 2136 | * Set the target "split_io" field to the minimum of all the snapshots' |
1398 | * chunk sizes. | 2137 | * chunk sizes. |
@@ -1400,19 +2139,8 @@ static int origin_map(struct dm_target *ti, struct bio *bio, | |||
1400 | static void origin_resume(struct dm_target *ti) | 2139 | static void origin_resume(struct dm_target *ti) |
1401 | { | 2140 | { |
1402 | struct dm_dev *dev = ti->private; | 2141 | struct dm_dev *dev = ti->private; |
1403 | struct dm_snapshot *snap; | ||
1404 | struct origin *o; | ||
1405 | unsigned chunk_size = 0; | ||
1406 | |||
1407 | down_read(&_origins_lock); | ||
1408 | o = __lookup_origin(dev->bdev); | ||
1409 | if (o) | ||
1410 | list_for_each_entry (snap, &o->snapshots, list) | ||
1411 | chunk_size = min_not_zero(chunk_size, | ||
1412 | snap->store->chunk_size); | ||
1413 | up_read(&_origins_lock); | ||
1414 | 2142 | ||
1415 | ti->split_io = chunk_size; | 2143 | ti->split_io = get_origin_minimum_chunksize(dev->bdev); |
1416 | } | 2144 | } |
1417 | 2145 | ||
1418 | static int origin_status(struct dm_target *ti, status_type_t type, char *result, | 2146 | static int origin_status(struct dm_target *ti, status_type_t type, char *result, |
@@ -1455,17 +2183,35 @@ static struct target_type origin_target = { | |||
1455 | 2183 | ||
1456 | static struct target_type snapshot_target = { | 2184 | static struct target_type snapshot_target = { |
1457 | .name = "snapshot", | 2185 | .name = "snapshot", |
1458 | .version = {1, 7, 0}, | 2186 | .version = {1, 9, 0}, |
1459 | .module = THIS_MODULE, | 2187 | .module = THIS_MODULE, |
1460 | .ctr = snapshot_ctr, | 2188 | .ctr = snapshot_ctr, |
1461 | .dtr = snapshot_dtr, | 2189 | .dtr = snapshot_dtr, |
1462 | .map = snapshot_map, | 2190 | .map = snapshot_map, |
1463 | .end_io = snapshot_end_io, | 2191 | .end_io = snapshot_end_io, |
2192 | .postsuspend = snapshot_postsuspend, | ||
2193 | .preresume = snapshot_preresume, | ||
1464 | .resume = snapshot_resume, | 2194 | .resume = snapshot_resume, |
1465 | .status = snapshot_status, | 2195 | .status = snapshot_status, |
1466 | .iterate_devices = snapshot_iterate_devices, | 2196 | .iterate_devices = snapshot_iterate_devices, |
1467 | }; | 2197 | }; |
1468 | 2198 | ||
2199 | static struct target_type merge_target = { | ||
2200 | .name = dm_snapshot_merge_target_name, | ||
2201 | .version = {1, 0, 0}, | ||
2202 | .module = THIS_MODULE, | ||
2203 | .ctr = snapshot_ctr, | ||
2204 | .dtr = snapshot_dtr, | ||
2205 | .map = snapshot_merge_map, | ||
2206 | .end_io = snapshot_end_io, | ||
2207 | .presuspend = snapshot_merge_presuspend, | ||
2208 | .postsuspend = snapshot_postsuspend, | ||
2209 | .preresume = snapshot_preresume, | ||
2210 | .resume = snapshot_merge_resume, | ||
2211 | .status = snapshot_status, | ||
2212 | .iterate_devices = snapshot_iterate_devices, | ||
2213 | }; | ||
2214 | |||
1469 | static int __init dm_snapshot_init(void) | 2215 | static int __init dm_snapshot_init(void) |
1470 | { | 2216 | { |
1471 | int r; | 2217 | int r; |
@@ -1477,7 +2223,7 @@ static int __init dm_snapshot_init(void) | |||
1477 | } | 2223 | } |
1478 | 2224 | ||
1479 | r = dm_register_target(&snapshot_target); | 2225 | r = dm_register_target(&snapshot_target); |
1480 | if (r) { | 2226 | if (r < 0) { |
1481 | DMERR("snapshot target register failed %d", r); | 2227 | DMERR("snapshot target register failed %d", r); |
1482 | goto bad_register_snapshot_target; | 2228 | goto bad_register_snapshot_target; |
1483 | } | 2229 | } |
@@ -1485,34 +2231,40 @@ static int __init dm_snapshot_init(void) | |||
1485 | r = dm_register_target(&origin_target); | 2231 | r = dm_register_target(&origin_target); |
1486 | if (r < 0) { | 2232 | if (r < 0) { |
1487 | DMERR("Origin target register failed %d", r); | 2233 | DMERR("Origin target register failed %d", r); |
1488 | goto bad1; | 2234 | goto bad_register_origin_target; |
2235 | } | ||
2236 | |||
2237 | r = dm_register_target(&merge_target); | ||
2238 | if (r < 0) { | ||
2239 | DMERR("Merge target register failed %d", r); | ||
2240 | goto bad_register_merge_target; | ||
1489 | } | 2241 | } |
1490 | 2242 | ||
1491 | r = init_origin_hash(); | 2243 | r = init_origin_hash(); |
1492 | if (r) { | 2244 | if (r) { |
1493 | DMERR("init_origin_hash failed."); | 2245 | DMERR("init_origin_hash failed."); |
1494 | goto bad2; | 2246 | goto bad_origin_hash; |
1495 | } | 2247 | } |
1496 | 2248 | ||
1497 | exception_cache = KMEM_CACHE(dm_snap_exception, 0); | 2249 | exception_cache = KMEM_CACHE(dm_exception, 0); |
1498 | if (!exception_cache) { | 2250 | if (!exception_cache) { |
1499 | DMERR("Couldn't create exception cache."); | 2251 | DMERR("Couldn't create exception cache."); |
1500 | r = -ENOMEM; | 2252 | r = -ENOMEM; |
1501 | goto bad3; | 2253 | goto bad_exception_cache; |
1502 | } | 2254 | } |
1503 | 2255 | ||
1504 | pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); | 2256 | pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); |
1505 | if (!pending_cache) { | 2257 | if (!pending_cache) { |
1506 | DMERR("Couldn't create pending cache."); | 2258 | DMERR("Couldn't create pending cache."); |
1507 | r = -ENOMEM; | 2259 | r = -ENOMEM; |
1508 | goto bad4; | 2260 | goto bad_pending_cache; |
1509 | } | 2261 | } |
1510 | 2262 | ||
1511 | tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); | 2263 | tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); |
1512 | if (!tracked_chunk_cache) { | 2264 | if (!tracked_chunk_cache) { |
1513 | DMERR("Couldn't create cache to track chunks in use."); | 2265 | DMERR("Couldn't create cache to track chunks in use."); |
1514 | r = -ENOMEM; | 2266 | r = -ENOMEM; |
1515 | goto bad5; | 2267 | goto bad_tracked_chunk_cache; |
1516 | } | 2268 | } |
1517 | 2269 | ||
1518 | ksnapd = create_singlethread_workqueue("ksnapd"); | 2270 | ksnapd = create_singlethread_workqueue("ksnapd"); |
@@ -1526,19 +2278,21 @@ static int __init dm_snapshot_init(void) | |||
1526 | 2278 | ||
1527 | bad_pending_pool: | 2279 | bad_pending_pool: |
1528 | kmem_cache_destroy(tracked_chunk_cache); | 2280 | kmem_cache_destroy(tracked_chunk_cache); |
1529 | bad5: | 2281 | bad_tracked_chunk_cache: |
1530 | kmem_cache_destroy(pending_cache); | 2282 | kmem_cache_destroy(pending_cache); |
1531 | bad4: | 2283 | bad_pending_cache: |
1532 | kmem_cache_destroy(exception_cache); | 2284 | kmem_cache_destroy(exception_cache); |
1533 | bad3: | 2285 | bad_exception_cache: |
1534 | exit_origin_hash(); | 2286 | exit_origin_hash(); |
1535 | bad2: | 2287 | bad_origin_hash: |
2288 | dm_unregister_target(&merge_target); | ||
2289 | bad_register_merge_target: | ||
1536 | dm_unregister_target(&origin_target); | 2290 | dm_unregister_target(&origin_target); |
1537 | bad1: | 2291 | bad_register_origin_target: |
1538 | dm_unregister_target(&snapshot_target); | 2292 | dm_unregister_target(&snapshot_target); |
1539 | |||
1540 | bad_register_snapshot_target: | 2293 | bad_register_snapshot_target: |
1541 | dm_exception_store_exit(); | 2294 | dm_exception_store_exit(); |
2295 | |||
1542 | return r; | 2296 | return r; |
1543 | } | 2297 | } |
1544 | 2298 | ||
@@ -1548,6 +2302,7 @@ static void __exit dm_snapshot_exit(void) | |||
1548 | 2302 | ||
1549 | dm_unregister_target(&snapshot_target); | 2303 | dm_unregister_target(&snapshot_target); |
1550 | dm_unregister_target(&origin_target); | 2304 | dm_unregister_target(&origin_target); |
2305 | dm_unregister_target(&merge_target); | ||
1551 | 2306 | ||
1552 | exit_origin_hash(); | 2307 | exit_origin_hash(); |
1553 | kmem_cache_destroy(pending_cache); | 2308 | kmem_cache_destroy(pending_cache); |
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index 4b045903a4e2..f53392df7b97 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c | |||
@@ -59,7 +59,7 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) | |||
59 | 59 | ||
60 | static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) | 60 | static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) |
61 | { | 61 | { |
62 | sprintf(buf, "%d\n", dm_suspended(md)); | 62 | sprintf(buf, "%d\n", dm_suspended_md(md)); |
63 | 63 | ||
64 | return strlen(buf); | 64 | return strlen(buf); |
65 | } | 65 | } |
@@ -80,12 +80,20 @@ static struct sysfs_ops dm_sysfs_ops = { | |||
80 | }; | 80 | }; |
81 | 81 | ||
82 | /* | 82 | /* |
83 | * The sysfs structure is embedded in md struct, nothing to do here | ||
84 | */ | ||
85 | static void dm_sysfs_release(struct kobject *kobj) | ||
86 | { | ||
87 | } | ||
88 | |||
89 | /* | ||
83 | * dm kobject is embedded in mapped_device structure | 90 | * dm kobject is embedded in mapped_device structure |
84 | * no need to define release function here | 91 | * no need to define release function here |
85 | */ | 92 | */ |
86 | static struct kobj_type dm_ktype = { | 93 | static struct kobj_type dm_ktype = { |
87 | .sysfs_ops = &dm_sysfs_ops, | 94 | .sysfs_ops = &dm_sysfs_ops, |
88 | .default_attrs = dm_attrs, | 95 | .default_attrs = dm_attrs, |
96 | .release = dm_sysfs_release | ||
89 | }; | 97 | }; |
90 | 98 | ||
91 | /* | 99 | /* |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 91976e8fae5f..be625475cf6d 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -238,6 +238,9 @@ void dm_table_destroy(struct dm_table *t) | |||
238 | { | 238 | { |
239 | unsigned int i; | 239 | unsigned int i; |
240 | 240 | ||
241 | if (!t) | ||
242 | return; | ||
243 | |||
241 | while (atomic_read(&t->holders)) | 244 | while (atomic_read(&t->holders)) |
242 | msleep(1); | 245 | msleep(1); |
243 | smp_mb(); | 246 | smp_mb(); |
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c index 6f65883aef12..c7c555a8c7b2 100644 --- a/drivers/md/dm-uevent.c +++ b/drivers/md/dm-uevent.c | |||
@@ -139,14 +139,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj) | |||
139 | list_del_init(&event->elist); | 139 | list_del_init(&event->elist); |
140 | 140 | ||
141 | /* | 141 | /* |
142 | * Need to call dm_copy_name_and_uuid from here for now. | 142 | * When a device is being removed this copy fails and we |
143 | * Context of previous var adds and locking used for | 143 | * discard these unsent events. |
144 | * hash_cell not compatable. | ||
145 | */ | 144 | */ |
146 | if (dm_copy_name_and_uuid(event->md, event->name, | 145 | if (dm_copy_name_and_uuid(event->md, event->name, |
147 | event->uuid)) { | 146 | event->uuid)) { |
148 | DMERR("%s: dm_copy_name_and_uuid() failed", | 147 | DMINFO("%s: skipping sending uevent for lost device", |
149 | __func__); | 148 | __func__); |
150 | goto uevent_free; | 149 | goto uevent_free; |
151 | } | 150 | } |
152 | 151 | ||
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 724efc63904d..3167480b532c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -143,9 +143,19 @@ struct mapped_device { | |||
143 | int barrier_error; | 143 | int barrier_error; |
144 | 144 | ||
145 | /* | 145 | /* |
146 | * Protect barrier_error from concurrent endio processing | ||
147 | * in request-based dm. | ||
148 | */ | ||
149 | spinlock_t barrier_error_lock; | ||
150 | |||
151 | /* | ||
146 | * Processing queue (flush/barriers) | 152 | * Processing queue (flush/barriers) |
147 | */ | 153 | */ |
148 | struct workqueue_struct *wq; | 154 | struct workqueue_struct *wq; |
155 | struct work_struct barrier_work; | ||
156 | |||
157 | /* A pointer to the currently processing pre/post flush request */ | ||
158 | struct request *flush_request; | ||
149 | 159 | ||
150 | /* | 160 | /* |
151 | * The current mapping. | 161 | * The current mapping. |
@@ -178,9 +188,6 @@ struct mapped_device { | |||
178 | /* forced geometry settings */ | 188 | /* forced geometry settings */ |
179 | struct hd_geometry geometry; | 189 | struct hd_geometry geometry; |
180 | 190 | ||
181 | /* marker of flush suspend for request-based dm */ | ||
182 | struct request suspend_rq; | ||
183 | |||
184 | /* For saving the address of __make_request for request based dm */ | 191 | /* For saving the address of __make_request for request based dm */ |
185 | make_request_fn *saved_make_request_fn; | 192 | make_request_fn *saved_make_request_fn; |
186 | 193 | ||
@@ -275,6 +282,7 @@ static int (*_inits[])(void) __initdata = { | |||
275 | dm_target_init, | 282 | dm_target_init, |
276 | dm_linear_init, | 283 | dm_linear_init, |
277 | dm_stripe_init, | 284 | dm_stripe_init, |
285 | dm_io_init, | ||
278 | dm_kcopyd_init, | 286 | dm_kcopyd_init, |
279 | dm_interface_init, | 287 | dm_interface_init, |
280 | }; | 288 | }; |
@@ -284,6 +292,7 @@ static void (*_exits[])(void) = { | |||
284 | dm_target_exit, | 292 | dm_target_exit, |
285 | dm_linear_exit, | 293 | dm_linear_exit, |
286 | dm_stripe_exit, | 294 | dm_stripe_exit, |
295 | dm_io_exit, | ||
287 | dm_kcopyd_exit, | 296 | dm_kcopyd_exit, |
288 | dm_interface_exit, | 297 | dm_interface_exit, |
289 | }; | 298 | }; |
@@ -320,6 +329,11 @@ static void __exit dm_exit(void) | |||
320 | /* | 329 | /* |
321 | * Block device functions | 330 | * Block device functions |
322 | */ | 331 | */ |
332 | int dm_deleting_md(struct mapped_device *md) | ||
333 | { | ||
334 | return test_bit(DMF_DELETING, &md->flags); | ||
335 | } | ||
336 | |||
323 | static int dm_blk_open(struct block_device *bdev, fmode_t mode) | 337 | static int dm_blk_open(struct block_device *bdev, fmode_t mode) |
324 | { | 338 | { |
325 | struct mapped_device *md; | 339 | struct mapped_device *md; |
@@ -331,7 +345,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) | |||
331 | goto out; | 345 | goto out; |
332 | 346 | ||
333 | if (test_bit(DMF_FREEING, &md->flags) || | 347 | if (test_bit(DMF_FREEING, &md->flags) || |
334 | test_bit(DMF_DELETING, &md->flags)) { | 348 | dm_deleting_md(md)) { |
335 | md = NULL; | 349 | md = NULL; |
336 | goto out; | 350 | goto out; |
337 | } | 351 | } |
@@ -388,7 +402,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, | |||
388 | unsigned int cmd, unsigned long arg) | 402 | unsigned int cmd, unsigned long arg) |
389 | { | 403 | { |
390 | struct mapped_device *md = bdev->bd_disk->private_data; | 404 | struct mapped_device *md = bdev->bd_disk->private_data; |
391 | struct dm_table *map = dm_get_table(md); | 405 | struct dm_table *map = dm_get_live_table(md); |
392 | struct dm_target *tgt; | 406 | struct dm_target *tgt; |
393 | int r = -ENOTTY; | 407 | int r = -ENOTTY; |
394 | 408 | ||
@@ -401,7 +415,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, | |||
401 | 415 | ||
402 | tgt = dm_table_get_target(map, 0); | 416 | tgt = dm_table_get_target(map, 0); |
403 | 417 | ||
404 | if (dm_suspended(md)) { | 418 | if (dm_suspended_md(md)) { |
405 | r = -EAGAIN; | 419 | r = -EAGAIN; |
406 | goto out; | 420 | goto out; |
407 | } | 421 | } |
@@ -430,9 +444,10 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio) | |||
430 | mempool_free(tio, md->tio_pool); | 444 | mempool_free(tio, md->tio_pool); |
431 | } | 445 | } |
432 | 446 | ||
433 | static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) | 447 | static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, |
448 | gfp_t gfp_mask) | ||
434 | { | 449 | { |
435 | return mempool_alloc(md->tio_pool, GFP_ATOMIC); | 450 | return mempool_alloc(md->tio_pool, gfp_mask); |
436 | } | 451 | } |
437 | 452 | ||
438 | static void free_rq_tio(struct dm_rq_target_io *tio) | 453 | static void free_rq_tio(struct dm_rq_target_io *tio) |
@@ -450,6 +465,12 @@ static void free_bio_info(struct dm_rq_clone_bio_info *info) | |||
450 | mempool_free(info, info->tio->md->io_pool); | 465 | mempool_free(info, info->tio->md->io_pool); |
451 | } | 466 | } |
452 | 467 | ||
468 | static int md_in_flight(struct mapped_device *md) | ||
469 | { | ||
470 | return atomic_read(&md->pending[READ]) + | ||
471 | atomic_read(&md->pending[WRITE]); | ||
472 | } | ||
473 | |||
453 | static void start_io_acct(struct dm_io *io) | 474 | static void start_io_acct(struct dm_io *io) |
454 | { | 475 | { |
455 | struct mapped_device *md = io->md; | 476 | struct mapped_device *md = io->md; |
@@ -512,7 +533,7 @@ static void queue_io(struct mapped_device *md, struct bio *bio) | |||
512 | * function to access the md->map field, and make sure they call | 533 | * function to access the md->map field, and make sure they call |
513 | * dm_table_put() when finished. | 534 | * dm_table_put() when finished. |
514 | */ | 535 | */ |
515 | struct dm_table *dm_get_table(struct mapped_device *md) | 536 | struct dm_table *dm_get_live_table(struct mapped_device *md) |
516 | { | 537 | { |
517 | struct dm_table *t; | 538 | struct dm_table *t; |
518 | unsigned long flags; | 539 | unsigned long flags; |
@@ -716,28 +737,38 @@ static void end_clone_bio(struct bio *clone, int error) | |||
716 | blk_update_request(tio->orig, 0, nr_bytes); | 737 | blk_update_request(tio->orig, 0, nr_bytes); |
717 | } | 738 | } |
718 | 739 | ||
740 | static void store_barrier_error(struct mapped_device *md, int error) | ||
741 | { | ||
742 | unsigned long flags; | ||
743 | |||
744 | spin_lock_irqsave(&md->barrier_error_lock, flags); | ||
745 | /* | ||
746 | * Basically, the first error is taken, but: | ||
747 | * -EOPNOTSUPP supersedes any I/O error. | ||
748 | * Requeue request supersedes any I/O error but -EOPNOTSUPP. | ||
749 | */ | ||
750 | if (!md->barrier_error || error == -EOPNOTSUPP || | ||
751 | (md->barrier_error != -EOPNOTSUPP && | ||
752 | error == DM_ENDIO_REQUEUE)) | ||
753 | md->barrier_error = error; | ||
754 | spin_unlock_irqrestore(&md->barrier_error_lock, flags); | ||
755 | } | ||
756 | |||
719 | /* | 757 | /* |
720 | * Don't touch any member of the md after calling this function because | 758 | * Don't touch any member of the md after calling this function because |
721 | * the md may be freed in dm_put() at the end of this function. | 759 | * the md may be freed in dm_put() at the end of this function. |
722 | * Or do dm_get() before calling this function and dm_put() later. | 760 | * Or do dm_get() before calling this function and dm_put() later. |
723 | */ | 761 | */ |
724 | static void rq_completed(struct mapped_device *md, int run_queue) | 762 | static void rq_completed(struct mapped_device *md, int rw, int run_queue) |
725 | { | 763 | { |
726 | int wakeup_waiters = 0; | 764 | atomic_dec(&md->pending[rw]); |
727 | struct request_queue *q = md->queue; | ||
728 | unsigned long flags; | ||
729 | |||
730 | spin_lock_irqsave(q->queue_lock, flags); | ||
731 | if (!queue_in_flight(q)) | ||
732 | wakeup_waiters = 1; | ||
733 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
734 | 765 | ||
735 | /* nudge anyone waiting on suspend queue */ | 766 | /* nudge anyone waiting on suspend queue */ |
736 | if (wakeup_waiters) | 767 | if (!md_in_flight(md)) |
737 | wake_up(&md->wait); | 768 | wake_up(&md->wait); |
738 | 769 | ||
739 | if (run_queue) | 770 | if (run_queue) |
740 | blk_run_queue(q); | 771 | blk_run_queue(md->queue); |
741 | 772 | ||
742 | /* | 773 | /* |
743 | * dm_put() must be at the end of this function. See the comment above | 774 | * dm_put() must be at the end of this function. See the comment above |
@@ -753,6 +784,44 @@ static void free_rq_clone(struct request *clone) | |||
753 | free_rq_tio(tio); | 784 | free_rq_tio(tio); |
754 | } | 785 | } |
755 | 786 | ||
787 | /* | ||
788 | * Complete the clone and the original request. | ||
789 | * Must be called without queue lock. | ||
790 | */ | ||
791 | static void dm_end_request(struct request *clone, int error) | ||
792 | { | ||
793 | int rw = rq_data_dir(clone); | ||
794 | int run_queue = 1; | ||
795 | bool is_barrier = blk_barrier_rq(clone); | ||
796 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
797 | struct mapped_device *md = tio->md; | ||
798 | struct request *rq = tio->orig; | ||
799 | |||
800 | if (blk_pc_request(rq) && !is_barrier) { | ||
801 | rq->errors = clone->errors; | ||
802 | rq->resid_len = clone->resid_len; | ||
803 | |||
804 | if (rq->sense) | ||
805 | /* | ||
806 | * We are using the sense buffer of the original | ||
807 | * request. | ||
808 | * So setting the length of the sense data is enough. | ||
809 | */ | ||
810 | rq->sense_len = clone->sense_len; | ||
811 | } | ||
812 | |||
813 | free_rq_clone(clone); | ||
814 | |||
815 | if (unlikely(is_barrier)) { | ||
816 | if (unlikely(error)) | ||
817 | store_barrier_error(md, error); | ||
818 | run_queue = 0; | ||
819 | } else | ||
820 | blk_end_request_all(rq, error); | ||
821 | |||
822 | rq_completed(md, rw, run_queue); | ||
823 | } | ||
824 | |||
756 | static void dm_unprep_request(struct request *rq) | 825 | static void dm_unprep_request(struct request *rq) |
757 | { | 826 | { |
758 | struct request *clone = rq->special; | 827 | struct request *clone = rq->special; |
@@ -768,12 +837,23 @@ static void dm_unprep_request(struct request *rq) | |||
768 | */ | 837 | */ |
769 | void dm_requeue_unmapped_request(struct request *clone) | 838 | void dm_requeue_unmapped_request(struct request *clone) |
770 | { | 839 | { |
840 | int rw = rq_data_dir(clone); | ||
771 | struct dm_rq_target_io *tio = clone->end_io_data; | 841 | struct dm_rq_target_io *tio = clone->end_io_data; |
772 | struct mapped_device *md = tio->md; | 842 | struct mapped_device *md = tio->md; |
773 | struct request *rq = tio->orig; | 843 | struct request *rq = tio->orig; |
774 | struct request_queue *q = rq->q; | 844 | struct request_queue *q = rq->q; |
775 | unsigned long flags; | 845 | unsigned long flags; |
776 | 846 | ||
847 | if (unlikely(blk_barrier_rq(clone))) { | ||
848 | /* | ||
849 | * Barrier clones share an original request. | ||
850 | * Leave it to dm_end_request(), which handles this special | ||
851 | * case. | ||
852 | */ | ||
853 | dm_end_request(clone, DM_ENDIO_REQUEUE); | ||
854 | return; | ||
855 | } | ||
856 | |||
777 | dm_unprep_request(rq); | 857 | dm_unprep_request(rq); |
778 | 858 | ||
779 | spin_lock_irqsave(q->queue_lock, flags); | 859 | spin_lock_irqsave(q->queue_lock, flags); |
@@ -782,7 +862,7 @@ void dm_requeue_unmapped_request(struct request *clone) | |||
782 | blk_requeue_request(q, rq); | 862 | blk_requeue_request(q, rq); |
783 | spin_unlock_irqrestore(q->queue_lock, flags); | 863 | spin_unlock_irqrestore(q->queue_lock, flags); |
784 | 864 | ||
785 | rq_completed(md, 0); | 865 | rq_completed(md, rw, 0); |
786 | } | 866 | } |
787 | EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); | 867 | EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); |
788 | 868 | ||
@@ -815,34 +895,28 @@ static void start_queue(struct request_queue *q) | |||
815 | spin_unlock_irqrestore(q->queue_lock, flags); | 895 | spin_unlock_irqrestore(q->queue_lock, flags); |
816 | } | 896 | } |
817 | 897 | ||
818 | /* | 898 | static void dm_done(struct request *clone, int error, bool mapped) |
819 | * Complete the clone and the original request. | ||
820 | * Must be called without queue lock. | ||
821 | */ | ||
822 | static void dm_end_request(struct request *clone, int error) | ||
823 | { | 899 | { |
900 | int r = error; | ||
824 | struct dm_rq_target_io *tio = clone->end_io_data; | 901 | struct dm_rq_target_io *tio = clone->end_io_data; |
825 | struct mapped_device *md = tio->md; | 902 | dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; |
826 | struct request *rq = tio->orig; | ||
827 | 903 | ||
828 | if (blk_pc_request(rq)) { | 904 | if (mapped && rq_end_io) |
829 | rq->errors = clone->errors; | 905 | r = rq_end_io(tio->ti, clone, error, &tio->info); |
830 | rq->resid_len = clone->resid_len; | ||
831 | 906 | ||
832 | if (rq->sense) | 907 | if (r <= 0) |
833 | /* | 908 | /* The target wants to complete the I/O */ |
834 | * We are using the sense buffer of the original | 909 | dm_end_request(clone, r); |
835 | * request. | 910 | else if (r == DM_ENDIO_INCOMPLETE) |
836 | * So setting the length of the sense data is enough. | 911 | /* The target will handle the I/O */ |
837 | */ | 912 | return; |
838 | rq->sense_len = clone->sense_len; | 913 | else if (r == DM_ENDIO_REQUEUE) |
914 | /* The target wants to requeue the I/O */ | ||
915 | dm_requeue_unmapped_request(clone); | ||
916 | else { | ||
917 | DMWARN("unimplemented target endio return value: %d", r); | ||
918 | BUG(); | ||
839 | } | 919 | } |
840 | |||
841 | free_rq_clone(clone); | ||
842 | |||
843 | blk_end_request_all(rq, error); | ||
844 | |||
845 | rq_completed(md, 1); | ||
846 | } | 920 | } |
847 | 921 | ||
848 | /* | 922 | /* |
@@ -850,27 +924,14 @@ static void dm_end_request(struct request *clone, int error) | |||
850 | */ | 924 | */ |
851 | static void dm_softirq_done(struct request *rq) | 925 | static void dm_softirq_done(struct request *rq) |
852 | { | 926 | { |
927 | bool mapped = true; | ||
853 | struct request *clone = rq->completion_data; | 928 | struct request *clone = rq->completion_data; |
854 | struct dm_rq_target_io *tio = clone->end_io_data; | 929 | struct dm_rq_target_io *tio = clone->end_io_data; |
855 | dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; | ||
856 | int error = tio->error; | ||
857 | 930 | ||
858 | if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) | 931 | if (rq->cmd_flags & REQ_FAILED) |
859 | error = rq_end_io(tio->ti, clone, error, &tio->info); | 932 | mapped = false; |
860 | 933 | ||
861 | if (error <= 0) | 934 | dm_done(clone, tio->error, mapped); |
862 | /* The target wants to complete the I/O */ | ||
863 | dm_end_request(clone, error); | ||
864 | else if (error == DM_ENDIO_INCOMPLETE) | ||
865 | /* The target will handle the I/O */ | ||
866 | return; | ||
867 | else if (error == DM_ENDIO_REQUEUE) | ||
868 | /* The target wants to requeue the I/O */ | ||
869 | dm_requeue_unmapped_request(clone); | ||
870 | else { | ||
871 | DMWARN("unimplemented target endio return value: %d", error); | ||
872 | BUG(); | ||
873 | } | ||
874 | } | 935 | } |
875 | 936 | ||
876 | /* | 937 | /* |
@@ -882,6 +943,19 @@ static void dm_complete_request(struct request *clone, int error) | |||
882 | struct dm_rq_target_io *tio = clone->end_io_data; | 943 | struct dm_rq_target_io *tio = clone->end_io_data; |
883 | struct request *rq = tio->orig; | 944 | struct request *rq = tio->orig; |
884 | 945 | ||
946 | if (unlikely(blk_barrier_rq(clone))) { | ||
947 | /* | ||
948 | * Barrier clones share an original request. So can't use | ||
949 | * softirq_done with the original. | ||
950 | * Pass the clone to dm_done() directly in this special case. | ||
951 | * It is safe (even if clone->q->queue_lock is held here) | ||
952 | * because there is no I/O dispatching during the completion | ||
953 | * of barrier clone. | ||
954 | */ | ||
955 | dm_done(clone, error, true); | ||
956 | return; | ||
957 | } | ||
958 | |||
885 | tio->error = error; | 959 | tio->error = error; |
886 | rq->completion_data = clone; | 960 | rq->completion_data = clone; |
887 | blk_complete_request(rq); | 961 | blk_complete_request(rq); |
@@ -898,6 +972,17 @@ void dm_kill_unmapped_request(struct request *clone, int error) | |||
898 | struct dm_rq_target_io *tio = clone->end_io_data; | 972 | struct dm_rq_target_io *tio = clone->end_io_data; |
899 | struct request *rq = tio->orig; | 973 | struct request *rq = tio->orig; |
900 | 974 | ||
975 | if (unlikely(blk_barrier_rq(clone))) { | ||
976 | /* | ||
977 | * Barrier clones share an original request. | ||
978 | * Leave it to dm_end_request(), which handles this special | ||
979 | * case. | ||
980 | */ | ||
981 | BUG_ON(error > 0); | ||
982 | dm_end_request(clone, error); | ||
983 | return; | ||
984 | } | ||
985 | |||
901 | rq->cmd_flags |= REQ_FAILED; | 986 | rq->cmd_flags |= REQ_FAILED; |
902 | dm_complete_request(clone, error); | 987 | dm_complete_request(clone, error); |
903 | } | 988 | } |
@@ -1214,7 +1299,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
1214 | struct clone_info ci; | 1299 | struct clone_info ci; |
1215 | int error = 0; | 1300 | int error = 0; |
1216 | 1301 | ||
1217 | ci.map = dm_get_table(md); | 1302 | ci.map = dm_get_live_table(md); |
1218 | if (unlikely(!ci.map)) { | 1303 | if (unlikely(!ci.map)) { |
1219 | if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) | 1304 | if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) |
1220 | bio_io_error(bio); | 1305 | bio_io_error(bio); |
@@ -1255,7 +1340,7 @@ static int dm_merge_bvec(struct request_queue *q, | |||
1255 | struct bio_vec *biovec) | 1340 | struct bio_vec *biovec) |
1256 | { | 1341 | { |
1257 | struct mapped_device *md = q->queuedata; | 1342 | struct mapped_device *md = q->queuedata; |
1258 | struct dm_table *map = dm_get_table(md); | 1343 | struct dm_table *map = dm_get_live_table(md); |
1259 | struct dm_target *ti; | 1344 | struct dm_target *ti; |
1260 | sector_t max_sectors; | 1345 | sector_t max_sectors; |
1261 | int max_size = 0; | 1346 | int max_size = 0; |
@@ -1352,11 +1437,6 @@ static int dm_make_request(struct request_queue *q, struct bio *bio) | |||
1352 | { | 1437 | { |
1353 | struct mapped_device *md = q->queuedata; | 1438 | struct mapped_device *md = q->queuedata; |
1354 | 1439 | ||
1355 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | ||
1356 | bio_endio(bio, -EOPNOTSUPP); | ||
1357 | return 0; | ||
1358 | } | ||
1359 | |||
1360 | return md->saved_make_request_fn(q, bio); /* call __make_request() */ | 1440 | return md->saved_make_request_fn(q, bio); /* call __make_request() */ |
1361 | } | 1441 | } |
1362 | 1442 | ||
@@ -1375,6 +1455,25 @@ static int dm_request(struct request_queue *q, struct bio *bio) | |||
1375 | return _dm_request(q, bio); | 1455 | return _dm_request(q, bio); |
1376 | } | 1456 | } |
1377 | 1457 | ||
1458 | /* | ||
1459 | * Mark this request as flush request, so that dm_request_fn() can | ||
1460 | * recognize. | ||
1461 | */ | ||
1462 | static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq) | ||
1463 | { | ||
1464 | rq->cmd_type = REQ_TYPE_LINUX_BLOCK; | ||
1465 | rq->cmd[0] = REQ_LB_OP_FLUSH; | ||
1466 | } | ||
1467 | |||
1468 | static bool dm_rq_is_flush_request(struct request *rq) | ||
1469 | { | ||
1470 | if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK && | ||
1471 | rq->cmd[0] == REQ_LB_OP_FLUSH) | ||
1472 | return true; | ||
1473 | else | ||
1474 | return false; | ||
1475 | } | ||
1476 | |||
1378 | void dm_dispatch_request(struct request *rq) | 1477 | void dm_dispatch_request(struct request *rq) |
1379 | { | 1478 | { |
1380 | int r; | 1479 | int r; |
@@ -1420,25 +1519,54 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, | |||
1420 | static int setup_clone(struct request *clone, struct request *rq, | 1519 | static int setup_clone(struct request *clone, struct request *rq, |
1421 | struct dm_rq_target_io *tio) | 1520 | struct dm_rq_target_io *tio) |
1422 | { | 1521 | { |
1423 | int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, | 1522 | int r; |
1424 | dm_rq_bio_constructor, tio); | ||
1425 | 1523 | ||
1426 | if (r) | 1524 | if (dm_rq_is_flush_request(rq)) { |
1427 | return r; | 1525 | blk_rq_init(NULL, clone); |
1526 | clone->cmd_type = REQ_TYPE_FS; | ||
1527 | clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); | ||
1528 | } else { | ||
1529 | r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, | ||
1530 | dm_rq_bio_constructor, tio); | ||
1531 | if (r) | ||
1532 | return r; | ||
1533 | |||
1534 | clone->cmd = rq->cmd; | ||
1535 | clone->cmd_len = rq->cmd_len; | ||
1536 | clone->sense = rq->sense; | ||
1537 | clone->buffer = rq->buffer; | ||
1538 | } | ||
1428 | 1539 | ||
1429 | clone->cmd = rq->cmd; | ||
1430 | clone->cmd_len = rq->cmd_len; | ||
1431 | clone->sense = rq->sense; | ||
1432 | clone->buffer = rq->buffer; | ||
1433 | clone->end_io = end_clone_request; | 1540 | clone->end_io = end_clone_request; |
1434 | clone->end_io_data = tio; | 1541 | clone->end_io_data = tio; |
1435 | 1542 | ||
1436 | return 0; | 1543 | return 0; |
1437 | } | 1544 | } |
1438 | 1545 | ||
1439 | static int dm_rq_flush_suspending(struct mapped_device *md) | 1546 | static struct request *clone_rq(struct request *rq, struct mapped_device *md, |
1547 | gfp_t gfp_mask) | ||
1440 | { | 1548 | { |
1441 | return !md->suspend_rq.special; | 1549 | struct request *clone; |
1550 | struct dm_rq_target_io *tio; | ||
1551 | |||
1552 | tio = alloc_rq_tio(md, gfp_mask); | ||
1553 | if (!tio) | ||
1554 | return NULL; | ||
1555 | |||
1556 | tio->md = md; | ||
1557 | tio->ti = NULL; | ||
1558 | tio->orig = rq; | ||
1559 | tio->error = 0; | ||
1560 | memset(&tio->info, 0, sizeof(tio->info)); | ||
1561 | |||
1562 | clone = &tio->clone; | ||
1563 | if (setup_clone(clone, rq, tio)) { | ||
1564 | /* -ENOMEM */ | ||
1565 | free_rq_tio(tio); | ||
1566 | return NULL; | ||
1567 | } | ||
1568 | |||
1569 | return clone; | ||
1442 | } | 1570 | } |
1443 | 1571 | ||
1444 | /* | 1572 | /* |
@@ -1447,39 +1575,19 @@ static int dm_rq_flush_suspending(struct mapped_device *md) | |||
1447 | static int dm_prep_fn(struct request_queue *q, struct request *rq) | 1575 | static int dm_prep_fn(struct request_queue *q, struct request *rq) |
1448 | { | 1576 | { |
1449 | struct mapped_device *md = q->queuedata; | 1577 | struct mapped_device *md = q->queuedata; |
1450 | struct dm_rq_target_io *tio; | ||
1451 | struct request *clone; | 1578 | struct request *clone; |
1452 | 1579 | ||
1453 | if (unlikely(rq == &md->suspend_rq)) { | 1580 | if (unlikely(dm_rq_is_flush_request(rq))) |
1454 | if (dm_rq_flush_suspending(md)) | 1581 | return BLKPREP_OK; |
1455 | return BLKPREP_OK; | ||
1456 | else | ||
1457 | /* The flush suspend was interrupted */ | ||
1458 | return BLKPREP_KILL; | ||
1459 | } | ||
1460 | 1582 | ||
1461 | if (unlikely(rq->special)) { | 1583 | if (unlikely(rq->special)) { |
1462 | DMWARN("Already has something in rq->special."); | 1584 | DMWARN("Already has something in rq->special."); |
1463 | return BLKPREP_KILL; | 1585 | return BLKPREP_KILL; |
1464 | } | 1586 | } |
1465 | 1587 | ||
1466 | tio = alloc_rq_tio(md); /* Only one for each original request */ | 1588 | clone = clone_rq(rq, md, GFP_ATOMIC); |
1467 | if (!tio) | 1589 | if (!clone) |
1468 | /* -ENOMEM */ | ||
1469 | return BLKPREP_DEFER; | ||
1470 | |||
1471 | tio->md = md; | ||
1472 | tio->ti = NULL; | ||
1473 | tio->orig = rq; | ||
1474 | tio->error = 0; | ||
1475 | memset(&tio->info, 0, sizeof(tio->info)); | ||
1476 | |||
1477 | clone = &tio->clone; | ||
1478 | if (setup_clone(clone, rq, tio)) { | ||
1479 | /* -ENOMEM */ | ||
1480 | free_rq_tio(tio); | ||
1481 | return BLKPREP_DEFER; | 1590 | return BLKPREP_DEFER; |
1482 | } | ||
1483 | 1591 | ||
1484 | rq->special = clone; | 1592 | rq->special = clone; |
1485 | rq->cmd_flags |= REQ_DONTPREP; | 1593 | rq->cmd_flags |= REQ_DONTPREP; |
@@ -1487,11 +1595,10 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) | |||
1487 | return BLKPREP_OK; | 1595 | return BLKPREP_OK; |
1488 | } | 1596 | } |
1489 | 1597 | ||
1490 | static void map_request(struct dm_target *ti, struct request *rq, | 1598 | static void map_request(struct dm_target *ti, struct request *clone, |
1491 | struct mapped_device *md) | 1599 | struct mapped_device *md) |
1492 | { | 1600 | { |
1493 | int r; | 1601 | int r; |
1494 | struct request *clone = rq->special; | ||
1495 | struct dm_rq_target_io *tio = clone->end_io_data; | 1602 | struct dm_rq_target_io *tio = clone->end_io_data; |
1496 | 1603 | ||
1497 | /* | 1604 | /* |
@@ -1511,6 +1618,8 @@ static void map_request(struct dm_target *ti, struct request *rq, | |||
1511 | break; | 1618 | break; |
1512 | case DM_MAPIO_REMAPPED: | 1619 | case DM_MAPIO_REMAPPED: |
1513 | /* The target has remapped the I/O so dispatch it */ | 1620 | /* The target has remapped the I/O so dispatch it */ |
1621 | trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), | ||
1622 | blk_rq_pos(tio->orig)); | ||
1514 | dm_dispatch_request(clone); | 1623 | dm_dispatch_request(clone); |
1515 | break; | 1624 | break; |
1516 | case DM_MAPIO_REQUEUE: | 1625 | case DM_MAPIO_REQUEUE: |
@@ -1536,29 +1645,26 @@ static void map_request(struct dm_target *ti, struct request *rq, | |||
1536 | static void dm_request_fn(struct request_queue *q) | 1645 | static void dm_request_fn(struct request_queue *q) |
1537 | { | 1646 | { |
1538 | struct mapped_device *md = q->queuedata; | 1647 | struct mapped_device *md = q->queuedata; |
1539 | struct dm_table *map = dm_get_table(md); | 1648 | struct dm_table *map = dm_get_live_table(md); |
1540 | struct dm_target *ti; | 1649 | struct dm_target *ti; |
1541 | struct request *rq; | 1650 | struct request *rq, *clone; |
1542 | 1651 | ||
1543 | /* | 1652 | /* |
1544 | * For noflush suspend, check blk_queue_stopped() to immediately | 1653 | * For suspend, check blk_queue_stopped() and increment |
1545 | * quit I/O dispatching. | 1654 | * ->pending within a single queue_lock not to increment the |
1655 | * number of in-flight I/Os after the queue is stopped in | ||
1656 | * dm_suspend(). | ||
1546 | */ | 1657 | */ |
1547 | while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { | 1658 | while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { |
1548 | rq = blk_peek_request(q); | 1659 | rq = blk_peek_request(q); |
1549 | if (!rq) | 1660 | if (!rq) |
1550 | goto plug_and_out; | 1661 | goto plug_and_out; |
1551 | 1662 | ||
1552 | if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ | 1663 | if (unlikely(dm_rq_is_flush_request(rq))) { |
1553 | if (queue_in_flight(q)) | 1664 | BUG_ON(md->flush_request); |
1554 | /* Not quiet yet. Wait more */ | 1665 | md->flush_request = rq; |
1555 | goto plug_and_out; | ||
1556 | |||
1557 | /* This device should be quiet now */ | ||
1558 | __stop_queue(q); | ||
1559 | blk_start_request(rq); | 1666 | blk_start_request(rq); |
1560 | __blk_end_request_all(rq, 0); | 1667 | queue_work(md->wq, &md->barrier_work); |
1561 | wake_up(&md->wait); | ||
1562 | goto out; | 1668 | goto out; |
1563 | } | 1669 | } |
1564 | 1670 | ||
@@ -1567,8 +1673,11 @@ static void dm_request_fn(struct request_queue *q) | |||
1567 | goto plug_and_out; | 1673 | goto plug_and_out; |
1568 | 1674 | ||
1569 | blk_start_request(rq); | 1675 | blk_start_request(rq); |
1676 | clone = rq->special; | ||
1677 | atomic_inc(&md->pending[rq_data_dir(clone)]); | ||
1678 | |||
1570 | spin_unlock(q->queue_lock); | 1679 | spin_unlock(q->queue_lock); |
1571 | map_request(ti, rq, md); | 1680 | map_request(ti, clone, md); |
1572 | spin_lock_irq(q->queue_lock); | 1681 | spin_lock_irq(q->queue_lock); |
1573 | } | 1682 | } |
1574 | 1683 | ||
@@ -1595,7 +1704,7 @@ static int dm_lld_busy(struct request_queue *q) | |||
1595 | { | 1704 | { |
1596 | int r; | 1705 | int r; |
1597 | struct mapped_device *md = q->queuedata; | 1706 | struct mapped_device *md = q->queuedata; |
1598 | struct dm_table *map = dm_get_table(md); | 1707 | struct dm_table *map = dm_get_live_table(md); |
1599 | 1708 | ||
1600 | if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) | 1709 | if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) |
1601 | r = 1; | 1710 | r = 1; |
@@ -1610,7 +1719,7 @@ static int dm_lld_busy(struct request_queue *q) | |||
1610 | static void dm_unplug_all(struct request_queue *q) | 1719 | static void dm_unplug_all(struct request_queue *q) |
1611 | { | 1720 | { |
1612 | struct mapped_device *md = q->queuedata; | 1721 | struct mapped_device *md = q->queuedata; |
1613 | struct dm_table *map = dm_get_table(md); | 1722 | struct dm_table *map = dm_get_live_table(md); |
1614 | 1723 | ||
1615 | if (map) { | 1724 | if (map) { |
1616 | if (dm_request_based(md)) | 1725 | if (dm_request_based(md)) |
@@ -1628,7 +1737,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits) | |||
1628 | struct dm_table *map; | 1737 | struct dm_table *map; |
1629 | 1738 | ||
1630 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { | 1739 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { |
1631 | map = dm_get_table(md); | 1740 | map = dm_get_live_table(md); |
1632 | if (map) { | 1741 | if (map) { |
1633 | /* | 1742 | /* |
1634 | * Request-based dm cares about only own queue for | 1743 | * Request-based dm cares about only own queue for |
@@ -1725,6 +1834,7 @@ out: | |||
1725 | static const struct block_device_operations dm_blk_dops; | 1834 | static const struct block_device_operations dm_blk_dops; |
1726 | 1835 | ||
1727 | static void dm_wq_work(struct work_struct *work); | 1836 | static void dm_wq_work(struct work_struct *work); |
1837 | static void dm_rq_barrier_work(struct work_struct *work); | ||
1728 | 1838 | ||
1729 | /* | 1839 | /* |
1730 | * Allocate and initialise a blank device with a given minor. | 1840 | * Allocate and initialise a blank device with a given minor. |
@@ -1754,6 +1864,7 @@ static struct mapped_device *alloc_dev(int minor) | |||
1754 | init_rwsem(&md->io_lock); | 1864 | init_rwsem(&md->io_lock); |
1755 | mutex_init(&md->suspend_lock); | 1865 | mutex_init(&md->suspend_lock); |
1756 | spin_lock_init(&md->deferred_lock); | 1866 | spin_lock_init(&md->deferred_lock); |
1867 | spin_lock_init(&md->barrier_error_lock); | ||
1757 | rwlock_init(&md->map_lock); | 1868 | rwlock_init(&md->map_lock); |
1758 | atomic_set(&md->holders, 1); | 1869 | atomic_set(&md->holders, 1); |
1759 | atomic_set(&md->open_count, 0); | 1870 | atomic_set(&md->open_count, 0); |
@@ -1788,6 +1899,8 @@ static struct mapped_device *alloc_dev(int minor) | |||
1788 | blk_queue_softirq_done(md->queue, dm_softirq_done); | 1899 | blk_queue_softirq_done(md->queue, dm_softirq_done); |
1789 | blk_queue_prep_rq(md->queue, dm_prep_fn); | 1900 | blk_queue_prep_rq(md->queue, dm_prep_fn); |
1790 | blk_queue_lld_busy(md->queue, dm_lld_busy); | 1901 | blk_queue_lld_busy(md->queue, dm_lld_busy); |
1902 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH, | ||
1903 | dm_rq_prepare_flush); | ||
1791 | 1904 | ||
1792 | md->disk = alloc_disk(1); | 1905 | md->disk = alloc_disk(1); |
1793 | if (!md->disk) | 1906 | if (!md->disk) |
@@ -1797,6 +1910,7 @@ static struct mapped_device *alloc_dev(int minor) | |||
1797 | atomic_set(&md->pending[1], 0); | 1910 | atomic_set(&md->pending[1], 0); |
1798 | init_waitqueue_head(&md->wait); | 1911 | init_waitqueue_head(&md->wait); |
1799 | INIT_WORK(&md->work, dm_wq_work); | 1912 | INIT_WORK(&md->work, dm_wq_work); |
1913 | INIT_WORK(&md->barrier_work, dm_rq_barrier_work); | ||
1800 | init_waitqueue_head(&md->eventq); | 1914 | init_waitqueue_head(&md->eventq); |
1801 | 1915 | ||
1802 | md->disk->major = _major; | 1916 | md->disk->major = _major; |
@@ -1921,9 +2035,13 @@ static void __set_size(struct mapped_device *md, sector_t size) | |||
1921 | mutex_unlock(&md->bdev->bd_inode->i_mutex); | 2035 | mutex_unlock(&md->bdev->bd_inode->i_mutex); |
1922 | } | 2036 | } |
1923 | 2037 | ||
1924 | static int __bind(struct mapped_device *md, struct dm_table *t, | 2038 | /* |
1925 | struct queue_limits *limits) | 2039 | * Returns old map, which caller must destroy. |
2040 | */ | ||
2041 | static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, | ||
2042 | struct queue_limits *limits) | ||
1926 | { | 2043 | { |
2044 | struct dm_table *old_map; | ||
1927 | struct request_queue *q = md->queue; | 2045 | struct request_queue *q = md->queue; |
1928 | sector_t size; | 2046 | sector_t size; |
1929 | unsigned long flags; | 2047 | unsigned long flags; |
@@ -1938,11 +2056,6 @@ static int __bind(struct mapped_device *md, struct dm_table *t, | |||
1938 | 2056 | ||
1939 | __set_size(md, size); | 2057 | __set_size(md, size); |
1940 | 2058 | ||
1941 | if (!size) { | ||
1942 | dm_table_destroy(t); | ||
1943 | return 0; | ||
1944 | } | ||
1945 | |||
1946 | dm_table_event_callback(t, event_callback, md); | 2059 | dm_table_event_callback(t, event_callback, md); |
1947 | 2060 | ||
1948 | /* | 2061 | /* |
@@ -1958,26 +2071,31 @@ static int __bind(struct mapped_device *md, struct dm_table *t, | |||
1958 | __bind_mempools(md, t); | 2071 | __bind_mempools(md, t); |
1959 | 2072 | ||
1960 | write_lock_irqsave(&md->map_lock, flags); | 2073 | write_lock_irqsave(&md->map_lock, flags); |
2074 | old_map = md->map; | ||
1961 | md->map = t; | 2075 | md->map = t; |
1962 | dm_table_set_restrictions(t, q, limits); | 2076 | dm_table_set_restrictions(t, q, limits); |
1963 | write_unlock_irqrestore(&md->map_lock, flags); | 2077 | write_unlock_irqrestore(&md->map_lock, flags); |
1964 | 2078 | ||
1965 | return 0; | 2079 | return old_map; |
1966 | } | 2080 | } |
1967 | 2081 | ||
1968 | static void __unbind(struct mapped_device *md) | 2082 | /* |
2083 | * Returns unbound table for the caller to free. | ||
2084 | */ | ||
2085 | static struct dm_table *__unbind(struct mapped_device *md) | ||
1969 | { | 2086 | { |
1970 | struct dm_table *map = md->map; | 2087 | struct dm_table *map = md->map; |
1971 | unsigned long flags; | 2088 | unsigned long flags; |
1972 | 2089 | ||
1973 | if (!map) | 2090 | if (!map) |
1974 | return; | 2091 | return NULL; |
1975 | 2092 | ||
1976 | dm_table_event_callback(map, NULL, NULL); | 2093 | dm_table_event_callback(map, NULL, NULL); |
1977 | write_lock_irqsave(&md->map_lock, flags); | 2094 | write_lock_irqsave(&md->map_lock, flags); |
1978 | md->map = NULL; | 2095 | md->map = NULL; |
1979 | write_unlock_irqrestore(&md->map_lock, flags); | 2096 | write_unlock_irqrestore(&md->map_lock, flags); |
1980 | dm_table_destroy(map); | 2097 | |
2098 | return map; | ||
1981 | } | 2099 | } |
1982 | 2100 | ||
1983 | /* | 2101 | /* |
@@ -2059,18 +2177,18 @@ void dm_put(struct mapped_device *md) | |||
2059 | BUG_ON(test_bit(DMF_FREEING, &md->flags)); | 2177 | BUG_ON(test_bit(DMF_FREEING, &md->flags)); |
2060 | 2178 | ||
2061 | if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { | 2179 | if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { |
2062 | map = dm_get_table(md); | 2180 | map = dm_get_live_table(md); |
2063 | idr_replace(&_minor_idr, MINOR_ALLOCED, | 2181 | idr_replace(&_minor_idr, MINOR_ALLOCED, |
2064 | MINOR(disk_devt(dm_disk(md)))); | 2182 | MINOR(disk_devt(dm_disk(md)))); |
2065 | set_bit(DMF_FREEING, &md->flags); | 2183 | set_bit(DMF_FREEING, &md->flags); |
2066 | spin_unlock(&_minor_lock); | 2184 | spin_unlock(&_minor_lock); |
2067 | if (!dm_suspended(md)) { | 2185 | if (!dm_suspended_md(md)) { |
2068 | dm_table_presuspend_targets(map); | 2186 | dm_table_presuspend_targets(map); |
2069 | dm_table_postsuspend_targets(map); | 2187 | dm_table_postsuspend_targets(map); |
2070 | } | 2188 | } |
2071 | dm_sysfs_exit(md); | 2189 | dm_sysfs_exit(md); |
2072 | dm_table_put(map); | 2190 | dm_table_put(map); |
2073 | __unbind(md); | 2191 | dm_table_destroy(__unbind(md)); |
2074 | free_dev(md); | 2192 | free_dev(md); |
2075 | } | 2193 | } |
2076 | } | 2194 | } |
@@ -2080,8 +2198,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
2080 | { | 2198 | { |
2081 | int r = 0; | 2199 | int r = 0; |
2082 | DECLARE_WAITQUEUE(wait, current); | 2200 | DECLARE_WAITQUEUE(wait, current); |
2083 | struct request_queue *q = md->queue; | ||
2084 | unsigned long flags; | ||
2085 | 2201 | ||
2086 | dm_unplug_all(md->queue); | 2202 | dm_unplug_all(md->queue); |
2087 | 2203 | ||
@@ -2091,15 +2207,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
2091 | set_current_state(interruptible); | 2207 | set_current_state(interruptible); |
2092 | 2208 | ||
2093 | smp_mb(); | 2209 | smp_mb(); |
2094 | if (dm_request_based(md)) { | 2210 | if (!md_in_flight(md)) |
2095 | spin_lock_irqsave(q->queue_lock, flags); | ||
2096 | if (!queue_in_flight(q) && blk_queue_stopped(q)) { | ||
2097 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2098 | break; | ||
2099 | } | ||
2100 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2101 | } else if (!atomic_read(&md->pending[0]) && | ||
2102 | !atomic_read(&md->pending[1])) | ||
2103 | break; | 2211 | break; |
2104 | 2212 | ||
2105 | if (interruptible == TASK_INTERRUPTIBLE && | 2213 | if (interruptible == TASK_INTERRUPTIBLE && |
@@ -2194,98 +2302,106 @@ static void dm_queue_flush(struct mapped_device *md) | |||
2194 | queue_work(md->wq, &md->work); | 2302 | queue_work(md->wq, &md->work); |
2195 | } | 2303 | } |
2196 | 2304 | ||
2197 | /* | 2305 | static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr) |
2198 | * Swap in a new table (destroying old one). | ||
2199 | */ | ||
2200 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) | ||
2201 | { | 2306 | { |
2202 | struct queue_limits limits; | 2307 | struct dm_rq_target_io *tio = clone->end_io_data; |
2203 | int r = -EINVAL; | ||
2204 | 2308 | ||
2205 | mutex_lock(&md->suspend_lock); | 2309 | tio->info.flush_request = flush_nr; |
2310 | } | ||
2206 | 2311 | ||
2207 | /* device must be suspended */ | 2312 | /* Issue barrier requests to targets and wait for their completion. */ |
2208 | if (!dm_suspended(md)) | 2313 | static int dm_rq_barrier(struct mapped_device *md) |
2209 | goto out; | 2314 | { |
2315 | int i, j; | ||
2316 | struct dm_table *map = dm_get_live_table(md); | ||
2317 | unsigned num_targets = dm_table_get_num_targets(map); | ||
2318 | struct dm_target *ti; | ||
2319 | struct request *clone; | ||
2210 | 2320 | ||
2211 | r = dm_calculate_queue_limits(table, &limits); | 2321 | md->barrier_error = 0; |
2212 | if (r) | ||
2213 | goto out; | ||
2214 | 2322 | ||
2215 | /* cannot change the device type, once a table is bound */ | 2323 | for (i = 0; i < num_targets; i++) { |
2216 | if (md->map && | 2324 | ti = dm_table_get_target(map, i); |
2217 | (dm_table_get_type(md->map) != dm_table_get_type(table))) { | 2325 | for (j = 0; j < ti->num_flush_requests; j++) { |
2218 | DMWARN("can't change the device type after a table is bound"); | 2326 | clone = clone_rq(md->flush_request, md, GFP_NOIO); |
2219 | goto out; | 2327 | dm_rq_set_flush_nr(clone, j); |
2328 | atomic_inc(&md->pending[rq_data_dir(clone)]); | ||
2329 | map_request(ti, clone, md); | ||
2330 | } | ||
2220 | } | 2331 | } |
2221 | 2332 | ||
2222 | __unbind(md); | 2333 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); |
2223 | r = __bind(md, table, &limits); | 2334 | dm_table_put(map); |
2224 | |||
2225 | out: | ||
2226 | mutex_unlock(&md->suspend_lock); | ||
2227 | return r; | ||
2228 | } | ||
2229 | 2335 | ||
2230 | static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) | 2336 | return md->barrier_error; |
2231 | { | ||
2232 | md->suspend_rq.special = (void *)0x1; | ||
2233 | } | 2337 | } |
2234 | 2338 | ||
2235 | static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) | 2339 | static void dm_rq_barrier_work(struct work_struct *work) |
2236 | { | 2340 | { |
2341 | int error; | ||
2342 | struct mapped_device *md = container_of(work, struct mapped_device, | ||
2343 | barrier_work); | ||
2237 | struct request_queue *q = md->queue; | 2344 | struct request_queue *q = md->queue; |
2345 | struct request *rq; | ||
2238 | unsigned long flags; | 2346 | unsigned long flags; |
2239 | 2347 | ||
2240 | spin_lock_irqsave(q->queue_lock, flags); | 2348 | /* |
2241 | if (!noflush) | 2349 | * Hold the md reference here and leave it at the last part so that |
2242 | dm_rq_invalidate_suspend_marker(md); | 2350 | * the md can't be deleted by device opener when the barrier request |
2243 | __start_queue(q); | 2351 | * completes. |
2244 | spin_unlock_irqrestore(q->queue_lock, flags); | 2352 | */ |
2245 | } | 2353 | dm_get(md); |
2246 | 2354 | ||
2247 | static void dm_rq_start_suspend(struct mapped_device *md, int noflush) | 2355 | error = dm_rq_barrier(md); |
2248 | { | ||
2249 | struct request *rq = &md->suspend_rq; | ||
2250 | struct request_queue *q = md->queue; | ||
2251 | 2356 | ||
2252 | if (noflush) | 2357 | rq = md->flush_request; |
2253 | stop_queue(q); | 2358 | md->flush_request = NULL; |
2254 | else { | 2359 | |
2255 | blk_rq_init(q, rq); | 2360 | if (error == DM_ENDIO_REQUEUE) { |
2256 | blk_insert_request(q, rq, 0, NULL); | 2361 | spin_lock_irqsave(q->queue_lock, flags); |
2257 | } | 2362 | blk_requeue_request(q, rq); |
2363 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2364 | } else | ||
2365 | blk_end_request_all(rq, error); | ||
2366 | |||
2367 | blk_run_queue(q); | ||
2368 | |||
2369 | dm_put(md); | ||
2258 | } | 2370 | } |
2259 | 2371 | ||
2260 | static int dm_rq_suspend_available(struct mapped_device *md, int noflush) | 2372 | /* |
2373 | * Swap in a new table, returning the old one for the caller to destroy. | ||
2374 | */ | ||
2375 | struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) | ||
2261 | { | 2376 | { |
2262 | int r = 1; | 2377 | struct dm_table *map = ERR_PTR(-EINVAL); |
2263 | struct request *rq = &md->suspend_rq; | 2378 | struct queue_limits limits; |
2264 | struct request_queue *q = md->queue; | 2379 | int r; |
2265 | unsigned long flags; | ||
2266 | 2380 | ||
2267 | if (noflush) | 2381 | mutex_lock(&md->suspend_lock); |
2268 | return r; | ||
2269 | 2382 | ||
2270 | /* The marker must be protected by queue lock if it is in use */ | 2383 | /* device must be suspended */ |
2271 | spin_lock_irqsave(q->queue_lock, flags); | 2384 | if (!dm_suspended_md(md)) |
2272 | if (unlikely(rq->ref_count)) { | 2385 | goto out; |
2273 | /* | 2386 | |
2274 | * This can happen, when the previous flush suspend was | 2387 | r = dm_calculate_queue_limits(table, &limits); |
2275 | * interrupted, the marker is still in the queue and | 2388 | if (r) { |
2276 | * this flush suspend has been invoked, because we don't | 2389 | map = ERR_PTR(r); |
2277 | * remove the marker at the time of suspend interruption. | 2390 | goto out; |
2278 | * We have only one marker per mapped_device, so we can't | ||
2279 | * start another flush suspend while it is in use. | ||
2280 | */ | ||
2281 | BUG_ON(!rq->special); /* The marker should be invalidated */ | ||
2282 | DMWARN("Invalidating the previous flush suspend is still in" | ||
2283 | " progress. Please retry later."); | ||
2284 | r = 0; | ||
2285 | } | 2391 | } |
2286 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2287 | 2392 | ||
2288 | return r; | 2393 | /* cannot change the device type, once a table is bound */ |
2394 | if (md->map && | ||
2395 | (dm_table_get_type(md->map) != dm_table_get_type(table))) { | ||
2396 | DMWARN("can't change the device type after a table is bound"); | ||
2397 | goto out; | ||
2398 | } | ||
2399 | |||
2400 | map = __bind(md, table, &limits); | ||
2401 | |||
2402 | out: | ||
2403 | mutex_unlock(&md->suspend_lock); | ||
2404 | return map; | ||
2289 | } | 2405 | } |
2290 | 2406 | ||
2291 | /* | 2407 | /* |
@@ -2330,49 +2446,11 @@ static void unlock_fs(struct mapped_device *md) | |||
2330 | /* | 2446 | /* |
2331 | * Suspend mechanism in request-based dm. | 2447 | * Suspend mechanism in request-based dm. |
2332 | * | 2448 | * |
2333 | * After the suspend starts, further incoming requests are kept in | 2449 | * 1. Flush all I/Os by lock_fs() if needed. |
2334 | * the request_queue and deferred. | 2450 | * 2. Stop dispatching any I/O by stopping the request_queue. |
2335 | * Remaining requests in the request_queue at the start of suspend are flushed | 2451 | * 3. Wait for all in-flight I/Os to be completed or requeued. |
2336 | * if it is flush suspend. | ||
2337 | * The suspend completes when the following conditions have been satisfied, | ||
2338 | * so wait for it: | ||
2339 | * 1. q->in_flight is 0 (which means no in_flight request) | ||
2340 | * 2. queue has been stopped (which means no request dispatching) | ||
2341 | * | ||
2342 | * | 2452 | * |
2343 | * Noflush suspend | 2453 | * To abort suspend, start the request_queue. |
2344 | * --------------- | ||
2345 | * Noflush suspend doesn't need to dispatch remaining requests. | ||
2346 | * So stop the queue immediately. Then, wait for all in_flight requests | ||
2347 | * to be completed or requeued. | ||
2348 | * | ||
2349 | * To abort noflush suspend, start the queue. | ||
2350 | * | ||
2351 | * | ||
2352 | * Flush suspend | ||
2353 | * ------------- | ||
2354 | * Flush suspend needs to dispatch remaining requests. So stop the queue | ||
2355 | * after the remaining requests are completed. (Requeued request must be also | ||
2356 | * re-dispatched and completed. Until then, we can't stop the queue.) | ||
2357 | * | ||
2358 | * During flushing the remaining requests, further incoming requests are also | ||
2359 | * inserted to the same queue. To distinguish which requests are to be | ||
2360 | * flushed, we insert a marker request to the queue at the time of starting | ||
2361 | * flush suspend, like a barrier. | ||
2362 | * The dispatching is blocked when the marker is found on the top of the queue. | ||
2363 | * And the queue is stopped when all in_flight requests are completed, since | ||
2364 | * that means the remaining requests are completely flushed. | ||
2365 | * Then, the marker is removed from the queue. | ||
2366 | * | ||
2367 | * To abort flush suspend, we also need to take care of the marker, not only | ||
2368 | * starting the queue. | ||
2369 | * We don't remove the marker forcibly from the queue since it's against | ||
2370 | * the block-layer manner. Instead, we put a invalidated mark on the marker. | ||
2371 | * When the invalidated marker is found on the top of the queue, it is | ||
2372 | * immediately removed from the queue, so it doesn't block dispatching. | ||
2373 | * Because we have only one marker per mapped_device, we can't start another | ||
2374 | * flush suspend until the invalidated marker is removed from the queue. | ||
2375 | * So fail and return with -EBUSY in such a case. | ||
2376 | */ | 2454 | */ |
2377 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | 2455 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) |
2378 | { | 2456 | { |
@@ -2383,17 +2461,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2383 | 2461 | ||
2384 | mutex_lock(&md->suspend_lock); | 2462 | mutex_lock(&md->suspend_lock); |
2385 | 2463 | ||
2386 | if (dm_suspended(md)) { | 2464 | if (dm_suspended_md(md)) { |
2387 | r = -EINVAL; | 2465 | r = -EINVAL; |
2388 | goto out_unlock; | 2466 | goto out_unlock; |
2389 | } | 2467 | } |
2390 | 2468 | ||
2391 | if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { | 2469 | map = dm_get_live_table(md); |
2392 | r = -EBUSY; | ||
2393 | goto out_unlock; | ||
2394 | } | ||
2395 | |||
2396 | map = dm_get_table(md); | ||
2397 | 2470 | ||
2398 | /* | 2471 | /* |
2399 | * DMF_NOFLUSH_SUSPENDING must be set before presuspend. | 2472 | * DMF_NOFLUSH_SUSPENDING must be set before presuspend. |
@@ -2406,8 +2479,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2406 | dm_table_presuspend_targets(map); | 2479 | dm_table_presuspend_targets(map); |
2407 | 2480 | ||
2408 | /* | 2481 | /* |
2409 | * Flush I/O to the device. noflush supersedes do_lockfs, | 2482 | * Flush I/O to the device. |
2410 | * because lock_fs() needs to flush I/Os. | 2483 | * Any I/O submitted after lock_fs() may not be flushed. |
2484 | * noflush takes precedence over do_lockfs. | ||
2485 | * (lock_fs() flushes I/Os and waits for them to complete.) | ||
2411 | */ | 2486 | */ |
2412 | if (!noflush && do_lockfs) { | 2487 | if (!noflush && do_lockfs) { |
2413 | r = lock_fs(md); | 2488 | r = lock_fs(md); |
@@ -2436,10 +2511,15 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2436 | set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); | 2511 | set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); |
2437 | up_write(&md->io_lock); | 2512 | up_write(&md->io_lock); |
2438 | 2513 | ||
2439 | flush_workqueue(md->wq); | 2514 | /* |
2440 | 2515 | * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which | |
2516 | * can be kicked until md->queue is stopped. So stop md->queue before | ||
2517 | * flushing md->wq. | ||
2518 | */ | ||
2441 | if (dm_request_based(md)) | 2519 | if (dm_request_based(md)) |
2442 | dm_rq_start_suspend(md, noflush); | 2520 | stop_queue(md->queue); |
2521 | |||
2522 | flush_workqueue(md->wq); | ||
2443 | 2523 | ||
2444 | /* | 2524 | /* |
2445 | * At this point no more requests are entering target request routines. | 2525 | * At this point no more requests are entering target request routines. |
@@ -2458,7 +2538,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2458 | dm_queue_flush(md); | 2538 | dm_queue_flush(md); |
2459 | 2539 | ||
2460 | if (dm_request_based(md)) | 2540 | if (dm_request_based(md)) |
2461 | dm_rq_abort_suspend(md, noflush); | 2541 | start_queue(md->queue); |
2462 | 2542 | ||
2463 | unlock_fs(md); | 2543 | unlock_fs(md); |
2464 | goto out; /* pushback list is already flushed, so skip flush */ | 2544 | goto out; /* pushback list is already flushed, so skip flush */ |
@@ -2470,10 +2550,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2470 | * requests are being added to md->deferred list. | 2550 | * requests are being added to md->deferred list. |
2471 | */ | 2551 | */ |
2472 | 2552 | ||
2473 | dm_table_postsuspend_targets(map); | ||
2474 | |||
2475 | set_bit(DMF_SUSPENDED, &md->flags); | 2553 | set_bit(DMF_SUSPENDED, &md->flags); |
2476 | 2554 | ||
2555 | dm_table_postsuspend_targets(map); | ||
2556 | |||
2477 | out: | 2557 | out: |
2478 | dm_table_put(map); | 2558 | dm_table_put(map); |
2479 | 2559 | ||
@@ -2488,10 +2568,10 @@ int dm_resume(struct mapped_device *md) | |||
2488 | struct dm_table *map = NULL; | 2568 | struct dm_table *map = NULL; |
2489 | 2569 | ||
2490 | mutex_lock(&md->suspend_lock); | 2570 | mutex_lock(&md->suspend_lock); |
2491 | if (!dm_suspended(md)) | 2571 | if (!dm_suspended_md(md)) |
2492 | goto out; | 2572 | goto out; |
2493 | 2573 | ||
2494 | map = dm_get_table(md); | 2574 | map = dm_get_live_table(md); |
2495 | if (!map || !dm_table_get_size(map)) | 2575 | if (!map || !dm_table_get_size(map)) |
2496 | goto out; | 2576 | goto out; |
2497 | 2577 | ||
@@ -2592,18 +2672,29 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) | |||
2592 | return NULL; | 2672 | return NULL; |
2593 | 2673 | ||
2594 | if (test_bit(DMF_FREEING, &md->flags) || | 2674 | if (test_bit(DMF_FREEING, &md->flags) || |
2595 | test_bit(DMF_DELETING, &md->flags)) | 2675 | dm_deleting_md(md)) |
2596 | return NULL; | 2676 | return NULL; |
2597 | 2677 | ||
2598 | dm_get(md); | 2678 | dm_get(md); |
2599 | return md; | 2679 | return md; |
2600 | } | 2680 | } |
2601 | 2681 | ||
2602 | int dm_suspended(struct mapped_device *md) | 2682 | int dm_suspended_md(struct mapped_device *md) |
2603 | { | 2683 | { |
2604 | return test_bit(DMF_SUSPENDED, &md->flags); | 2684 | return test_bit(DMF_SUSPENDED, &md->flags); |
2605 | } | 2685 | } |
2606 | 2686 | ||
2687 | int dm_suspended(struct dm_target *ti) | ||
2688 | { | ||
2689 | struct mapped_device *md = dm_table_get_md(ti->table); | ||
2690 | int r = dm_suspended_md(md); | ||
2691 | |||
2692 | dm_put(md); | ||
2693 | |||
2694 | return r; | ||
2695 | } | ||
2696 | EXPORT_SYMBOL_GPL(dm_suspended); | ||
2697 | |||
2607 | int dm_noflush_suspending(struct dm_target *ti) | 2698 | int dm_noflush_suspending(struct dm_target *ti) |
2608 | { | 2699 | { |
2609 | struct mapped_device *md = dm_table_get_md(ti->table); | 2700 | struct mapped_device *md = dm_table_get_md(ti->table); |
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a7663eba17e2..8dadaa5bc396 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -89,6 +89,16 @@ int dm_target_iterate(void (*iter_func)(struct target_type *tt, | |||
89 | int dm_split_args(int *argc, char ***argvp, char *input); | 89 | int dm_split_args(int *argc, char ***argvp, char *input); |
90 | 90 | ||
91 | /* | 91 | /* |
92 | * Is this mapped_device being deleted? | ||
93 | */ | ||
94 | int dm_deleting_md(struct mapped_device *md); | ||
95 | |||
96 | /* | ||
97 | * Is this mapped_device suspended? | ||
98 | */ | ||
99 | int dm_suspended_md(struct mapped_device *md); | ||
100 | |||
101 | /* | ||
92 | * The device-mapper can be driven through one of two interfaces; | 102 | * The device-mapper can be driven through one of two interfaces; |
93 | * ioctl or filesystem, depending which patch you have applied. | 103 | * ioctl or filesystem, depending which patch you have applied. |
94 | */ | 104 | */ |
@@ -118,6 +128,9 @@ int dm_lock_for_deletion(struct mapped_device *md); | |||
118 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, | 128 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
119 | unsigned cookie); | 129 | unsigned cookie); |
120 | 130 | ||
131 | int dm_io_init(void); | ||
132 | void dm_io_exit(void); | ||
133 | |||
121 | int dm_kcopyd_init(void); | 134 | int dm_kcopyd_init(void); |
122 | void dm_kcopyd_exit(void); | 135 | void dm_kcopyd_exit(void); |
123 | 136 | ||