aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/dm-crypt.c207
-rw-r--r--drivers/md/dm-exception-store.c33
-rw-r--r--drivers/md/dm-exception-store.h62
-rw-r--r--drivers/md/dm-io.c120
-rw-r--r--drivers/md/dm-ioctl.c123
-rw-r--r--drivers/md/dm-kcopyd.c5
-rw-r--r--drivers/md/dm-log.c77
-rw-r--r--drivers/md/dm-mpath.c95
-rw-r--r--drivers/md/dm-raid1.c219
-rw-r--r--drivers/md/dm-region-hash.c31
-rw-r--r--drivers/md/dm-snap-persistent.c195
-rw-r--r--drivers/md/dm-snap-transient.c24
-rw-r--r--drivers/md/dm-snap.c1279
-rw-r--r--drivers/md/dm-sysfs.c10
-rw-r--r--drivers/md/dm-table.c3
-rw-r--r--drivers/md/dm-uevent.c9
-rw-r--r--drivers/md/dm.c643
-rw-r--r--drivers/md/dm.h13
18 files changed, 2274 insertions, 874 deletions
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index e412980763bd..a93637223c8d 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de> 2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> 3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
4 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. 4 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
5 * 5 *
6 * This file is released under the GPL. 6 * This file is released under the GPL.
7 */ 7 */
@@ -71,10 +71,21 @@ struct crypt_iv_operations {
71 int (*ctr)(struct crypt_config *cc, struct dm_target *ti, 71 int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
72 const char *opts); 72 const char *opts);
73 void (*dtr)(struct crypt_config *cc); 73 void (*dtr)(struct crypt_config *cc);
74 const char *(*status)(struct crypt_config *cc); 74 int (*init)(struct crypt_config *cc);
75 int (*wipe)(struct crypt_config *cc);
75 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); 76 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
76}; 77};
77 78
79struct iv_essiv_private {
80 struct crypto_cipher *tfm;
81 struct crypto_hash *hash_tfm;
82 u8 *salt;
83};
84
85struct iv_benbi_private {
86 int shift;
87};
88
78/* 89/*
79 * Crypt: maps a linear range of a block device 90 * Crypt: maps a linear range of a block device
80 * and encrypts / decrypts at the same time. 91 * and encrypts / decrypts at the same time.
@@ -102,8 +113,8 @@ struct crypt_config {
102 struct crypt_iv_operations *iv_gen_ops; 113 struct crypt_iv_operations *iv_gen_ops;
103 char *iv_mode; 114 char *iv_mode;
104 union { 115 union {
105 struct crypto_cipher *essiv_tfm; 116 struct iv_essiv_private essiv;
106 int benbi_shift; 117 struct iv_benbi_private benbi;
107 } iv_gen_private; 118 } iv_gen_private;
108 sector_t iv_offset; 119 sector_t iv_offset;
109 unsigned int iv_size; 120 unsigned int iv_size;
@@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
147 * plain: the initial vector is the 32-bit little-endian version of the sector 158 * plain: the initial vector is the 32-bit little-endian version of the sector
148 * number, padded with zeros if necessary. 159 * number, padded with zeros if necessary.
149 * 160 *
161 * plain64: the initial vector is the 64-bit little-endian version of the sector
162 * number, padded with zeros if necessary.
163 *
150 * essiv: "encrypted sector|salt initial vector", the sector number is 164 * essiv: "encrypted sector|salt initial vector", the sector number is
151 * encrypted with the bulk cipher using a salt as key. The salt 165 * encrypted with the bulk cipher using a salt as key. The salt
152 * should be derived from the bulk cipher's key via hashing. 166 * should be derived from the bulk cipher's key via hashing.
@@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
169 return 0; 183 return 0;
170} 184}
171 185
172static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, 186static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
173 const char *opts) 187 sector_t sector)
174{ 188{
175 struct crypto_cipher *essiv_tfm; 189 memset(iv, 0, cc->iv_size);
176 struct crypto_hash *hash_tfm; 190 *(u64 *)iv = cpu_to_le64(sector);
191
192 return 0;
193}
194
195/* Initialise ESSIV - compute salt but no local memory allocations */
196static int crypt_iv_essiv_init(struct crypt_config *cc)
197{
198 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
177 struct hash_desc desc; 199 struct hash_desc desc;
178 struct scatterlist sg; 200 struct scatterlist sg;
179 unsigned int saltsize;
180 u8 *salt;
181 int err; 201 int err;
182 202
183 if (opts == NULL) { 203 sg_init_one(&sg, cc->key, cc->key_size);
204 desc.tfm = essiv->hash_tfm;
205 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
206
207 err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
208 if (err)
209 return err;
210
211 return crypto_cipher_setkey(essiv->tfm, essiv->salt,
212 crypto_hash_digestsize(essiv->hash_tfm));
213}
214
215/* Wipe salt and reset key derived from volume key */
216static int crypt_iv_essiv_wipe(struct crypt_config *cc)
217{
218 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
219 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
220
221 memset(essiv->salt, 0, salt_size);
222
223 return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
224}
225
226static void crypt_iv_essiv_dtr(struct crypt_config *cc)
227{
228 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
229
230 crypto_free_cipher(essiv->tfm);
231 essiv->tfm = NULL;
232
233 crypto_free_hash(essiv->hash_tfm);
234 essiv->hash_tfm = NULL;
235
236 kzfree(essiv->salt);
237 essiv->salt = NULL;
238}
239
240static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
241 const char *opts)
242{
243 struct crypto_cipher *essiv_tfm = NULL;
244 struct crypto_hash *hash_tfm = NULL;
245 u8 *salt = NULL;
246 int err;
247
248 if (!opts) {
184 ti->error = "Digest algorithm missing for ESSIV mode"; 249 ti->error = "Digest algorithm missing for ESSIV mode";
185 return -EINVAL; 250 return -EINVAL;
186 } 251 }
187 252
188 /* Hash the cipher key with the given hash algorithm */ 253 /* Allocate hash algorithm */
189 hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); 254 hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
190 if (IS_ERR(hash_tfm)) { 255 if (IS_ERR(hash_tfm)) {
191 ti->error = "Error initializing ESSIV hash"; 256 ti->error = "Error initializing ESSIV hash";
192 return PTR_ERR(hash_tfm); 257 err = PTR_ERR(hash_tfm);
258 goto bad;
193 } 259 }
194 260
195 saltsize = crypto_hash_digestsize(hash_tfm); 261 salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
196 salt = kmalloc(saltsize, GFP_KERNEL); 262 if (!salt) {
197 if (salt == NULL) {
198 ti->error = "Error kmallocing salt storage in ESSIV"; 263 ti->error = "Error kmallocing salt storage in ESSIV";
199 crypto_free_hash(hash_tfm); 264 err = -ENOMEM;
200 return -ENOMEM; 265 goto bad;
201 } 266 }
202 267
203 sg_init_one(&sg, cc->key, cc->key_size); 268 /* Allocate essiv_tfm */
204 desc.tfm = hash_tfm;
205 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
206 err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
207 crypto_free_hash(hash_tfm);
208
209 if (err) {
210 ti->error = "Error calculating hash in ESSIV";
211 kfree(salt);
212 return err;
213 }
214
215 /* Setup the essiv_tfm with the given salt */
216 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); 269 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
217 if (IS_ERR(essiv_tfm)) { 270 if (IS_ERR(essiv_tfm)) {
218 ti->error = "Error allocating crypto tfm for ESSIV"; 271 ti->error = "Error allocating crypto tfm for ESSIV";
219 kfree(salt); 272 err = PTR_ERR(essiv_tfm);
220 return PTR_ERR(essiv_tfm); 273 goto bad;
221 } 274 }
222 if (crypto_cipher_blocksize(essiv_tfm) != 275 if (crypto_cipher_blocksize(essiv_tfm) !=
223 crypto_ablkcipher_ivsize(cc->tfm)) { 276 crypto_ablkcipher_ivsize(cc->tfm)) {
224 ti->error = "Block size of ESSIV cipher does " 277 ti->error = "Block size of ESSIV cipher does "
225 "not match IV size of block cipher"; 278 "not match IV size of block cipher";
226 crypto_free_cipher(essiv_tfm); 279 err = -EINVAL;
227 kfree(salt); 280 goto bad;
228 return -EINVAL;
229 } 281 }
230 err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
231 if (err) {
232 ti->error = "Failed to set key for ESSIV cipher";
233 crypto_free_cipher(essiv_tfm);
234 kfree(salt);
235 return err;
236 }
237 kfree(salt);
238 282
239 cc->iv_gen_private.essiv_tfm = essiv_tfm; 283 cc->iv_gen_private.essiv.salt = salt;
284 cc->iv_gen_private.essiv.tfm = essiv_tfm;
285 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
286
240 return 0; 287 return 0;
241}
242 288
243static void crypt_iv_essiv_dtr(struct crypt_config *cc) 289bad:
244{ 290 if (essiv_tfm && !IS_ERR(essiv_tfm))
245 crypto_free_cipher(cc->iv_gen_private.essiv_tfm); 291 crypto_free_cipher(essiv_tfm);
246 cc->iv_gen_private.essiv_tfm = NULL; 292 if (hash_tfm && !IS_ERR(hash_tfm))
293 crypto_free_hash(hash_tfm);
294 kfree(salt);
295 return err;
247} 296}
248 297
249static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 298static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
250{ 299{
251 memset(iv, 0, cc->iv_size); 300 memset(iv, 0, cc->iv_size);
252 *(u64 *)iv = cpu_to_le64(sector); 301 *(u64 *)iv = cpu_to_le64(sector);
253 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv); 302 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
254 return 0; 303 return 0;
255} 304}
256 305
@@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
273 return -EINVAL; 322 return -EINVAL;
274 } 323 }
275 324
276 cc->iv_gen_private.benbi_shift = 9 - log; 325 cc->iv_gen_private.benbi.shift = 9 - log;
277 326
278 return 0; 327 return 0;
279} 328}
@@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
288 337
289 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ 338 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
290 339
291 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1); 340 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
292 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); 341 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
293 342
294 return 0; 343 return 0;
@@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = {
305 .generator = crypt_iv_plain_gen 354 .generator = crypt_iv_plain_gen
306}; 355};
307 356
357static struct crypt_iv_operations crypt_iv_plain64_ops = {
358 .generator = crypt_iv_plain64_gen
359};
360
308static struct crypt_iv_operations crypt_iv_essiv_ops = { 361static struct crypt_iv_operations crypt_iv_essiv_ops = {
309 .ctr = crypt_iv_essiv_ctr, 362 .ctr = crypt_iv_essiv_ctr,
310 .dtr = crypt_iv_essiv_dtr, 363 .dtr = crypt_iv_essiv_dtr,
364 .init = crypt_iv_essiv_init,
365 .wipe = crypt_iv_essiv_wipe,
311 .generator = crypt_iv_essiv_gen 366 .generator = crypt_iv_essiv_gen
312}; 367};
313 368
@@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
934 989
935 set_bit(DM_CRYPT_KEY_VALID, &cc->flags); 990 set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
936 991
937 return 0; 992 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
938} 993}
939 994
940static int crypt_wipe_key(struct crypt_config *cc) 995static int crypt_wipe_key(struct crypt_config *cc)
941{ 996{
942 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); 997 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
943 memset(&cc->key, 0, cc->key_size * sizeof(u8)); 998 memset(&cc->key, 0, cc->key_size * sizeof(u8));
944 return 0; 999 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
945} 1000}
946 1001
947/* 1002/*
@@ -983,11 +1038,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
983 return -ENOMEM; 1038 return -ENOMEM;
984 } 1039 }
985 1040
986 if (crypt_set_key(cc, argv[1])) {
987 ti->error = "Error decoding key";
988 goto bad_cipher;
989 }
990
991 /* Compatibility mode for old dm-crypt cipher strings */ 1041 /* Compatibility mode for old dm-crypt cipher strings */
992 if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { 1042 if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
993 chainmode = "cbc"; 1043 chainmode = "cbc";
@@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1015 strcpy(cc->chainmode, chainmode); 1065 strcpy(cc->chainmode, chainmode);
1016 cc->tfm = tfm; 1066 cc->tfm = tfm;
1017 1067
1068 if (crypt_set_key(cc, argv[1]) < 0) {
1069 ti->error = "Error decoding and setting key";
1070 goto bad_ivmode;
1071 }
1072
1018 /* 1073 /*
1019 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". 1074 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
1020 * See comments at iv code 1075 * See comments at iv code
@@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1024 cc->iv_gen_ops = NULL; 1079 cc->iv_gen_ops = NULL;
1025 else if (strcmp(ivmode, "plain") == 0) 1080 else if (strcmp(ivmode, "plain") == 0)
1026 cc->iv_gen_ops = &crypt_iv_plain_ops; 1081 cc->iv_gen_ops = &crypt_iv_plain_ops;
1082 else if (strcmp(ivmode, "plain64") == 0)
1083 cc->iv_gen_ops = &crypt_iv_plain64_ops;
1027 else if (strcmp(ivmode, "essiv") == 0) 1084 else if (strcmp(ivmode, "essiv") == 0)
1028 cc->iv_gen_ops = &crypt_iv_essiv_ops; 1085 cc->iv_gen_ops = &crypt_iv_essiv_ops;
1029 else if (strcmp(ivmode, "benbi") == 0) 1086 else if (strcmp(ivmode, "benbi") == 0)
@@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1039 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) 1096 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
1040 goto bad_ivmode; 1097 goto bad_ivmode;
1041 1098
1099 if (cc->iv_gen_ops && cc->iv_gen_ops->init &&
1100 cc->iv_gen_ops->init(cc) < 0) {
1101 ti->error = "Error initialising IV";
1102 goto bad_slab_pool;
1103 }
1104
1042 cc->iv_size = crypto_ablkcipher_ivsize(tfm); 1105 cc->iv_size = crypto_ablkcipher_ivsize(tfm);
1043 if (cc->iv_size) 1106 if (cc->iv_size)
1044 /* at least a 64 bit sector number should fit in our buffer */ 1107 /* at least a 64 bit sector number should fit in our buffer */
@@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1085 goto bad_bs; 1148 goto bad_bs;
1086 } 1149 }
1087 1150
1088 if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
1089 ti->error = "Error setting key";
1090 goto bad_device;
1091 }
1092
1093 if (sscanf(argv[2], "%llu", &tmpll) != 1) { 1151 if (sscanf(argv[2], "%llu", &tmpll) != 1) {
1094 ti->error = "Invalid iv_offset sector"; 1152 ti->error = "Invalid iv_offset sector";
1095 goto bad_device; 1153 goto bad_device;
@@ -1278,6 +1336,7 @@ static void crypt_resume(struct dm_target *ti)
1278static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) 1336static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1279{ 1337{
1280 struct crypt_config *cc = ti->private; 1338 struct crypt_config *cc = ti->private;
1339 int ret = -EINVAL;
1281 1340
1282 if (argc < 2) 1341 if (argc < 2)
1283 goto error; 1342 goto error;
@@ -1287,10 +1346,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1287 DMWARN("not suspended during key manipulation."); 1346 DMWARN("not suspended during key manipulation.");
1288 return -EINVAL; 1347 return -EINVAL;
1289 } 1348 }
1290 if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) 1349 if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
1291 return crypt_set_key(cc, argv[2]); 1350 ret = crypt_set_key(cc, argv[2]);
1292 if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) 1351 if (ret)
1352 return ret;
1353 if (cc->iv_gen_ops && cc->iv_gen_ops->init)
1354 ret = cc->iv_gen_ops->init(cc);
1355 return ret;
1356 }
1357 if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
1358 if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
1359 ret = cc->iv_gen_ops->wipe(cc);
1360 if (ret)
1361 return ret;
1362 }
1293 return crypt_wipe_key(cc); 1363 return crypt_wipe_key(cc);
1364 }
1294 } 1365 }
1295 1366
1296error: 1367error:
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 7dbe652efb5a..2b7907b6dd09 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
172 } 172 }
173 173
174 /* Validate the chunk size against the device block size */ 174 /* Validate the chunk size against the device block size */
175 if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) { 175 if (chunk_size %
176 (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) {
176 *error = "Chunk size is not a multiple of device blocksize"; 177 *error = "Chunk size is not a multiple of device blocksize";
177 return -EINVAL; 178 return -EINVAL;
178 } 179 }
@@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
190} 191}
191 192
192int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, 193int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
194 struct dm_snapshot *snap,
193 unsigned *args_used, 195 unsigned *args_used,
194 struct dm_exception_store **store) 196 struct dm_exception_store **store)
195{ 197{
@@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
198 struct dm_exception_store *tmp_store; 200 struct dm_exception_store *tmp_store;
199 char persistent; 201 char persistent;
200 202
201 if (argc < 3) { 203 if (argc < 2) {
202 ti->error = "Insufficient exception store arguments"; 204 ti->error = "Insufficient exception store arguments";
203 return -EINVAL; 205 return -EINVAL;
204 } 206 }
@@ -209,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
209 return -ENOMEM; 211 return -ENOMEM;
210 } 212 }
211 213
212 persistent = toupper(*argv[1]); 214 persistent = toupper(*argv[0]);
213 if (persistent == 'P') 215 if (persistent == 'P')
214 type = get_type("P"); 216 type = get_type("P");
215 else if (persistent == 'N') 217 else if (persistent == 'N')
216 type = get_type("N"); 218 type = get_type("N");
217 else { 219 else {
218 ti->error = "Persistent flag is not P or N"; 220 ti->error = "Persistent flag is not P or N";
219 return -EINVAL; 221 r = -EINVAL;
222 goto bad_type;
220 } 223 }
221 224
222 if (!type) { 225 if (!type) {
@@ -226,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
226 } 229 }
227 230
228 tmp_store->type = type; 231 tmp_store->type = type;
229 tmp_store->ti = ti; 232 tmp_store->snap = snap;
230
231 r = dm_get_device(ti, argv[0], 0, 0,
232 FMODE_READ | FMODE_WRITE, &tmp_store->cow);
233 if (r) {
234 ti->error = "Cannot get COW device";
235 goto bad_cow;
236 }
237 233
238 r = set_chunk_size(tmp_store, argv[2], &ti->error); 234 r = set_chunk_size(tmp_store, argv[1], &ti->error);
239 if (r) 235 if (r)
240 goto bad_ctr; 236 goto bad;
241 237
242 r = type->ctr(tmp_store, 0, NULL); 238 r = type->ctr(tmp_store, 0, NULL);
243 if (r) { 239 if (r) {
244 ti->error = "Exception store type constructor failed"; 240 ti->error = "Exception store type constructor failed";
245 goto bad_ctr; 241 goto bad;
246 } 242 }
247 243
248 *args_used = 3; 244 *args_used = 2;
249 *store = tmp_store; 245 *store = tmp_store;
250 return 0; 246 return 0;
251 247
252bad_ctr: 248bad:
253 dm_put_device(ti, tmp_store->cow);
254bad_cow:
255 put_type(type); 249 put_type(type);
256bad_type: 250bad_type:
257 kfree(tmp_store); 251 kfree(tmp_store);
@@ -262,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create);
262void dm_exception_store_destroy(struct dm_exception_store *store) 256void dm_exception_store_destroy(struct dm_exception_store *store)
263{ 257{
264 store->type->dtr(store); 258 store->type->dtr(store);
265 dm_put_device(store->ti, store->cow);
266 put_type(store->type); 259 put_type(store->type);
267 kfree(store); 260 kfree(store);
268} 261}
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 8a223a48802c..e8dfa06af3ba 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -26,7 +26,7 @@ typedef sector_t chunk_t;
26 * of chunks that follow contiguously. Remaining bits hold the number of the 26 * of chunks that follow contiguously. Remaining bits hold the number of the
27 * chunk within the device. 27 * chunk within the device.
28 */ 28 */
29struct dm_snap_exception { 29struct dm_exception {
30 struct list_head hash_list; 30 struct list_head hash_list;
31 31
32 chunk_t old_chunk; 32 chunk_t old_chunk;
@@ -64,17 +64,34 @@ struct dm_exception_store_type {
64 * Find somewhere to store the next exception. 64 * Find somewhere to store the next exception.
65 */ 65 */
66 int (*prepare_exception) (struct dm_exception_store *store, 66 int (*prepare_exception) (struct dm_exception_store *store,
67 struct dm_snap_exception *e); 67 struct dm_exception *e);
68 68
69 /* 69 /*
70 * Update the metadata with this exception. 70 * Update the metadata with this exception.
71 */ 71 */
72 void (*commit_exception) (struct dm_exception_store *store, 72 void (*commit_exception) (struct dm_exception_store *store,
73 struct dm_snap_exception *e, 73 struct dm_exception *e,
74 void (*callback) (void *, int success), 74 void (*callback) (void *, int success),
75 void *callback_context); 75 void *callback_context);
76 76
77 /* 77 /*
78 * Returns 0 if the exception store is empty.
79 *
80 * If there are exceptions still to be merged, sets
81 * *last_old_chunk and *last_new_chunk to the most recent
82 * still-to-be-merged chunk and returns the number of
83 * consecutive previous ones.
84 */
85 int (*prepare_merge) (struct dm_exception_store *store,
86 chunk_t *last_old_chunk, chunk_t *last_new_chunk);
87
88 /*
89 * Clear the last n exceptions.
90 * nr_merged must be <= the value returned by prepare_merge.
91 */
92 int (*commit_merge) (struct dm_exception_store *store, int nr_merged);
93
94 /*
78 * The snapshot is invalid, note this in the metadata. 95 * The snapshot is invalid, note this in the metadata.
79 */ 96 */
80 void (*drop_snapshot) (struct dm_exception_store *store); 97 void (*drop_snapshot) (struct dm_exception_store *store);
@@ -86,19 +103,19 @@ struct dm_exception_store_type {
86 /* 103 /*
87 * Return how full the snapshot is. 104 * Return how full the snapshot is.
88 */ 105 */
89 void (*fraction_full) (struct dm_exception_store *store, 106 void (*usage) (struct dm_exception_store *store,
90 sector_t *numerator, 107 sector_t *total_sectors, sector_t *sectors_allocated,
91 sector_t *denominator); 108 sector_t *metadata_sectors);
92 109
93 /* For internal device-mapper use only. */ 110 /* For internal device-mapper use only. */
94 struct list_head list; 111 struct list_head list;
95}; 112};
96 113
114struct dm_snapshot;
115
97struct dm_exception_store { 116struct dm_exception_store {
98 struct dm_exception_store_type *type; 117 struct dm_exception_store_type *type;
99 struct dm_target *ti; 118 struct dm_snapshot *snap;
100
101 struct dm_dev *cow;
102 119
103 /* Size of data blocks saved - must be a power of 2 */ 120 /* Size of data blocks saved - must be a power of 2 */
104 unsigned chunk_size; 121 unsigned chunk_size;
@@ -109,6 +126,11 @@ struct dm_exception_store {
109}; 126};
110 127
111/* 128/*
129 * Obtain the cow device used by a given snapshot.
130 */
131struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
132
133/*
112 * Funtions to manipulate consecutive chunks 134 * Funtions to manipulate consecutive chunks
113 */ 135 */
114# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) 136# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
@@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
120 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); 142 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
121} 143}
122 144
123static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) 145static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
124{ 146{
125 return e->new_chunk >> DM_CHUNK_NUMBER_BITS; 147 return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
126} 148}
127 149
128static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) 150static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
129{ 151{
130 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); 152 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
131 153
132 BUG_ON(!dm_consecutive_chunk_count(e)); 154 BUG_ON(!dm_consecutive_chunk_count(e));
133} 155}
134 156
157static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
158{
159 BUG_ON(!dm_consecutive_chunk_count(e));
160
161 e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
162}
163
135# else 164# else
136# define DM_CHUNK_CONSECUTIVE_BITS 0 165# define DM_CHUNK_CONSECUTIVE_BITS 0
137 166
@@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
140 return chunk; 169 return chunk;
141} 170}
142 171
143static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) 172static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
144{ 173{
145 return 0; 174 return 0;
146} 175}
147 176
148static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) 177static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
178{
179}
180
181static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
149{ 182{
150} 183}
151 184
@@ -162,7 +195,7 @@ static inline sector_t get_dev_size(struct block_device *bdev)
162static inline chunk_t sector_to_chunk(struct dm_exception_store *store, 195static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
163 sector_t sector) 196 sector_t sector)
164{ 197{
165 return (sector & ~store->chunk_mask) >> store->chunk_shift; 198 return sector >> store->chunk_shift;
166} 199}
167 200
168int dm_exception_store_type_register(struct dm_exception_store_type *type); 201int dm_exception_store_type_register(struct dm_exception_store_type *type);
@@ -173,6 +206,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
173 char **error); 206 char **error);
174 207
175int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, 208int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
209 struct dm_snapshot *snap,
176 unsigned *args_used, 210 unsigned *args_used,
177 struct dm_exception_store **store); 211 struct dm_exception_store **store);
178void dm_exception_store_destroy(struct dm_exception_store *store); 212void dm_exception_store_destroy(struct dm_exception_store *store);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3a2e6a2f8bdd..10f457ca6af2 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -5,6 +5,8 @@
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8#include "dm.h"
9
8#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
9 11
10#include <linux/bio.h> 12#include <linux/bio.h>
@@ -14,12 +16,19 @@
14#include <linux/slab.h> 16#include <linux/slab.h>
15#include <linux/dm-io.h> 17#include <linux/dm-io.h>
16 18
19#define DM_MSG_PREFIX "io"
20
21#define DM_IO_MAX_REGIONS BITS_PER_LONG
22
17struct dm_io_client { 23struct dm_io_client {
18 mempool_t *pool; 24 mempool_t *pool;
19 struct bio_set *bios; 25 struct bio_set *bios;
20}; 26};
21 27
22/* FIXME: can we shrink this ? */ 28/*
29 * Aligning 'struct io' reduces the number of bits required to store
30 * its address. Refer to store_io_and_region_in_bio() below.
31 */
23struct io { 32struct io {
24 unsigned long error_bits; 33 unsigned long error_bits;
25 unsigned long eopnotsupp_bits; 34 unsigned long eopnotsupp_bits;
@@ -28,7 +37,9 @@ struct io {
28 struct dm_io_client *client; 37 struct dm_io_client *client;
29 io_notify_fn callback; 38 io_notify_fn callback;
30 void *context; 39 void *context;
31}; 40} __attribute__((aligned(DM_IO_MAX_REGIONS)));
41
42static struct kmem_cache *_dm_io_cache;
32 43
33/* 44/*
34 * io contexts are only dynamically allocated for asynchronous 45 * io contexts are only dynamically allocated for asynchronous
@@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
53 if (!client) 64 if (!client)
54 return ERR_PTR(-ENOMEM); 65 return ERR_PTR(-ENOMEM);
55 66
56 client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io)); 67 client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
57 if (!client->pool) 68 if (!client->pool)
58 goto bad; 69 goto bad;
59 70
@@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy);
88 99
89/*----------------------------------------------------------------- 100/*-----------------------------------------------------------------
90 * We need to keep track of which region a bio is doing io for. 101 * We need to keep track of which region a bio is doing io for.
91 * In order to save a memory allocation we store this the last 102 * To avoid a memory allocation to store just 5 or 6 bits, we
92 * bvec which we know is unused (blech). 103 * ensure the 'struct io' pointer is aligned so enough low bits are
93 * XXX This is ugly and can OOPS with some configs... find another way. 104 * always zero and then combine it with the region number directly in
105 * bi_private.
94 *---------------------------------------------------------------*/ 106 *---------------------------------------------------------------*/
95static inline void bio_set_region(struct bio *bio, unsigned region) 107static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
108 unsigned region)
96{ 109{
97 bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; 110 if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
111 DMCRIT("Unaligned struct io pointer %p", io);
112 BUG();
113 }
114
115 bio->bi_private = (void *)((unsigned long)io | region);
98} 116}
99 117
100static inline unsigned bio_get_region(struct bio *bio) 118static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
119 unsigned *region)
101{ 120{
102 return bio->bi_io_vec[bio->bi_max_vecs].bv_len; 121 unsigned long val = (unsigned long)bio->bi_private;
122
123 *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
124 *region = val & (DM_IO_MAX_REGIONS - 1);
103} 125}
104 126
105/*----------------------------------------------------------------- 127/*-----------------------------------------------------------------
@@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error)
140 /* 162 /*
141 * The bio destructor in bio_put() may use the io object. 163 * The bio destructor in bio_put() may use the io object.
142 */ 164 */
143 io = bio->bi_private; 165 retrieve_io_and_region_from_bio(bio, &io, &region);
144 region = bio_get_region(bio);
145 166
146 bio->bi_max_vecs++;
147 bio_put(bio); 167 bio_put(bio);
148 168
149 dec_count(io, region, error); 169 dec_count(io, region, error);
@@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data)
243 263
244static void dm_bio_destructor(struct bio *bio) 264static void dm_bio_destructor(struct bio *bio)
245{ 265{
246 struct io *io = bio->bi_private; 266 unsigned region;
267 struct io *io;
268
269 retrieve_io_and_region_from_bio(bio, &io, &region);
247 270
248 bio_free(bio, io->client->bios); 271 bio_free(bio, io->client->bios);
249} 272}
@@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
286 unsigned num_bvecs; 309 unsigned num_bvecs;
287 sector_t remaining = where->count; 310 sector_t remaining = where->count;
288 311
289 while (remaining) { 312 /*
313 * where->count may be zero if rw holds a write barrier and we
314 * need to send a zero-sized barrier.
315 */
316 do {
290 /* 317 /*
291 * Allocate a suitably sized-bio: we add an extra 318 * Allocate a suitably sized-bio.
292 * bvec for bio_get/set_region() and decrement bi_max_vecs
293 * to hide it from bio_add_page().
294 */ 319 */
295 num_bvecs = dm_sector_div_up(remaining, 320 num_bvecs = dm_sector_div_up(remaining,
296 (PAGE_SIZE >> SECTOR_SHIFT)); 321 (PAGE_SIZE >> SECTOR_SHIFT));
297 num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev), 322 num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
298 num_bvecs);
299 if (unlikely(num_bvecs > BIO_MAX_PAGES))
300 num_bvecs = BIO_MAX_PAGES;
301 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); 323 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
302 bio->bi_sector = where->sector + (where->count - remaining); 324 bio->bi_sector = where->sector + (where->count - remaining);
303 bio->bi_bdev = where->bdev; 325 bio->bi_bdev = where->bdev;
304 bio->bi_end_io = endio; 326 bio->bi_end_io = endio;
305 bio->bi_private = io;
306 bio->bi_destructor = dm_bio_destructor; 327 bio->bi_destructor = dm_bio_destructor;
307 bio->bi_max_vecs--; 328 store_io_and_region_in_bio(bio, io, region);
308 bio_set_region(bio, region);
309 329
310 /* 330 /*
311 * Try and add as many pages as possible. 331 * Try and add as many pages as possible.
@@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
323 343
324 atomic_inc(&io->count); 344 atomic_inc(&io->count);
325 submit_bio(rw, bio); 345 submit_bio(rw, bio);
326 } 346 } while (remaining);
327} 347}
328 348
329static void dispatch_io(int rw, unsigned int num_regions, 349static void dispatch_io(int rw, unsigned int num_regions,
@@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
333 int i; 353 int i;
334 struct dpages old_pages = *dp; 354 struct dpages old_pages = *dp;
335 355
356 BUG_ON(num_regions > DM_IO_MAX_REGIONS);
357
336 if (sync) 358 if (sync)
337 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 359 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
338 360
@@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
342 */ 364 */
343 for (i = 0; i < num_regions; i++) { 365 for (i = 0; i < num_regions; i++) {
344 *dp = old_pages; 366 *dp = old_pages;
345 if (where[i].count) 367 if (where[i].count || (rw & (1 << BIO_RW_BARRIER)))
346 do_region(rw, i, where + i, dp, io); 368 do_region(rw, i, where + i, dp, io);
347 } 369 }
348 370
@@ -357,7 +379,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
357 struct dm_io_region *where, int rw, struct dpages *dp, 379 struct dm_io_region *where, int rw, struct dpages *dp,
358 unsigned long *error_bits) 380 unsigned long *error_bits)
359{ 381{
360 struct io io; 382 /*
383 * gcc <= 4.3 can't do the alignment for stack variables, so we must
384 * align it on our own.
385 * volatile prevents the optimizer from removing or reusing
386 * "io_" field from the stack frame (allowed in ANSI C).
387 */
388 volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
389 struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
361 390
362 if (num_regions > 1 && (rw & RW_MASK) != WRITE) { 391 if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
363 WARN_ON(1); 392 WARN_ON(1);
@@ -365,33 +394,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
365 } 394 }
366 395
367retry: 396retry:
368 io.error_bits = 0; 397 io->error_bits = 0;
369 io.eopnotsupp_bits = 0; 398 io->eopnotsupp_bits = 0;
370 atomic_set(&io.count, 1); /* see dispatch_io() */ 399 atomic_set(&io->count, 1); /* see dispatch_io() */
371 io.sleeper = current; 400 io->sleeper = current;
372 io.client = client; 401 io->client = client;
373 402
374 dispatch_io(rw, num_regions, where, dp, &io, 1); 403 dispatch_io(rw, num_regions, where, dp, io, 1);
375 404
376 while (1) { 405 while (1) {
377 set_current_state(TASK_UNINTERRUPTIBLE); 406 set_current_state(TASK_UNINTERRUPTIBLE);
378 407
379 if (!atomic_read(&io.count)) 408 if (!atomic_read(&io->count))
380 break; 409 break;
381 410
382 io_schedule(); 411 io_schedule();
383 } 412 }
384 set_current_state(TASK_RUNNING); 413 set_current_state(TASK_RUNNING);
385 414
386 if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { 415 if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
387 rw &= ~(1 << BIO_RW_BARRIER); 416 rw &= ~(1 << BIO_RW_BARRIER);
388 goto retry; 417 goto retry;
389 } 418 }
390 419
391 if (error_bits) 420 if (error_bits)
392 *error_bits = io.error_bits; 421 *error_bits = io->error_bits;
393 422
394 return io.error_bits ? -EIO : 0; 423 return io->error_bits ? -EIO : 0;
395} 424}
396 425
397static int async_io(struct dm_io_client *client, unsigned int num_regions, 426static int async_io(struct dm_io_client *client, unsigned int num_regions,
@@ -472,3 +501,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
472 &dp, io_req->notify.fn, io_req->notify.context); 501 &dp, io_req->notify.fn, io_req->notify.context);
473} 502}
474EXPORT_SYMBOL(dm_io); 503EXPORT_SYMBOL(dm_io);
504
505int __init dm_io_init(void)
506{
507 _dm_io_cache = KMEM_CACHE(io, 0);
508 if (!_dm_io_cache)
509 return -ENOMEM;
510
511 return 0;
512}
513
514void dm_io_exit(void)
515{
516 kmem_cache_destroy(_dm_io_cache);
517 _dm_io_cache = NULL;
518}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index a67942931582..1d669322b27c 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -56,6 +56,11 @@ static void dm_hash_remove_all(int keep_open_devices);
56 */ 56 */
57static DECLARE_RWSEM(_hash_lock); 57static DECLARE_RWSEM(_hash_lock);
58 58
59/*
60 * Protects use of mdptr to obtain hash cell name and uuid from mapped device.
61 */
62static DEFINE_MUTEX(dm_hash_cells_mutex);
63
59static void init_buckets(struct list_head *buckets) 64static void init_buckets(struct list_head *buckets)
60{ 65{
61 unsigned int i; 66 unsigned int i;
@@ -206,7 +211,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
206 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); 211 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
207 } 212 }
208 dm_get(md); 213 dm_get(md);
214 mutex_lock(&dm_hash_cells_mutex);
209 dm_set_mdptr(md, cell); 215 dm_set_mdptr(md, cell);
216 mutex_unlock(&dm_hash_cells_mutex);
210 up_write(&_hash_lock); 217 up_write(&_hash_lock);
211 218
212 return 0; 219 return 0;
@@ -224,9 +231,11 @@ static void __hash_remove(struct hash_cell *hc)
224 /* remove from the dev hash */ 231 /* remove from the dev hash */
225 list_del(&hc->uuid_list); 232 list_del(&hc->uuid_list);
226 list_del(&hc->name_list); 233 list_del(&hc->name_list);
234 mutex_lock(&dm_hash_cells_mutex);
227 dm_set_mdptr(hc->md, NULL); 235 dm_set_mdptr(hc->md, NULL);
236 mutex_unlock(&dm_hash_cells_mutex);
228 237
229 table = dm_get_table(hc->md); 238 table = dm_get_live_table(hc->md);
230 if (table) { 239 if (table) {
231 dm_table_event(table); 240 dm_table_event(table);
232 dm_table_put(table); 241 dm_table_put(table);
@@ -321,13 +330,15 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
321 */ 330 */
322 list_del(&hc->name_list); 331 list_del(&hc->name_list);
323 old_name = hc->name; 332 old_name = hc->name;
333 mutex_lock(&dm_hash_cells_mutex);
324 hc->name = new_name; 334 hc->name = new_name;
335 mutex_unlock(&dm_hash_cells_mutex);
325 list_add(&hc->name_list, _name_buckets + hash_str(new_name)); 336 list_add(&hc->name_list, _name_buckets + hash_str(new_name));
326 337
327 /* 338 /*
328 * Wake up any dm event waiters. 339 * Wake up any dm event waiters.
329 */ 340 */
330 table = dm_get_table(hc->md); 341 table = dm_get_live_table(hc->md);
331 if (table) { 342 if (table) {
332 dm_table_event(table); 343 dm_table_event(table);
333 dm_table_put(table); 344 dm_table_put(table);
@@ -512,8 +523,6 @@ static int list_versions(struct dm_ioctl *param, size_t param_size)
512 return 0; 523 return 0;
513} 524}
514 525
515
516
517static int check_name(const char *name) 526static int check_name(const char *name)
518{ 527{
519 if (strchr(name, '/')) { 528 if (strchr(name, '/')) {
@@ -525,6 +534,40 @@ static int check_name(const char *name)
525} 534}
526 535
527/* 536/*
537 * On successful return, the caller must not attempt to acquire
538 * _hash_lock without first calling dm_table_put, because dm_table_destroy
539 * waits for this dm_table_put and could be called under this lock.
540 */
541static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
542{
543 struct hash_cell *hc;
544 struct dm_table *table = NULL;
545
546 down_read(&_hash_lock);
547 hc = dm_get_mdptr(md);
548 if (!hc || hc->md != md) {
549 DMWARN("device has been removed from the dev hash table.");
550 goto out;
551 }
552
553 table = hc->new_map;
554 if (table)
555 dm_table_get(table);
556
557out:
558 up_read(&_hash_lock);
559
560 return table;
561}
562
563static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
564 struct dm_ioctl *param)
565{
566 return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
567 dm_get_inactive_table(md) : dm_get_live_table(md);
568}
569
570/*
528 * Fills in a dm_ioctl structure, ready for sending back to 571 * Fills in a dm_ioctl structure, ready for sending back to
529 * userland. 572 * userland.
530 */ 573 */
@@ -536,7 +579,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
536 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | 579 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
537 DM_ACTIVE_PRESENT_FLAG); 580 DM_ACTIVE_PRESENT_FLAG);
538 581
539 if (dm_suspended(md)) 582 if (dm_suspended_md(md))
540 param->flags |= DM_SUSPEND_FLAG; 583 param->flags |= DM_SUSPEND_FLAG;
541 584
542 param->dev = huge_encode_dev(disk_devt(disk)); 585 param->dev = huge_encode_dev(disk_devt(disk));
@@ -548,18 +591,30 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
548 */ 591 */
549 param->open_count = dm_open_count(md); 592 param->open_count = dm_open_count(md);
550 593
551 if (get_disk_ro(disk))
552 param->flags |= DM_READONLY_FLAG;
553
554 param->event_nr = dm_get_event_nr(md); 594 param->event_nr = dm_get_event_nr(md);
595 param->target_count = 0;
555 596
556 table = dm_get_table(md); 597 table = dm_get_live_table(md);
557 if (table) { 598 if (table) {
558 param->flags |= DM_ACTIVE_PRESENT_FLAG; 599 if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
559 param->target_count = dm_table_get_num_targets(table); 600 if (get_disk_ro(disk))
601 param->flags |= DM_READONLY_FLAG;
602 param->target_count = dm_table_get_num_targets(table);
603 }
560 dm_table_put(table); 604 dm_table_put(table);
561 } else 605
562 param->target_count = 0; 606 param->flags |= DM_ACTIVE_PRESENT_FLAG;
607 }
608
609 if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
610 table = dm_get_inactive_table(md);
611 if (table) {
612 if (!(dm_table_get_mode(table) & FMODE_WRITE))
613 param->flags |= DM_READONLY_FLAG;
614 param->target_count = dm_table_get_num_targets(table);
615 dm_table_put(table);
616 }
617 }
563 618
564 return 0; 619 return 0;
565} 620}
@@ -634,9 +689,9 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
634 * Sneakily write in both the name and the uuid 689 * Sneakily write in both the name and the uuid
635 * while we have the cell. 690 * while we have the cell.
636 */ 691 */
637 strncpy(param->name, hc->name, sizeof(param->name)); 692 strlcpy(param->name, hc->name, sizeof(param->name));
638 if (hc->uuid) 693 if (hc->uuid)
639 strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1); 694 strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
640 else 695 else
641 param->uuid[0] = '\0'; 696 param->uuid[0] = '\0';
642 697
@@ -784,7 +839,7 @@ static int do_suspend(struct dm_ioctl *param)
784 if (param->flags & DM_NOFLUSH_FLAG) 839 if (param->flags & DM_NOFLUSH_FLAG)
785 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; 840 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
786 841
787 if (!dm_suspended(md)) 842 if (!dm_suspended_md(md))
788 r = dm_suspend(md, suspend_flags); 843 r = dm_suspend(md, suspend_flags);
789 844
790 if (!r) 845 if (!r)
@@ -800,7 +855,7 @@ static int do_resume(struct dm_ioctl *param)
800 unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; 855 unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
801 struct hash_cell *hc; 856 struct hash_cell *hc;
802 struct mapped_device *md; 857 struct mapped_device *md;
803 struct dm_table *new_map; 858 struct dm_table *new_map, *old_map = NULL;
804 859
805 down_write(&_hash_lock); 860 down_write(&_hash_lock);
806 861
@@ -826,14 +881,14 @@ static int do_resume(struct dm_ioctl *param)
826 suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; 881 suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
827 if (param->flags & DM_NOFLUSH_FLAG) 882 if (param->flags & DM_NOFLUSH_FLAG)
828 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; 883 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
829 if (!dm_suspended(md)) 884 if (!dm_suspended_md(md))
830 dm_suspend(md, suspend_flags); 885 dm_suspend(md, suspend_flags);
831 886
832 r = dm_swap_table(md, new_map); 887 old_map = dm_swap_table(md, new_map);
833 if (r) { 888 if (IS_ERR(old_map)) {
834 dm_table_destroy(new_map); 889 dm_table_destroy(new_map);
835 dm_put(md); 890 dm_put(md);
836 return r; 891 return PTR_ERR(old_map);
837 } 892 }
838 893
839 if (dm_table_get_mode(new_map) & FMODE_WRITE) 894 if (dm_table_get_mode(new_map) & FMODE_WRITE)
@@ -842,9 +897,11 @@ static int do_resume(struct dm_ioctl *param)
842 set_disk_ro(dm_disk(md), 1); 897 set_disk_ro(dm_disk(md), 1);
843 } 898 }
844 899
845 if (dm_suspended(md)) 900 if (dm_suspended_md(md))
846 r = dm_resume(md); 901 r = dm_resume(md);
847 902
903 if (old_map)
904 dm_table_destroy(old_map);
848 905
849 if (!r) { 906 if (!r) {
850 dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); 907 dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
@@ -982,7 +1039,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
982 if (r) 1039 if (r)
983 goto out; 1040 goto out;
984 1041
985 table = dm_get_table(md); 1042 table = dm_get_live_or_inactive_table(md, param);
986 if (table) { 1043 if (table) {
987 retrieve_status(table, param, param_size); 1044 retrieve_status(table, param, param_size);
988 dm_table_put(table); 1045 dm_table_put(table);
@@ -1215,7 +1272,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
1215 if (r) 1272 if (r)
1216 goto out; 1273 goto out;
1217 1274
1218 table = dm_get_table(md); 1275 table = dm_get_live_or_inactive_table(md, param);
1219 if (table) { 1276 if (table) {
1220 retrieve_deps(table, param, param_size); 1277 retrieve_deps(table, param, param_size);
1221 dm_table_put(table); 1278 dm_table_put(table);
@@ -1244,13 +1301,13 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
1244 if (r) 1301 if (r)
1245 goto out; 1302 goto out;
1246 1303
1247 table = dm_get_table(md); 1304 table = dm_get_live_or_inactive_table(md, param);
1248 if (table) { 1305 if (table) {
1249 retrieve_status(table, param, param_size); 1306 retrieve_status(table, param, param_size);
1250 dm_table_put(table); 1307 dm_table_put(table);
1251 } 1308 }
1252 1309
1253 out: 1310out:
1254 dm_put(md); 1311 dm_put(md);
1255 return r; 1312 return r;
1256} 1313}
@@ -1288,10 +1345,15 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1288 goto out; 1345 goto out;
1289 } 1346 }
1290 1347
1291 table = dm_get_table(md); 1348 table = dm_get_live_table(md);
1292 if (!table) 1349 if (!table)
1293 goto out_argv; 1350 goto out_argv;
1294 1351
1352 if (dm_deleting_md(md)) {
1353 r = -ENXIO;
1354 goto out_table;
1355 }
1356
1295 ti = dm_table_find_target(table, tmsg->sector); 1357 ti = dm_table_find_target(table, tmsg->sector);
1296 if (!dm_target_is_valid(ti)) { 1358 if (!dm_target_is_valid(ti)) {
1297 DMWARN("Target message sector outside device."); 1359 DMWARN("Target message sector outside device.");
@@ -1303,6 +1365,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1303 r = -EINVAL; 1365 r = -EINVAL;
1304 } 1366 }
1305 1367
1368 out_table:
1306 dm_table_put(table); 1369 dm_table_put(table);
1307 out_argv: 1370 out_argv:
1308 kfree(argv); 1371 kfree(argv);
@@ -1582,8 +1645,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
1582 if (!md) 1645 if (!md)
1583 return -ENXIO; 1646 return -ENXIO;
1584 1647
1585 dm_get(md); 1648 mutex_lock(&dm_hash_cells_mutex);
1586 down_read(&_hash_lock);
1587 hc = dm_get_mdptr(md); 1649 hc = dm_get_mdptr(md);
1588 if (!hc || hc->md != md) { 1650 if (!hc || hc->md != md) {
1589 r = -ENXIO; 1651 r = -ENXIO;
@@ -1596,8 +1658,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
1596 strcpy(uuid, hc->uuid ? : ""); 1658 strcpy(uuid, hc->uuid ? : "");
1597 1659
1598out: 1660out:
1599 up_read(&_hash_lock); 1661 mutex_unlock(&dm_hash_cells_mutex);
1600 dm_put(md);
1601 1662
1602 return r; 1663 return r;
1603} 1664}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 3e3fc06cb861..addf83475040 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -450,7 +450,10 @@ static void dispatch_job(struct kcopyd_job *job)
450{ 450{
451 struct dm_kcopyd_client *kc = job->kc; 451 struct dm_kcopyd_client *kc = job->kc;
452 atomic_inc(&kc->nr_jobs); 452 atomic_inc(&kc->nr_jobs);
453 push(&kc->pages_jobs, job); 453 if (unlikely(!job->source.count))
454 push(&kc->complete_jobs, job);
455 else
456 push(&kc->pages_jobs, job);
454 wake(kc); 457 wake(kc);
455} 458}
456 459
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 9443896ede07..7035582786fb 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -145,8 +145,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
145EXPORT_SYMBOL(dm_dirty_log_type_unregister); 145EXPORT_SYMBOL(dm_dirty_log_type_unregister);
146 146
147struct dm_dirty_log *dm_dirty_log_create(const char *type_name, 147struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
148 struct dm_target *ti, 148 struct dm_target *ti,
149 unsigned int argc, char **argv) 149 int (*flush_callback_fn)(struct dm_target *ti),
150 unsigned int argc, char **argv)
150{ 151{
151 struct dm_dirty_log_type *type; 152 struct dm_dirty_log_type *type;
152 struct dm_dirty_log *log; 153 struct dm_dirty_log *log;
@@ -161,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
161 return NULL; 162 return NULL;
162 } 163 }
163 164
165 log->flush_callback_fn = flush_callback_fn;
164 log->type = type; 166 log->type = type;
165 if (type->ctr(log, ti, argc, argv)) { 167 if (type->ctr(log, ti, argc, argv)) {
166 kfree(log); 168 kfree(log);
@@ -208,7 +210,9 @@ struct log_header {
208 210
209struct log_c { 211struct log_c {
210 struct dm_target *ti; 212 struct dm_target *ti;
211 int touched; 213 int touched_dirtied;
214 int touched_cleaned;
215 int flush_failed;
212 uint32_t region_size; 216 uint32_t region_size;
213 unsigned int region_count; 217 unsigned int region_count;
214 region_t sync_count; 218 region_t sync_count;
@@ -233,6 +237,7 @@ struct log_c {
233 * Disk log fields 237 * Disk log fields
234 */ 238 */
235 int log_dev_failed; 239 int log_dev_failed;
240 int log_dev_flush_failed;
236 struct dm_dev *log_dev; 241 struct dm_dev *log_dev;
237 struct log_header header; 242 struct log_header header;
238 243
@@ -253,14 +258,14 @@ static inline void log_set_bit(struct log_c *l,
253 uint32_t *bs, unsigned bit) 258 uint32_t *bs, unsigned bit)
254{ 259{
255 ext2_set_bit(bit, (unsigned long *) bs); 260 ext2_set_bit(bit, (unsigned long *) bs);
256 l->touched = 1; 261 l->touched_cleaned = 1;
257} 262}
258 263
259static inline void log_clear_bit(struct log_c *l, 264static inline void log_clear_bit(struct log_c *l,
260 uint32_t *bs, unsigned bit) 265 uint32_t *bs, unsigned bit)
261{ 266{
262 ext2_clear_bit(bit, (unsigned long *) bs); 267 ext2_clear_bit(bit, (unsigned long *) bs);
263 l->touched = 1; 268 l->touched_dirtied = 1;
264} 269}
265 270
266/*---------------------------------------------------------------- 271/*----------------------------------------------------------------
@@ -287,6 +292,19 @@ static int rw_header(struct log_c *lc, int rw)
287 return dm_io(&lc->io_req, 1, &lc->header_location, NULL); 292 return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
288} 293}
289 294
295static int flush_header(struct log_c *lc)
296{
297 struct dm_io_region null_location = {
298 .bdev = lc->header_location.bdev,
299 .sector = 0,
300 .count = 0,
301 };
302
303 lc->io_req.bi_rw = WRITE_BARRIER;
304
305 return dm_io(&lc->io_req, 1, &null_location, NULL);
306}
307
290static int read_header(struct log_c *log) 308static int read_header(struct log_c *log)
291{ 309{
292 int r; 310 int r;
@@ -378,7 +396,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
378 } 396 }
379 397
380 lc->ti = ti; 398 lc->ti = ti;
381 lc->touched = 0; 399 lc->touched_dirtied = 0;
400 lc->touched_cleaned = 0;
401 lc->flush_failed = 0;
382 lc->region_size = region_size; 402 lc->region_size = region_size;
383 lc->region_count = region_count; 403 lc->region_count = region_count;
384 lc->sync = sync; 404 lc->sync = sync;
@@ -406,6 +426,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
406 } else { 426 } else {
407 lc->log_dev = dev; 427 lc->log_dev = dev;
408 lc->log_dev_failed = 0; 428 lc->log_dev_failed = 0;
429 lc->log_dev_flush_failed = 0;
409 lc->header_location.bdev = lc->log_dev->bdev; 430 lc->header_location.bdev = lc->log_dev->bdev;
410 lc->header_location.sector = 0; 431 lc->header_location.sector = 0;
411 432
@@ -614,6 +635,11 @@ static int disk_resume(struct dm_dirty_log *log)
614 635
615 /* write the new header */ 636 /* write the new header */
616 r = rw_header(lc, WRITE); 637 r = rw_header(lc, WRITE);
638 if (!r) {
639 r = flush_header(lc);
640 if (r)
641 lc->log_dev_flush_failed = 1;
642 }
617 if (r) { 643 if (r) {
618 DMWARN("%s: Failed to write header on dirty region log device", 644 DMWARN("%s: Failed to write header on dirty region log device",
619 lc->log_dev->name); 645 lc->log_dev->name);
@@ -656,18 +682,40 @@ static int core_flush(struct dm_dirty_log *log)
656 682
657static int disk_flush(struct dm_dirty_log *log) 683static int disk_flush(struct dm_dirty_log *log)
658{ 684{
659 int r; 685 int r, i;
660 struct log_c *lc = (struct log_c *) log->context; 686 struct log_c *lc = log->context;
661 687
662 /* only write if the log has changed */ 688 /* only write if the log has changed */
663 if (!lc->touched) 689 if (!lc->touched_cleaned && !lc->touched_dirtied)
664 return 0; 690 return 0;
665 691
692 if (lc->touched_cleaned && log->flush_callback_fn &&
693 log->flush_callback_fn(lc->ti)) {
694 /*
695 * At this point it is impossible to determine which
696 * regions are clean and which are dirty (without
697 * re-reading the log off disk). So mark all of them
698 * dirty.
699 */
700 lc->flush_failed = 1;
701 for (i = 0; i < lc->region_count; i++)
702 log_clear_bit(lc, lc->clean_bits, i);
703 }
704
666 r = rw_header(lc, WRITE); 705 r = rw_header(lc, WRITE);
667 if (r) 706 if (r)
668 fail_log_device(lc); 707 fail_log_device(lc);
669 else 708 else {
670 lc->touched = 0; 709 if (lc->touched_dirtied) {
710 r = flush_header(lc);
711 if (r) {
712 lc->log_dev_flush_failed = 1;
713 fail_log_device(lc);
714 } else
715 lc->touched_dirtied = 0;
716 }
717 lc->touched_cleaned = 0;
718 }
671 719
672 return r; 720 return r;
673} 721}
@@ -681,7 +729,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region)
681static void core_clear_region(struct dm_dirty_log *log, region_t region) 729static void core_clear_region(struct dm_dirty_log *log, region_t region)
682{ 730{
683 struct log_c *lc = (struct log_c *) log->context; 731 struct log_c *lc = (struct log_c *) log->context;
684 log_set_bit(lc, lc->clean_bits, region); 732 if (likely(!lc->flush_failed))
733 log_set_bit(lc, lc->clean_bits, region);
685} 734}
686 735
687static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) 736static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
@@ -762,7 +811,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status,
762 switch(status) { 811 switch(status) {
763 case STATUSTYPE_INFO: 812 case STATUSTYPE_INFO:
764 DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, 813 DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
765 lc->log_dev_failed ? 'D' : 'A'); 814 lc->log_dev_flush_failed ? 'F' :
815 lc->log_dev_failed ? 'D' :
816 'A');
766 break; 817 break;
767 818
768 case STATUSTYPE_TABLE: 819 case STATUSTYPE_TABLE:
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index dce971dbdfa3..e81345a1d08f 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -93,6 +93,10 @@ struct multipath {
93 * can resubmit bios on error. 93 * can resubmit bios on error.
94 */ 94 */
95 mempool_t *mpio_pool; 95 mempool_t *mpio_pool;
96
97 struct mutex work_mutex;
98
99 unsigned suspended; /* Don't create new I/O internally when set. */
96}; 100};
97 101
98/* 102/*
@@ -198,6 +202,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
198 m->queue_io = 1; 202 m->queue_io = 1;
199 INIT_WORK(&m->process_queued_ios, process_queued_ios); 203 INIT_WORK(&m->process_queued_ios, process_queued_ios);
200 INIT_WORK(&m->trigger_event, trigger_event); 204 INIT_WORK(&m->trigger_event, trigger_event);
205 mutex_init(&m->work_mutex);
201 m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); 206 m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
202 if (!m->mpio_pool) { 207 if (!m->mpio_pool) {
203 kfree(m); 208 kfree(m);
@@ -885,13 +890,18 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
885 return r; 890 return r;
886} 891}
887 892
888static void multipath_dtr(struct dm_target *ti) 893static void flush_multipath_work(void)
889{ 894{
890 struct multipath *m = (struct multipath *) ti->private;
891
892 flush_workqueue(kmpath_handlerd); 895 flush_workqueue(kmpath_handlerd);
893 flush_workqueue(kmultipathd); 896 flush_workqueue(kmultipathd);
894 flush_scheduled_work(); 897 flush_scheduled_work();
898}
899
900static void multipath_dtr(struct dm_target *ti)
901{
902 struct multipath *m = ti->private;
903
904 flush_multipath_work();
895 free_multipath(m); 905 free_multipath(m);
896} 906}
897 907
@@ -1261,6 +1271,16 @@ static void multipath_presuspend(struct dm_target *ti)
1261 queue_if_no_path(m, 0, 1); 1271 queue_if_no_path(m, 0, 1);
1262} 1272}
1263 1273
1274static void multipath_postsuspend(struct dm_target *ti)
1275{
1276 struct multipath *m = ti->private;
1277
1278 mutex_lock(&m->work_mutex);
1279 m->suspended = 1;
1280 flush_multipath_work();
1281 mutex_unlock(&m->work_mutex);
1282}
1283
1264/* 1284/*
1265 * Restore the queue_if_no_path setting. 1285 * Restore the queue_if_no_path setting.
1266 */ 1286 */
@@ -1269,6 +1289,10 @@ static void multipath_resume(struct dm_target *ti)
1269 struct multipath *m = (struct multipath *) ti->private; 1289 struct multipath *m = (struct multipath *) ti->private;
1270 unsigned long flags; 1290 unsigned long flags;
1271 1291
1292 mutex_lock(&m->work_mutex);
1293 m->suspended = 0;
1294 mutex_unlock(&m->work_mutex);
1295
1272 spin_lock_irqsave(&m->lock, flags); 1296 spin_lock_irqsave(&m->lock, flags);
1273 m->queue_if_no_path = m->saved_queue_if_no_path; 1297 m->queue_if_no_path = m->saved_queue_if_no_path;
1274 spin_unlock_irqrestore(&m->lock, flags); 1298 spin_unlock_irqrestore(&m->lock, flags);
@@ -1397,51 +1421,71 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
1397 1421
1398static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) 1422static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1399{ 1423{
1400 int r; 1424 int r = -EINVAL;
1401 struct dm_dev *dev; 1425 struct dm_dev *dev;
1402 struct multipath *m = (struct multipath *) ti->private; 1426 struct multipath *m = (struct multipath *) ti->private;
1403 action_fn action; 1427 action_fn action;
1404 1428
1429 mutex_lock(&m->work_mutex);
1430
1431 if (m->suspended) {
1432 r = -EBUSY;
1433 goto out;
1434 }
1435
1436 if (dm_suspended(ti)) {
1437 r = -EBUSY;
1438 goto out;
1439 }
1440
1405 if (argc == 1) { 1441 if (argc == 1) {
1406 if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) 1442 if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
1407 return queue_if_no_path(m, 1, 0); 1443 r = queue_if_no_path(m, 1, 0);
1408 else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) 1444 goto out;
1409 return queue_if_no_path(m, 0, 0); 1445 } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) {
1446 r = queue_if_no_path(m, 0, 0);
1447 goto out;
1448 }
1410 } 1449 }
1411 1450
1412 if (argc != 2) 1451 if (argc != 2) {
1413 goto error; 1452 DMWARN("Unrecognised multipath message received.");
1453 goto out;
1454 }
1414 1455
1415 if (!strnicmp(argv[0], MESG_STR("disable_group"))) 1456 if (!strnicmp(argv[0], MESG_STR("disable_group"))) {
1416 return bypass_pg_num(m, argv[1], 1); 1457 r = bypass_pg_num(m, argv[1], 1);
1417 else if (!strnicmp(argv[0], MESG_STR("enable_group"))) 1458 goto out;
1418 return bypass_pg_num(m, argv[1], 0); 1459 } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) {
1419 else if (!strnicmp(argv[0], MESG_STR("switch_group"))) 1460 r = bypass_pg_num(m, argv[1], 0);
1420 return switch_pg_num(m, argv[1]); 1461 goto out;
1421 else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) 1462 } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) {
1463 r = switch_pg_num(m, argv[1]);
1464 goto out;
1465 } else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
1422 action = reinstate_path; 1466 action = reinstate_path;
1423 else if (!strnicmp(argv[0], MESG_STR("fail_path"))) 1467 else if (!strnicmp(argv[0], MESG_STR("fail_path")))
1424 action = fail_path; 1468 action = fail_path;
1425 else 1469 else {
1426 goto error; 1470 DMWARN("Unrecognised multipath message received.");
1471 goto out;
1472 }
1427 1473
1428 r = dm_get_device(ti, argv[1], ti->begin, ti->len, 1474 r = dm_get_device(ti, argv[1], ti->begin, ti->len,
1429 dm_table_get_mode(ti->table), &dev); 1475 dm_table_get_mode(ti->table), &dev);
1430 if (r) { 1476 if (r) {
1431 DMWARN("message: error getting device %s", 1477 DMWARN("message: error getting device %s",
1432 argv[1]); 1478 argv[1]);
1433 return -EINVAL; 1479 goto out;
1434 } 1480 }
1435 1481
1436 r = action_dev(m, dev, action); 1482 r = action_dev(m, dev, action);
1437 1483
1438 dm_put_device(ti, dev); 1484 dm_put_device(ti, dev);
1439 1485
1486out:
1487 mutex_unlock(&m->work_mutex);
1440 return r; 1488 return r;
1441
1442error:
1443 DMWARN("Unrecognised multipath message received.");
1444 return -EINVAL;
1445} 1489}
1446 1490
1447static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, 1491static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
@@ -1567,13 +1611,14 @@ out:
1567 *---------------------------------------------------------------*/ 1611 *---------------------------------------------------------------*/
1568static struct target_type multipath_target = { 1612static struct target_type multipath_target = {
1569 .name = "multipath", 1613 .name = "multipath",
1570 .version = {1, 1, 0}, 1614 .version = {1, 1, 1},
1571 .module = THIS_MODULE, 1615 .module = THIS_MODULE,
1572 .ctr = multipath_ctr, 1616 .ctr = multipath_ctr,
1573 .dtr = multipath_dtr, 1617 .dtr = multipath_dtr,
1574 .map_rq = multipath_map, 1618 .map_rq = multipath_map,
1575 .rq_end_io = multipath_end_io, 1619 .rq_end_io = multipath_end_io,
1576 .presuspend = multipath_presuspend, 1620 .presuspend = multipath_presuspend,
1621 .postsuspend = multipath_postsuspend,
1577 .resume = multipath_resume, 1622 .resume = multipath_resume,
1578 .status = multipath_status, 1623 .status = multipath_status,
1579 .message = multipath_message, 1624 .message = multipath_message,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index cc9dc79b0784..ad779bd13aec 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
35 *---------------------------------------------------------------*/ 35 *---------------------------------------------------------------*/
36enum dm_raid1_error { 36enum dm_raid1_error {
37 DM_RAID1_WRITE_ERROR, 37 DM_RAID1_WRITE_ERROR,
38 DM_RAID1_FLUSH_ERROR,
38 DM_RAID1_SYNC_ERROR, 39 DM_RAID1_SYNC_ERROR,
39 DM_RAID1_READ_ERROR 40 DM_RAID1_READ_ERROR
40}; 41};
@@ -57,6 +58,7 @@ struct mirror_set {
57 struct bio_list reads; 58 struct bio_list reads;
58 struct bio_list writes; 59 struct bio_list writes;
59 struct bio_list failures; 60 struct bio_list failures;
61 struct bio_list holds; /* bios are waiting until suspend */
60 62
61 struct dm_region_hash *rh; 63 struct dm_region_hash *rh;
62 struct dm_kcopyd_client *kcopyd_client; 64 struct dm_kcopyd_client *kcopyd_client;
@@ -67,6 +69,7 @@ struct mirror_set {
67 region_t nr_regions; 69 region_t nr_regions;
68 int in_sync; 70 int in_sync;
69 int log_failure; 71 int log_failure;
72 int leg_failure;
70 atomic_t suspend; 73 atomic_t suspend;
71 74
72 atomic_t default_mirror; /* Default mirror */ 75 atomic_t default_mirror; /* Default mirror */
@@ -179,6 +182,17 @@ static void set_default_mirror(struct mirror *m)
179 atomic_set(&ms->default_mirror, m - m0); 182 atomic_set(&ms->default_mirror, m - m0);
180} 183}
181 184
185static struct mirror *get_valid_mirror(struct mirror_set *ms)
186{
187 struct mirror *m;
188
189 for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
190 if (!atomic_read(&m->error_count))
191 return m;
192
193 return NULL;
194}
195
182/* fail_mirror 196/* fail_mirror
183 * @m: mirror device to fail 197 * @m: mirror device to fail
184 * @error_type: one of the enum's, DM_RAID1_*_ERROR 198 * @error_type: one of the enum's, DM_RAID1_*_ERROR
@@ -198,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
198 struct mirror_set *ms = m->ms; 212 struct mirror_set *ms = m->ms;
199 struct mirror *new; 213 struct mirror *new;
200 214
215 ms->leg_failure = 1;
216
201 /* 217 /*
202 * error_count is used for nothing more than a 218 * error_count is used for nothing more than a
203 * simple way to tell if a device has encountered 219 * simple way to tell if a device has encountered
@@ -224,19 +240,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
224 goto out; 240 goto out;
225 } 241 }
226 242
227 for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) 243 new = get_valid_mirror(ms);
228 if (!atomic_read(&new->error_count)) { 244 if (new)
229 set_default_mirror(new); 245 set_default_mirror(new);
230 break; 246 else
231 }
232
233 if (unlikely(new == ms->mirror + ms->nr_mirrors))
234 DMWARN("All sides of mirror have failed."); 247 DMWARN("All sides of mirror have failed.");
235 248
236out: 249out:
237 schedule_work(&ms->trigger_event); 250 schedule_work(&ms->trigger_event);
238} 251}
239 252
253static int mirror_flush(struct dm_target *ti)
254{
255 struct mirror_set *ms = ti->private;
256 unsigned long error_bits;
257
258 unsigned int i;
259 struct dm_io_region io[ms->nr_mirrors];
260 struct mirror *m;
261 struct dm_io_request io_req = {
262 .bi_rw = WRITE_BARRIER,
263 .mem.type = DM_IO_KMEM,
264 .mem.ptr.bvec = NULL,
265 .client = ms->io_client,
266 };
267
268 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
269 io[i].bdev = m->dev->bdev;
270 io[i].sector = 0;
271 io[i].count = 0;
272 }
273
274 error_bits = -1;
275 dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
276 if (unlikely(error_bits != 0)) {
277 for (i = 0; i < ms->nr_mirrors; i++)
278 if (test_bit(i, &error_bits))
279 fail_mirror(ms->mirror + i,
280 DM_RAID1_FLUSH_ERROR);
281 return -EIO;
282 }
283
284 return 0;
285}
286
240/*----------------------------------------------------------------- 287/*-----------------------------------------------------------------
241 * Recovery. 288 * Recovery.
242 * 289 *
@@ -396,6 +443,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
396 */ 443 */
397static sector_t map_sector(struct mirror *m, struct bio *bio) 444static sector_t map_sector(struct mirror *m, struct bio *bio)
398{ 445{
446 if (unlikely(!bio->bi_size))
447 return 0;
399 return m->offset + (bio->bi_sector - m->ms->ti->begin); 448 return m->offset + (bio->bi_sector - m->ms->ti->begin);
400} 449}
401 450
@@ -413,6 +462,27 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
413 io->count = bio->bi_size >> 9; 462 io->count = bio->bi_size >> 9;
414} 463}
415 464
465static void hold_bio(struct mirror_set *ms, struct bio *bio)
466{
467 /*
468 * If device is suspended, complete the bio.
469 */
470 if (atomic_read(&ms->suspend)) {
471 if (dm_noflush_suspending(ms->ti))
472 bio_endio(bio, DM_ENDIO_REQUEUE);
473 else
474 bio_endio(bio, -EIO);
475 return;
476 }
477
478 /*
479 * Hold bio until the suspend is complete.
480 */
481 spin_lock_irq(&ms->lock);
482 bio_list_add(&ms->holds, bio);
483 spin_unlock_irq(&ms->lock);
484}
485
416/*----------------------------------------------------------------- 486/*-----------------------------------------------------------------
417 * Reads 487 * Reads
418 *---------------------------------------------------------------*/ 488 *---------------------------------------------------------------*/
@@ -511,7 +581,6 @@ static void write_callback(unsigned long error, void *context)
511 unsigned i, ret = 0; 581 unsigned i, ret = 0;
512 struct bio *bio = (struct bio *) context; 582 struct bio *bio = (struct bio *) context;
513 struct mirror_set *ms; 583 struct mirror_set *ms;
514 int uptodate = 0;
515 int should_wake = 0; 584 int should_wake = 0;
516 unsigned long flags; 585 unsigned long flags;
517 586
@@ -524,36 +593,27 @@ static void write_callback(unsigned long error, void *context)
524 * This way we handle both writes to SYNC and NOSYNC 593 * This way we handle both writes to SYNC and NOSYNC
525 * regions with the same code. 594 * regions with the same code.
526 */ 595 */
527 if (likely(!error)) 596 if (likely(!error)) {
528 goto out; 597 bio_endio(bio, ret);
598 return;
599 }
529 600
530 for (i = 0; i < ms->nr_mirrors; i++) 601 for (i = 0; i < ms->nr_mirrors; i++)
531 if (test_bit(i, &error)) 602 if (test_bit(i, &error))
532 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); 603 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
533 else
534 uptodate = 1;
535 604
536 if (unlikely(!uptodate)) { 605 /*
537 DMERR("All replicated volumes dead, failing I/O"); 606 * Need to raise event. Since raising
538 /* None of the writes succeeded, fail the I/O. */ 607 * events can block, we need to do it in
539 ret = -EIO; 608 * the main thread.
540 } else if (errors_handled(ms)) { 609 */
541 /* 610 spin_lock_irqsave(&ms->lock, flags);
542 * Need to raise event. Since raising 611 if (!ms->failures.head)
543 * events can block, we need to do it in 612 should_wake = 1;
544 * the main thread. 613 bio_list_add(&ms->failures, bio);
545 */ 614 spin_unlock_irqrestore(&ms->lock, flags);
546 spin_lock_irqsave(&ms->lock, flags); 615 if (should_wake)
547 if (!ms->failures.head) 616 wakeup_mirrord(ms);
548 should_wake = 1;
549 bio_list_add(&ms->failures, bio);
550 spin_unlock_irqrestore(&ms->lock, flags);
551 if (should_wake)
552 wakeup_mirrord(ms);
553 return;
554 }
555out:
556 bio_endio(bio, ret);
557} 617}
558 618
559static void do_write(struct mirror_set *ms, struct bio *bio) 619static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -562,7 +622,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
562 struct dm_io_region io[ms->nr_mirrors], *dest = io; 622 struct dm_io_region io[ms->nr_mirrors], *dest = io;
563 struct mirror *m; 623 struct mirror *m;
564 struct dm_io_request io_req = { 624 struct dm_io_request io_req = {
565 .bi_rw = WRITE, 625 .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
566 .mem.type = DM_IO_BVEC, 626 .mem.type = DM_IO_BVEC,
567 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 627 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
568 .notify.fn = write_callback, 628 .notify.fn = write_callback,
@@ -603,6 +663,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
603 bio_list_init(&requeue); 663 bio_list_init(&requeue);
604 664
605 while ((bio = bio_list_pop(writes))) { 665 while ((bio = bio_list_pop(writes))) {
666 if (unlikely(bio_empty_barrier(bio))) {
667 bio_list_add(&sync, bio);
668 continue;
669 }
670
606 region = dm_rh_bio_to_region(ms->rh, bio); 671 region = dm_rh_bio_to_region(ms->rh, bio);
607 672
608 if (log->type->is_remote_recovering && 673 if (log->type->is_remote_recovering &&
@@ -672,8 +737,12 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
672 dm_rh_delay(ms->rh, bio); 737 dm_rh_delay(ms->rh, bio);
673 738
674 while ((bio = bio_list_pop(&nosync))) { 739 while ((bio = bio_list_pop(&nosync))) {
675 map_bio(get_default_mirror(ms), bio); 740 if (unlikely(ms->leg_failure) && errors_handled(ms))
676 generic_make_request(bio); 741 hold_bio(ms, bio);
742 else {
743 map_bio(get_default_mirror(ms), bio);
744 generic_make_request(bio);
745 }
677 } 746 }
678} 747}
679 748
@@ -681,20 +750,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
681{ 750{
682 struct bio *bio; 751 struct bio *bio;
683 752
684 if (!failures->head) 753 if (likely(!failures->head))
685 return;
686
687 if (!ms->log_failure) {
688 while ((bio = bio_list_pop(failures))) {
689 ms->in_sync = 0;
690 dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
691 }
692 return; 754 return;
693 }
694 755
695 /* 756 /*
696 * If the log has failed, unattempted writes are being 757 * If the log has failed, unattempted writes are being
697 * put on the failures list. We can't issue those writes 758 * put on the holds list. We can't issue those writes
698 * until a log has been marked, so we must store them. 759 * until a log has been marked, so we must store them.
699 * 760 *
700 * If a 'noflush' suspend is in progress, we can requeue 761 * If a 'noflush' suspend is in progress, we can requeue
@@ -709,23 +770,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
709 * for us to treat them the same and requeue them 770 * for us to treat them the same and requeue them
710 * as well. 771 * as well.
711 */ 772 */
712 if (dm_noflush_suspending(ms->ti)) { 773 while ((bio = bio_list_pop(failures))) {
713 while ((bio = bio_list_pop(failures))) 774 if (!ms->log_failure) {
714 bio_endio(bio, DM_ENDIO_REQUEUE); 775 ms->in_sync = 0;
715 return; 776 dm_rh_mark_nosync(ms->rh, bio);
716 } 777 }
717 778
718 if (atomic_read(&ms->suspend)) { 779 /*
719 while ((bio = bio_list_pop(failures))) 780 * If all the legs are dead, fail the I/O.
781 * If we have been told to handle errors, hold the bio
782 * and wait for userspace to deal with the problem.
783 * Otherwise pretend that the I/O succeeded. (This would
784 * be wrong if the failed leg returned after reboot and
785 * got replicated back to the good legs.)
786 */
787 if (!get_valid_mirror(ms))
720 bio_endio(bio, -EIO); 788 bio_endio(bio, -EIO);
721 return; 789 else if (errors_handled(ms))
790 hold_bio(ms, bio);
791 else
792 bio_endio(bio, 0);
722 } 793 }
723
724 spin_lock_irq(&ms->lock);
725 bio_list_merge(&ms->failures, failures);
726 spin_unlock_irq(&ms->lock);
727
728 delayed_wake(ms);
729} 794}
730 795
731static void trigger_event(struct work_struct *work) 796static void trigger_event(struct work_struct *work)
@@ -784,12 +849,17 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
784 } 849 }
785 850
786 spin_lock_init(&ms->lock); 851 spin_lock_init(&ms->lock);
852 bio_list_init(&ms->reads);
853 bio_list_init(&ms->writes);
854 bio_list_init(&ms->failures);
855 bio_list_init(&ms->holds);
787 856
788 ms->ti = ti; 857 ms->ti = ti;
789 ms->nr_mirrors = nr_mirrors; 858 ms->nr_mirrors = nr_mirrors;
790 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 859 ms->nr_regions = dm_sector_div_up(ti->len, region_size);
791 ms->in_sync = 0; 860 ms->in_sync = 0;
792 ms->log_failure = 0; 861 ms->log_failure = 0;
862 ms->leg_failure = 0;
793 atomic_set(&ms->suspend, 0); 863 atomic_set(&ms->suspend, 0);
794 atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 864 atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
795 865
@@ -889,7 +959,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
889 return NULL; 959 return NULL;
890 } 960 }
891 961
892 dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); 962 dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count,
963 argv + 2);
893 if (!dl) { 964 if (!dl) {
894 ti->error = "Error creating mirror dirty log"; 965 ti->error = "Error creating mirror dirty log";
895 return NULL; 966 return NULL;
@@ -995,6 +1066,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
995 1066
996 ti->private = ms; 1067 ti->private = ms;
997 ti->split_io = dm_rh_get_region_size(ms->rh); 1068 ti->split_io = dm_rh_get_region_size(ms->rh);
1069 ti->num_flush_requests = 1;
998 1070
999 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); 1071 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
1000 if (!ms->kmirrord_wq) { 1072 if (!ms->kmirrord_wq) {
@@ -1122,7 +1194,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1122 * We need to dec pending if this was a write. 1194 * We need to dec pending if this was a write.
1123 */ 1195 */
1124 if (rw == WRITE) { 1196 if (rw == WRITE) {
1125 dm_rh_dec(ms->rh, map_context->ll); 1197 if (likely(!bio_empty_barrier(bio)))
1198 dm_rh_dec(ms->rh, map_context->ll);
1126 return error; 1199 return error;
1127 } 1200 }
1128 1201
@@ -1180,6 +1253,9 @@ static void mirror_presuspend(struct dm_target *ti)
1180 struct mirror_set *ms = (struct mirror_set *) ti->private; 1253 struct mirror_set *ms = (struct mirror_set *) ti->private;
1181 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 1254 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1182 1255
1256 struct bio_list holds;
1257 struct bio *bio;
1258
1183 atomic_set(&ms->suspend, 1); 1259 atomic_set(&ms->suspend, 1);
1184 1260
1185 /* 1261 /*
@@ -1202,6 +1278,22 @@ static void mirror_presuspend(struct dm_target *ti)
1202 * we know that all of our I/O has been pushed. 1278 * we know that all of our I/O has been pushed.
1203 */ 1279 */
1204 flush_workqueue(ms->kmirrord_wq); 1280 flush_workqueue(ms->kmirrord_wq);
1281
1282 /*
1283 * Now set ms->suspend is set and the workqueue flushed, no more
1284 * entries can be added to ms->hold list, so process it.
1285 *
1286 * Bios can still arrive concurrently with or after this
1287 * presuspend function, but they cannot join the hold list
1288 * because ms->suspend is set.
1289 */
1290 spin_lock_irq(&ms->lock);
1291 holds = ms->holds;
1292 bio_list_init(&ms->holds);
1293 spin_unlock_irq(&ms->lock);
1294
1295 while ((bio = bio_list_pop(&holds)))
1296 hold_bio(ms, bio);
1205} 1297}
1206 1298
1207static void mirror_postsuspend(struct dm_target *ti) 1299static void mirror_postsuspend(struct dm_target *ti)
@@ -1244,7 +1336,8 @@ static char device_status_char(struct mirror *m)
1244 if (!atomic_read(&(m->error_count))) 1336 if (!atomic_read(&(m->error_count)))
1245 return 'A'; 1337 return 'A';
1246 1338
1247 return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : 1339 return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' :
1340 (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
1248 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : 1341 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
1249 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; 1342 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
1250} 1343}
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 36dbe29f2fd6..5f19ceb6fe91 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -79,6 +79,11 @@ struct dm_region_hash {
79 struct list_head recovered_regions; 79 struct list_head recovered_regions;
80 struct list_head failed_recovered_regions; 80 struct list_head failed_recovered_regions;
81 81
82 /*
83 * If there was a barrier failure no regions can be marked clean.
84 */
85 int barrier_failure;
86
82 void *context; 87 void *context;
83 sector_t target_begin; 88 sector_t target_begin;
84 89
@@ -211,6 +216,7 @@ struct dm_region_hash *dm_region_hash_create(
211 INIT_LIST_HEAD(&rh->quiesced_regions); 216 INIT_LIST_HEAD(&rh->quiesced_regions);
212 INIT_LIST_HEAD(&rh->recovered_regions); 217 INIT_LIST_HEAD(&rh->recovered_regions);
213 INIT_LIST_HEAD(&rh->failed_recovered_regions); 218 INIT_LIST_HEAD(&rh->failed_recovered_regions);
219 rh->barrier_failure = 0;
214 220
215 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 221 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
216 sizeof(struct dm_region)); 222 sizeof(struct dm_region));
@@ -377,8 +383,6 @@ static void complete_resync_work(struct dm_region *reg, int success)
377/* dm_rh_mark_nosync 383/* dm_rh_mark_nosync
378 * @ms 384 * @ms
379 * @bio 385 * @bio
380 * @done
381 * @error
382 * 386 *
383 * The bio was written on some mirror(s) but failed on other mirror(s). 387 * The bio was written on some mirror(s) but failed on other mirror(s).
384 * We can successfully endio the bio but should avoid the region being 388 * We can successfully endio the bio but should avoid the region being
@@ -386,8 +390,7 @@ static void complete_resync_work(struct dm_region *reg, int success)
386 * 390 *
387 * This function is _not_ safe in interrupt context! 391 * This function is _not_ safe in interrupt context!
388 */ 392 */
389void dm_rh_mark_nosync(struct dm_region_hash *rh, 393void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
390 struct bio *bio, unsigned done, int error)
391{ 394{
392 unsigned long flags; 395 unsigned long flags;
393 struct dm_dirty_log *log = rh->log; 396 struct dm_dirty_log *log = rh->log;
@@ -395,6 +398,11 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
395 region_t region = dm_rh_bio_to_region(rh, bio); 398 region_t region = dm_rh_bio_to_region(rh, bio);
396 int recovering = 0; 399 int recovering = 0;
397 400
401 if (bio_empty_barrier(bio)) {
402 rh->barrier_failure = 1;
403 return;
404 }
405
398 /* We must inform the log that the sync count has changed. */ 406 /* We must inform the log that the sync count has changed. */
399 log->type->set_region_sync(log, region, 0); 407 log->type->set_region_sync(log, region, 0);
400 408
@@ -419,7 +427,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
419 BUG_ON(!list_empty(&reg->list)); 427 BUG_ON(!list_empty(&reg->list));
420 spin_unlock_irqrestore(&rh->region_lock, flags); 428 spin_unlock_irqrestore(&rh->region_lock, flags);
421 429
422 bio_endio(bio, error);
423 if (recovering) 430 if (recovering)
424 complete_resync_work(reg, 0); 431 complete_resync_work(reg, 0);
425} 432}
@@ -515,8 +522,11 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
515{ 522{
516 struct bio *bio; 523 struct bio *bio;
517 524
518 for (bio = bios->head; bio; bio = bio->bi_next) 525 for (bio = bios->head; bio; bio = bio->bi_next) {
526 if (bio_empty_barrier(bio))
527 continue;
519 rh_inc(rh, dm_rh_bio_to_region(rh, bio)); 528 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
529 }
520} 530}
521EXPORT_SYMBOL_GPL(dm_rh_inc_pending); 531EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
522 532
@@ -544,7 +554,14 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
544 */ 554 */
545 555
546 /* do nothing for DM_RH_NOSYNC */ 556 /* do nothing for DM_RH_NOSYNC */
547 if (reg->state == DM_RH_RECOVERING) { 557 if (unlikely(rh->barrier_failure)) {
558 /*
559 * If a write barrier failed some time ago, we
560 * don't know whether or not this write made it
561 * to the disk, so we must resync the device.
562 */
563 reg->state = DM_RH_NOSYNC;
564 } else if (reg->state == DM_RH_RECOVERING) {
548 list_add_tail(&reg->list, &rh->quiesced_regions); 565 list_add_tail(&reg->list, &rh->quiesced_regions);
549 } else if (reg->state == DM_RH_DIRTY) { 566 } else if (reg->state == DM_RH_DIRTY) {
550 reg->state = DM_RH_CLEAN; 567 reg->state = DM_RH_CLEAN;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 0c746420c008..7d08879689ac 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -55,6 +55,8 @@
55 */ 55 */
56#define SNAPSHOT_DISK_VERSION 1 56#define SNAPSHOT_DISK_VERSION 1
57 57
58#define NUM_SNAPSHOT_HDR_CHUNKS 1
59
58struct disk_header { 60struct disk_header {
59 uint32_t magic; 61 uint32_t magic;
60 62
@@ -120,7 +122,22 @@ struct pstore {
120 122
121 /* 123 /*
122 * The next free chunk for an exception. 124 * The next free chunk for an exception.
125 *
126 * When creating exceptions, all the chunks here and above are
127 * free. It holds the next chunk to be allocated. On rare
128 * occasions (e.g. after a system crash) holes can be left in
129 * the exception store because chunks can be committed out of
130 * order.
131 *
132 * When merging exceptions, it does not necessarily mean all the
133 * chunks here and above are free. It holds the value it would
134 * have held if all chunks had been committed in order of
135 * allocation. Consequently the value may occasionally be
136 * slightly too low, but since it's only used for 'status' and
137 * it can never reach its minimum value too early this doesn't
138 * matter.
123 */ 139 */
140
124 chunk_t next_free; 141 chunk_t next_free;
125 142
126 /* 143 /*
@@ -214,7 +231,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
214 int metadata) 231 int metadata)
215{ 232{
216 struct dm_io_region where = { 233 struct dm_io_region where = {
217 .bdev = ps->store->cow->bdev, 234 .bdev = dm_snap_cow(ps->store->snap)->bdev,
218 .sector = ps->store->chunk_size * chunk, 235 .sector = ps->store->chunk_size * chunk,
219 .count = ps->store->chunk_size, 236 .count = ps->store->chunk_size,
220 }; 237 };
@@ -294,7 +311,8 @@ static int read_header(struct pstore *ps, int *new_snapshot)
294 */ 311 */
295 if (!ps->store->chunk_size) { 312 if (!ps->store->chunk_size) {
296 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, 313 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
297 bdev_logical_block_size(ps->store->cow->bdev) >> 9); 314 bdev_logical_block_size(dm_snap_cow(ps->store->snap)->
315 bdev) >> 9);
298 ps->store->chunk_mask = ps->store->chunk_size - 1; 316 ps->store->chunk_mask = ps->store->chunk_size - 1;
299 ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; 317 ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
300 chunk_size_supplied = 0; 318 chunk_size_supplied = 0;
@@ -408,6 +426,15 @@ static void write_exception(struct pstore *ps,
408 e->new_chunk = cpu_to_le64(de->new_chunk); 426 e->new_chunk = cpu_to_le64(de->new_chunk);
409} 427}
410 428
429static void clear_exception(struct pstore *ps, uint32_t index)
430{
431 struct disk_exception *e = get_exception(ps, index);
432
433 /* clear it */
434 e->old_chunk = 0;
435 e->new_chunk = 0;
436}
437
411/* 438/*
412 * Registers the exceptions that are present in the current area. 439 * Registers the exceptions that are present in the current area.
413 * 'full' is filled in to indicate if the area has been 440 * 'full' is filled in to indicate if the area has been
@@ -489,11 +516,23 @@ static struct pstore *get_info(struct dm_exception_store *store)
489 return (struct pstore *) store->context; 516 return (struct pstore *) store->context;
490} 517}
491 518
492static void persistent_fraction_full(struct dm_exception_store *store, 519static void persistent_usage(struct dm_exception_store *store,
493 sector_t *numerator, sector_t *denominator) 520 sector_t *total_sectors,
521 sector_t *sectors_allocated,
522 sector_t *metadata_sectors)
494{ 523{
495 *numerator = get_info(store)->next_free * store->chunk_size; 524 struct pstore *ps = get_info(store);
496 *denominator = get_dev_size(store->cow->bdev); 525
526 *sectors_allocated = ps->next_free * store->chunk_size;
527 *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
528
529 /*
530 * First chunk is the fixed header.
531 * Then there are (ps->current_area + 1) metadata chunks, each one
532 * separated from the next by ps->exceptions_per_area data chunks.
533 */
534 *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) *
535 store->chunk_size;
497} 536}
498 537
499static void persistent_dtr(struct dm_exception_store *store) 538static void persistent_dtr(struct dm_exception_store *store)
@@ -552,44 +591,40 @@ static int persistent_read_metadata(struct dm_exception_store *store,
552 ps->current_area = 0; 591 ps->current_area = 0;
553 zero_memory_area(ps); 592 zero_memory_area(ps);
554 r = zero_disk_area(ps, 0); 593 r = zero_disk_area(ps, 0);
555 if (r) { 594 if (r)
556 DMWARN("zero_disk_area(0) failed"); 595 DMWARN("zero_disk_area(0) failed");
557 return r; 596 return r;
558 } 597 }
559 } else { 598 /*
560 /* 599 * Sanity checks.
561 * Sanity checks. 600 */
562 */ 601 if (ps->version != SNAPSHOT_DISK_VERSION) {
563 if (ps->version != SNAPSHOT_DISK_VERSION) { 602 DMWARN("unable to handle snapshot disk version %d",
564 DMWARN("unable to handle snapshot disk version %d", 603 ps->version);
565 ps->version); 604 return -EINVAL;
566 return -EINVAL; 605 }
567 }
568 606
569 /* 607 /*
570 * Metadata are valid, but snapshot is invalidated 608 * Metadata are valid, but snapshot is invalidated
571 */ 609 */
572 if (!ps->valid) 610 if (!ps->valid)
573 return 1; 611 return 1;
574 612
575 /* 613 /*
576 * Read the metadata. 614 * Read the metadata.
577 */ 615 */
578 r = read_exceptions(ps, callback, callback_context); 616 r = read_exceptions(ps, callback, callback_context);
579 if (r)
580 return r;
581 }
582 617
583 return 0; 618 return r;
584} 619}
585 620
586static int persistent_prepare_exception(struct dm_exception_store *store, 621static int persistent_prepare_exception(struct dm_exception_store *store,
587 struct dm_snap_exception *e) 622 struct dm_exception *e)
588{ 623{
589 struct pstore *ps = get_info(store); 624 struct pstore *ps = get_info(store);
590 uint32_t stride; 625 uint32_t stride;
591 chunk_t next_free; 626 chunk_t next_free;
592 sector_t size = get_dev_size(store->cow->bdev); 627 sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
593 628
594 /* Is there enough room ? */ 629 /* Is there enough room ? */
595 if (size < ((ps->next_free + 1) * store->chunk_size)) 630 if (size < ((ps->next_free + 1) * store->chunk_size))
@@ -611,7 +646,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
611} 646}
612 647
613static void persistent_commit_exception(struct dm_exception_store *store, 648static void persistent_commit_exception(struct dm_exception_store *store,
614 struct dm_snap_exception *e, 649 struct dm_exception *e,
615 void (*callback) (void *, int success), 650 void (*callback) (void *, int success),
616 void *callback_context) 651 void *callback_context)
617{ 652{
@@ -672,6 +707,85 @@ static void persistent_commit_exception(struct dm_exception_store *store,
672 ps->callback_count = 0; 707 ps->callback_count = 0;
673} 708}
674 709
710static int persistent_prepare_merge(struct dm_exception_store *store,
711 chunk_t *last_old_chunk,
712 chunk_t *last_new_chunk)
713{
714 struct pstore *ps = get_info(store);
715 struct disk_exception de;
716 int nr_consecutive;
717 int r;
718
719 /*
720 * When current area is empty, move back to preceding area.
721 */
722 if (!ps->current_committed) {
723 /*
724 * Have we finished?
725 */
726 if (!ps->current_area)
727 return 0;
728
729 ps->current_area--;
730 r = area_io(ps, READ);
731 if (r < 0)
732 return r;
733 ps->current_committed = ps->exceptions_per_area;
734 }
735
736 read_exception(ps, ps->current_committed - 1, &de);
737 *last_old_chunk = de.old_chunk;
738 *last_new_chunk = de.new_chunk;
739
740 /*
741 * Find number of consecutive chunks within the current area,
742 * working backwards.
743 */
744 for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
745 nr_consecutive++) {
746 read_exception(ps, ps->current_committed - 1 - nr_consecutive,
747 &de);
748 if (de.old_chunk != *last_old_chunk - nr_consecutive ||
749 de.new_chunk != *last_new_chunk - nr_consecutive)
750 break;
751 }
752
753 return nr_consecutive;
754}
755
756static int persistent_commit_merge(struct dm_exception_store *store,
757 int nr_merged)
758{
759 int r, i;
760 struct pstore *ps = get_info(store);
761
762 BUG_ON(nr_merged > ps->current_committed);
763
764 for (i = 0; i < nr_merged; i++)
765 clear_exception(ps, ps->current_committed - 1 - i);
766
767 r = area_io(ps, WRITE);
768 if (r < 0)
769 return r;
770
771 ps->current_committed -= nr_merged;
772
773 /*
774 * At this stage, only persistent_usage() uses ps->next_free, so
775 * we make no attempt to keep ps->next_free strictly accurate
776 * as exceptions may have been committed out-of-order originally.
777 * Once a snapshot has become merging, we set it to the value it
778 * would have held had all the exceptions been committed in order.
779 *
780 * ps->current_area does not get reduced by prepare_merge() until
781 * after commit_merge() has removed the nr_merged previous exceptions.
782 */
783 ps->next_free = (area_location(ps, ps->current_area) - 1) +
784 (ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS;
785
786 return 0;
787}
788
675static void persistent_drop_snapshot(struct dm_exception_store *store) 789static void persistent_drop_snapshot(struct dm_exception_store *store)
676{ 790{
677 struct pstore *ps = get_info(store); 791 struct pstore *ps = get_info(store);
@@ -697,7 +811,7 @@ static int persistent_ctr(struct dm_exception_store *store,
697 ps->area = NULL; 811 ps->area = NULL;
698 ps->zero_area = NULL; 812 ps->zero_area = NULL;
699 ps->header_area = NULL; 813 ps->header_area = NULL;
700 ps->next_free = 2; /* skipping the header and first area */ 814 ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */
701 ps->current_committed = 0; 815 ps->current_committed = 0;
702 816
703 ps->callback_count = 0; 817 ps->callback_count = 0;
@@ -726,8 +840,7 @@ static unsigned persistent_status(struct dm_exception_store *store,
726 case STATUSTYPE_INFO: 840 case STATUSTYPE_INFO:
727 break; 841 break;
728 case STATUSTYPE_TABLE: 842 case STATUSTYPE_TABLE:
729 DMEMIT(" %s P %llu", store->cow->name, 843 DMEMIT(" P %llu", (unsigned long long)store->chunk_size);
730 (unsigned long long)store->chunk_size);
731 } 844 }
732 845
733 return sz; 846 return sz;
@@ -741,8 +854,10 @@ static struct dm_exception_store_type _persistent_type = {
741 .read_metadata = persistent_read_metadata, 854 .read_metadata = persistent_read_metadata,
742 .prepare_exception = persistent_prepare_exception, 855 .prepare_exception = persistent_prepare_exception,
743 .commit_exception = persistent_commit_exception, 856 .commit_exception = persistent_commit_exception,
857 .prepare_merge = persistent_prepare_merge,
858 .commit_merge = persistent_commit_merge,
744 .drop_snapshot = persistent_drop_snapshot, 859 .drop_snapshot = persistent_drop_snapshot,
745 .fraction_full = persistent_fraction_full, 860 .usage = persistent_usage,
746 .status = persistent_status, 861 .status = persistent_status,
747}; 862};
748 863
@@ -754,8 +869,10 @@ static struct dm_exception_store_type _persistent_compat_type = {
754 .read_metadata = persistent_read_metadata, 869 .read_metadata = persistent_read_metadata,
755 .prepare_exception = persistent_prepare_exception, 870 .prepare_exception = persistent_prepare_exception,
756 .commit_exception = persistent_commit_exception, 871 .commit_exception = persistent_commit_exception,
872 .prepare_merge = persistent_prepare_merge,
873 .commit_merge = persistent_commit_merge,
757 .drop_snapshot = persistent_drop_snapshot, 874 .drop_snapshot = persistent_drop_snapshot,
758 .fraction_full = persistent_fraction_full, 875 .usage = persistent_usage,
759 .status = persistent_status, 876 .status = persistent_status,
760}; 877};
761 878
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index cde5aa558e6d..a0898a66a2f8 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -36,10 +36,10 @@ static int transient_read_metadata(struct dm_exception_store *store,
36} 36}
37 37
38static int transient_prepare_exception(struct dm_exception_store *store, 38static int transient_prepare_exception(struct dm_exception_store *store,
39 struct dm_snap_exception *e) 39 struct dm_exception *e)
40{ 40{
41 struct transient_c *tc = store->context; 41 struct transient_c *tc = store->context;
42 sector_t size = get_dev_size(store->cow->bdev); 42 sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
43 43
44 if (size < (tc->next_free + store->chunk_size)) 44 if (size < (tc->next_free + store->chunk_size))
45 return -1; 45 return -1;
@@ -51,7 +51,7 @@ static int transient_prepare_exception(struct dm_exception_store *store,
51} 51}
52 52
53static void transient_commit_exception(struct dm_exception_store *store, 53static void transient_commit_exception(struct dm_exception_store *store,
54 struct dm_snap_exception *e, 54 struct dm_exception *e,
55 void (*callback) (void *, int success), 55 void (*callback) (void *, int success),
56 void *callback_context) 56 void *callback_context)
57{ 57{
@@ -59,11 +59,14 @@ static void transient_commit_exception(struct dm_exception_store *store,
59 callback(callback_context, 1); 59 callback(callback_context, 1);
60} 60}
61 61
62static void transient_fraction_full(struct dm_exception_store *store, 62static void transient_usage(struct dm_exception_store *store,
63 sector_t *numerator, sector_t *denominator) 63 sector_t *total_sectors,
64 sector_t *sectors_allocated,
65 sector_t *metadata_sectors)
64{ 66{
65 *numerator = ((struct transient_c *) store->context)->next_free; 67 *sectors_allocated = ((struct transient_c *) store->context)->next_free;
66 *denominator = get_dev_size(store->cow->bdev); 68 *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
69 *metadata_sectors = 0;
67} 70}
68 71
69static int transient_ctr(struct dm_exception_store *store, 72static int transient_ctr(struct dm_exception_store *store,
@@ -91,8 +94,7 @@ static unsigned transient_status(struct dm_exception_store *store,
91 case STATUSTYPE_INFO: 94 case STATUSTYPE_INFO:
92 break; 95 break;
93 case STATUSTYPE_TABLE: 96 case STATUSTYPE_TABLE:
94 DMEMIT(" %s N %llu", store->cow->name, 97 DMEMIT(" N %llu", (unsigned long long)store->chunk_size);
95 (unsigned long long)store->chunk_size);
96 } 98 }
97 99
98 return sz; 100 return sz;
@@ -106,7 +108,7 @@ static struct dm_exception_store_type _transient_type = {
106 .read_metadata = transient_read_metadata, 108 .read_metadata = transient_read_metadata,
107 .prepare_exception = transient_prepare_exception, 109 .prepare_exception = transient_prepare_exception,
108 .commit_exception = transient_commit_exception, 110 .commit_exception = transient_commit_exception,
109 .fraction_full = transient_fraction_full, 111 .usage = transient_usage,
110 .status = transient_status, 112 .status = transient_status,
111}; 113};
112 114
@@ -118,7 +120,7 @@ static struct dm_exception_store_type _transient_compat_type = {
118 .read_metadata = transient_read_metadata, 120 .read_metadata = transient_read_metadata,
119 .prepare_exception = transient_prepare_exception, 121 .prepare_exception = transient_prepare_exception,
120 .commit_exception = transient_commit_exception, 122 .commit_exception = transient_commit_exception,
121 .fraction_full = transient_fraction_full, 123 .usage = transient_usage,
122 .status = transient_status, 124 .status = transient_status,
123}; 125};
124 126
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 3a3ba46e6d4b..ee8eb283650d 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -25,6 +25,11 @@
25 25
26#define DM_MSG_PREFIX "snapshots" 26#define DM_MSG_PREFIX "snapshots"
27 27
28static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
29
30#define dm_target_is_snapshot_merge(ti) \
31 ((ti)->type->name == dm_snapshot_merge_target_name)
32
28/* 33/*
29 * The percentage increment we will wake up users at 34 * The percentage increment we will wake up users at
30 */ 35 */
@@ -49,7 +54,7 @@
49#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ 54#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
50 (DM_TRACKED_CHUNK_HASH_SIZE - 1)) 55 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
51 56
52struct exception_table { 57struct dm_exception_table {
53 uint32_t hash_mask; 58 uint32_t hash_mask;
54 unsigned hash_shift; 59 unsigned hash_shift;
55 struct list_head *table; 60 struct list_head *table;
@@ -59,22 +64,31 @@ struct dm_snapshot {
59 struct rw_semaphore lock; 64 struct rw_semaphore lock;
60 65
61 struct dm_dev *origin; 66 struct dm_dev *origin;
67 struct dm_dev *cow;
68
69 struct dm_target *ti;
62 70
63 /* List of snapshots per Origin */ 71 /* List of snapshots per Origin */
64 struct list_head list; 72 struct list_head list;
65 73
66 /* You can't use a snapshot if this is 0 (e.g. if full) */ 74 /*
75 * You can't use a snapshot if this is 0 (e.g. if full).
76 * A snapshot-merge target never clears this.
77 */
67 int valid; 78 int valid;
68 79
69 /* Origin writes don't trigger exceptions until this is set */ 80 /* Origin writes don't trigger exceptions until this is set */
70 int active; 81 int active;
71 82
83 /* Whether or not owning mapped_device is suspended */
84 int suspended;
85
72 mempool_t *pending_pool; 86 mempool_t *pending_pool;
73 87
74 atomic_t pending_exceptions_count; 88 atomic_t pending_exceptions_count;
75 89
76 struct exception_table pending; 90 struct dm_exception_table pending;
77 struct exception_table complete; 91 struct dm_exception_table complete;
78 92
79 /* 93 /*
80 * pe_lock protects all pending_exception operations and access 94 * pe_lock protects all pending_exception operations and access
@@ -95,8 +109,51 @@ struct dm_snapshot {
95 mempool_t *tracked_chunk_pool; 109 mempool_t *tracked_chunk_pool;
96 spinlock_t tracked_chunk_lock; 110 spinlock_t tracked_chunk_lock;
97 struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; 111 struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
112
113 /*
114 * The merge operation failed if this flag is set.
115 * Failure modes are handled as follows:
116 * - I/O error reading the header
117 * => don't load the target; abort.
118 * - Header does not have "valid" flag set
119 * => use the origin; forget about the snapshot.
120 * - I/O error when reading exceptions
121 * => don't load the target; abort.
122 * (We can't use the intermediate origin state.)
123 * - I/O error while merging
124 * => stop merging; set merge_failed; process I/O normally.
125 */
126 int merge_failed;
127
128 /* Wait for events based on state_bits */
129 unsigned long state_bits;
130
131 /* Range of chunks currently being merged. */
132 chunk_t first_merging_chunk;
133 int num_merging_chunks;
134
135 /*
136 * Incoming bios that overlap with chunks being merged must wait
137 * for them to be committed.
138 */
139 struct bio_list bios_queued_during_merge;
98}; 140};
99 141
142/*
143 * state_bits:
144 * RUNNING_MERGE - Merge operation is in progress.
145 * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
146 * cleared afterwards.
147 */
148#define RUNNING_MERGE 0
149#define SHUTDOWN_MERGE 1
150
151struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
152{
153 return s->cow;
154}
155EXPORT_SYMBOL(dm_snap_cow);
156
100static struct workqueue_struct *ksnapd; 157static struct workqueue_struct *ksnapd;
101static void flush_queued_bios(struct work_struct *work); 158static void flush_queued_bios(struct work_struct *work);
102 159
@@ -116,7 +173,7 @@ static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
116} 173}
117 174
118struct dm_snap_pending_exception { 175struct dm_snap_pending_exception {
119 struct dm_snap_exception e; 176 struct dm_exception e;
120 177
121 /* 178 /*
122 * Origin buffers waiting for this to complete are held 179 * Origin buffers waiting for this to complete are held
@@ -125,28 +182,6 @@ struct dm_snap_pending_exception {
125 struct bio_list origin_bios; 182 struct bio_list origin_bios;
126 struct bio_list snapshot_bios; 183 struct bio_list snapshot_bios;
127 184
128 /*
129 * Short-term queue of pending exceptions prior to submission.
130 */
131 struct list_head list;
132
133 /*
134 * The primary pending_exception is the one that holds
135 * the ref_count and the list of origin_bios for a
136 * group of pending_exceptions. It is always last to get freed.
137 * These fields get set up when writing to the origin.
138 */
139 struct dm_snap_pending_exception *primary_pe;
140
141 /*
142 * Number of pending_exceptions processing this chunk.
143 * When this drops to zero we must complete the origin bios.
144 * If incrementing or decrementing this, hold pe->snap->lock for
145 * the sibling concerned and not pe->primary_pe->snap->lock unless
146 * they are the same.
147 */
148 atomic_t ref_count;
149
150 /* Pointer back to snapshot context */ 185 /* Pointer back to snapshot context */
151 struct dm_snapshot *snap; 186 struct dm_snapshot *snap;
152 187
@@ -222,6 +257,16 @@ static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
222} 257}
223 258
224/* 259/*
260 * This conflicting I/O is extremely improbable in the caller,
261 * so msleep(1) is sufficient and there is no need for a wait queue.
262 */
263static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
264{
265 while (__chunk_is_tracked(s, chunk))
266 msleep(1);
267}
268
269/*
225 * One of these per registered origin, held in the snapshot_origins hash 270 * One of these per registered origin, held in the snapshot_origins hash
226 */ 271 */
227struct origin { 272struct origin {
@@ -243,6 +288,10 @@ struct origin {
243static struct list_head *_origins; 288static struct list_head *_origins;
244static struct rw_semaphore _origins_lock; 289static struct rw_semaphore _origins_lock;
245 290
291static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
292static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
293static uint64_t _pending_exceptions_done_count;
294
246static int init_origin_hash(void) 295static int init_origin_hash(void)
247{ 296{
248 int i; 297 int i;
@@ -291,22 +340,144 @@ static void __insert_origin(struct origin *o)
291} 340}
292 341
293/* 342/*
343 * _origins_lock must be held when calling this function.
344 * Returns number of snapshots registered using the supplied cow device, plus:
345 * snap_src - a snapshot suitable for use as a source of exception handover
346 * snap_dest - a snapshot capable of receiving exception handover.
347 * snap_merge - an existing snapshot-merge target linked to the same origin.
348 * There can be at most one snapshot-merge target. The parameter is optional.
349 *
350 * Possible return values and states of snap_src and snap_dest.
351 * 0: NULL, NULL - first new snapshot
352 * 1: snap_src, NULL - normal snapshot
353 * 2: snap_src, snap_dest - waiting for handover
354 * 2: snap_src, NULL - handed over, waiting for old to be deleted
355 * 1: NULL, snap_dest - source got destroyed without handover
356 */
357static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
358 struct dm_snapshot **snap_src,
359 struct dm_snapshot **snap_dest,
360 struct dm_snapshot **snap_merge)
361{
362 struct dm_snapshot *s;
363 struct origin *o;
364 int count = 0;
365 int active;
366
367 o = __lookup_origin(snap->origin->bdev);
368 if (!o)
369 goto out;
370
371 list_for_each_entry(s, &o->snapshots, list) {
372 if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
373 *snap_merge = s;
374 if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
375 continue;
376
377 down_read(&s->lock);
378 active = s->active;
379 up_read(&s->lock);
380
381 if (active) {
382 if (snap_src)
383 *snap_src = s;
384 } else if (snap_dest)
385 *snap_dest = s;
386
387 count++;
388 }
389
390out:
391 return count;
392}
393
394/*
395 * On success, returns 1 if this snapshot is a handover destination,
396 * otherwise returns 0.
397 */
398static int __validate_exception_handover(struct dm_snapshot *snap)
399{
400 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
401 struct dm_snapshot *snap_merge = NULL;
402
403 /* Does snapshot need exceptions handed over to it? */
404 if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
405 &snap_merge) == 2) ||
406 snap_dest) {
407 snap->ti->error = "Snapshot cow pairing for exception "
408 "table handover failed";
409 return -EINVAL;
410 }
411
412 /*
413 * If no snap_src was found, snap cannot become a handover
414 * destination.
415 */
416 if (!snap_src)
417 return 0;
418
419 /*
420 * Non-snapshot-merge handover?
421 */
422 if (!dm_target_is_snapshot_merge(snap->ti))
423 return 1;
424
425 /*
426 * Do not allow more than one merging snapshot.
427 */
428 if (snap_merge) {
429 snap->ti->error = "A snapshot is already merging.";
430 return -EINVAL;
431 }
432
433 if (!snap_src->store->type->prepare_merge ||
434 !snap_src->store->type->commit_merge) {
435 snap->ti->error = "Snapshot exception store does not "
436 "support snapshot-merge.";
437 return -EINVAL;
438 }
439
440 return 1;
441}
442
443static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
444{
445 struct dm_snapshot *l;
446
447 /* Sort the list according to chunk size, largest-first smallest-last */
448 list_for_each_entry(l, &o->snapshots, list)
449 if (l->store->chunk_size < s->store->chunk_size)
450 break;
451 list_add_tail(&s->list, &l->list);
452}
453
454/*
294 * Make a note of the snapshot and its origin so we can look it 455 * Make a note of the snapshot and its origin so we can look it
295 * up when the origin has a write on it. 456 * up when the origin has a write on it.
457 *
458 * Also validate snapshot exception store handovers.
459 * On success, returns 1 if this registration is a handover destination,
460 * otherwise returns 0.
296 */ 461 */
297static int register_snapshot(struct dm_snapshot *snap) 462static int register_snapshot(struct dm_snapshot *snap)
298{ 463{
299 struct dm_snapshot *l; 464 struct origin *o, *new_o = NULL;
300 struct origin *o, *new_o;
301 struct block_device *bdev = snap->origin->bdev; 465 struct block_device *bdev = snap->origin->bdev;
466 int r = 0;
302 467
303 new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); 468 new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
304 if (!new_o) 469 if (!new_o)
305 return -ENOMEM; 470 return -ENOMEM;
306 471
307 down_write(&_origins_lock); 472 down_write(&_origins_lock);
308 o = __lookup_origin(bdev);
309 473
474 r = __validate_exception_handover(snap);
475 if (r < 0) {
476 kfree(new_o);
477 goto out;
478 }
479
480 o = __lookup_origin(bdev);
310 if (o) 481 if (o)
311 kfree(new_o); 482 kfree(new_o);
312 else { 483 else {
@@ -320,14 +491,27 @@ static int register_snapshot(struct dm_snapshot *snap)
320 __insert_origin(o); 491 __insert_origin(o);
321 } 492 }
322 493
323 /* Sort the list according to chunk size, largest-first smallest-last */ 494 __insert_snapshot(o, snap);
324 list_for_each_entry(l, &o->snapshots, list) 495
325 if (l->store->chunk_size < snap->store->chunk_size) 496out:
326 break; 497 up_write(&_origins_lock);
327 list_add_tail(&snap->list, &l->list); 498
499 return r;
500}
501
502/*
503 * Move snapshot to correct place in list according to chunk size.
504 */
505static void reregister_snapshot(struct dm_snapshot *s)
506{
507 struct block_device *bdev = s->origin->bdev;
508
509 down_write(&_origins_lock);
510
511 list_del(&s->list);
512 __insert_snapshot(__lookup_origin(bdev), s);
328 513
329 up_write(&_origins_lock); 514 up_write(&_origins_lock);
330 return 0;
331} 515}
332 516
333static void unregister_snapshot(struct dm_snapshot *s) 517static void unregister_snapshot(struct dm_snapshot *s)
@@ -338,7 +522,7 @@ static void unregister_snapshot(struct dm_snapshot *s)
338 o = __lookup_origin(s->origin->bdev); 522 o = __lookup_origin(s->origin->bdev);
339 523
340 list_del(&s->list); 524 list_del(&s->list);
341 if (list_empty(&o->snapshots)) { 525 if (o && list_empty(&o->snapshots)) {
342 list_del(&o->hash_list); 526 list_del(&o->hash_list);
343 kfree(o); 527 kfree(o);
344 } 528 }
@@ -351,8 +535,8 @@ static void unregister_snapshot(struct dm_snapshot *s)
351 * The lowest hash_shift bits of the chunk number are ignored, allowing 535 * The lowest hash_shift bits of the chunk number are ignored, allowing
352 * some consecutive chunks to be grouped together. 536 * some consecutive chunks to be grouped together.
353 */ 537 */
354static int init_exception_table(struct exception_table *et, uint32_t size, 538static int dm_exception_table_init(struct dm_exception_table *et,
355 unsigned hash_shift) 539 uint32_t size, unsigned hash_shift)
356{ 540{
357 unsigned int i; 541 unsigned int i;
358 542
@@ -368,10 +552,11 @@ static int init_exception_table(struct exception_table *et, uint32_t size,
368 return 0; 552 return 0;
369} 553}
370 554
371static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem) 555static void dm_exception_table_exit(struct dm_exception_table *et,
556 struct kmem_cache *mem)
372{ 557{
373 struct list_head *slot; 558 struct list_head *slot;
374 struct dm_snap_exception *ex, *next; 559 struct dm_exception *ex, *next;
375 int i, size; 560 int i, size;
376 561
377 size = et->hash_mask + 1; 562 size = et->hash_mask + 1;
@@ -385,19 +570,12 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache *
385 vfree(et->table); 570 vfree(et->table);
386} 571}
387 572
388static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) 573static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
389{ 574{
390 return (chunk >> et->hash_shift) & et->hash_mask; 575 return (chunk >> et->hash_shift) & et->hash_mask;
391} 576}
392 577
393static void insert_exception(struct exception_table *eh, 578static void dm_remove_exception(struct dm_exception *e)
394 struct dm_snap_exception *e)
395{
396 struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
397 list_add(&e->hash_list, l);
398}
399
400static void remove_exception(struct dm_snap_exception *e)
401{ 579{
402 list_del(&e->hash_list); 580 list_del(&e->hash_list);
403} 581}
@@ -406,11 +584,11 @@ static void remove_exception(struct dm_snap_exception *e)
406 * Return the exception data for a sector, or NULL if not 584 * Return the exception data for a sector, or NULL if not
407 * remapped. 585 * remapped.
408 */ 586 */
409static struct dm_snap_exception *lookup_exception(struct exception_table *et, 587static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
410 chunk_t chunk) 588 chunk_t chunk)
411{ 589{
412 struct list_head *slot; 590 struct list_head *slot;
413 struct dm_snap_exception *e; 591 struct dm_exception *e;
414 592
415 slot = &et->table[exception_hash(et, chunk)]; 593 slot = &et->table[exception_hash(et, chunk)];
416 list_for_each_entry (e, slot, hash_list) 594 list_for_each_entry (e, slot, hash_list)
@@ -421,9 +599,9 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et,
421 return NULL; 599 return NULL;
422} 600}
423 601
424static struct dm_snap_exception *alloc_exception(void) 602static struct dm_exception *alloc_completed_exception(void)
425{ 603{
426 struct dm_snap_exception *e; 604 struct dm_exception *e;
427 605
428 e = kmem_cache_alloc(exception_cache, GFP_NOIO); 606 e = kmem_cache_alloc(exception_cache, GFP_NOIO);
429 if (!e) 607 if (!e)
@@ -432,7 +610,7 @@ static struct dm_snap_exception *alloc_exception(void)
432 return e; 610 return e;
433} 611}
434 612
435static void free_exception(struct dm_snap_exception *e) 613static void free_completed_exception(struct dm_exception *e)
436{ 614{
437 kmem_cache_free(exception_cache, e); 615 kmem_cache_free(exception_cache, e);
438} 616}
@@ -457,12 +635,11 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
457 atomic_dec(&s->pending_exceptions_count); 635 atomic_dec(&s->pending_exceptions_count);
458} 636}
459 637
460static void insert_completed_exception(struct dm_snapshot *s, 638static void dm_insert_exception(struct dm_exception_table *eh,
461 struct dm_snap_exception *new_e) 639 struct dm_exception *new_e)
462{ 640{
463 struct exception_table *eh = &s->complete;
464 struct list_head *l; 641 struct list_head *l;
465 struct dm_snap_exception *e = NULL; 642 struct dm_exception *e = NULL;
466 643
467 l = &eh->table[exception_hash(eh, new_e->old_chunk)]; 644 l = &eh->table[exception_hash(eh, new_e->old_chunk)];
468 645
@@ -478,7 +655,7 @@ static void insert_completed_exception(struct dm_snapshot *s,
478 new_e->new_chunk == (dm_chunk_number(e->new_chunk) + 655 new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
479 dm_consecutive_chunk_count(e) + 1)) { 656 dm_consecutive_chunk_count(e) + 1)) {
480 dm_consecutive_chunk_count_inc(e); 657 dm_consecutive_chunk_count_inc(e);
481 free_exception(new_e); 658 free_completed_exception(new_e);
482 return; 659 return;
483 } 660 }
484 661
@@ -488,7 +665,7 @@ static void insert_completed_exception(struct dm_snapshot *s,
488 dm_consecutive_chunk_count_inc(e); 665 dm_consecutive_chunk_count_inc(e);
489 e->old_chunk--; 666 e->old_chunk--;
490 e->new_chunk--; 667 e->new_chunk--;
491 free_exception(new_e); 668 free_completed_exception(new_e);
492 return; 669 return;
493 } 670 }
494 671
@@ -507,9 +684,9 @@ out:
507static int dm_add_exception(void *context, chunk_t old, chunk_t new) 684static int dm_add_exception(void *context, chunk_t old, chunk_t new)
508{ 685{
509 struct dm_snapshot *s = context; 686 struct dm_snapshot *s = context;
510 struct dm_snap_exception *e; 687 struct dm_exception *e;
511 688
512 e = alloc_exception(); 689 e = alloc_completed_exception();
513 if (!e) 690 if (!e)
514 return -ENOMEM; 691 return -ENOMEM;
515 692
@@ -518,11 +695,30 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
518 /* Consecutive_count is implicitly initialised to zero */ 695 /* Consecutive_count is implicitly initialised to zero */
519 e->new_chunk = new; 696 e->new_chunk = new;
520 697
521 insert_completed_exception(s, e); 698 dm_insert_exception(&s->complete, e);
522 699
523 return 0; 700 return 0;
524} 701}
525 702
703#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
704
705/*
706 * Return a minimum chunk size of all snapshots that have the specified origin.
707 * Return zero if the origin has no snapshots.
708 */
709static sector_t __minimum_chunk_size(struct origin *o)
710{
711 struct dm_snapshot *snap;
712 unsigned chunk_size = 0;
713
714 if (o)
715 list_for_each_entry(snap, &o->snapshots, list)
716 chunk_size = min_not_zero(chunk_size,
717 snap->store->chunk_size);
718
719 return chunk_size;
720}
721
526/* 722/*
527 * Hard coded magic. 723 * Hard coded magic.
528 */ 724 */
@@ -546,16 +742,18 @@ static int init_hash_tables(struct dm_snapshot *s)
546 * Calculate based on the size of the original volume or 742 * Calculate based on the size of the original volume or
547 * the COW volume... 743 * the COW volume...
548 */ 744 */
549 cow_dev_size = get_dev_size(s->store->cow->bdev); 745 cow_dev_size = get_dev_size(s->cow->bdev);
550 origin_dev_size = get_dev_size(s->origin->bdev); 746 origin_dev_size = get_dev_size(s->origin->bdev);
551 max_buckets = calc_max_buckets(); 747 max_buckets = calc_max_buckets();
552 748
553 hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; 749 hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
554 hash_size = min(hash_size, max_buckets); 750 hash_size = min(hash_size, max_buckets);
555 751
752 if (hash_size < 64)
753 hash_size = 64;
556 hash_size = rounddown_pow_of_two(hash_size); 754 hash_size = rounddown_pow_of_two(hash_size);
557 if (init_exception_table(&s->complete, hash_size, 755 if (dm_exception_table_init(&s->complete, hash_size,
558 DM_CHUNK_CONSECUTIVE_BITS)) 756 DM_CHUNK_CONSECUTIVE_BITS))
559 return -ENOMEM; 757 return -ENOMEM;
560 758
561 /* 759 /*
@@ -566,14 +764,284 @@ static int init_hash_tables(struct dm_snapshot *s)
566 if (hash_size < 64) 764 if (hash_size < 64)
567 hash_size = 64; 765 hash_size = 64;
568 766
569 if (init_exception_table(&s->pending, hash_size, 0)) { 767 if (dm_exception_table_init(&s->pending, hash_size, 0)) {
570 exit_exception_table(&s->complete, exception_cache); 768 dm_exception_table_exit(&s->complete, exception_cache);
571 return -ENOMEM; 769 return -ENOMEM;
572 } 770 }
573 771
574 return 0; 772 return 0;
575} 773}
576 774
775static void merge_shutdown(struct dm_snapshot *s)
776{
777 clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
778 smp_mb__after_clear_bit();
779 wake_up_bit(&s->state_bits, RUNNING_MERGE);
780}
781
782static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s)
783{
784 s->first_merging_chunk = 0;
785 s->num_merging_chunks = 0;
786
787 return bio_list_get(&s->bios_queued_during_merge);
788}
789
790/*
791 * Remove one chunk from the index of completed exceptions.
792 */
793static int __remove_single_exception_chunk(struct dm_snapshot *s,
794 chunk_t old_chunk)
795{
796 struct dm_exception *e;
797
798 e = dm_lookup_exception(&s->complete, old_chunk);
799 if (!e) {
800 DMERR("Corruption detected: exception for block %llu is "
801 "on disk but not in memory",
802 (unsigned long long)old_chunk);
803 return -EINVAL;
804 }
805
806 /*
807 * If this is the only chunk using this exception, remove exception.
808 */
809 if (!dm_consecutive_chunk_count(e)) {
810 dm_remove_exception(e);
811 free_completed_exception(e);
812 return 0;
813 }
814
815 /*
816 * The chunk may be either at the beginning or the end of a
817 * group of consecutive chunks - never in the middle. We are
818 * removing chunks in the opposite order to that in which they
819 * were added, so this should always be true.
820 * Decrement the consecutive chunk counter and adjust the
821 * starting point if necessary.
822 */
823 if (old_chunk == e->old_chunk) {
824 e->old_chunk++;
825 e->new_chunk++;
826 } else if (old_chunk != e->old_chunk +
827 dm_consecutive_chunk_count(e)) {
828 DMERR("Attempt to merge block %llu from the "
829 "middle of a chunk range [%llu - %llu]",
830 (unsigned long long)old_chunk,
831 (unsigned long long)e->old_chunk,
832 (unsigned long long)
833 e->old_chunk + dm_consecutive_chunk_count(e));
834 return -EINVAL;
835 }
836
837 dm_consecutive_chunk_count_dec(e);
838
839 return 0;
840}
841
842static void flush_bios(struct bio *bio);
843
844static int remove_single_exception_chunk(struct dm_snapshot *s)
845{
846 struct bio *b = NULL;
847 int r;
848 chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
849
850 down_write(&s->lock);
851
852 /*
853 * Process chunks (and associated exceptions) in reverse order
854 * so that dm_consecutive_chunk_count_dec() accounting works.
855 */
856 do {
857 r = __remove_single_exception_chunk(s, old_chunk);
858 if (r)
859 goto out;
860 } while (old_chunk-- > s->first_merging_chunk);
861
862 b = __release_queued_bios_after_merge(s);
863
864out:
865 up_write(&s->lock);
866 if (b)
867 flush_bios(b);
868
869 return r;
870}
871
872static int origin_write_extent(struct dm_snapshot *merging_snap,
873 sector_t sector, unsigned chunk_size);
874
875static void merge_callback(int read_err, unsigned long write_err,
876 void *context);
877
878static uint64_t read_pending_exceptions_done_count(void)
879{
880 uint64_t pending_exceptions_done;
881
882 spin_lock(&_pending_exceptions_done_spinlock);
883 pending_exceptions_done = _pending_exceptions_done_count;
884 spin_unlock(&_pending_exceptions_done_spinlock);
885
886 return pending_exceptions_done;
887}
888
889static void increment_pending_exceptions_done_count(void)
890{
891 spin_lock(&_pending_exceptions_done_spinlock);
892 _pending_exceptions_done_count++;
893 spin_unlock(&_pending_exceptions_done_spinlock);
894
895 wake_up_all(&_pending_exceptions_done);
896}
897
898static void snapshot_merge_next_chunks(struct dm_snapshot *s)
899{
900 int i, linear_chunks;
901 chunk_t old_chunk, new_chunk;
902 struct dm_io_region src, dest;
903 sector_t io_size;
904 uint64_t previous_count;
905
906 BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
907 if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
908 goto shut;
909
910 /*
911 * valid flag never changes during merge, so no lock required.
912 */
913 if (!s->valid) {
914 DMERR("Snapshot is invalid: can't merge");
915 goto shut;
916 }
917
918 linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
919 &new_chunk);
920 if (linear_chunks <= 0) {
921 if (linear_chunks < 0) {
922 DMERR("Read error in exception store: "
923 "shutting down merge");
924 down_write(&s->lock);
925 s->merge_failed = 1;
926 up_write(&s->lock);
927 }
928 goto shut;
929 }
930
931 /* Adjust old_chunk and new_chunk to reflect start of linear region */
932 old_chunk = old_chunk + 1 - linear_chunks;
933 new_chunk = new_chunk + 1 - linear_chunks;
934
935 /*
936 * Use one (potentially large) I/O to copy all 'linear_chunks'
937 * from the exception store to the origin
938 */
939 io_size = linear_chunks * s->store->chunk_size;
940
941 dest.bdev = s->origin->bdev;
942 dest.sector = chunk_to_sector(s->store, old_chunk);
943 dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
944
945 src.bdev = s->cow->bdev;
946 src.sector = chunk_to_sector(s->store, new_chunk);
947 src.count = dest.count;
948
949 /*
950 * Reallocate any exceptions needed in other snapshots then
951 * wait for the pending exceptions to complete.
952 * Each time any pending exception (globally on the system)
953 * completes we are woken and repeat the process to find out
954 * if we can proceed. While this may not seem a particularly
955 * efficient algorithm, it is not expected to have any
956 * significant impact on performance.
957 */
958 previous_count = read_pending_exceptions_done_count();
959 while (origin_write_extent(s, dest.sector, io_size)) {
960 wait_event(_pending_exceptions_done,
961 (read_pending_exceptions_done_count() !=
962 previous_count));
963 /* Retry after the wait, until all exceptions are done. */
964 previous_count = read_pending_exceptions_done_count();
965 }
966
967 down_write(&s->lock);
968 s->first_merging_chunk = old_chunk;
969 s->num_merging_chunks = linear_chunks;
970 up_write(&s->lock);
971
972 /* Wait until writes to all 'linear_chunks' drain */
973 for (i = 0; i < linear_chunks; i++)
974 __check_for_conflicting_io(s, old_chunk + i);
975
976 dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
977 return;
978
979shut:
980 merge_shutdown(s);
981}
982
983static void error_bios(struct bio *bio);
984
985static void merge_callback(int read_err, unsigned long write_err, void *context)
986{
987 struct dm_snapshot *s = context;
988 struct bio *b = NULL;
989
990 if (read_err || write_err) {
991 if (read_err)
992 DMERR("Read error: shutting down merge.");
993 else
994 DMERR("Write error: shutting down merge.");
995 goto shut;
996 }
997
998 if (s->store->type->commit_merge(s->store,
999 s->num_merging_chunks) < 0) {
1000 DMERR("Write error in exception store: shutting down merge");
1001 goto shut;
1002 }
1003
1004 if (remove_single_exception_chunk(s) < 0)
1005 goto shut;
1006
1007 snapshot_merge_next_chunks(s);
1008
1009 return;
1010
1011shut:
1012 down_write(&s->lock);
1013 s->merge_failed = 1;
1014 b = __release_queued_bios_after_merge(s);
1015 up_write(&s->lock);
1016 error_bios(b);
1017
1018 merge_shutdown(s);
1019}
1020
1021static void start_merge(struct dm_snapshot *s)
1022{
1023 if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
1024 snapshot_merge_next_chunks(s);
1025}
1026
1027static int wait_schedule(void *ptr)
1028{
1029 schedule();
1030
1031 return 0;
1032}
1033
1034/*
1035 * Stop the merging process and wait until it finishes.
1036 */
1037static void stop_merge(struct dm_snapshot *s)
1038{
1039 set_bit(SHUTDOWN_MERGE, &s->state_bits);
1040 wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule,
1041 TASK_UNINTERRUPTIBLE);
1042 clear_bit(SHUTDOWN_MERGE, &s->state_bits);
1043}
1044
577/* 1045/*
578 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> 1046 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
579 */ 1047 */
@@ -582,50 +1050,73 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
582 struct dm_snapshot *s; 1050 struct dm_snapshot *s;
583 int i; 1051 int i;
584 int r = -EINVAL; 1052 int r = -EINVAL;
585 char *origin_path; 1053 char *origin_path, *cow_path;
586 struct dm_exception_store *store; 1054 unsigned args_used, num_flush_requests = 1;
587 unsigned args_used; 1055 fmode_t origin_mode = FMODE_READ;
588 1056
589 if (argc != 4) { 1057 if (argc != 4) {
590 ti->error = "requires exactly 4 arguments"; 1058 ti->error = "requires exactly 4 arguments";
591 r = -EINVAL; 1059 r = -EINVAL;
592 goto bad_args; 1060 goto bad;
1061 }
1062
1063 if (dm_target_is_snapshot_merge(ti)) {
1064 num_flush_requests = 2;
1065 origin_mode = FMODE_WRITE;
593 } 1066 }
594 1067
595 origin_path = argv[0]; 1068 origin_path = argv[0];
596 argv++; 1069 argv++;
597 argc--; 1070 argc--;
598 1071
599 r = dm_exception_store_create(ti, argc, argv, &args_used, &store); 1072 s = kmalloc(sizeof(*s), GFP_KERNEL);
1073 if (!s) {
1074 ti->error = "Cannot allocate snapshot context private "
1075 "structure";
1076 r = -ENOMEM;
1077 goto bad;
1078 }
1079
1080 cow_path = argv[0];
1081 argv++;
1082 argc--;
1083
1084 r = dm_get_device(ti, cow_path, 0, 0,
1085 FMODE_READ | FMODE_WRITE, &s->cow);
1086 if (r) {
1087 ti->error = "Cannot get COW device";
1088 goto bad_cow;
1089 }
1090
1091 r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
600 if (r) { 1092 if (r) {
601 ti->error = "Couldn't create exception store"; 1093 ti->error = "Couldn't create exception store";
602 r = -EINVAL; 1094 r = -EINVAL;
603 goto bad_args; 1095 goto bad_store;
604 } 1096 }
605 1097
606 argv += args_used; 1098 argv += args_used;
607 argc -= args_used; 1099 argc -= args_used;
608 1100
609 s = kmalloc(sizeof(*s), GFP_KERNEL); 1101 r = dm_get_device(ti, origin_path, 0, ti->len, origin_mode, &s->origin);
610 if (!s) {
611 ti->error = "Cannot allocate snapshot context private "
612 "structure";
613 r = -ENOMEM;
614 goto bad_snap;
615 }
616
617 r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
618 if (r) { 1102 if (r) {
619 ti->error = "Cannot get origin device"; 1103 ti->error = "Cannot get origin device";
620 goto bad_origin; 1104 goto bad_origin;
621 } 1105 }
622 1106
623 s->store = store; 1107 s->ti = ti;
624 s->valid = 1; 1108 s->valid = 1;
625 s->active = 0; 1109 s->active = 0;
1110 s->suspended = 0;
626 atomic_set(&s->pending_exceptions_count, 0); 1111 atomic_set(&s->pending_exceptions_count, 0);
627 init_rwsem(&s->lock); 1112 init_rwsem(&s->lock);
1113 INIT_LIST_HEAD(&s->list);
628 spin_lock_init(&s->pe_lock); 1114 spin_lock_init(&s->pe_lock);
1115 s->state_bits = 0;
1116 s->merge_failed = 0;
1117 s->first_merging_chunk = 0;
1118 s->num_merging_chunks = 0;
1119 bio_list_init(&s->bios_queued_during_merge);
629 1120
630 /* Allocate hash table for COW data */ 1121 /* Allocate hash table for COW data */
631 if (init_hash_tables(s)) { 1122 if (init_hash_tables(s)) {
@@ -659,39 +1150,55 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
659 1150
660 spin_lock_init(&s->tracked_chunk_lock); 1151 spin_lock_init(&s->tracked_chunk_lock);
661 1152
662 /* Metadata must only be loaded into one table at once */ 1153 bio_list_init(&s->queued_bios);
1154 INIT_WORK(&s->queued_bios_work, flush_queued_bios);
1155
1156 ti->private = s;
1157 ti->num_flush_requests = num_flush_requests;
1158
1159 /* Add snapshot to the list of snapshots for this origin */
1160 /* Exceptions aren't triggered till snapshot_resume() is called */
1161 r = register_snapshot(s);
1162 if (r == -ENOMEM) {
1163 ti->error = "Snapshot origin struct allocation failed";
1164 goto bad_load_and_register;
1165 } else if (r < 0) {
1166 /* invalid handover, register_snapshot has set ti->error */
1167 goto bad_load_and_register;
1168 }
1169
1170 /*
1171 * Metadata must only be loaded into one table at once, so skip this
1172 * if metadata will be handed over during resume.
1173 * Chunk size will be set during the handover - set it to zero to
1174 * ensure it's ignored.
1175 */
1176 if (r > 0) {
1177 s->store->chunk_size = 0;
1178 return 0;
1179 }
1180
663 r = s->store->type->read_metadata(s->store, dm_add_exception, 1181 r = s->store->type->read_metadata(s->store, dm_add_exception,
664 (void *)s); 1182 (void *)s);
665 if (r < 0) { 1183 if (r < 0) {
666 ti->error = "Failed to read snapshot metadata"; 1184 ti->error = "Failed to read snapshot metadata";
667 goto bad_load_and_register; 1185 goto bad_read_metadata;
668 } else if (r > 0) { 1186 } else if (r > 0) {
669 s->valid = 0; 1187 s->valid = 0;
670 DMWARN("Snapshot is marked invalid."); 1188 DMWARN("Snapshot is marked invalid.");
671 } 1189 }
672 1190
673 bio_list_init(&s->queued_bios);
674 INIT_WORK(&s->queued_bios_work, flush_queued_bios);
675
676 if (!s->store->chunk_size) { 1191 if (!s->store->chunk_size) {
677 ti->error = "Chunk size not set"; 1192 ti->error = "Chunk size not set";
678 goto bad_load_and_register; 1193 goto bad_read_metadata;
679 }
680
681 /* Add snapshot to the list of snapshots for this origin */
682 /* Exceptions aren't triggered till snapshot_resume() is called */
683 if (register_snapshot(s)) {
684 r = -EINVAL;
685 ti->error = "Cannot register snapshot origin";
686 goto bad_load_and_register;
687 } 1194 }
688
689 ti->private = s;
690 ti->split_io = s->store->chunk_size; 1195 ti->split_io = s->store->chunk_size;
691 ti->num_flush_requests = 1;
692 1196
693 return 0; 1197 return 0;
694 1198
1199bad_read_metadata:
1200 unregister_snapshot(s);
1201
695bad_load_and_register: 1202bad_load_and_register:
696 mempool_destroy(s->tracked_chunk_pool); 1203 mempool_destroy(s->tracked_chunk_pool);
697 1204
@@ -702,19 +1209,22 @@ bad_pending_pool:
702 dm_kcopyd_client_destroy(s->kcopyd_client); 1209 dm_kcopyd_client_destroy(s->kcopyd_client);
703 1210
704bad_kcopyd: 1211bad_kcopyd:
705 exit_exception_table(&s->pending, pending_cache); 1212 dm_exception_table_exit(&s->pending, pending_cache);
706 exit_exception_table(&s->complete, exception_cache); 1213 dm_exception_table_exit(&s->complete, exception_cache);
707 1214
708bad_hash_tables: 1215bad_hash_tables:
709 dm_put_device(ti, s->origin); 1216 dm_put_device(ti, s->origin);
710 1217
711bad_origin: 1218bad_origin:
712 kfree(s); 1219 dm_exception_store_destroy(s->store);
713 1220
714bad_snap: 1221bad_store:
715 dm_exception_store_destroy(store); 1222 dm_put_device(ti, s->cow);
1223
1224bad_cow:
1225 kfree(s);
716 1226
717bad_args: 1227bad:
718 return r; 1228 return r;
719} 1229}
720 1230
@@ -723,8 +1233,39 @@ static void __free_exceptions(struct dm_snapshot *s)
723 dm_kcopyd_client_destroy(s->kcopyd_client); 1233 dm_kcopyd_client_destroy(s->kcopyd_client);
724 s->kcopyd_client = NULL; 1234 s->kcopyd_client = NULL;
725 1235
726 exit_exception_table(&s->pending, pending_cache); 1236 dm_exception_table_exit(&s->pending, pending_cache);
727 exit_exception_table(&s->complete, exception_cache); 1237 dm_exception_table_exit(&s->complete, exception_cache);
1238}
1239
1240static void __handover_exceptions(struct dm_snapshot *snap_src,
1241 struct dm_snapshot *snap_dest)
1242{
1243 union {
1244 struct dm_exception_table table_swap;
1245 struct dm_exception_store *store_swap;
1246 } u;
1247
1248 /*
1249 * Swap all snapshot context information between the two instances.
1250 */
1251 u.table_swap = snap_dest->complete;
1252 snap_dest->complete = snap_src->complete;
1253 snap_src->complete = u.table_swap;
1254
1255 u.store_swap = snap_dest->store;
1256 snap_dest->store = snap_src->store;
1257 snap_src->store = u.store_swap;
1258
1259 snap_dest->store->snap = snap_dest;
1260 snap_src->store->snap = snap_src;
1261
1262 snap_dest->ti->split_io = snap_dest->store->chunk_size;
1263 snap_dest->valid = snap_src->valid;
1264
1265 /*
1266 * Set source invalid to ensure it receives no further I/O.
1267 */
1268 snap_src->valid = 0;
728} 1269}
729 1270
730static void snapshot_dtr(struct dm_target *ti) 1271static void snapshot_dtr(struct dm_target *ti)
@@ -733,9 +1274,24 @@ static void snapshot_dtr(struct dm_target *ti)
733 int i; 1274 int i;
734#endif 1275#endif
735 struct dm_snapshot *s = ti->private; 1276 struct dm_snapshot *s = ti->private;
1277 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
736 1278
737 flush_workqueue(ksnapd); 1279 flush_workqueue(ksnapd);
738 1280
1281 down_read(&_origins_lock);
1282 /* Check whether exception handover must be cancelled */
1283 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1284 if (snap_src && snap_dest && (s == snap_src)) {
1285 down_write(&snap_dest->lock);
1286 snap_dest->valid = 0;
1287 up_write(&snap_dest->lock);
1288 DMERR("Cancelling snapshot handover.");
1289 }
1290 up_read(&_origins_lock);
1291
1292 if (dm_target_is_snapshot_merge(ti))
1293 stop_merge(s);
1294
739 /* Prevent further origin writes from using this snapshot. */ 1295 /* Prevent further origin writes from using this snapshot. */
740 /* After this returns there can be no new kcopyd jobs. */ 1296 /* After this returns there can be no new kcopyd jobs. */
741 unregister_snapshot(s); 1297 unregister_snapshot(s);
@@ -763,6 +1319,8 @@ static void snapshot_dtr(struct dm_target *ti)
763 1319
764 dm_exception_store_destroy(s->store); 1320 dm_exception_store_destroy(s->store);
765 1321
1322 dm_put_device(ti, s->cow);
1323
766 kfree(s); 1324 kfree(s);
767} 1325}
768 1326
@@ -795,6 +1353,26 @@ static void flush_queued_bios(struct work_struct *work)
795 flush_bios(queued_bios); 1353 flush_bios(queued_bios);
796} 1354}
797 1355
1356static int do_origin(struct dm_dev *origin, struct bio *bio);
1357
1358/*
1359 * Flush a list of buffers.
1360 */
1361static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
1362{
1363 struct bio *n;
1364 int r;
1365
1366 while (bio) {
1367 n = bio->bi_next;
1368 bio->bi_next = NULL;
1369 r = do_origin(s->origin, bio);
1370 if (r == DM_MAPIO_REMAPPED)
1371 generic_make_request(bio);
1372 bio = n;
1373 }
1374}
1375
798/* 1376/*
799 * Error a list of buffers. 1377 * Error a list of buffers.
800 */ 1378 */
@@ -825,45 +1403,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
825 1403
826 s->valid = 0; 1404 s->valid = 0;
827 1405
828 dm_table_event(s->store->ti->table); 1406 dm_table_event(s->ti->table);
829}
830
831static void get_pending_exception(struct dm_snap_pending_exception *pe)
832{
833 atomic_inc(&pe->ref_count);
834}
835
836static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe)
837{
838 struct dm_snap_pending_exception *primary_pe;
839 struct bio *origin_bios = NULL;
840
841 primary_pe = pe->primary_pe;
842
843 /*
844 * If this pe is involved in a write to the origin and
845 * it is the last sibling to complete then release
846 * the bios for the original write to the origin.
847 */
848 if (primary_pe &&
849 atomic_dec_and_test(&primary_pe->ref_count)) {
850 origin_bios = bio_list_get(&primary_pe->origin_bios);
851 free_pending_exception(primary_pe);
852 }
853
854 /*
855 * Free the pe if it's not linked to an origin write or if
856 * it's not itself a primary pe.
857 */
858 if (!primary_pe || primary_pe != pe)
859 free_pending_exception(pe);
860
861 return origin_bios;
862} 1407}
863 1408
864static void pending_complete(struct dm_snap_pending_exception *pe, int success) 1409static void pending_complete(struct dm_snap_pending_exception *pe, int success)
865{ 1410{
866 struct dm_snap_exception *e; 1411 struct dm_exception *e;
867 struct dm_snapshot *s = pe->snap; 1412 struct dm_snapshot *s = pe->snap;
868 struct bio *origin_bios = NULL; 1413 struct bio *origin_bios = NULL;
869 struct bio *snapshot_bios = NULL; 1414 struct bio *snapshot_bios = NULL;
@@ -877,7 +1422,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
877 goto out; 1422 goto out;
878 } 1423 }
879 1424
880 e = alloc_exception(); 1425 e = alloc_completed_exception();
881 if (!e) { 1426 if (!e) {
882 down_write(&s->lock); 1427 down_write(&s->lock);
883 __invalidate_snapshot(s, -ENOMEM); 1428 __invalidate_snapshot(s, -ENOMEM);
@@ -888,28 +1433,27 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
888 1433
889 down_write(&s->lock); 1434 down_write(&s->lock);
890 if (!s->valid) { 1435 if (!s->valid) {
891 free_exception(e); 1436 free_completed_exception(e);
892 error = 1; 1437 error = 1;
893 goto out; 1438 goto out;
894 } 1439 }
895 1440
896 /* 1441 /* Check for conflicting reads */
897 * Check for conflicting reads. This is extremely improbable, 1442 __check_for_conflicting_io(s, pe->e.old_chunk);
898 * so msleep(1) is sufficient and there is no need for a wait queue.
899 */
900 while (__chunk_is_tracked(s, pe->e.old_chunk))
901 msleep(1);
902 1443
903 /* 1444 /*
904 * Add a proper exception, and remove the 1445 * Add a proper exception, and remove the
905 * in-flight exception from the list. 1446 * in-flight exception from the list.
906 */ 1447 */
907 insert_completed_exception(s, e); 1448 dm_insert_exception(&s->complete, e);
908 1449
909 out: 1450 out:
910 remove_exception(&pe->e); 1451 dm_remove_exception(&pe->e);
911 snapshot_bios = bio_list_get(&pe->snapshot_bios); 1452 snapshot_bios = bio_list_get(&pe->snapshot_bios);
912 origin_bios = put_pending_exception(pe); 1453 origin_bios = bio_list_get(&pe->origin_bios);
1454 free_pending_exception(pe);
1455
1456 increment_pending_exceptions_done_count();
913 1457
914 up_write(&s->lock); 1458 up_write(&s->lock);
915 1459
@@ -919,7 +1463,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
919 else 1463 else
920 flush_bios(snapshot_bios); 1464 flush_bios(snapshot_bios);
921 1465
922 flush_bios(origin_bios); 1466 retry_origin_bios(s, origin_bios);
923} 1467}
924 1468
925static void commit_callback(void *context, int success) 1469static void commit_callback(void *context, int success)
@@ -963,7 +1507,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
963 src.sector = chunk_to_sector(s->store, pe->e.old_chunk); 1507 src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
964 src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); 1508 src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
965 1509
966 dest.bdev = s->store->cow->bdev; 1510 dest.bdev = s->cow->bdev;
967 dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); 1511 dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
968 dest.count = src.count; 1512 dest.count = src.count;
969 1513
@@ -975,7 +1519,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
975static struct dm_snap_pending_exception * 1519static struct dm_snap_pending_exception *
976__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) 1520__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
977{ 1521{
978 struct dm_snap_exception *e = lookup_exception(&s->pending, chunk); 1522 struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
979 1523
980 if (!e) 1524 if (!e)
981 return NULL; 1525 return NULL;
@@ -1006,8 +1550,6 @@ __find_pending_exception(struct dm_snapshot *s,
1006 pe->e.old_chunk = chunk; 1550 pe->e.old_chunk = chunk;
1007 bio_list_init(&pe->origin_bios); 1551 bio_list_init(&pe->origin_bios);
1008 bio_list_init(&pe->snapshot_bios); 1552 bio_list_init(&pe->snapshot_bios);
1009 pe->primary_pe = NULL;
1010 atomic_set(&pe->ref_count, 0);
1011 pe->started = 0; 1553 pe->started = 0;
1012 1554
1013 if (s->store->type->prepare_exception(s->store, &pe->e)) { 1555 if (s->store->type->prepare_exception(s->store, &pe->e)) {
@@ -1015,16 +1557,15 @@ __find_pending_exception(struct dm_snapshot *s,
1015 return NULL; 1557 return NULL;
1016 } 1558 }
1017 1559
1018 get_pending_exception(pe); 1560 dm_insert_exception(&s->pending, &pe->e);
1019 insert_exception(&s->pending, &pe->e);
1020 1561
1021 return pe; 1562 return pe;
1022} 1563}
1023 1564
1024static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, 1565static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1025 struct bio *bio, chunk_t chunk) 1566 struct bio *bio, chunk_t chunk)
1026{ 1567{
1027 bio->bi_bdev = s->store->cow->bdev; 1568 bio->bi_bdev = s->cow->bdev;
1028 bio->bi_sector = chunk_to_sector(s->store, 1569 bio->bi_sector = chunk_to_sector(s->store,
1029 dm_chunk_number(e->new_chunk) + 1570 dm_chunk_number(e->new_chunk) +
1030 (chunk - e->old_chunk)) + 1571 (chunk - e->old_chunk)) +
@@ -1035,14 +1576,14 @@ static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
1035static int snapshot_map(struct dm_target *ti, struct bio *bio, 1576static int snapshot_map(struct dm_target *ti, struct bio *bio,
1036 union map_info *map_context) 1577 union map_info *map_context)
1037{ 1578{
1038 struct dm_snap_exception *e; 1579 struct dm_exception *e;
1039 struct dm_snapshot *s = ti->private; 1580 struct dm_snapshot *s = ti->private;
1040 int r = DM_MAPIO_REMAPPED; 1581 int r = DM_MAPIO_REMAPPED;
1041 chunk_t chunk; 1582 chunk_t chunk;
1042 struct dm_snap_pending_exception *pe = NULL; 1583 struct dm_snap_pending_exception *pe = NULL;
1043 1584
1044 if (unlikely(bio_empty_barrier(bio))) { 1585 if (unlikely(bio_empty_barrier(bio))) {
1045 bio->bi_bdev = s->store->cow->bdev; 1586 bio->bi_bdev = s->cow->bdev;
1046 return DM_MAPIO_REMAPPED; 1587 return DM_MAPIO_REMAPPED;
1047 } 1588 }
1048 1589
@@ -1063,7 +1604,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1063 } 1604 }
1064 1605
1065 /* If the block is already remapped - use that, else remap it */ 1606 /* If the block is already remapped - use that, else remap it */
1066 e = lookup_exception(&s->complete, chunk); 1607 e = dm_lookup_exception(&s->complete, chunk);
1067 if (e) { 1608 if (e) {
1068 remap_exception(s, e, bio, chunk); 1609 remap_exception(s, e, bio, chunk);
1069 goto out_unlock; 1610 goto out_unlock;
@@ -1087,7 +1628,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1087 goto out_unlock; 1628 goto out_unlock;
1088 } 1629 }
1089 1630
1090 e = lookup_exception(&s->complete, chunk); 1631 e = dm_lookup_exception(&s->complete, chunk);
1091 if (e) { 1632 if (e) {
1092 free_pending_exception(pe); 1633 free_pending_exception(pe);
1093 remap_exception(s, e, bio, chunk); 1634 remap_exception(s, e, bio, chunk);
@@ -1125,6 +1666,78 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1125 return r; 1666 return r;
1126} 1667}
1127 1668
1669/*
1670 * A snapshot-merge target behaves like a combination of a snapshot
1671 * target and a snapshot-origin target. It only generates new
1672 * exceptions in other snapshots and not in the one that is being
1673 * merged.
1674 *
1675 * For each chunk, if there is an existing exception, it is used to
1676 * redirect I/O to the cow device. Otherwise I/O is sent to the origin,
1677 * which in turn might generate exceptions in other snapshots.
1678 * If merging is currently taking place on the chunk in question, the
1679 * I/O is deferred by adding it to s->bios_queued_during_merge.
1680 */
1681static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
1682 union map_info *map_context)
1683{
1684 struct dm_exception *e;
1685 struct dm_snapshot *s = ti->private;
1686 int r = DM_MAPIO_REMAPPED;
1687 chunk_t chunk;
1688
1689 if (unlikely(bio_empty_barrier(bio))) {
1690 if (!map_context->flush_request)
1691 bio->bi_bdev = s->origin->bdev;
1692 else
1693 bio->bi_bdev = s->cow->bdev;
1694 map_context->ptr = NULL;
1695 return DM_MAPIO_REMAPPED;
1696 }
1697
1698 chunk = sector_to_chunk(s->store, bio->bi_sector);
1699
1700 down_write(&s->lock);
1701
1702 /* Full merging snapshots are redirected to the origin */
1703 if (!s->valid)
1704 goto redirect_to_origin;
1705
1706 /* If the block is already remapped - use that */
1707 e = dm_lookup_exception(&s->complete, chunk);
1708 if (e) {
1709 /* Queue writes overlapping with chunks being merged */
1710 if (bio_rw(bio) == WRITE &&
1711 chunk >= s->first_merging_chunk &&
1712 chunk < (s->first_merging_chunk +
1713 s->num_merging_chunks)) {
1714 bio->bi_bdev = s->origin->bdev;
1715 bio_list_add(&s->bios_queued_during_merge, bio);
1716 r = DM_MAPIO_SUBMITTED;
1717 goto out_unlock;
1718 }
1719
1720 remap_exception(s, e, bio, chunk);
1721
1722 if (bio_rw(bio) == WRITE)
1723 map_context->ptr = track_chunk(s, chunk);
1724 goto out_unlock;
1725 }
1726
1727redirect_to_origin:
1728 bio->bi_bdev = s->origin->bdev;
1729
1730 if (bio_rw(bio) == WRITE) {
1731 up_write(&s->lock);
1732 return do_origin(s->origin, bio);
1733 }
1734
1735out_unlock:
1736 up_write(&s->lock);
1737
1738 return r;
1739}
1740
1128static int snapshot_end_io(struct dm_target *ti, struct bio *bio, 1741static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
1129 int error, union map_info *map_context) 1742 int error, union map_info *map_context)
1130{ 1743{
@@ -1137,40 +1750,135 @@ static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
1137 return 0; 1750 return 0;
1138} 1751}
1139 1752
1753static void snapshot_merge_presuspend(struct dm_target *ti)
1754{
1755 struct dm_snapshot *s = ti->private;
1756
1757 stop_merge(s);
1758}
1759
1760static void snapshot_postsuspend(struct dm_target *ti)
1761{
1762 struct dm_snapshot *s = ti->private;
1763
1764 down_write(&s->lock);
1765 s->suspended = 1;
1766 up_write(&s->lock);
1767}
1768
1769static int snapshot_preresume(struct dm_target *ti)
1770{
1771 int r = 0;
1772 struct dm_snapshot *s = ti->private;
1773 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1774
1775 down_read(&_origins_lock);
1776 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1777 if (snap_src && snap_dest) {
1778 down_read(&snap_src->lock);
1779 if (s == snap_src) {
1780 DMERR("Unable to resume snapshot source until "
1781 "handover completes.");
1782 r = -EINVAL;
1783 } else if (!snap_src->suspended) {
1784 DMERR("Unable to perform snapshot handover until "
1785 "source is suspended.");
1786 r = -EINVAL;
1787 }
1788 up_read(&snap_src->lock);
1789 }
1790 up_read(&_origins_lock);
1791
1792 return r;
1793}
1794
1140static void snapshot_resume(struct dm_target *ti) 1795static void snapshot_resume(struct dm_target *ti)
1141{ 1796{
1142 struct dm_snapshot *s = ti->private; 1797 struct dm_snapshot *s = ti->private;
1798 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1799
1800 down_read(&_origins_lock);
1801 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1802 if (snap_src && snap_dest) {
1803 down_write(&snap_src->lock);
1804 down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
1805 __handover_exceptions(snap_src, snap_dest);
1806 up_write(&snap_dest->lock);
1807 up_write(&snap_src->lock);
1808 }
1809 up_read(&_origins_lock);
1810
1811 /* Now we have correct chunk size, reregister */
1812 reregister_snapshot(s);
1143 1813
1144 down_write(&s->lock); 1814 down_write(&s->lock);
1145 s->active = 1; 1815 s->active = 1;
1816 s->suspended = 0;
1146 up_write(&s->lock); 1817 up_write(&s->lock);
1147} 1818}
1148 1819
1820static sector_t get_origin_minimum_chunksize(struct block_device *bdev)
1821{
1822 sector_t min_chunksize;
1823
1824 down_read(&_origins_lock);
1825 min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
1826 up_read(&_origins_lock);
1827
1828 return min_chunksize;
1829}
1830
1831static void snapshot_merge_resume(struct dm_target *ti)
1832{
1833 struct dm_snapshot *s = ti->private;
1834
1835 /*
1836 * Handover exceptions from existing snapshot.
1837 */
1838 snapshot_resume(ti);
1839
1840 /*
1841 * snapshot-merge acts as an origin, so set ti->split_io
1842 */
1843 ti->split_io = get_origin_minimum_chunksize(s->origin->bdev);
1844
1845 start_merge(s);
1846}
1847
1149static int snapshot_status(struct dm_target *ti, status_type_t type, 1848static int snapshot_status(struct dm_target *ti, status_type_t type,
1150 char *result, unsigned int maxlen) 1849 char *result, unsigned int maxlen)
1151{ 1850{
1152 unsigned sz = 0; 1851 unsigned sz = 0;
1153 struct dm_snapshot *snap = ti->private; 1852 struct dm_snapshot *snap = ti->private;
1154 1853
1155 down_write(&snap->lock);
1156
1157 switch (type) { 1854 switch (type) {
1158 case STATUSTYPE_INFO: 1855 case STATUSTYPE_INFO:
1856
1857 down_write(&snap->lock);
1858
1159 if (!snap->valid) 1859 if (!snap->valid)
1160 DMEMIT("Invalid"); 1860 DMEMIT("Invalid");
1861 else if (snap->merge_failed)
1862 DMEMIT("Merge failed");
1161 else { 1863 else {
1162 if (snap->store->type->fraction_full) { 1864 if (snap->store->type->usage) {
1163 sector_t numerator, denominator; 1865 sector_t total_sectors, sectors_allocated,
1164 snap->store->type->fraction_full(snap->store, 1866 metadata_sectors;
1165 &numerator, 1867 snap->store->type->usage(snap->store,
1166 &denominator); 1868 &total_sectors,
1167 DMEMIT("%llu/%llu", 1869 &sectors_allocated,
1168 (unsigned long long)numerator, 1870 &metadata_sectors);
1169 (unsigned long long)denominator); 1871 DMEMIT("%llu/%llu %llu",
1872 (unsigned long long)sectors_allocated,
1873 (unsigned long long)total_sectors,
1874 (unsigned long long)metadata_sectors);
1170 } 1875 }
1171 else 1876 else
1172 DMEMIT("Unknown"); 1877 DMEMIT("Unknown");
1173 } 1878 }
1879
1880 up_write(&snap->lock);
1881
1174 break; 1882 break;
1175 1883
1176 case STATUSTYPE_TABLE: 1884 case STATUSTYPE_TABLE:
@@ -1179,14 +1887,12 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
1179 * to make private copies if the output is to 1887 * to make private copies if the output is to
1180 * make sense. 1888 * make sense.
1181 */ 1889 */
1182 DMEMIT("%s", snap->origin->name); 1890 DMEMIT("%s %s", snap->origin->name, snap->cow->name);
1183 snap->store->type->status(snap->store, type, result + sz, 1891 snap->store->type->status(snap->store, type, result + sz,
1184 maxlen - sz); 1892 maxlen - sz);
1185 break; 1893 break;
1186 } 1894 }
1187 1895
1188 up_write(&snap->lock);
1189
1190 return 0; 1896 return 0;
1191} 1897}
1192 1898
@@ -1202,17 +1908,36 @@ static int snapshot_iterate_devices(struct dm_target *ti,
1202/*----------------------------------------------------------------- 1908/*-----------------------------------------------------------------
1203 * Origin methods 1909 * Origin methods
1204 *---------------------------------------------------------------*/ 1910 *---------------------------------------------------------------*/
1205static int __origin_write(struct list_head *snapshots, struct bio *bio) 1911
1912/*
1913 * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
1914 * supplied bio was ignored. The caller may submit it immediately.
1915 * (No remapping actually occurs as the origin is always a direct linear
1916 * map.)
1917 *
1918 * If further exceptions are required, DM_MAPIO_SUBMITTED is returned
1919 * and any supplied bio is added to a list to be submitted once all
1920 * the necessary exceptions exist.
1921 */
1922static int __origin_write(struct list_head *snapshots, sector_t sector,
1923 struct bio *bio)
1206{ 1924{
1207 int r = DM_MAPIO_REMAPPED, first = 0; 1925 int r = DM_MAPIO_REMAPPED;
1208 struct dm_snapshot *snap; 1926 struct dm_snapshot *snap;
1209 struct dm_snap_exception *e; 1927 struct dm_exception *e;
1210 struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL; 1928 struct dm_snap_pending_exception *pe;
1929 struct dm_snap_pending_exception *pe_to_start_now = NULL;
1930 struct dm_snap_pending_exception *pe_to_start_last = NULL;
1211 chunk_t chunk; 1931 chunk_t chunk;
1212 LIST_HEAD(pe_queue);
1213 1932
1214 /* Do all the snapshots on this origin */ 1933 /* Do all the snapshots on this origin */
1215 list_for_each_entry (snap, snapshots, list) { 1934 list_for_each_entry (snap, snapshots, list) {
1935 /*
1936 * Don't make new exceptions in a merging snapshot
1937 * because it has effectively been deleted
1938 */
1939 if (dm_target_is_snapshot_merge(snap->ti))
1940 continue;
1216 1941
1217 down_write(&snap->lock); 1942 down_write(&snap->lock);
1218 1943
@@ -1221,24 +1946,21 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1221 goto next_snapshot; 1946 goto next_snapshot;
1222 1947
1223 /* Nothing to do if writing beyond end of snapshot */ 1948 /* Nothing to do if writing beyond end of snapshot */
1224 if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table)) 1949 if (sector >= dm_table_get_size(snap->ti->table))
1225 goto next_snapshot; 1950 goto next_snapshot;
1226 1951
1227 /* 1952 /*
1228 * Remember, different snapshots can have 1953 * Remember, different snapshots can have
1229 * different chunk sizes. 1954 * different chunk sizes.
1230 */ 1955 */
1231 chunk = sector_to_chunk(snap->store, bio->bi_sector); 1956 chunk = sector_to_chunk(snap->store, sector);
1232 1957
1233 /* 1958 /*
1234 * Check exception table to see if block 1959 * Check exception table to see if block
1235 * is already remapped in this snapshot 1960 * is already remapped in this snapshot
1236 * and trigger an exception if not. 1961 * and trigger an exception if not.
1237 *
1238 * ref_count is initialised to 1 so pending_complete()
1239 * won't destroy the primary_pe while we're inside this loop.
1240 */ 1962 */
1241 e = lookup_exception(&snap->complete, chunk); 1963 e = dm_lookup_exception(&snap->complete, chunk);
1242 if (e) 1964 if (e)
1243 goto next_snapshot; 1965 goto next_snapshot;
1244 1966
@@ -1253,7 +1975,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1253 goto next_snapshot; 1975 goto next_snapshot;
1254 } 1976 }
1255 1977
1256 e = lookup_exception(&snap->complete, chunk); 1978 e = dm_lookup_exception(&snap->complete, chunk);
1257 if (e) { 1979 if (e) {
1258 free_pending_exception(pe); 1980 free_pending_exception(pe);
1259 goto next_snapshot; 1981 goto next_snapshot;
@@ -1266,59 +1988,43 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1266 } 1988 }
1267 } 1989 }
1268 1990
1269 if (!primary_pe) { 1991 r = DM_MAPIO_SUBMITTED;
1270 /*
1271 * Either every pe here has same
1272 * primary_pe or none has one yet.
1273 */
1274 if (pe->primary_pe)
1275 primary_pe = pe->primary_pe;
1276 else {
1277 primary_pe = pe;
1278 first = 1;
1279 }
1280
1281 bio_list_add(&primary_pe->origin_bios, bio);
1282 1992
1283 r = DM_MAPIO_SUBMITTED; 1993 /*
1284 } 1994 * If an origin bio was supplied, queue it to wait for the
1995 * completion of this exception, and start this one last,
1996 * at the end of the function.
1997 */
1998 if (bio) {
1999 bio_list_add(&pe->origin_bios, bio);
2000 bio = NULL;
1285 2001
1286 if (!pe->primary_pe) { 2002 if (!pe->started) {
1287 pe->primary_pe = primary_pe; 2003 pe->started = 1;
1288 get_pending_exception(primary_pe); 2004 pe_to_start_last = pe;
2005 }
1289 } 2006 }
1290 2007
1291 if (!pe->started) { 2008 if (!pe->started) {
1292 pe->started = 1; 2009 pe->started = 1;
1293 list_add_tail(&pe->list, &pe_queue); 2010 pe_to_start_now = pe;
1294 } 2011 }
1295 2012
1296 next_snapshot: 2013 next_snapshot:
1297 up_write(&snap->lock); 2014 up_write(&snap->lock);
1298 }
1299 2015
1300 if (!primary_pe) 2016 if (pe_to_start_now) {
1301 return r; 2017 start_copy(pe_to_start_now);
1302 2018 pe_to_start_now = NULL;
1303 /* 2019 }
1304 * If this is the first time we're processing this chunk and
1305 * ref_count is now 1 it means all the pending exceptions
1306 * got completed while we were in the loop above, so it falls to
1307 * us here to remove the primary_pe and submit any origin_bios.
1308 */
1309
1310 if (first && atomic_dec_and_test(&primary_pe->ref_count)) {
1311 flush_bios(bio_list_get(&primary_pe->origin_bios));
1312 free_pending_exception(primary_pe);
1313 /* If we got here, pe_queue is necessarily empty. */
1314 return r;
1315 } 2020 }
1316 2021
1317 /* 2022 /*
1318 * Now that we have a complete pe list we can start the copying. 2023 * Submit the exception against which the bio is queued last,
2024 * to give the other exceptions a head start.
1319 */ 2025 */
1320 list_for_each_entry_safe(pe, next_pe, &pe_queue, list) 2026 if (pe_to_start_last)
1321 start_copy(pe); 2027 start_copy(pe_to_start_last);
1322 2028
1323 return r; 2029 return r;
1324} 2030}
@@ -1334,13 +2040,48 @@ static int do_origin(struct dm_dev *origin, struct bio *bio)
1334 down_read(&_origins_lock); 2040 down_read(&_origins_lock);
1335 o = __lookup_origin(origin->bdev); 2041 o = __lookup_origin(origin->bdev);
1336 if (o) 2042 if (o)
1337 r = __origin_write(&o->snapshots, bio); 2043 r = __origin_write(&o->snapshots, bio->bi_sector, bio);
1338 up_read(&_origins_lock); 2044 up_read(&_origins_lock);
1339 2045
1340 return r; 2046 return r;
1341} 2047}
1342 2048
1343/* 2049/*
2050 * Trigger exceptions in all non-merging snapshots.
2051 *
2052 * The chunk size of the merging snapshot may be larger than the chunk
2053 * size of some other snapshot so we may need to reallocate multiple
2054 * chunks in other snapshots.
2055 *
2056 * We scan all the overlapping exceptions in the other snapshots.
2057 * Returns 1 if anything was reallocated and must be waited for,
2058 * otherwise returns 0.
2059 *
2060 * size must be a multiple of merging_snap's chunk_size.
2061 */
2062static int origin_write_extent(struct dm_snapshot *merging_snap,
2063 sector_t sector, unsigned size)
2064{
2065 int must_wait = 0;
2066 sector_t n;
2067 struct origin *o;
2068
2069 /*
2070 * The origin's __minimum_chunk_size() got stored in split_io
2071 * by snapshot_merge_resume().
2072 */
2073 down_read(&_origins_lock);
2074 o = __lookup_origin(merging_snap->origin->bdev);
2075 for (n = 0; n < size; n += merging_snap->ti->split_io)
2076 if (__origin_write(&o->snapshots, sector + n, NULL) ==
2077 DM_MAPIO_SUBMITTED)
2078 must_wait = 1;
2079 up_read(&_origins_lock);
2080
2081 return must_wait;
2082}
2083
2084/*
1344 * Origin: maps a linear range of a device, with hooks for snapshotting. 2085 * Origin: maps a linear range of a device, with hooks for snapshotting.
1345 */ 2086 */
1346 2087
@@ -1391,8 +2132,6 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
1391 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; 2132 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
1392} 2133}
1393 2134
1394#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
1395
1396/* 2135/*
1397 * Set the target "split_io" field to the minimum of all the snapshots' 2136 * Set the target "split_io" field to the minimum of all the snapshots'
1398 * chunk sizes. 2137 * chunk sizes.
@@ -1400,19 +2139,8 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
1400static void origin_resume(struct dm_target *ti) 2139static void origin_resume(struct dm_target *ti)
1401{ 2140{
1402 struct dm_dev *dev = ti->private; 2141 struct dm_dev *dev = ti->private;
1403 struct dm_snapshot *snap;
1404 struct origin *o;
1405 unsigned chunk_size = 0;
1406
1407 down_read(&_origins_lock);
1408 o = __lookup_origin(dev->bdev);
1409 if (o)
1410 list_for_each_entry (snap, &o->snapshots, list)
1411 chunk_size = min_not_zero(chunk_size,
1412 snap->store->chunk_size);
1413 up_read(&_origins_lock);
1414 2142
1415 ti->split_io = chunk_size; 2143 ti->split_io = get_origin_minimum_chunksize(dev->bdev);
1416} 2144}
1417 2145
1418static int origin_status(struct dm_target *ti, status_type_t type, char *result, 2146static int origin_status(struct dm_target *ti, status_type_t type, char *result,
@@ -1455,17 +2183,35 @@ static struct target_type origin_target = {
1455 2183
1456static struct target_type snapshot_target = { 2184static struct target_type snapshot_target = {
1457 .name = "snapshot", 2185 .name = "snapshot",
1458 .version = {1, 7, 0}, 2186 .version = {1, 9, 0},
1459 .module = THIS_MODULE, 2187 .module = THIS_MODULE,
1460 .ctr = snapshot_ctr, 2188 .ctr = snapshot_ctr,
1461 .dtr = snapshot_dtr, 2189 .dtr = snapshot_dtr,
1462 .map = snapshot_map, 2190 .map = snapshot_map,
1463 .end_io = snapshot_end_io, 2191 .end_io = snapshot_end_io,
2192 .postsuspend = snapshot_postsuspend,
2193 .preresume = snapshot_preresume,
1464 .resume = snapshot_resume, 2194 .resume = snapshot_resume,
1465 .status = snapshot_status, 2195 .status = snapshot_status,
1466 .iterate_devices = snapshot_iterate_devices, 2196 .iterate_devices = snapshot_iterate_devices,
1467}; 2197};
1468 2198
2199static struct target_type merge_target = {
2200 .name = dm_snapshot_merge_target_name,
2201 .version = {1, 0, 0},
2202 .module = THIS_MODULE,
2203 .ctr = snapshot_ctr,
2204 .dtr = snapshot_dtr,
2205 .map = snapshot_merge_map,
2206 .end_io = snapshot_end_io,
2207 .presuspend = snapshot_merge_presuspend,
2208 .postsuspend = snapshot_postsuspend,
2209 .preresume = snapshot_preresume,
2210 .resume = snapshot_merge_resume,
2211 .status = snapshot_status,
2212 .iterate_devices = snapshot_iterate_devices,
2213};
2214
1469static int __init dm_snapshot_init(void) 2215static int __init dm_snapshot_init(void)
1470{ 2216{
1471 int r; 2217 int r;
@@ -1477,7 +2223,7 @@ static int __init dm_snapshot_init(void)
1477 } 2223 }
1478 2224
1479 r = dm_register_target(&snapshot_target); 2225 r = dm_register_target(&snapshot_target);
1480 if (r) { 2226 if (r < 0) {
1481 DMERR("snapshot target register failed %d", r); 2227 DMERR("snapshot target register failed %d", r);
1482 goto bad_register_snapshot_target; 2228 goto bad_register_snapshot_target;
1483 } 2229 }
@@ -1485,34 +2231,40 @@ static int __init dm_snapshot_init(void)
1485 r = dm_register_target(&origin_target); 2231 r = dm_register_target(&origin_target);
1486 if (r < 0) { 2232 if (r < 0) {
1487 DMERR("Origin target register failed %d", r); 2233 DMERR("Origin target register failed %d", r);
1488 goto bad1; 2234 goto bad_register_origin_target;
2235 }
2236
2237 r = dm_register_target(&merge_target);
2238 if (r < 0) {
2239 DMERR("Merge target register failed %d", r);
2240 goto bad_register_merge_target;
1489 } 2241 }
1490 2242
1491 r = init_origin_hash(); 2243 r = init_origin_hash();
1492 if (r) { 2244 if (r) {
1493 DMERR("init_origin_hash failed."); 2245 DMERR("init_origin_hash failed.");
1494 goto bad2; 2246 goto bad_origin_hash;
1495 } 2247 }
1496 2248
1497 exception_cache = KMEM_CACHE(dm_snap_exception, 0); 2249 exception_cache = KMEM_CACHE(dm_exception, 0);
1498 if (!exception_cache) { 2250 if (!exception_cache) {
1499 DMERR("Couldn't create exception cache."); 2251 DMERR("Couldn't create exception cache.");
1500 r = -ENOMEM; 2252 r = -ENOMEM;
1501 goto bad3; 2253 goto bad_exception_cache;
1502 } 2254 }
1503 2255
1504 pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); 2256 pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
1505 if (!pending_cache) { 2257 if (!pending_cache) {
1506 DMERR("Couldn't create pending cache."); 2258 DMERR("Couldn't create pending cache.");
1507 r = -ENOMEM; 2259 r = -ENOMEM;
1508 goto bad4; 2260 goto bad_pending_cache;
1509 } 2261 }
1510 2262
1511 tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); 2263 tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
1512 if (!tracked_chunk_cache) { 2264 if (!tracked_chunk_cache) {
1513 DMERR("Couldn't create cache to track chunks in use."); 2265 DMERR("Couldn't create cache to track chunks in use.");
1514 r = -ENOMEM; 2266 r = -ENOMEM;
1515 goto bad5; 2267 goto bad_tracked_chunk_cache;
1516 } 2268 }
1517 2269
1518 ksnapd = create_singlethread_workqueue("ksnapd"); 2270 ksnapd = create_singlethread_workqueue("ksnapd");
@@ -1526,19 +2278,21 @@ static int __init dm_snapshot_init(void)
1526 2278
1527bad_pending_pool: 2279bad_pending_pool:
1528 kmem_cache_destroy(tracked_chunk_cache); 2280 kmem_cache_destroy(tracked_chunk_cache);
1529bad5: 2281bad_tracked_chunk_cache:
1530 kmem_cache_destroy(pending_cache); 2282 kmem_cache_destroy(pending_cache);
1531bad4: 2283bad_pending_cache:
1532 kmem_cache_destroy(exception_cache); 2284 kmem_cache_destroy(exception_cache);
1533bad3: 2285bad_exception_cache:
1534 exit_origin_hash(); 2286 exit_origin_hash();
1535bad2: 2287bad_origin_hash:
2288 dm_unregister_target(&merge_target);
2289bad_register_merge_target:
1536 dm_unregister_target(&origin_target); 2290 dm_unregister_target(&origin_target);
1537bad1: 2291bad_register_origin_target:
1538 dm_unregister_target(&snapshot_target); 2292 dm_unregister_target(&snapshot_target);
1539
1540bad_register_snapshot_target: 2293bad_register_snapshot_target:
1541 dm_exception_store_exit(); 2294 dm_exception_store_exit();
2295
1542 return r; 2296 return r;
1543} 2297}
1544 2298
@@ -1548,6 +2302,7 @@ static void __exit dm_snapshot_exit(void)
1548 2302
1549 dm_unregister_target(&snapshot_target); 2303 dm_unregister_target(&snapshot_target);
1550 dm_unregister_target(&origin_target); 2304 dm_unregister_target(&origin_target);
2305 dm_unregister_target(&merge_target);
1551 2306
1552 exit_origin_hash(); 2307 exit_origin_hash();
1553 kmem_cache_destroy(pending_cache); 2308 kmem_cache_destroy(pending_cache);
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 4b045903a4e2..f53392df7b97 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -59,7 +59,7 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
59 59
60static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) 60static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
61{ 61{
62 sprintf(buf, "%d\n", dm_suspended(md)); 62 sprintf(buf, "%d\n", dm_suspended_md(md));
63 63
64 return strlen(buf); 64 return strlen(buf);
65} 65}
@@ -80,12 +80,20 @@ static struct sysfs_ops dm_sysfs_ops = {
80}; 80};
81 81
82/* 82/*
83 * The sysfs structure is embedded in md struct, nothing to do here
84 */
85static void dm_sysfs_release(struct kobject *kobj)
86{
87}
88
89/*
83 * dm kobject is embedded in mapped_device structure 90 * dm kobject is embedded in mapped_device structure
84 * no need to define release function here 91 * no need to define release function here
85 */ 92 */
86static struct kobj_type dm_ktype = { 93static struct kobj_type dm_ktype = {
87 .sysfs_ops = &dm_sysfs_ops, 94 .sysfs_ops = &dm_sysfs_ops,
88 .default_attrs = dm_attrs, 95 .default_attrs = dm_attrs,
96 .release = dm_sysfs_release
89}; 97};
90 98
91/* 99/*
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 91976e8fae5f..be625475cf6d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -238,6 +238,9 @@ void dm_table_destroy(struct dm_table *t)
238{ 238{
239 unsigned int i; 239 unsigned int i;
240 240
241 if (!t)
242 return;
243
241 while (atomic_read(&t->holders)) 244 while (atomic_read(&t->holders))
242 msleep(1); 245 msleep(1);
243 smp_mb(); 246 smp_mb();
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c
index 6f65883aef12..c7c555a8c7b2 100644
--- a/drivers/md/dm-uevent.c
+++ b/drivers/md/dm-uevent.c
@@ -139,14 +139,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj)
139 list_del_init(&event->elist); 139 list_del_init(&event->elist);
140 140
141 /* 141 /*
142 * Need to call dm_copy_name_and_uuid from here for now. 142 * When a device is being removed this copy fails and we
143 * Context of previous var adds and locking used for 143 * discard these unsent events.
144 * hash_cell not compatable.
145 */ 144 */
146 if (dm_copy_name_and_uuid(event->md, event->name, 145 if (dm_copy_name_and_uuid(event->md, event->name,
147 event->uuid)) { 146 event->uuid)) {
148 DMERR("%s: dm_copy_name_and_uuid() failed", 147 DMINFO("%s: skipping sending uevent for lost device",
149 __func__); 148 __func__);
150 goto uevent_free; 149 goto uevent_free;
151 } 150 }
152 151
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 724efc63904d..3167480b532c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -143,9 +143,19 @@ struct mapped_device {
143 int barrier_error; 143 int barrier_error;
144 144
145 /* 145 /*
146 * Protect barrier_error from concurrent endio processing
147 * in request-based dm.
148 */
149 spinlock_t barrier_error_lock;
150
151 /*
146 * Processing queue (flush/barriers) 152 * Processing queue (flush/barriers)
147 */ 153 */
148 struct workqueue_struct *wq; 154 struct workqueue_struct *wq;
155 struct work_struct barrier_work;
156
157 /* A pointer to the currently processing pre/post flush request */
158 struct request *flush_request;
149 159
150 /* 160 /*
151 * The current mapping. 161 * The current mapping.
@@ -178,9 +188,6 @@ struct mapped_device {
178 /* forced geometry settings */ 188 /* forced geometry settings */
179 struct hd_geometry geometry; 189 struct hd_geometry geometry;
180 190
181 /* marker of flush suspend for request-based dm */
182 struct request suspend_rq;
183
184 /* For saving the address of __make_request for request based dm */ 191 /* For saving the address of __make_request for request based dm */
185 make_request_fn *saved_make_request_fn; 192 make_request_fn *saved_make_request_fn;
186 193
@@ -275,6 +282,7 @@ static int (*_inits[])(void) __initdata = {
275 dm_target_init, 282 dm_target_init,
276 dm_linear_init, 283 dm_linear_init,
277 dm_stripe_init, 284 dm_stripe_init,
285 dm_io_init,
278 dm_kcopyd_init, 286 dm_kcopyd_init,
279 dm_interface_init, 287 dm_interface_init,
280}; 288};
@@ -284,6 +292,7 @@ static void (*_exits[])(void) = {
284 dm_target_exit, 292 dm_target_exit,
285 dm_linear_exit, 293 dm_linear_exit,
286 dm_stripe_exit, 294 dm_stripe_exit,
295 dm_io_exit,
287 dm_kcopyd_exit, 296 dm_kcopyd_exit,
288 dm_interface_exit, 297 dm_interface_exit,
289}; 298};
@@ -320,6 +329,11 @@ static void __exit dm_exit(void)
320/* 329/*
321 * Block device functions 330 * Block device functions
322 */ 331 */
332int dm_deleting_md(struct mapped_device *md)
333{
334 return test_bit(DMF_DELETING, &md->flags);
335}
336
323static int dm_blk_open(struct block_device *bdev, fmode_t mode) 337static int dm_blk_open(struct block_device *bdev, fmode_t mode)
324{ 338{
325 struct mapped_device *md; 339 struct mapped_device *md;
@@ -331,7 +345,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
331 goto out; 345 goto out;
332 346
333 if (test_bit(DMF_FREEING, &md->flags) || 347 if (test_bit(DMF_FREEING, &md->flags) ||
334 test_bit(DMF_DELETING, &md->flags)) { 348 dm_deleting_md(md)) {
335 md = NULL; 349 md = NULL;
336 goto out; 350 goto out;
337 } 351 }
@@ -388,7 +402,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
388 unsigned int cmd, unsigned long arg) 402 unsigned int cmd, unsigned long arg)
389{ 403{
390 struct mapped_device *md = bdev->bd_disk->private_data; 404 struct mapped_device *md = bdev->bd_disk->private_data;
391 struct dm_table *map = dm_get_table(md); 405 struct dm_table *map = dm_get_live_table(md);
392 struct dm_target *tgt; 406 struct dm_target *tgt;
393 int r = -ENOTTY; 407 int r = -ENOTTY;
394 408
@@ -401,7 +415,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
401 415
402 tgt = dm_table_get_target(map, 0); 416 tgt = dm_table_get_target(map, 0);
403 417
404 if (dm_suspended(md)) { 418 if (dm_suspended_md(md)) {
405 r = -EAGAIN; 419 r = -EAGAIN;
406 goto out; 420 goto out;
407 } 421 }
@@ -430,9 +444,10 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
430 mempool_free(tio, md->tio_pool); 444 mempool_free(tio, md->tio_pool);
431} 445}
432 446
433static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) 447static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
448 gfp_t gfp_mask)
434{ 449{
435 return mempool_alloc(md->tio_pool, GFP_ATOMIC); 450 return mempool_alloc(md->tio_pool, gfp_mask);
436} 451}
437 452
438static void free_rq_tio(struct dm_rq_target_io *tio) 453static void free_rq_tio(struct dm_rq_target_io *tio)
@@ -450,6 +465,12 @@ static void free_bio_info(struct dm_rq_clone_bio_info *info)
450 mempool_free(info, info->tio->md->io_pool); 465 mempool_free(info, info->tio->md->io_pool);
451} 466}
452 467
468static int md_in_flight(struct mapped_device *md)
469{
470 return atomic_read(&md->pending[READ]) +
471 atomic_read(&md->pending[WRITE]);
472}
473
453static void start_io_acct(struct dm_io *io) 474static void start_io_acct(struct dm_io *io)
454{ 475{
455 struct mapped_device *md = io->md; 476 struct mapped_device *md = io->md;
@@ -512,7 +533,7 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
512 * function to access the md->map field, and make sure they call 533 * function to access the md->map field, and make sure they call
513 * dm_table_put() when finished. 534 * dm_table_put() when finished.
514 */ 535 */
515struct dm_table *dm_get_table(struct mapped_device *md) 536struct dm_table *dm_get_live_table(struct mapped_device *md)
516{ 537{
517 struct dm_table *t; 538 struct dm_table *t;
518 unsigned long flags; 539 unsigned long flags;
@@ -716,28 +737,38 @@ static void end_clone_bio(struct bio *clone, int error)
716 blk_update_request(tio->orig, 0, nr_bytes); 737 blk_update_request(tio->orig, 0, nr_bytes);
717} 738}
718 739
740static void store_barrier_error(struct mapped_device *md, int error)
741{
742 unsigned long flags;
743
744 spin_lock_irqsave(&md->barrier_error_lock, flags);
745 /*
746 * Basically, the first error is taken, but:
747 * -EOPNOTSUPP supersedes any I/O error.
748 * Requeue request supersedes any I/O error but -EOPNOTSUPP.
749 */
750 if (!md->barrier_error || error == -EOPNOTSUPP ||
751 (md->barrier_error != -EOPNOTSUPP &&
752 error == DM_ENDIO_REQUEUE))
753 md->barrier_error = error;
754 spin_unlock_irqrestore(&md->barrier_error_lock, flags);
755}
756
719/* 757/*
720 * Don't touch any member of the md after calling this function because 758 * Don't touch any member of the md after calling this function because
721 * the md may be freed in dm_put() at the end of this function. 759 * the md may be freed in dm_put() at the end of this function.
722 * Or do dm_get() before calling this function and dm_put() later. 760 * Or do dm_get() before calling this function and dm_put() later.
723 */ 761 */
724static void rq_completed(struct mapped_device *md, int run_queue) 762static void rq_completed(struct mapped_device *md, int rw, int run_queue)
725{ 763{
726 int wakeup_waiters = 0; 764 atomic_dec(&md->pending[rw]);
727 struct request_queue *q = md->queue;
728 unsigned long flags;
729
730 spin_lock_irqsave(q->queue_lock, flags);
731 if (!queue_in_flight(q))
732 wakeup_waiters = 1;
733 spin_unlock_irqrestore(q->queue_lock, flags);
734 765
735 /* nudge anyone waiting on suspend queue */ 766 /* nudge anyone waiting on suspend queue */
736 if (wakeup_waiters) 767 if (!md_in_flight(md))
737 wake_up(&md->wait); 768 wake_up(&md->wait);
738 769
739 if (run_queue) 770 if (run_queue)
740 blk_run_queue(q); 771 blk_run_queue(md->queue);
741 772
742 /* 773 /*
743 * dm_put() must be at the end of this function. See the comment above 774 * dm_put() must be at the end of this function. See the comment above
@@ -753,6 +784,44 @@ static void free_rq_clone(struct request *clone)
753 free_rq_tio(tio); 784 free_rq_tio(tio);
754} 785}
755 786
787/*
788 * Complete the clone and the original request.
789 * Must be called without queue lock.
790 */
791static void dm_end_request(struct request *clone, int error)
792{
793 int rw = rq_data_dir(clone);
794 int run_queue = 1;
795 bool is_barrier = blk_barrier_rq(clone);
796 struct dm_rq_target_io *tio = clone->end_io_data;
797 struct mapped_device *md = tio->md;
798 struct request *rq = tio->orig;
799
800 if (blk_pc_request(rq) && !is_barrier) {
801 rq->errors = clone->errors;
802 rq->resid_len = clone->resid_len;
803
804 if (rq->sense)
805 /*
806 * We are using the sense buffer of the original
807 * request.
808 * So setting the length of the sense data is enough.
809 */
810 rq->sense_len = clone->sense_len;
811 }
812
813 free_rq_clone(clone);
814
815 if (unlikely(is_barrier)) {
816 if (unlikely(error))
817 store_barrier_error(md, error);
818 run_queue = 0;
819 } else
820 blk_end_request_all(rq, error);
821
822 rq_completed(md, rw, run_queue);
823}
824
756static void dm_unprep_request(struct request *rq) 825static void dm_unprep_request(struct request *rq)
757{ 826{
758 struct request *clone = rq->special; 827 struct request *clone = rq->special;
@@ -768,12 +837,23 @@ static void dm_unprep_request(struct request *rq)
768 */ 837 */
769void dm_requeue_unmapped_request(struct request *clone) 838void dm_requeue_unmapped_request(struct request *clone)
770{ 839{
840 int rw = rq_data_dir(clone);
771 struct dm_rq_target_io *tio = clone->end_io_data; 841 struct dm_rq_target_io *tio = clone->end_io_data;
772 struct mapped_device *md = tio->md; 842 struct mapped_device *md = tio->md;
773 struct request *rq = tio->orig; 843 struct request *rq = tio->orig;
774 struct request_queue *q = rq->q; 844 struct request_queue *q = rq->q;
775 unsigned long flags; 845 unsigned long flags;
776 846
847 if (unlikely(blk_barrier_rq(clone))) {
848 /*
849 * Barrier clones share an original request.
850 * Leave it to dm_end_request(), which handles this special
851 * case.
852 */
853 dm_end_request(clone, DM_ENDIO_REQUEUE);
854 return;
855 }
856
777 dm_unprep_request(rq); 857 dm_unprep_request(rq);
778 858
779 spin_lock_irqsave(q->queue_lock, flags); 859 spin_lock_irqsave(q->queue_lock, flags);
@@ -782,7 +862,7 @@ void dm_requeue_unmapped_request(struct request *clone)
782 blk_requeue_request(q, rq); 862 blk_requeue_request(q, rq);
783 spin_unlock_irqrestore(q->queue_lock, flags); 863 spin_unlock_irqrestore(q->queue_lock, flags);
784 864
785 rq_completed(md, 0); 865 rq_completed(md, rw, 0);
786} 866}
787EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 867EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
788 868
@@ -815,34 +895,28 @@ static void start_queue(struct request_queue *q)
815 spin_unlock_irqrestore(q->queue_lock, flags); 895 spin_unlock_irqrestore(q->queue_lock, flags);
816} 896}
817 897
818/* 898static void dm_done(struct request *clone, int error, bool mapped)
819 * Complete the clone and the original request.
820 * Must be called without queue lock.
821 */
822static void dm_end_request(struct request *clone, int error)
823{ 899{
900 int r = error;
824 struct dm_rq_target_io *tio = clone->end_io_data; 901 struct dm_rq_target_io *tio = clone->end_io_data;
825 struct mapped_device *md = tio->md; 902 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
826 struct request *rq = tio->orig;
827 903
828 if (blk_pc_request(rq)) { 904 if (mapped && rq_end_io)
829 rq->errors = clone->errors; 905 r = rq_end_io(tio->ti, clone, error, &tio->info);
830 rq->resid_len = clone->resid_len;
831 906
832 if (rq->sense) 907 if (r <= 0)
833 /* 908 /* The target wants to complete the I/O */
834 * We are using the sense buffer of the original 909 dm_end_request(clone, r);
835 * request. 910 else if (r == DM_ENDIO_INCOMPLETE)
836 * So setting the length of the sense data is enough. 911 /* The target will handle the I/O */
837 */ 912 return;
838 rq->sense_len = clone->sense_len; 913 else if (r == DM_ENDIO_REQUEUE)
914 /* The target wants to requeue the I/O */
915 dm_requeue_unmapped_request(clone);
916 else {
917 DMWARN("unimplemented target endio return value: %d", r);
918 BUG();
839 } 919 }
840
841 free_rq_clone(clone);
842
843 blk_end_request_all(rq, error);
844
845 rq_completed(md, 1);
846} 920}
847 921
848/* 922/*
@@ -850,27 +924,14 @@ static void dm_end_request(struct request *clone, int error)
850 */ 924 */
851static void dm_softirq_done(struct request *rq) 925static void dm_softirq_done(struct request *rq)
852{ 926{
927 bool mapped = true;
853 struct request *clone = rq->completion_data; 928 struct request *clone = rq->completion_data;
854 struct dm_rq_target_io *tio = clone->end_io_data; 929 struct dm_rq_target_io *tio = clone->end_io_data;
855 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
856 int error = tio->error;
857 930
858 if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) 931 if (rq->cmd_flags & REQ_FAILED)
859 error = rq_end_io(tio->ti, clone, error, &tio->info); 932 mapped = false;
860 933
861 if (error <= 0) 934 dm_done(clone, tio->error, mapped);
862 /* The target wants to complete the I/O */
863 dm_end_request(clone, error);
864 else if (error == DM_ENDIO_INCOMPLETE)
865 /* The target will handle the I/O */
866 return;
867 else if (error == DM_ENDIO_REQUEUE)
868 /* The target wants to requeue the I/O */
869 dm_requeue_unmapped_request(clone);
870 else {
871 DMWARN("unimplemented target endio return value: %d", error);
872 BUG();
873 }
874} 935}
875 936
876/* 937/*
@@ -882,6 +943,19 @@ static void dm_complete_request(struct request *clone, int error)
882 struct dm_rq_target_io *tio = clone->end_io_data; 943 struct dm_rq_target_io *tio = clone->end_io_data;
883 struct request *rq = tio->orig; 944 struct request *rq = tio->orig;
884 945
946 if (unlikely(blk_barrier_rq(clone))) {
947 /*
948 * Barrier clones share an original request. So can't use
949 * softirq_done with the original.
950 * Pass the clone to dm_done() directly in this special case.
951 * It is safe (even if clone->q->queue_lock is held here)
952 * because there is no I/O dispatching during the completion
953 * of barrier clone.
954 */
955 dm_done(clone, error, true);
956 return;
957 }
958
885 tio->error = error; 959 tio->error = error;
886 rq->completion_data = clone; 960 rq->completion_data = clone;
887 blk_complete_request(rq); 961 blk_complete_request(rq);
@@ -898,6 +972,17 @@ void dm_kill_unmapped_request(struct request *clone, int error)
898 struct dm_rq_target_io *tio = clone->end_io_data; 972 struct dm_rq_target_io *tio = clone->end_io_data;
899 struct request *rq = tio->orig; 973 struct request *rq = tio->orig;
900 974
975 if (unlikely(blk_barrier_rq(clone))) {
976 /*
977 * Barrier clones share an original request.
978 * Leave it to dm_end_request(), which handles this special
979 * case.
980 */
981 BUG_ON(error > 0);
982 dm_end_request(clone, error);
983 return;
984 }
985
901 rq->cmd_flags |= REQ_FAILED; 986 rq->cmd_flags |= REQ_FAILED;
902 dm_complete_request(clone, error); 987 dm_complete_request(clone, error);
903} 988}
@@ -1214,7 +1299,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1214 struct clone_info ci; 1299 struct clone_info ci;
1215 int error = 0; 1300 int error = 0;
1216 1301
1217 ci.map = dm_get_table(md); 1302 ci.map = dm_get_live_table(md);
1218 if (unlikely(!ci.map)) { 1303 if (unlikely(!ci.map)) {
1219 if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) 1304 if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
1220 bio_io_error(bio); 1305 bio_io_error(bio);
@@ -1255,7 +1340,7 @@ static int dm_merge_bvec(struct request_queue *q,
1255 struct bio_vec *biovec) 1340 struct bio_vec *biovec)
1256{ 1341{
1257 struct mapped_device *md = q->queuedata; 1342 struct mapped_device *md = q->queuedata;
1258 struct dm_table *map = dm_get_table(md); 1343 struct dm_table *map = dm_get_live_table(md);
1259 struct dm_target *ti; 1344 struct dm_target *ti;
1260 sector_t max_sectors; 1345 sector_t max_sectors;
1261 int max_size = 0; 1346 int max_size = 0;
@@ -1352,11 +1437,6 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
1352{ 1437{
1353 struct mapped_device *md = q->queuedata; 1438 struct mapped_device *md = q->queuedata;
1354 1439
1355 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
1356 bio_endio(bio, -EOPNOTSUPP);
1357 return 0;
1358 }
1359
1360 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1440 return md->saved_make_request_fn(q, bio); /* call __make_request() */
1361} 1441}
1362 1442
@@ -1375,6 +1455,25 @@ static int dm_request(struct request_queue *q, struct bio *bio)
1375 return _dm_request(q, bio); 1455 return _dm_request(q, bio);
1376} 1456}
1377 1457
1458/*
1459 * Mark this request as flush request, so that dm_request_fn() can
1460 * recognize.
1461 */
1462static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq)
1463{
1464 rq->cmd_type = REQ_TYPE_LINUX_BLOCK;
1465 rq->cmd[0] = REQ_LB_OP_FLUSH;
1466}
1467
1468static bool dm_rq_is_flush_request(struct request *rq)
1469{
1470 if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK &&
1471 rq->cmd[0] == REQ_LB_OP_FLUSH)
1472 return true;
1473 else
1474 return false;
1475}
1476
1378void dm_dispatch_request(struct request *rq) 1477void dm_dispatch_request(struct request *rq)
1379{ 1478{
1380 int r; 1479 int r;
@@ -1420,25 +1519,54 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1420static int setup_clone(struct request *clone, struct request *rq, 1519static int setup_clone(struct request *clone, struct request *rq,
1421 struct dm_rq_target_io *tio) 1520 struct dm_rq_target_io *tio)
1422{ 1521{
1423 int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1522 int r;
1424 dm_rq_bio_constructor, tio);
1425 1523
1426 if (r) 1524 if (dm_rq_is_flush_request(rq)) {
1427 return r; 1525 blk_rq_init(NULL, clone);
1526 clone->cmd_type = REQ_TYPE_FS;
1527 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
1528 } else {
1529 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1530 dm_rq_bio_constructor, tio);
1531 if (r)
1532 return r;
1533
1534 clone->cmd = rq->cmd;
1535 clone->cmd_len = rq->cmd_len;
1536 clone->sense = rq->sense;
1537 clone->buffer = rq->buffer;
1538 }
1428 1539
1429 clone->cmd = rq->cmd;
1430 clone->cmd_len = rq->cmd_len;
1431 clone->sense = rq->sense;
1432 clone->buffer = rq->buffer;
1433 clone->end_io = end_clone_request; 1540 clone->end_io = end_clone_request;
1434 clone->end_io_data = tio; 1541 clone->end_io_data = tio;
1435 1542
1436 return 0; 1543 return 0;
1437} 1544}
1438 1545
1439static int dm_rq_flush_suspending(struct mapped_device *md) 1546static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1547 gfp_t gfp_mask)
1440{ 1548{
1441 return !md->suspend_rq.special; 1549 struct request *clone;
1550 struct dm_rq_target_io *tio;
1551
1552 tio = alloc_rq_tio(md, gfp_mask);
1553 if (!tio)
1554 return NULL;
1555
1556 tio->md = md;
1557 tio->ti = NULL;
1558 tio->orig = rq;
1559 tio->error = 0;
1560 memset(&tio->info, 0, sizeof(tio->info));
1561
1562 clone = &tio->clone;
1563 if (setup_clone(clone, rq, tio)) {
1564 /* -ENOMEM */
1565 free_rq_tio(tio);
1566 return NULL;
1567 }
1568
1569 return clone;
1442} 1570}
1443 1571
1444/* 1572/*
@@ -1447,39 +1575,19 @@ static int dm_rq_flush_suspending(struct mapped_device *md)
1447static int dm_prep_fn(struct request_queue *q, struct request *rq) 1575static int dm_prep_fn(struct request_queue *q, struct request *rq)
1448{ 1576{
1449 struct mapped_device *md = q->queuedata; 1577 struct mapped_device *md = q->queuedata;
1450 struct dm_rq_target_io *tio;
1451 struct request *clone; 1578 struct request *clone;
1452 1579
1453 if (unlikely(rq == &md->suspend_rq)) { 1580 if (unlikely(dm_rq_is_flush_request(rq)))
1454 if (dm_rq_flush_suspending(md)) 1581 return BLKPREP_OK;
1455 return BLKPREP_OK;
1456 else
1457 /* The flush suspend was interrupted */
1458 return BLKPREP_KILL;
1459 }
1460 1582
1461 if (unlikely(rq->special)) { 1583 if (unlikely(rq->special)) {
1462 DMWARN("Already has something in rq->special."); 1584 DMWARN("Already has something in rq->special.");
1463 return BLKPREP_KILL; 1585 return BLKPREP_KILL;
1464 } 1586 }
1465 1587
1466 tio = alloc_rq_tio(md); /* Only one for each original request */ 1588 clone = clone_rq(rq, md, GFP_ATOMIC);
1467 if (!tio) 1589 if (!clone)
1468 /* -ENOMEM */
1469 return BLKPREP_DEFER;
1470
1471 tio->md = md;
1472 tio->ti = NULL;
1473 tio->orig = rq;
1474 tio->error = 0;
1475 memset(&tio->info, 0, sizeof(tio->info));
1476
1477 clone = &tio->clone;
1478 if (setup_clone(clone, rq, tio)) {
1479 /* -ENOMEM */
1480 free_rq_tio(tio);
1481 return BLKPREP_DEFER; 1590 return BLKPREP_DEFER;
1482 }
1483 1591
1484 rq->special = clone; 1592 rq->special = clone;
1485 rq->cmd_flags |= REQ_DONTPREP; 1593 rq->cmd_flags |= REQ_DONTPREP;
@@ -1487,11 +1595,10 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
1487 return BLKPREP_OK; 1595 return BLKPREP_OK;
1488} 1596}
1489 1597
1490static void map_request(struct dm_target *ti, struct request *rq, 1598static void map_request(struct dm_target *ti, struct request *clone,
1491 struct mapped_device *md) 1599 struct mapped_device *md)
1492{ 1600{
1493 int r; 1601 int r;
1494 struct request *clone = rq->special;
1495 struct dm_rq_target_io *tio = clone->end_io_data; 1602 struct dm_rq_target_io *tio = clone->end_io_data;
1496 1603
1497 /* 1604 /*
@@ -1511,6 +1618,8 @@ static void map_request(struct dm_target *ti, struct request *rq,
1511 break; 1618 break;
1512 case DM_MAPIO_REMAPPED: 1619 case DM_MAPIO_REMAPPED:
1513 /* The target has remapped the I/O so dispatch it */ 1620 /* The target has remapped the I/O so dispatch it */
1621 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1622 blk_rq_pos(tio->orig));
1514 dm_dispatch_request(clone); 1623 dm_dispatch_request(clone);
1515 break; 1624 break;
1516 case DM_MAPIO_REQUEUE: 1625 case DM_MAPIO_REQUEUE:
@@ -1536,29 +1645,26 @@ static void map_request(struct dm_target *ti, struct request *rq,
1536static void dm_request_fn(struct request_queue *q) 1645static void dm_request_fn(struct request_queue *q)
1537{ 1646{
1538 struct mapped_device *md = q->queuedata; 1647 struct mapped_device *md = q->queuedata;
1539 struct dm_table *map = dm_get_table(md); 1648 struct dm_table *map = dm_get_live_table(md);
1540 struct dm_target *ti; 1649 struct dm_target *ti;
1541 struct request *rq; 1650 struct request *rq, *clone;
1542 1651
1543 /* 1652 /*
1544 * For noflush suspend, check blk_queue_stopped() to immediately 1653 * For suspend, check blk_queue_stopped() and increment
1545 * quit I/O dispatching. 1654 * ->pending within a single queue_lock not to increment the
1655 * number of in-flight I/Os after the queue is stopped in
1656 * dm_suspend().
1546 */ 1657 */
1547 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1658 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
1548 rq = blk_peek_request(q); 1659 rq = blk_peek_request(q);
1549 if (!rq) 1660 if (!rq)
1550 goto plug_and_out; 1661 goto plug_and_out;
1551 1662
1552 if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ 1663 if (unlikely(dm_rq_is_flush_request(rq))) {
1553 if (queue_in_flight(q)) 1664 BUG_ON(md->flush_request);
1554 /* Not quiet yet. Wait more */ 1665 md->flush_request = rq;
1555 goto plug_and_out;
1556
1557 /* This device should be quiet now */
1558 __stop_queue(q);
1559 blk_start_request(rq); 1666 blk_start_request(rq);
1560 __blk_end_request_all(rq, 0); 1667 queue_work(md->wq, &md->barrier_work);
1561 wake_up(&md->wait);
1562 goto out; 1668 goto out;
1563 } 1669 }
1564 1670
@@ -1567,8 +1673,11 @@ static void dm_request_fn(struct request_queue *q)
1567 goto plug_and_out; 1673 goto plug_and_out;
1568 1674
1569 blk_start_request(rq); 1675 blk_start_request(rq);
1676 clone = rq->special;
1677 atomic_inc(&md->pending[rq_data_dir(clone)]);
1678
1570 spin_unlock(q->queue_lock); 1679 spin_unlock(q->queue_lock);
1571 map_request(ti, rq, md); 1680 map_request(ti, clone, md);
1572 spin_lock_irq(q->queue_lock); 1681 spin_lock_irq(q->queue_lock);
1573 } 1682 }
1574 1683
@@ -1595,7 +1704,7 @@ static int dm_lld_busy(struct request_queue *q)
1595{ 1704{
1596 int r; 1705 int r;
1597 struct mapped_device *md = q->queuedata; 1706 struct mapped_device *md = q->queuedata;
1598 struct dm_table *map = dm_get_table(md); 1707 struct dm_table *map = dm_get_live_table(md);
1599 1708
1600 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1709 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1601 r = 1; 1710 r = 1;
@@ -1610,7 +1719,7 @@ static int dm_lld_busy(struct request_queue *q)
1610static void dm_unplug_all(struct request_queue *q) 1719static void dm_unplug_all(struct request_queue *q)
1611{ 1720{
1612 struct mapped_device *md = q->queuedata; 1721 struct mapped_device *md = q->queuedata;
1613 struct dm_table *map = dm_get_table(md); 1722 struct dm_table *map = dm_get_live_table(md);
1614 1723
1615 if (map) { 1724 if (map) {
1616 if (dm_request_based(md)) 1725 if (dm_request_based(md))
@@ -1628,7 +1737,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
1628 struct dm_table *map; 1737 struct dm_table *map;
1629 1738
1630 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1739 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1631 map = dm_get_table(md); 1740 map = dm_get_live_table(md);
1632 if (map) { 1741 if (map) {
1633 /* 1742 /*
1634 * Request-based dm cares about only own queue for 1743 * Request-based dm cares about only own queue for
@@ -1725,6 +1834,7 @@ out:
1725static const struct block_device_operations dm_blk_dops; 1834static const struct block_device_operations dm_blk_dops;
1726 1835
1727static void dm_wq_work(struct work_struct *work); 1836static void dm_wq_work(struct work_struct *work);
1837static void dm_rq_barrier_work(struct work_struct *work);
1728 1838
1729/* 1839/*
1730 * Allocate and initialise a blank device with a given minor. 1840 * Allocate and initialise a blank device with a given minor.
@@ -1754,6 +1864,7 @@ static struct mapped_device *alloc_dev(int minor)
1754 init_rwsem(&md->io_lock); 1864 init_rwsem(&md->io_lock);
1755 mutex_init(&md->suspend_lock); 1865 mutex_init(&md->suspend_lock);
1756 spin_lock_init(&md->deferred_lock); 1866 spin_lock_init(&md->deferred_lock);
1867 spin_lock_init(&md->barrier_error_lock);
1757 rwlock_init(&md->map_lock); 1868 rwlock_init(&md->map_lock);
1758 atomic_set(&md->holders, 1); 1869 atomic_set(&md->holders, 1);
1759 atomic_set(&md->open_count, 0); 1870 atomic_set(&md->open_count, 0);
@@ -1788,6 +1899,8 @@ static struct mapped_device *alloc_dev(int minor)
1788 blk_queue_softirq_done(md->queue, dm_softirq_done); 1899 blk_queue_softirq_done(md->queue, dm_softirq_done);
1789 blk_queue_prep_rq(md->queue, dm_prep_fn); 1900 blk_queue_prep_rq(md->queue, dm_prep_fn);
1790 blk_queue_lld_busy(md->queue, dm_lld_busy); 1901 blk_queue_lld_busy(md->queue, dm_lld_busy);
1902 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH,
1903 dm_rq_prepare_flush);
1791 1904
1792 md->disk = alloc_disk(1); 1905 md->disk = alloc_disk(1);
1793 if (!md->disk) 1906 if (!md->disk)
@@ -1797,6 +1910,7 @@ static struct mapped_device *alloc_dev(int minor)
1797 atomic_set(&md->pending[1], 0); 1910 atomic_set(&md->pending[1], 0);
1798 init_waitqueue_head(&md->wait); 1911 init_waitqueue_head(&md->wait);
1799 INIT_WORK(&md->work, dm_wq_work); 1912 INIT_WORK(&md->work, dm_wq_work);
1913 INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
1800 init_waitqueue_head(&md->eventq); 1914 init_waitqueue_head(&md->eventq);
1801 1915
1802 md->disk->major = _major; 1916 md->disk->major = _major;
@@ -1921,9 +2035,13 @@ static void __set_size(struct mapped_device *md, sector_t size)
1921 mutex_unlock(&md->bdev->bd_inode->i_mutex); 2035 mutex_unlock(&md->bdev->bd_inode->i_mutex);
1922} 2036}
1923 2037
1924static int __bind(struct mapped_device *md, struct dm_table *t, 2038/*
1925 struct queue_limits *limits) 2039 * Returns old map, which caller must destroy.
2040 */
2041static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2042 struct queue_limits *limits)
1926{ 2043{
2044 struct dm_table *old_map;
1927 struct request_queue *q = md->queue; 2045 struct request_queue *q = md->queue;
1928 sector_t size; 2046 sector_t size;
1929 unsigned long flags; 2047 unsigned long flags;
@@ -1938,11 +2056,6 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
1938 2056
1939 __set_size(md, size); 2057 __set_size(md, size);
1940 2058
1941 if (!size) {
1942 dm_table_destroy(t);
1943 return 0;
1944 }
1945
1946 dm_table_event_callback(t, event_callback, md); 2059 dm_table_event_callback(t, event_callback, md);
1947 2060
1948 /* 2061 /*
@@ -1958,26 +2071,31 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
1958 __bind_mempools(md, t); 2071 __bind_mempools(md, t);
1959 2072
1960 write_lock_irqsave(&md->map_lock, flags); 2073 write_lock_irqsave(&md->map_lock, flags);
2074 old_map = md->map;
1961 md->map = t; 2075 md->map = t;
1962 dm_table_set_restrictions(t, q, limits); 2076 dm_table_set_restrictions(t, q, limits);
1963 write_unlock_irqrestore(&md->map_lock, flags); 2077 write_unlock_irqrestore(&md->map_lock, flags);
1964 2078
1965 return 0; 2079 return old_map;
1966} 2080}
1967 2081
1968static void __unbind(struct mapped_device *md) 2082/*
2083 * Returns unbound table for the caller to free.
2084 */
2085static struct dm_table *__unbind(struct mapped_device *md)
1969{ 2086{
1970 struct dm_table *map = md->map; 2087 struct dm_table *map = md->map;
1971 unsigned long flags; 2088 unsigned long flags;
1972 2089
1973 if (!map) 2090 if (!map)
1974 return; 2091 return NULL;
1975 2092
1976 dm_table_event_callback(map, NULL, NULL); 2093 dm_table_event_callback(map, NULL, NULL);
1977 write_lock_irqsave(&md->map_lock, flags); 2094 write_lock_irqsave(&md->map_lock, flags);
1978 md->map = NULL; 2095 md->map = NULL;
1979 write_unlock_irqrestore(&md->map_lock, flags); 2096 write_unlock_irqrestore(&md->map_lock, flags);
1980 dm_table_destroy(map); 2097
2098 return map;
1981} 2099}
1982 2100
1983/* 2101/*
@@ -2059,18 +2177,18 @@ void dm_put(struct mapped_device *md)
2059 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2177 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2060 2178
2061 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 2179 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
2062 map = dm_get_table(md); 2180 map = dm_get_live_table(md);
2063 idr_replace(&_minor_idr, MINOR_ALLOCED, 2181 idr_replace(&_minor_idr, MINOR_ALLOCED,
2064 MINOR(disk_devt(dm_disk(md)))); 2182 MINOR(disk_devt(dm_disk(md))));
2065 set_bit(DMF_FREEING, &md->flags); 2183 set_bit(DMF_FREEING, &md->flags);
2066 spin_unlock(&_minor_lock); 2184 spin_unlock(&_minor_lock);
2067 if (!dm_suspended(md)) { 2185 if (!dm_suspended_md(md)) {
2068 dm_table_presuspend_targets(map); 2186 dm_table_presuspend_targets(map);
2069 dm_table_postsuspend_targets(map); 2187 dm_table_postsuspend_targets(map);
2070 } 2188 }
2071 dm_sysfs_exit(md); 2189 dm_sysfs_exit(md);
2072 dm_table_put(map); 2190 dm_table_put(map);
2073 __unbind(md); 2191 dm_table_destroy(__unbind(md));
2074 free_dev(md); 2192 free_dev(md);
2075 } 2193 }
2076} 2194}
@@ -2080,8 +2198,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2080{ 2198{
2081 int r = 0; 2199 int r = 0;
2082 DECLARE_WAITQUEUE(wait, current); 2200 DECLARE_WAITQUEUE(wait, current);
2083 struct request_queue *q = md->queue;
2084 unsigned long flags;
2085 2201
2086 dm_unplug_all(md->queue); 2202 dm_unplug_all(md->queue);
2087 2203
@@ -2091,15 +2207,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2091 set_current_state(interruptible); 2207 set_current_state(interruptible);
2092 2208
2093 smp_mb(); 2209 smp_mb();
2094 if (dm_request_based(md)) { 2210 if (!md_in_flight(md))
2095 spin_lock_irqsave(q->queue_lock, flags);
2096 if (!queue_in_flight(q) && blk_queue_stopped(q)) {
2097 spin_unlock_irqrestore(q->queue_lock, flags);
2098 break;
2099 }
2100 spin_unlock_irqrestore(q->queue_lock, flags);
2101 } else if (!atomic_read(&md->pending[0]) &&
2102 !atomic_read(&md->pending[1]))
2103 break; 2211 break;
2104 2212
2105 if (interruptible == TASK_INTERRUPTIBLE && 2213 if (interruptible == TASK_INTERRUPTIBLE &&
@@ -2194,98 +2302,106 @@ static void dm_queue_flush(struct mapped_device *md)
2194 queue_work(md->wq, &md->work); 2302 queue_work(md->wq, &md->work);
2195} 2303}
2196 2304
2197/* 2305static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
2198 * Swap in a new table (destroying old one).
2199 */
2200int dm_swap_table(struct mapped_device *md, struct dm_table *table)
2201{ 2306{
2202 struct queue_limits limits; 2307 struct dm_rq_target_io *tio = clone->end_io_data;
2203 int r = -EINVAL;
2204 2308
2205 mutex_lock(&md->suspend_lock); 2309 tio->info.flush_request = flush_nr;
2310}
2206 2311
2207 /* device must be suspended */ 2312/* Issue barrier requests to targets and wait for their completion. */
2208 if (!dm_suspended(md)) 2313static int dm_rq_barrier(struct mapped_device *md)
2209 goto out; 2314{
2315 int i, j;
2316 struct dm_table *map = dm_get_live_table(md);
2317 unsigned num_targets = dm_table_get_num_targets(map);
2318 struct dm_target *ti;
2319 struct request *clone;
2210 2320
2211 r = dm_calculate_queue_limits(table, &limits); 2321 md->barrier_error = 0;
2212 if (r)
2213 goto out;
2214 2322
2215 /* cannot change the device type, once a table is bound */ 2323 for (i = 0; i < num_targets; i++) {
2216 if (md->map && 2324 ti = dm_table_get_target(map, i);
2217 (dm_table_get_type(md->map) != dm_table_get_type(table))) { 2325 for (j = 0; j < ti->num_flush_requests; j++) {
2218 DMWARN("can't change the device type after a table is bound"); 2326 clone = clone_rq(md->flush_request, md, GFP_NOIO);
2219 goto out; 2327 dm_rq_set_flush_nr(clone, j);
2328 atomic_inc(&md->pending[rq_data_dir(clone)]);
2329 map_request(ti, clone, md);
2330 }
2220 } 2331 }
2221 2332
2222 __unbind(md); 2333 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2223 r = __bind(md, table, &limits); 2334 dm_table_put(map);
2224
2225out:
2226 mutex_unlock(&md->suspend_lock);
2227 return r;
2228}
2229 2335
2230static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) 2336 return md->barrier_error;
2231{
2232 md->suspend_rq.special = (void *)0x1;
2233} 2337}
2234 2338
2235static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) 2339static void dm_rq_barrier_work(struct work_struct *work)
2236{ 2340{
2341 int error;
2342 struct mapped_device *md = container_of(work, struct mapped_device,
2343 barrier_work);
2237 struct request_queue *q = md->queue; 2344 struct request_queue *q = md->queue;
2345 struct request *rq;
2238 unsigned long flags; 2346 unsigned long flags;
2239 2347
2240 spin_lock_irqsave(q->queue_lock, flags); 2348 /*
2241 if (!noflush) 2349 * Hold the md reference here and leave it at the last part so that
2242 dm_rq_invalidate_suspend_marker(md); 2350 * the md can't be deleted by device opener when the barrier request
2243 __start_queue(q); 2351 * completes.
2244 spin_unlock_irqrestore(q->queue_lock, flags); 2352 */
2245} 2353 dm_get(md);
2246 2354
2247static void dm_rq_start_suspend(struct mapped_device *md, int noflush) 2355 error = dm_rq_barrier(md);
2248{
2249 struct request *rq = &md->suspend_rq;
2250 struct request_queue *q = md->queue;
2251 2356
2252 if (noflush) 2357 rq = md->flush_request;
2253 stop_queue(q); 2358 md->flush_request = NULL;
2254 else { 2359
2255 blk_rq_init(q, rq); 2360 if (error == DM_ENDIO_REQUEUE) {
2256 blk_insert_request(q, rq, 0, NULL); 2361 spin_lock_irqsave(q->queue_lock, flags);
2257 } 2362 blk_requeue_request(q, rq);
2363 spin_unlock_irqrestore(q->queue_lock, flags);
2364 } else
2365 blk_end_request_all(rq, error);
2366
2367 blk_run_queue(q);
2368
2369 dm_put(md);
2258} 2370}
2259 2371
2260static int dm_rq_suspend_available(struct mapped_device *md, int noflush) 2372/*
2373 * Swap in a new table, returning the old one for the caller to destroy.
2374 */
2375struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2261{ 2376{
2262 int r = 1; 2377 struct dm_table *map = ERR_PTR(-EINVAL);
2263 struct request *rq = &md->suspend_rq; 2378 struct queue_limits limits;
2264 struct request_queue *q = md->queue; 2379 int r;
2265 unsigned long flags;
2266 2380
2267 if (noflush) 2381 mutex_lock(&md->suspend_lock);
2268 return r;
2269 2382
2270 /* The marker must be protected by queue lock if it is in use */ 2383 /* device must be suspended */
2271 spin_lock_irqsave(q->queue_lock, flags); 2384 if (!dm_suspended_md(md))
2272 if (unlikely(rq->ref_count)) { 2385 goto out;
2273 /* 2386
2274 * This can happen, when the previous flush suspend was 2387 r = dm_calculate_queue_limits(table, &limits);
2275 * interrupted, the marker is still in the queue and 2388 if (r) {
2276 * this flush suspend has been invoked, because we don't 2389 map = ERR_PTR(r);
2277 * remove the marker at the time of suspend interruption. 2390 goto out;
2278 * We have only one marker per mapped_device, so we can't
2279 * start another flush suspend while it is in use.
2280 */
2281 BUG_ON(!rq->special); /* The marker should be invalidated */
2282 DMWARN("Invalidating the previous flush suspend is still in"
2283 " progress. Please retry later.");
2284 r = 0;
2285 } 2391 }
2286 spin_unlock_irqrestore(q->queue_lock, flags);
2287 2392
2288 return r; 2393 /* cannot change the device type, once a table is bound */
2394 if (md->map &&
2395 (dm_table_get_type(md->map) != dm_table_get_type(table))) {
2396 DMWARN("can't change the device type after a table is bound");
2397 goto out;
2398 }
2399
2400 map = __bind(md, table, &limits);
2401
2402out:
2403 mutex_unlock(&md->suspend_lock);
2404 return map;
2289} 2405}
2290 2406
2291/* 2407/*
@@ -2330,49 +2446,11 @@ static void unlock_fs(struct mapped_device *md)
2330/* 2446/*
2331 * Suspend mechanism in request-based dm. 2447 * Suspend mechanism in request-based dm.
2332 * 2448 *
2333 * After the suspend starts, further incoming requests are kept in 2449 * 1. Flush all I/Os by lock_fs() if needed.
2334 * the request_queue and deferred. 2450 * 2. Stop dispatching any I/O by stopping the request_queue.
2335 * Remaining requests in the request_queue at the start of suspend are flushed 2451 * 3. Wait for all in-flight I/Os to be completed or requeued.
2336 * if it is flush suspend.
2337 * The suspend completes when the following conditions have been satisfied,
2338 * so wait for it:
2339 * 1. q->in_flight is 0 (which means no in_flight request)
2340 * 2. queue has been stopped (which means no request dispatching)
2341 *
2342 * 2452 *
2343 * Noflush suspend 2453 * To abort suspend, start the request_queue.
2344 * ---------------
2345 * Noflush suspend doesn't need to dispatch remaining requests.
2346 * So stop the queue immediately. Then, wait for all in_flight requests
2347 * to be completed or requeued.
2348 *
2349 * To abort noflush suspend, start the queue.
2350 *
2351 *
2352 * Flush suspend
2353 * -------------
2354 * Flush suspend needs to dispatch remaining requests. So stop the queue
2355 * after the remaining requests are completed. (Requeued request must be also
2356 * re-dispatched and completed. Until then, we can't stop the queue.)
2357 *
2358 * During flushing the remaining requests, further incoming requests are also
2359 * inserted to the same queue. To distinguish which requests are to be
2360 * flushed, we insert a marker request to the queue at the time of starting
2361 * flush suspend, like a barrier.
2362 * The dispatching is blocked when the marker is found on the top of the queue.
2363 * And the queue is stopped when all in_flight requests are completed, since
2364 * that means the remaining requests are completely flushed.
2365 * Then, the marker is removed from the queue.
2366 *
2367 * To abort flush suspend, we also need to take care of the marker, not only
2368 * starting the queue.
2369 * We don't remove the marker forcibly from the queue since it's against
2370 * the block-layer manner. Instead, we put a invalidated mark on the marker.
2371 * When the invalidated marker is found on the top of the queue, it is
2372 * immediately removed from the queue, so it doesn't block dispatching.
2373 * Because we have only one marker per mapped_device, we can't start another
2374 * flush suspend until the invalidated marker is removed from the queue.
2375 * So fail and return with -EBUSY in such a case.
2376 */ 2454 */
2377int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2455int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2378{ 2456{
@@ -2383,17 +2461,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2383 2461
2384 mutex_lock(&md->suspend_lock); 2462 mutex_lock(&md->suspend_lock);
2385 2463
2386 if (dm_suspended(md)) { 2464 if (dm_suspended_md(md)) {
2387 r = -EINVAL; 2465 r = -EINVAL;
2388 goto out_unlock; 2466 goto out_unlock;
2389 } 2467 }
2390 2468
2391 if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { 2469 map = dm_get_live_table(md);
2392 r = -EBUSY;
2393 goto out_unlock;
2394 }
2395
2396 map = dm_get_table(md);
2397 2470
2398 /* 2471 /*
2399 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2472 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2406,8 +2479,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2406 dm_table_presuspend_targets(map); 2479 dm_table_presuspend_targets(map);
2407 2480
2408 /* 2481 /*
2409 * Flush I/O to the device. noflush supersedes do_lockfs, 2482 * Flush I/O to the device.
2410 * because lock_fs() needs to flush I/Os. 2483 * Any I/O submitted after lock_fs() may not be flushed.
2484 * noflush takes precedence over do_lockfs.
2485 * (lock_fs() flushes I/Os and waits for them to complete.)
2411 */ 2486 */
2412 if (!noflush && do_lockfs) { 2487 if (!noflush && do_lockfs) {
2413 r = lock_fs(md); 2488 r = lock_fs(md);
@@ -2436,10 +2511,15 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2436 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2511 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2437 up_write(&md->io_lock); 2512 up_write(&md->io_lock);
2438 2513
2439 flush_workqueue(md->wq); 2514 /*
2440 2515 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
2516 * can be kicked until md->queue is stopped. So stop md->queue before
2517 * flushing md->wq.
2518 */
2441 if (dm_request_based(md)) 2519 if (dm_request_based(md))
2442 dm_rq_start_suspend(md, noflush); 2520 stop_queue(md->queue);
2521
2522 flush_workqueue(md->wq);
2443 2523
2444 /* 2524 /*
2445 * At this point no more requests are entering target request routines. 2525 * At this point no more requests are entering target request routines.
@@ -2458,7 +2538,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2458 dm_queue_flush(md); 2538 dm_queue_flush(md);
2459 2539
2460 if (dm_request_based(md)) 2540 if (dm_request_based(md))
2461 dm_rq_abort_suspend(md, noflush); 2541 start_queue(md->queue);
2462 2542
2463 unlock_fs(md); 2543 unlock_fs(md);
2464 goto out; /* pushback list is already flushed, so skip flush */ 2544 goto out; /* pushback list is already flushed, so skip flush */
@@ -2470,10 +2550,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2470 * requests are being added to md->deferred list. 2550 * requests are being added to md->deferred list.
2471 */ 2551 */
2472 2552
2473 dm_table_postsuspend_targets(map);
2474
2475 set_bit(DMF_SUSPENDED, &md->flags); 2553 set_bit(DMF_SUSPENDED, &md->flags);
2476 2554
2555 dm_table_postsuspend_targets(map);
2556
2477out: 2557out:
2478 dm_table_put(map); 2558 dm_table_put(map);
2479 2559
@@ -2488,10 +2568,10 @@ int dm_resume(struct mapped_device *md)
2488 struct dm_table *map = NULL; 2568 struct dm_table *map = NULL;
2489 2569
2490 mutex_lock(&md->suspend_lock); 2570 mutex_lock(&md->suspend_lock);
2491 if (!dm_suspended(md)) 2571 if (!dm_suspended_md(md))
2492 goto out; 2572 goto out;
2493 2573
2494 map = dm_get_table(md); 2574 map = dm_get_live_table(md);
2495 if (!map || !dm_table_get_size(map)) 2575 if (!map || !dm_table_get_size(map))
2496 goto out; 2576 goto out;
2497 2577
@@ -2592,18 +2672,29 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2592 return NULL; 2672 return NULL;
2593 2673
2594 if (test_bit(DMF_FREEING, &md->flags) || 2674 if (test_bit(DMF_FREEING, &md->flags) ||
2595 test_bit(DMF_DELETING, &md->flags)) 2675 dm_deleting_md(md))
2596 return NULL; 2676 return NULL;
2597 2677
2598 dm_get(md); 2678 dm_get(md);
2599 return md; 2679 return md;
2600} 2680}
2601 2681
2602int dm_suspended(struct mapped_device *md) 2682int dm_suspended_md(struct mapped_device *md)
2603{ 2683{
2604 return test_bit(DMF_SUSPENDED, &md->flags); 2684 return test_bit(DMF_SUSPENDED, &md->flags);
2605} 2685}
2606 2686
2687int dm_suspended(struct dm_target *ti)
2688{
2689 struct mapped_device *md = dm_table_get_md(ti->table);
2690 int r = dm_suspended_md(md);
2691
2692 dm_put(md);
2693
2694 return r;
2695}
2696EXPORT_SYMBOL_GPL(dm_suspended);
2697
2607int dm_noflush_suspending(struct dm_target *ti) 2698int dm_noflush_suspending(struct dm_target *ti)
2608{ 2699{
2609 struct mapped_device *md = dm_table_get_md(ti->table); 2700 struct mapped_device *md = dm_table_get_md(ti->table);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a7663eba17e2..8dadaa5bc396 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -89,6 +89,16 @@ int dm_target_iterate(void (*iter_func)(struct target_type *tt,
89int dm_split_args(int *argc, char ***argvp, char *input); 89int dm_split_args(int *argc, char ***argvp, char *input);
90 90
91/* 91/*
92 * Is this mapped_device being deleted?
93 */
94int dm_deleting_md(struct mapped_device *md);
95
96/*
97 * Is this mapped_device suspended?
98 */
99int dm_suspended_md(struct mapped_device *md);
100
101/*
92 * The device-mapper can be driven through one of two interfaces; 102 * The device-mapper can be driven through one of two interfaces;
93 * ioctl or filesystem, depending which patch you have applied. 103 * ioctl or filesystem, depending which patch you have applied.
94 */ 104 */
@@ -118,6 +128,9 @@ int dm_lock_for_deletion(struct mapped_device *md);
118void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 128void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
119 unsigned cookie); 129 unsigned cookie);
120 130
131int dm_io_init(void);
132void dm_io_exit(void);
133
121int dm_kcopyd_init(void); 134int dm_kcopyd_init(void);
122void dm_kcopyd_exit(void); 135void dm_kcopyd_exit(void);
123 136