aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/dm-crypt.c6
-rw-r--r--drivers/md/dm-delay.c6
-rw-r--r--drivers/md/dm-exception-store.c749
-rw-r--r--drivers/md/dm-exception-store.h148
-rw-r--r--drivers/md/dm-ioctl.c16
-rw-r--r--drivers/md/dm-linear.c6
-rw-r--r--drivers/md/dm-log.c40
-rw-r--r--drivers/md/dm-mpath.c14
-rw-r--r--drivers/md/dm-raid1.c24
-rw-r--r--drivers/md/dm-snap-persistent.c704
-rw-r--r--drivers/md/dm-snap-transient.c98
-rw-r--r--drivers/md/dm-snap.c48
-rw-r--r--drivers/md/dm-snap.h129
-rw-r--r--drivers/md/dm-stripe.c4
-rw-r--r--drivers/md/dm-sysfs.c99
-rw-r--r--drivers/md/dm-table.c47
-rw-r--r--drivers/md/dm-target.c15
-rw-r--r--drivers/md/dm-zero.c5
-rw-r--r--drivers/md/dm.c101
-rw-r--r--drivers/md/dm.h10
-rw-r--r--include/linux/device-mapper.h28
22 files changed, 1319 insertions, 983 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 1c615804ea76..72880b7e28d9 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -3,9 +3,10 @@
3# 3#
4 4
5dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ 5dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
6 dm-ioctl.o dm-io.o dm-kcopyd.o 6 dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
7dm-multipath-objs := dm-path-selector.o dm-mpath.o 7dm-multipath-objs := dm-path-selector.o dm-mpath.o
8dm-snapshot-objs := dm-snap.o dm-exception-store.o 8dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \
9 dm-snap-persistent.o
9dm-mirror-objs := dm-raid1.o 10dm-mirror-objs := dm-raid1.o
10md-mod-objs := md.o bitmap.o 11md-mod-objs := md.o bitmap.o
11raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ 12raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3326750ec02c..35bda49796fb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1322,11 +1322,7 @@ static int __init dm_crypt_init(void)
1322 1322
1323static void __exit dm_crypt_exit(void) 1323static void __exit dm_crypt_exit(void)
1324{ 1324{
1325 int r = dm_unregister_target(&crypt_target); 1325 dm_unregister_target(&crypt_target);
1326
1327 if (r < 0)
1328 DMERR("unregister failed %d", r);
1329
1330 kmem_cache_destroy(_crypt_io_pool); 1326 kmem_cache_destroy(_crypt_io_pool);
1331} 1327}
1332 1328
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 848b381f1173..59ee1b015d2d 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -364,11 +364,7 @@ bad_queue:
364 364
365static void __exit dm_delay_exit(void) 365static void __exit dm_delay_exit(void)
366{ 366{
367 int r = dm_unregister_target(&delay_target); 367 dm_unregister_target(&delay_target);
368
369 if (r < 0)
370 DMERR("unregister failed %d", r);
371
372 kmem_cache_destroy(delayed_cache); 368 kmem_cache_destroy(delayed_cache);
373 destroy_workqueue(kdelayd_wq); 369 destroy_workqueue(kdelayd_wq);
374} 370}
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 01590f3e0009..dccbfb0e010f 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -1,756 +1,45 @@
1/* 1/*
2 * dm-exception-store.c
3 *
4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5 * Copyright (C) 2006 Red Hat GmbH 3 * Copyright (C) 2006-2008 Red Hat GmbH
6 * 4 *
7 * This file is released under the GPL. 5 * This file is released under the GPL.
8 */ 6 */
9 7
10#include "dm-snap.h" 8#include "dm-exception-store.h"
11 9
12#include <linux/mm.h> 10#include <linux/mm.h>
13#include <linux/pagemap.h> 11#include <linux/pagemap.h>
14#include <linux/vmalloc.h> 12#include <linux/vmalloc.h>
15#include <linux/slab.h> 13#include <linux/slab.h>
16#include <linux/dm-io.h>
17#include <linux/dm-kcopyd.h>
18
19#define DM_MSG_PREFIX "snapshots"
20#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
21
22/*-----------------------------------------------------------------
23 * Persistent snapshots, by persistent we mean that the snapshot
24 * will survive a reboot.
25 *---------------------------------------------------------------*/
26
27/*
28 * We need to store a record of which parts of the origin have
29 * been copied to the snapshot device. The snapshot code
30 * requires that we copy exception chunks to chunk aligned areas
31 * of the COW store. It makes sense therefore, to store the
32 * metadata in chunk size blocks.
33 *
34 * There is no backward or forward compatibility implemented,
35 * snapshots with different disk versions than the kernel will
36 * not be usable. It is expected that "lvcreate" will blank out
37 * the start of a fresh COW device before calling the snapshot
38 * constructor.
39 *
40 * The first chunk of the COW device just contains the header.
41 * After this there is a chunk filled with exception metadata,
42 * followed by as many exception chunks as can fit in the
43 * metadata areas.
44 *
45 * All on disk structures are in little-endian format. The end
46 * of the exceptions info is indicated by an exception with a
47 * new_chunk of 0, which is invalid since it would point to the
48 * header chunk.
49 */
50
51/*
52 * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
53 */
54#define SNAP_MAGIC 0x70416e53
55
56/*
57 * The on-disk version of the metadata.
58 */
59#define SNAPSHOT_DISK_VERSION 1
60
61struct disk_header {
62 uint32_t magic;
63
64 /*
65 * Is this snapshot valid. There is no way of recovering
66 * an invalid snapshot.
67 */
68 uint32_t valid;
69
70 /*
71 * Simple, incrementing version. no backward
72 * compatibility.
73 */
74 uint32_t version;
75
76 /* In sectors */
77 uint32_t chunk_size;
78};
79
80struct disk_exception {
81 uint64_t old_chunk;
82 uint64_t new_chunk;
83};
84
85struct commit_callback {
86 void (*callback)(void *, int success);
87 void *context;
88};
89
90/*
91 * The top level structure for a persistent exception store.
92 */
93struct pstore {
94 struct dm_snapshot *snap; /* up pointer to my snapshot */
95 int version;
96 int valid;
97 uint32_t exceptions_per_area;
98
99 /*
100 * Now that we have an asynchronous kcopyd there is no
101 * need for large chunk sizes, so it wont hurt to have a
102 * whole chunks worth of metadata in memory at once.
103 */
104 void *area;
105
106 /*
107 * An area of zeros used to clear the next area.
108 */
109 void *zero_area;
110
111 /*
112 * Used to keep track of which metadata area the data in
113 * 'chunk' refers to.
114 */
115 chunk_t current_area;
116
117 /*
118 * The next free chunk for an exception.
119 */
120 chunk_t next_free;
121
122 /*
123 * The index of next free exception in the current
124 * metadata area.
125 */
126 uint32_t current_committed;
127
128 atomic_t pending_count;
129 uint32_t callback_count;
130 struct commit_callback *callbacks;
131 struct dm_io_client *io_client;
132
133 struct workqueue_struct *metadata_wq;
134};
135
136static unsigned sectors_to_pages(unsigned sectors)
137{
138 return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
139}
140
141static int alloc_area(struct pstore *ps)
142{
143 int r = -ENOMEM;
144 size_t len;
145
146 len = ps->snap->chunk_size << SECTOR_SHIFT;
147
148 /*
149 * Allocate the chunk_size block of memory that will hold
150 * a single metadata area.
151 */
152 ps->area = vmalloc(len);
153 if (!ps->area)
154 return r;
155
156 ps->zero_area = vmalloc(len);
157 if (!ps->zero_area) {
158 vfree(ps->area);
159 return r;
160 }
161 memset(ps->zero_area, 0, len);
162
163 return 0;
164}
165
166static void free_area(struct pstore *ps)
167{
168 vfree(ps->area);
169 ps->area = NULL;
170 vfree(ps->zero_area);
171 ps->zero_area = NULL;
172}
173
174struct mdata_req {
175 struct dm_io_region *where;
176 struct dm_io_request *io_req;
177 struct work_struct work;
178 int result;
179};
180
181static void do_metadata(struct work_struct *work)
182{
183 struct mdata_req *req = container_of(work, struct mdata_req, work);
184
185 req->result = dm_io(req->io_req, 1, req->where, NULL);
186}
187
188/*
189 * Read or write a chunk aligned and sized block of data from a device.
190 */
191static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
192{
193 struct dm_io_region where = {
194 .bdev = ps->snap->cow->bdev,
195 .sector = ps->snap->chunk_size * chunk,
196 .count = ps->snap->chunk_size,
197 };
198 struct dm_io_request io_req = {
199 .bi_rw = rw,
200 .mem.type = DM_IO_VMA,
201 .mem.ptr.vma = ps->area,
202 .client = ps->io_client,
203 .notify.fn = NULL,
204 };
205 struct mdata_req req;
206
207 if (!metadata)
208 return dm_io(&io_req, 1, &where, NULL);
209
210 req.where = &where;
211 req.io_req = &io_req;
212
213 /*
214 * Issue the synchronous I/O from a different thread
215 * to avoid generic_make_request recursion.
216 */
217 INIT_WORK(&req.work, do_metadata);
218 queue_work(ps->metadata_wq, &req.work);
219 flush_workqueue(ps->metadata_wq);
220
221 return req.result;
222}
223
224/*
225 * Convert a metadata area index to a chunk index.
226 */
227static chunk_t area_location(struct pstore *ps, chunk_t area)
228{
229 return 1 + ((ps->exceptions_per_area + 1) * area);
230}
231
232/*
233 * Read or write a metadata area. Remembering to skip the first
234 * chunk which holds the header.
235 */
236static int area_io(struct pstore *ps, int rw)
237{
238 int r;
239 chunk_t chunk;
240
241 chunk = area_location(ps, ps->current_area);
242
243 r = chunk_io(ps, chunk, rw, 0);
244 if (r)
245 return r;
246
247 return 0;
248}
249
250static void zero_memory_area(struct pstore *ps)
251{
252 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
253}
254
255static int zero_disk_area(struct pstore *ps, chunk_t area)
256{
257 struct dm_io_region where = {
258 .bdev = ps->snap->cow->bdev,
259 .sector = ps->snap->chunk_size * area_location(ps, area),
260 .count = ps->snap->chunk_size,
261 };
262 struct dm_io_request io_req = {
263 .bi_rw = WRITE,
264 .mem.type = DM_IO_VMA,
265 .mem.ptr.vma = ps->zero_area,
266 .client = ps->io_client,
267 .notify.fn = NULL,
268 };
269
270 return dm_io(&io_req, 1, &where, NULL);
271}
272
273static int read_header(struct pstore *ps, int *new_snapshot)
274{
275 int r;
276 struct disk_header *dh;
277 chunk_t chunk_size;
278 int chunk_size_supplied = 1;
279
280 /*
281 * Use default chunk size (or hardsect_size, if larger) if none supplied
282 */
283 if (!ps->snap->chunk_size) {
284 ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
285 bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
286 ps->snap->chunk_mask = ps->snap->chunk_size - 1;
287 ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
288 chunk_size_supplied = 0;
289 }
290
291 ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
292 chunk_size));
293 if (IS_ERR(ps->io_client))
294 return PTR_ERR(ps->io_client);
295
296 r = alloc_area(ps);
297 if (r)
298 return r;
299
300 r = chunk_io(ps, 0, READ, 1);
301 if (r)
302 goto bad;
303
304 dh = (struct disk_header *) ps->area;
305
306 if (le32_to_cpu(dh->magic) == 0) {
307 *new_snapshot = 1;
308 return 0;
309 }
310
311 if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
312 DMWARN("Invalid or corrupt snapshot");
313 r = -ENXIO;
314 goto bad;
315 }
316
317 *new_snapshot = 0;
318 ps->valid = le32_to_cpu(dh->valid);
319 ps->version = le32_to_cpu(dh->version);
320 chunk_size = le32_to_cpu(dh->chunk_size);
321
322 if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
323 return 0;
324
325 DMWARN("chunk size %llu in device metadata overrides "
326 "table chunk size of %llu.",
327 (unsigned long long)chunk_size,
328 (unsigned long long)ps->snap->chunk_size);
329
330 /* We had a bogus chunk_size. Fix stuff up. */
331 free_area(ps);
332
333 ps->snap->chunk_size = chunk_size;
334 ps->snap->chunk_mask = chunk_size - 1;
335 ps->snap->chunk_shift = ffs(chunk_size) - 1;
336
337 r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
338 ps->io_client);
339 if (r)
340 return r;
341
342 r = alloc_area(ps);
343 return r;
344
345bad:
346 free_area(ps);
347 return r;
348}
349
350static int write_header(struct pstore *ps)
351{
352 struct disk_header *dh;
353
354 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
355
356 dh = (struct disk_header *) ps->area;
357 dh->magic = cpu_to_le32(SNAP_MAGIC);
358 dh->valid = cpu_to_le32(ps->valid);
359 dh->version = cpu_to_le32(ps->version);
360 dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
361
362 return chunk_io(ps, 0, WRITE, 1);
363}
364
365/*
366 * Access functions for the disk exceptions, these do the endian conversions.
367 */
368static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
369{
370 BUG_ON(index >= ps->exceptions_per_area);
371
372 return ((struct disk_exception *) ps->area) + index;
373}
374 14
375static void read_exception(struct pstore *ps, 15#define DM_MSG_PREFIX "snapshot exception stores"
376 uint32_t index, struct disk_exception *result)
377{
378 struct disk_exception *e = get_exception(ps, index);
379
380 /* copy it */
381 result->old_chunk = le64_to_cpu(e->old_chunk);
382 result->new_chunk = le64_to_cpu(e->new_chunk);
383}
384
385static void write_exception(struct pstore *ps,
386 uint32_t index, struct disk_exception *de)
387{
388 struct disk_exception *e = get_exception(ps, index);
389
390 /* copy it */
391 e->old_chunk = cpu_to_le64(de->old_chunk);
392 e->new_chunk = cpu_to_le64(de->new_chunk);
393}
394 16
395/* 17int dm_exception_store_init(void)
396 * Registers the exceptions that are present in the current area.
397 * 'full' is filled in to indicate if the area has been
398 * filled.
399 */
400static int insert_exceptions(struct pstore *ps, int *full)
401{ 18{
402 int r; 19 int r;
403 unsigned int i;
404 struct disk_exception de;
405
406 /* presume the area is full */
407 *full = 1;
408
409 for (i = 0; i < ps->exceptions_per_area; i++) {
410 read_exception(ps, i, &de);
411
412 /*
413 * If the new_chunk is pointing at the start of
414 * the COW device, where the first metadata area
415 * is we know that we've hit the end of the
416 * exceptions. Therefore the area is not full.
417 */
418 if (de.new_chunk == 0LL) {
419 ps->current_committed = i;
420 *full = 0;
421 break;
422 }
423
424 /*
425 * Keep track of the start of the free chunks.
426 */
427 if (ps->next_free <= de.new_chunk)
428 ps->next_free = de.new_chunk + 1;
429
430 /*
431 * Otherwise we add the exception to the snapshot.
432 */
433 r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
434 if (r)
435 return r;
436 }
437
438 return 0;
439}
440
441static int read_exceptions(struct pstore *ps)
442{
443 int r, full = 1;
444
445 /*
446 * Keeping reading chunks and inserting exceptions until
447 * we find a partially full area.
448 */
449 for (ps->current_area = 0; full; ps->current_area++) {
450 r = area_io(ps, READ);
451 if (r)
452 return r;
453 20
454 r = insert_exceptions(ps, &full); 21 r = dm_transient_snapshot_init();
455 if (r) 22 if (r) {
456 return r; 23 DMERR("Unable to register transient exception store type.");
24 goto transient_fail;
457 } 25 }
458 26
459 ps->current_area--; 27 r = dm_persistent_snapshot_init();
460 28 if (r) {
461 return 0; 29 DMERR("Unable to register persistent exception store type");
462} 30 goto persistent_fail;
463
464static struct pstore *get_info(struct exception_store *store)
465{
466 return (struct pstore *) store->context;
467}
468
469static void persistent_fraction_full(struct exception_store *store,
470 sector_t *numerator, sector_t *denominator)
471{
472 *numerator = get_info(store)->next_free * store->snap->chunk_size;
473 *denominator = get_dev_size(store->snap->cow->bdev);
474}
475
476static void persistent_destroy(struct exception_store *store)
477{
478 struct pstore *ps = get_info(store);
479
480 destroy_workqueue(ps->metadata_wq);
481 dm_io_client_destroy(ps->io_client);
482 vfree(ps->callbacks);
483 free_area(ps);
484 kfree(ps);
485}
486
487static int persistent_read_metadata(struct exception_store *store)
488{
489 int r, uninitialized_var(new_snapshot);
490 struct pstore *ps = get_info(store);
491
492 /*
493 * Read the snapshot header.
494 */
495 r = read_header(ps, &new_snapshot);
496 if (r)
497 return r;
498
499 /*
500 * Now we know correct chunk_size, complete the initialisation.
501 */
502 ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
503 sizeof(struct disk_exception);
504 ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
505 sizeof(*ps->callbacks));
506 if (!ps->callbacks)
507 return -ENOMEM;
508
509 /*
510 * Do we need to setup a new snapshot ?
511 */
512 if (new_snapshot) {
513 r = write_header(ps);
514 if (r) {
515 DMWARN("write_header failed");
516 return r;
517 }
518
519 ps->current_area = 0;
520 zero_memory_area(ps);
521 r = zero_disk_area(ps, 0);
522 if (r) {
523 DMWARN("zero_disk_area(0) failed");
524 return r;
525 }
526 } else {
527 /*
528 * Sanity checks.
529 */
530 if (ps->version != SNAPSHOT_DISK_VERSION) {
531 DMWARN("unable to handle snapshot disk version %d",
532 ps->version);
533 return -EINVAL;
534 }
535
536 /*
537 * Metadata are valid, but snapshot is invalidated
538 */
539 if (!ps->valid)
540 return 1;
541
542 /*
543 * Read the metadata.
544 */
545 r = read_exceptions(ps);
546 if (r)
547 return r;
548 } 31 }
549 32
550 return 0; 33 return 0;
551}
552
553static int persistent_prepare(struct exception_store *store,
554 struct dm_snap_exception *e)
555{
556 struct pstore *ps = get_info(store);
557 uint32_t stride;
558 chunk_t next_free;
559 sector_t size = get_dev_size(store->snap->cow->bdev);
560
561 /* Is there enough room ? */
562 if (size < ((ps->next_free + 1) * store->snap->chunk_size))
563 return -ENOSPC;
564 34
565 e->new_chunk = ps->next_free; 35persistent_fail:
566 36 dm_persistent_snapshot_exit();
567 /* 37transient_fail:
568 * Move onto the next free pending, making sure to take 38 return r;
569 * into account the location of the metadata chunks.
570 */
571 stride = (ps->exceptions_per_area + 1);
572 next_free = ++ps->next_free;
573 if (sector_div(next_free, stride) == 1)
574 ps->next_free++;
575
576 atomic_inc(&ps->pending_count);
577 return 0;
578}
579
580static void persistent_commit(struct exception_store *store,
581 struct dm_snap_exception *e,
582 void (*callback) (void *, int success),
583 void *callback_context)
584{
585 unsigned int i;
586 struct pstore *ps = get_info(store);
587 struct disk_exception de;
588 struct commit_callback *cb;
589
590 de.old_chunk = e->old_chunk;
591 de.new_chunk = e->new_chunk;
592 write_exception(ps, ps->current_committed++, &de);
593
594 /*
595 * Add the callback to the back of the array. This code
596 * is the only place where the callback array is
597 * manipulated, and we know that it will never be called
598 * multiple times concurrently.
599 */
600 cb = ps->callbacks + ps->callback_count++;
601 cb->callback = callback;
602 cb->context = callback_context;
603
604 /*
605 * If there are exceptions in flight and we have not yet
606 * filled this metadata area there's nothing more to do.
607 */
608 if (!atomic_dec_and_test(&ps->pending_count) &&
609 (ps->current_committed != ps->exceptions_per_area))
610 return;
611
612 /*
613 * If we completely filled the current area, then wipe the next one.
614 */
615 if ((ps->current_committed == ps->exceptions_per_area) &&
616 zero_disk_area(ps, ps->current_area + 1))
617 ps->valid = 0;
618
619 /*
620 * Commit exceptions to disk.
621 */
622 if (ps->valid && area_io(ps, WRITE))
623 ps->valid = 0;
624
625 /*
626 * Advance to the next area if this one is full.
627 */
628 if (ps->current_committed == ps->exceptions_per_area) {
629 ps->current_committed = 0;
630 ps->current_area++;
631 zero_memory_area(ps);
632 }
633
634 for (i = 0; i < ps->callback_count; i++) {
635 cb = ps->callbacks + i;
636 cb->callback(cb->context, ps->valid);
637 }
638
639 ps->callback_count = 0;
640}
641
642static void persistent_drop(struct exception_store *store)
643{
644 struct pstore *ps = get_info(store);
645
646 ps->valid = 0;
647 if (write_header(ps))
648 DMWARN("write header failed");
649}
650
651int dm_create_persistent(struct exception_store *store)
652{
653 struct pstore *ps;
654
655 /* allocate the pstore */
656 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
657 if (!ps)
658 return -ENOMEM;
659
660 ps->snap = store->snap;
661 ps->valid = 1;
662 ps->version = SNAPSHOT_DISK_VERSION;
663 ps->area = NULL;
664 ps->next_free = 2; /* skipping the header and first area */
665 ps->current_committed = 0;
666
667 ps->callback_count = 0;
668 atomic_set(&ps->pending_count, 0);
669 ps->callbacks = NULL;
670
671 ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
672 if (!ps->metadata_wq) {
673 kfree(ps);
674 DMERR("couldn't start header metadata update thread");
675 return -ENOMEM;
676 }
677
678 store->destroy = persistent_destroy;
679 store->read_metadata = persistent_read_metadata;
680 store->prepare_exception = persistent_prepare;
681 store->commit_exception = persistent_commit;
682 store->drop_snapshot = persistent_drop;
683 store->fraction_full = persistent_fraction_full;
684 store->context = ps;
685
686 return 0;
687}
688
689/*-----------------------------------------------------------------
690 * Implementation of the store for non-persistent snapshots.
691 *---------------------------------------------------------------*/
692struct transient_c {
693 sector_t next_free;
694};
695
696static void transient_destroy(struct exception_store *store)
697{
698 kfree(store->context);
699}
700
701static int transient_read_metadata(struct exception_store *store)
702{
703 return 0;
704}
705
706static int transient_prepare(struct exception_store *store,
707 struct dm_snap_exception *e)
708{
709 struct transient_c *tc = (struct transient_c *) store->context;
710 sector_t size = get_dev_size(store->snap->cow->bdev);
711
712 if (size < (tc->next_free + store->snap->chunk_size))
713 return -1;
714
715 e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
716 tc->next_free += store->snap->chunk_size;
717
718 return 0;
719}
720
721static void transient_commit(struct exception_store *store,
722 struct dm_snap_exception *e,
723 void (*callback) (void *, int success),
724 void *callback_context)
725{
726 /* Just succeed */
727 callback(callback_context, 1);
728}
729
730static void transient_fraction_full(struct exception_store *store,
731 sector_t *numerator, sector_t *denominator)
732{
733 *numerator = ((struct transient_c *) store->context)->next_free;
734 *denominator = get_dev_size(store->snap->cow->bdev);
735} 39}
736 40
737int dm_create_transient(struct exception_store *store) 41void dm_exception_store_exit(void)
738{ 42{
739 struct transient_c *tc; 43 dm_persistent_snapshot_exit();
740 44 dm_transient_snapshot_exit();
741 store->destroy = transient_destroy;
742 store->read_metadata = transient_read_metadata;
743 store->prepare_exception = transient_prepare;
744 store->commit_exception = transient_commit;
745 store->drop_snapshot = NULL;
746 store->fraction_full = transient_fraction_full;
747
748 tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
749 if (!tc)
750 return -ENOMEM;
751
752 tc->next_free = 0;
753 store->context = tc;
754
755 return 0;
756} 45}
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
new file mode 100644
index 000000000000..bb9f33d5daa2
--- /dev/null
+++ b/drivers/md/dm-exception-store.h
@@ -0,0 +1,148 @@
1/*
2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
4 *
5 * Device-mapper snapshot exception store.
6 *
7 * This file is released under the GPL.
8 */
9
10#ifndef _LINUX_DM_EXCEPTION_STORE
11#define _LINUX_DM_EXCEPTION_STORE
12
13#include <linux/blkdev.h>
14#include <linux/device-mapper.h>
15
16/*
17 * The snapshot code deals with largish chunks of the disk at a
18 * time. Typically 32k - 512k.
19 */
20typedef sector_t chunk_t;
21
22/*
23 * An exception is used where an old chunk of data has been
24 * replaced by a new one.
25 * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
26 * of chunks that follow contiguously. Remaining bits hold the number of the
27 * chunk within the device.
28 */
29struct dm_snap_exception {
30 struct list_head hash_list;
31
32 chunk_t old_chunk;
33 chunk_t new_chunk;
34};
35
36/*
37 * Abstraction to handle the meta/layout of exception stores (the
38 * COW device).
39 */
40struct dm_exception_store {
41 /*
42 * Destroys this object when you've finished with it.
43 */
44 void (*destroy) (struct dm_exception_store *store);
45
46 /*
47 * The target shouldn't read the COW device until this is
48 * called. As exceptions are read from the COW, they are
49 * reported back via the callback.
50 */
51 int (*read_metadata) (struct dm_exception_store *store,
52 int (*callback)(void *callback_context,
53 chunk_t old, chunk_t new),
54 void *callback_context);
55
56 /*
57 * Find somewhere to store the next exception.
58 */
59 int (*prepare_exception) (struct dm_exception_store *store,
60 struct dm_snap_exception *e);
61
62 /*
63 * Update the metadata with this exception.
64 */
65 void (*commit_exception) (struct dm_exception_store *store,
66 struct dm_snap_exception *e,
67 void (*callback) (void *, int success),
68 void *callback_context);
69
70 /*
71 * The snapshot is invalid, note this in the metadata.
72 */
73 void (*drop_snapshot) (struct dm_exception_store *store);
74
75 int (*status) (struct dm_exception_store *store, status_type_t status,
76 char *result, unsigned int maxlen);
77
78 /*
79 * Return how full the snapshot is.
80 */
81 void (*fraction_full) (struct dm_exception_store *store,
82 sector_t *numerator,
83 sector_t *denominator);
84
85 struct dm_snapshot *snap;
86 void *context;
87};
88
89/*
90 * Funtions to manipulate consecutive chunks
91 */
92# if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
93# define DM_CHUNK_CONSECUTIVE_BITS 8
94# define DM_CHUNK_NUMBER_BITS 56
95
96static inline chunk_t dm_chunk_number(chunk_t chunk)
97{
98 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
99}
100
101static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
102{
103 return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
104}
105
106static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
107{
108 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
109
110 BUG_ON(!dm_consecutive_chunk_count(e));
111}
112
113# else
114# define DM_CHUNK_CONSECUTIVE_BITS 0
115
116static inline chunk_t dm_chunk_number(chunk_t chunk)
117{
118 return chunk;
119}
120
121static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
122{
123 return 0;
124}
125
126static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
127{
128}
129
130# endif
131
132int dm_exception_store_init(void);
133void dm_exception_store_exit(void);
134
135/*
136 * Two exception store implementations.
137 */
138int dm_persistent_snapshot_init(void);
139void dm_persistent_snapshot_exit(void);
140
141int dm_transient_snapshot_init(void);
142void dm_transient_snapshot_exit(void);
143
144int dm_create_persistent(struct dm_exception_store *store);
145
146int dm_create_transient(struct dm_exception_store *store);
147
148#endif /* _LINUX_DM_EXCEPTION_STORE */
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 777c948180f9..54d0588fc1f6 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -233,7 +233,7 @@ static void __hash_remove(struct hash_cell *hc)
233 } 233 }
234 234
235 if (hc->new_map) 235 if (hc->new_map)
236 dm_table_put(hc->new_map); 236 dm_table_destroy(hc->new_map);
237 dm_put(hc->md); 237 dm_put(hc->md);
238 free_cell(hc); 238 free_cell(hc);
239} 239}
@@ -827,8 +827,8 @@ static int do_resume(struct dm_ioctl *param)
827 827
828 r = dm_swap_table(md, new_map); 828 r = dm_swap_table(md, new_map);
829 if (r) { 829 if (r) {
830 dm_table_destroy(new_map);
830 dm_put(md); 831 dm_put(md);
831 dm_table_put(new_map);
832 return r; 832 return r;
833 } 833 }
834 834
@@ -836,8 +836,6 @@ static int do_resume(struct dm_ioctl *param)
836 set_disk_ro(dm_disk(md), 0); 836 set_disk_ro(dm_disk(md), 0);
837 else 837 else
838 set_disk_ro(dm_disk(md), 1); 838 set_disk_ro(dm_disk(md), 1);
839
840 dm_table_put(new_map);
841 } 839 }
842 840
843 if (dm_suspended(md)) 841 if (dm_suspended(md))
@@ -1080,7 +1078,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1080 } 1078 }
1081 1079
1082 if (hc->new_map) 1080 if (hc->new_map)
1083 dm_table_put(hc->new_map); 1081 dm_table_destroy(hc->new_map);
1084 hc->new_map = t; 1082 hc->new_map = t;
1085 up_write(&_hash_lock); 1083 up_write(&_hash_lock);
1086 1084
@@ -1109,7 +1107,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1109 } 1107 }
1110 1108
1111 if (hc->new_map) { 1109 if (hc->new_map) {
1112 dm_table_put(hc->new_map); 1110 dm_table_destroy(hc->new_map);
1113 hc->new_map = NULL; 1111 hc->new_map = NULL;
1114 } 1112 }
1115 1113
@@ -1550,8 +1548,10 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
1550 goto out; 1548 goto out;
1551 } 1549 }
1552 1550
1553 strcpy(name, hc->name); 1551 if (name)
1554 strcpy(uuid, hc->uuid ? : ""); 1552 strcpy(name, hc->name);
1553 if (uuid)
1554 strcpy(uuid, hc->uuid ? : "");
1555 1555
1556out: 1556out:
1557 up_read(&_hash_lock); 1557 up_read(&_hash_lock);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 44042becad8a..bfa107f59d96 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -142,6 +142,7 @@ static struct target_type linear_target = {
142 .status = linear_status, 142 .status = linear_status,
143 .ioctl = linear_ioctl, 143 .ioctl = linear_ioctl,
144 .merge = linear_merge, 144 .merge = linear_merge,
145 .features = DM_TARGET_SUPPORTS_BARRIERS,
145}; 146};
146 147
147int __init dm_linear_init(void) 148int __init dm_linear_init(void)
@@ -156,8 +157,5 @@ int __init dm_linear_init(void)
156 157
157void dm_linear_exit(void) 158void dm_linear_exit(void)
158{ 159{
159 int r = dm_unregister_target(&linear_target); 160 dm_unregister_target(&linear_target);
160
161 if (r < 0)
162 DMERR("unregister failed %d", r);
163} 161}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index a8c0fc79ca78..737961f275c1 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -326,8 +326,6 @@ static void header_from_disk(struct log_header *core, struct log_header *disk)
326static int rw_header(struct log_c *lc, int rw) 326static int rw_header(struct log_c *lc, int rw)
327{ 327{
328 lc->io_req.bi_rw = rw; 328 lc->io_req.bi_rw = rw;
329 lc->io_req.mem.ptr.vma = lc->disk_header;
330 lc->io_req.notify.fn = NULL;
331 329
332 return dm_io(&lc->io_req, 1, &lc->header_location, NULL); 330 return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
333} 331}
@@ -362,10 +360,15 @@ static int read_header(struct log_c *log)
362 return 0; 360 return 0;
363} 361}
364 362
365static inline int write_header(struct log_c *log) 363static int _check_region_size(struct dm_target *ti, uint32_t region_size)
366{ 364{
367 header_to_disk(&log->header, log->disk_header); 365 if (region_size < 2 || region_size > ti->len)
368 return rw_header(log, WRITE); 366 return 0;
367
368 if (!is_power_of_2(region_size))
369 return 0;
370
371 return 1;
369} 372}
370 373
371/*---------------------------------------------------------------- 374/*----------------------------------------------------------------
@@ -403,8 +406,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
403 } 406 }
404 } 407 }
405 408
406 if (sscanf(argv[0], "%u", &region_size) != 1) { 409 if (sscanf(argv[0], "%u", &region_size) != 1 ||
407 DMWARN("invalid region size string"); 410 !_check_region_size(ti, region_size)) {
411 DMWARN("invalid region size %s", argv[0]);
408 return -EINVAL; 412 return -EINVAL;
409 } 413 }
410 414
@@ -453,8 +457,18 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
453 */ 457 */
454 buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + 458 buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
455 bitset_size, ti->limits.hardsect_size); 459 bitset_size, ti->limits.hardsect_size);
460
461 if (buf_size > dev->bdev->bd_inode->i_size) {
462 DMWARN("log device %s too small: need %llu bytes",
463 dev->name, (unsigned long long)buf_size);
464 kfree(lc);
465 return -EINVAL;
466 }
467
456 lc->header_location.count = buf_size >> SECTOR_SHIFT; 468 lc->header_location.count = buf_size >> SECTOR_SHIFT;
469
457 lc->io_req.mem.type = DM_IO_VMA; 470 lc->io_req.mem.type = DM_IO_VMA;
471 lc->io_req.notify.fn = NULL;
458 lc->io_req.client = dm_io_client_create(dm_div_up(buf_size, 472 lc->io_req.client = dm_io_client_create(dm_div_up(buf_size,
459 PAGE_SIZE)); 473 PAGE_SIZE));
460 if (IS_ERR(lc->io_req.client)) { 474 if (IS_ERR(lc->io_req.client)) {
@@ -467,10 +481,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
467 lc->disk_header = vmalloc(buf_size); 481 lc->disk_header = vmalloc(buf_size);
468 if (!lc->disk_header) { 482 if (!lc->disk_header) {
469 DMWARN("couldn't allocate disk log buffer"); 483 DMWARN("couldn't allocate disk log buffer");
484 dm_io_client_destroy(lc->io_req.client);
470 kfree(lc); 485 kfree(lc);
471 return -ENOMEM; 486 return -ENOMEM;
472 } 487 }
473 488
489 lc->io_req.mem.ptr.vma = lc->disk_header;
474 lc->clean_bits = (void *)lc->disk_header + 490 lc->clean_bits = (void *)lc->disk_header +
475 (LOG_OFFSET << SECTOR_SHIFT); 491 (LOG_OFFSET << SECTOR_SHIFT);
476 } 492 }
@@ -482,6 +498,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
482 DMWARN("couldn't allocate sync bitset"); 498 DMWARN("couldn't allocate sync bitset");
483 if (!dev) 499 if (!dev)
484 vfree(lc->clean_bits); 500 vfree(lc->clean_bits);
501 else
502 dm_io_client_destroy(lc->io_req.client);
485 vfree(lc->disk_header); 503 vfree(lc->disk_header);
486 kfree(lc); 504 kfree(lc);
487 return -ENOMEM; 505 return -ENOMEM;
@@ -495,6 +513,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
495 vfree(lc->sync_bits); 513 vfree(lc->sync_bits);
496 if (!dev) 514 if (!dev)
497 vfree(lc->clean_bits); 515 vfree(lc->clean_bits);
516 else
517 dm_io_client_destroy(lc->io_req.client);
498 vfree(lc->disk_header); 518 vfree(lc->disk_header);
499 kfree(lc); 519 kfree(lc);
500 return -ENOMEM; 520 return -ENOMEM;
@@ -631,8 +651,10 @@ static int disk_resume(struct dm_dirty_log *log)
631 /* set the correct number of regions in the header */ 651 /* set the correct number of regions in the header */
632 lc->header.nr_regions = lc->region_count; 652 lc->header.nr_regions = lc->region_count;
633 653
654 header_to_disk(&lc->header, lc->disk_header);
655
634 /* write the new header */ 656 /* write the new header */
635 r = write_header(lc); 657 r = rw_header(lc, WRITE);
636 if (r) { 658 if (r) {
637 DMWARN("%s: Failed to write header on dirty region log device", 659 DMWARN("%s: Failed to write header on dirty region log device",
638 lc->log_dev->name); 660 lc->log_dev->name);
@@ -682,7 +704,7 @@ static int disk_flush(struct dm_dirty_log *log)
682 if (!lc->touched) 704 if (!lc->touched)
683 return 0; 705 return 0;
684 706
685 r = write_header(lc); 707 r = rw_header(lc, WRITE);
686 if (r) 708 if (r)
687 fail_log_device(lc); 709 fail_log_device(lc);
688 else 710 else
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 3d7f4923cd13..095f77bf9681 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -889,7 +889,7 @@ static int fail_path(struct pgpath *pgpath)
889 dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, 889 dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
890 pgpath->path.dev->name, m->nr_valid_paths); 890 pgpath->path.dev->name, m->nr_valid_paths);
891 891
892 queue_work(kmultipathd, &m->trigger_event); 892 schedule_work(&m->trigger_event);
893 queue_work(kmultipathd, &pgpath->deactivate_path); 893 queue_work(kmultipathd, &pgpath->deactivate_path);
894 894
895out: 895out:
@@ -932,7 +932,7 @@ static int reinstate_path(struct pgpath *pgpath)
932 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, 932 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
933 pgpath->path.dev->name, m->nr_valid_paths); 933 pgpath->path.dev->name, m->nr_valid_paths);
934 934
935 queue_work(kmultipathd, &m->trigger_event); 935 schedule_work(&m->trigger_event);
936 936
937out: 937out:
938 spin_unlock_irqrestore(&m->lock, flags); 938 spin_unlock_irqrestore(&m->lock, flags);
@@ -976,7 +976,7 @@ static void bypass_pg(struct multipath *m, struct priority_group *pg,
976 976
977 spin_unlock_irqrestore(&m->lock, flags); 977 spin_unlock_irqrestore(&m->lock, flags);
978 978
979 queue_work(kmultipathd, &m->trigger_event); 979 schedule_work(&m->trigger_event);
980} 980}
981 981
982/* 982/*
@@ -1006,7 +1006,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
1006 } 1006 }
1007 spin_unlock_irqrestore(&m->lock, flags); 1007 spin_unlock_irqrestore(&m->lock, flags);
1008 1008
1009 queue_work(kmultipathd, &m->trigger_event); 1009 schedule_work(&m->trigger_event);
1010 return 0; 1010 return 0;
1011} 1011}
1012 1012
@@ -1495,14 +1495,10 @@ static int __init dm_multipath_init(void)
1495 1495
1496static void __exit dm_multipath_exit(void) 1496static void __exit dm_multipath_exit(void)
1497{ 1497{
1498 int r;
1499
1500 destroy_workqueue(kmpath_handlerd); 1498 destroy_workqueue(kmpath_handlerd);
1501 destroy_workqueue(kmultipathd); 1499 destroy_workqueue(kmultipathd);
1502 1500
1503 r = dm_unregister_target(&multipath_target); 1501 dm_unregister_target(&multipath_target);
1504 if (r < 0)
1505 DMERR("target unregister failed %d", r);
1506 kmem_cache_destroy(_mpio_cache); 1502 kmem_cache_destroy(_mpio_cache);
1507} 1503}
1508 1504
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ec43f9fa4b2a..4d6bc101962e 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -197,9 +197,6 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
197 struct mirror_set *ms = m->ms; 197 struct mirror_set *ms = m->ms;
198 struct mirror *new; 198 struct mirror *new;
199 199
200 if (!errors_handled(ms))
201 return;
202
203 /* 200 /*
204 * error_count is used for nothing more than a 201 * error_count is used for nothing more than a
205 * simple way to tell if a device has encountered 202 * simple way to tell if a device has encountered
@@ -210,6 +207,9 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
210 if (test_and_set_bit(error_type, &m->error_type)) 207 if (test_and_set_bit(error_type, &m->error_type))
211 return; 208 return;
212 209
210 if (!errors_handled(ms))
211 return;
212
213 if (m != get_default_mirror(ms)) 213 if (m != get_default_mirror(ms))
214 goto out; 214 goto out;
215 215
@@ -808,12 +808,6 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
808 kfree(ms); 808 kfree(ms);
809} 809}
810 810
811static inline int _check_region_size(struct dm_target *ti, uint32_t size)
812{
813 return !(size % (PAGE_SIZE >> 9) || !is_power_of_2(size) ||
814 size > ti->len);
815}
816
817static int get_mirror(struct mirror_set *ms, struct dm_target *ti, 811static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
818 unsigned int mirror, char **argv) 812 unsigned int mirror, char **argv)
819{ 813{
@@ -872,12 +866,6 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
872 return NULL; 866 return NULL;
873 } 867 }
874 868
875 if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
876 ti->error = "Invalid region size";
877 dm_dirty_log_destroy(dl);
878 return NULL;
879 }
880
881 return dl; 869 return dl;
882} 870}
883 871
@@ -1300,11 +1288,7 @@ static int __init dm_mirror_init(void)
1300 1288
1301static void __exit dm_mirror_exit(void) 1289static void __exit dm_mirror_exit(void)
1302{ 1290{
1303 int r; 1291 dm_unregister_target(&mirror_target);
1304
1305 r = dm_unregister_target(&mirror_target);
1306 if (r < 0)
1307 DMERR("unregister failed %d", r);
1308} 1292}
1309 1293
1310/* Module hooks */ 1294/* Module hooks */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
new file mode 100644
index 000000000000..936b34e0959f
--- /dev/null
+++ b/drivers/md/dm-snap-persistent.c
@@ -0,0 +1,704 @@
1/*
2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2006-2008 Red Hat GmbH
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm-exception-store.h"
9#include "dm-snap.h"
10
11#include <linux/mm.h>
12#include <linux/pagemap.h>
13#include <linux/vmalloc.h>
14#include <linux/slab.h>
15#include <linux/dm-io.h>
16
17#define DM_MSG_PREFIX "persistent snapshot"
18#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
19
20/*-----------------------------------------------------------------
21 * Persistent snapshots, by persistent we mean that the snapshot
22 * will survive a reboot.
23 *---------------------------------------------------------------*/
24
25/*
26 * We need to store a record of which parts of the origin have
27 * been copied to the snapshot device. The snapshot code
28 * requires that we copy exception chunks to chunk aligned areas
29 * of the COW store. It makes sense therefore, to store the
30 * metadata in chunk size blocks.
31 *
32 * There is no backward or forward compatibility implemented,
33 * snapshots with different disk versions than the kernel will
34 * not be usable. It is expected that "lvcreate" will blank out
35 * the start of a fresh COW device before calling the snapshot
36 * constructor.
37 *
38 * The first chunk of the COW device just contains the header.
39 * After this there is a chunk filled with exception metadata,
40 * followed by as many exception chunks as can fit in the
41 * metadata areas.
42 *
43 * All on disk structures are in little-endian format. The end
44 * of the exceptions info is indicated by an exception with a
45 * new_chunk of 0, which is invalid since it would point to the
46 * header chunk.
47 */
48
49/*
50 * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
51 */
52#define SNAP_MAGIC 0x70416e53
53
54/*
55 * The on-disk version of the metadata.
56 */
57#define SNAPSHOT_DISK_VERSION 1
58
59struct disk_header {
60 uint32_t magic;
61
62 /*
63 * Is this snapshot valid. There is no way of recovering
64 * an invalid snapshot.
65 */
66 uint32_t valid;
67
68 /*
69 * Simple, incrementing version. no backward
70 * compatibility.
71 */
72 uint32_t version;
73
74 /* In sectors */
75 uint32_t chunk_size;
76};
77
78struct disk_exception {
79 uint64_t old_chunk;
80 uint64_t new_chunk;
81};
82
83struct commit_callback {
84 void (*callback)(void *, int success);
85 void *context;
86};
87
88/*
89 * The top level structure for a persistent exception store.
90 */
91struct pstore {
92 struct dm_snapshot *snap; /* up pointer to my snapshot */
93 int version;
94 int valid;
95 uint32_t exceptions_per_area;
96
97 /*
98 * Now that we have an asynchronous kcopyd there is no
99 * need for large chunk sizes, so it wont hurt to have a
100 * whole chunks worth of metadata in memory at once.
101 */
102 void *area;
103
104 /*
105 * An area of zeros used to clear the next area.
106 */
107 void *zero_area;
108
109 /*
110 * Used to keep track of which metadata area the data in
111 * 'chunk' refers to.
112 */
113 chunk_t current_area;
114
115 /*
116 * The next free chunk for an exception.
117 */
118 chunk_t next_free;
119
120 /*
121 * The index of next free exception in the current
122 * metadata area.
123 */
124 uint32_t current_committed;
125
126 atomic_t pending_count;
127 uint32_t callback_count;
128 struct commit_callback *callbacks;
129 struct dm_io_client *io_client;
130
131 struct workqueue_struct *metadata_wq;
132};
133
134static unsigned sectors_to_pages(unsigned sectors)
135{
136 return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
137}
138
139static int alloc_area(struct pstore *ps)
140{
141 int r = -ENOMEM;
142 size_t len;
143
144 len = ps->snap->chunk_size << SECTOR_SHIFT;
145
146 /*
147 * Allocate the chunk_size block of memory that will hold
148 * a single metadata area.
149 */
150 ps->area = vmalloc(len);
151 if (!ps->area)
152 return r;
153
154 ps->zero_area = vmalloc(len);
155 if (!ps->zero_area) {
156 vfree(ps->area);
157 return r;
158 }
159 memset(ps->zero_area, 0, len);
160
161 return 0;
162}
163
164static void free_area(struct pstore *ps)
165{
166 vfree(ps->area);
167 ps->area = NULL;
168 vfree(ps->zero_area);
169 ps->zero_area = NULL;
170}
171
172struct mdata_req {
173 struct dm_io_region *where;
174 struct dm_io_request *io_req;
175 struct work_struct work;
176 int result;
177};
178
179static void do_metadata(struct work_struct *work)
180{
181 struct mdata_req *req = container_of(work, struct mdata_req, work);
182
183 req->result = dm_io(req->io_req, 1, req->where, NULL);
184}
185
186/*
187 * Read or write a chunk aligned and sized block of data from a device.
188 */
189static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
190{
191 struct dm_io_region where = {
192 .bdev = ps->snap->cow->bdev,
193 .sector = ps->snap->chunk_size * chunk,
194 .count = ps->snap->chunk_size,
195 };
196 struct dm_io_request io_req = {
197 .bi_rw = rw,
198 .mem.type = DM_IO_VMA,
199 .mem.ptr.vma = ps->area,
200 .client = ps->io_client,
201 .notify.fn = NULL,
202 };
203 struct mdata_req req;
204
205 if (!metadata)
206 return dm_io(&io_req, 1, &where, NULL);
207
208 req.where = &where;
209 req.io_req = &io_req;
210
211 /*
212 * Issue the synchronous I/O from a different thread
213 * to avoid generic_make_request recursion.
214 */
215 INIT_WORK(&req.work, do_metadata);
216 queue_work(ps->metadata_wq, &req.work);
217 flush_workqueue(ps->metadata_wq);
218
219 return req.result;
220}
221
222/*
223 * Convert a metadata area index to a chunk index.
224 */
225static chunk_t area_location(struct pstore *ps, chunk_t area)
226{
227 return 1 + ((ps->exceptions_per_area + 1) * area);
228}
229
230/*
231 * Read or write a metadata area. Remembering to skip the first
232 * chunk which holds the header.
233 */
234static int area_io(struct pstore *ps, int rw)
235{
236 int r;
237 chunk_t chunk;
238
239 chunk = area_location(ps, ps->current_area);
240
241 r = chunk_io(ps, chunk, rw, 0);
242 if (r)
243 return r;
244
245 return 0;
246}
247
248static void zero_memory_area(struct pstore *ps)
249{
250 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
251}
252
253static int zero_disk_area(struct pstore *ps, chunk_t area)
254{
255 struct dm_io_region where = {
256 .bdev = ps->snap->cow->bdev,
257 .sector = ps->snap->chunk_size * area_location(ps, area),
258 .count = ps->snap->chunk_size,
259 };
260 struct dm_io_request io_req = {
261 .bi_rw = WRITE,
262 .mem.type = DM_IO_VMA,
263 .mem.ptr.vma = ps->zero_area,
264 .client = ps->io_client,
265 .notify.fn = NULL,
266 };
267
268 return dm_io(&io_req, 1, &where, NULL);
269}
270
271static int read_header(struct pstore *ps, int *new_snapshot)
272{
273 int r;
274 struct disk_header *dh;
275 chunk_t chunk_size;
276 int chunk_size_supplied = 1;
277
278 /*
279 * Use default chunk size (or hardsect_size, if larger) if none supplied
280 */
281 if (!ps->snap->chunk_size) {
282 ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
283 bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
284 ps->snap->chunk_mask = ps->snap->chunk_size - 1;
285 ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
286 chunk_size_supplied = 0;
287 }
288
289 ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
290 chunk_size));
291 if (IS_ERR(ps->io_client))
292 return PTR_ERR(ps->io_client);
293
294 r = alloc_area(ps);
295 if (r)
296 return r;
297
298 r = chunk_io(ps, 0, READ, 1);
299 if (r)
300 goto bad;
301
302 dh = (struct disk_header *) ps->area;
303
304 if (le32_to_cpu(dh->magic) == 0) {
305 *new_snapshot = 1;
306 return 0;
307 }
308
309 if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
310 DMWARN("Invalid or corrupt snapshot");
311 r = -ENXIO;
312 goto bad;
313 }
314
315 *new_snapshot = 0;
316 ps->valid = le32_to_cpu(dh->valid);
317 ps->version = le32_to_cpu(dh->version);
318 chunk_size = le32_to_cpu(dh->chunk_size);
319
320 if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
321 return 0;
322
323 DMWARN("chunk size %llu in device metadata overrides "
324 "table chunk size of %llu.",
325 (unsigned long long)chunk_size,
326 (unsigned long long)ps->snap->chunk_size);
327
328 /* We had a bogus chunk_size. Fix stuff up. */
329 free_area(ps);
330
331 ps->snap->chunk_size = chunk_size;
332 ps->snap->chunk_mask = chunk_size - 1;
333 ps->snap->chunk_shift = ffs(chunk_size) - 1;
334
335 r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
336 ps->io_client);
337 if (r)
338 return r;
339
340 r = alloc_area(ps);
341 return r;
342
343bad:
344 free_area(ps);
345 return r;
346}
347
348static int write_header(struct pstore *ps)
349{
350 struct disk_header *dh;
351
352 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
353
354 dh = (struct disk_header *) ps->area;
355 dh->magic = cpu_to_le32(SNAP_MAGIC);
356 dh->valid = cpu_to_le32(ps->valid);
357 dh->version = cpu_to_le32(ps->version);
358 dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
359
360 return chunk_io(ps, 0, WRITE, 1);
361}
362
363/*
364 * Access functions for the disk exceptions, these do the endian conversions.
365 */
366static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
367{
368 BUG_ON(index >= ps->exceptions_per_area);
369
370 return ((struct disk_exception *) ps->area) + index;
371}
372
373static void read_exception(struct pstore *ps,
374 uint32_t index, struct disk_exception *result)
375{
376 struct disk_exception *e = get_exception(ps, index);
377
378 /* copy it */
379 result->old_chunk = le64_to_cpu(e->old_chunk);
380 result->new_chunk = le64_to_cpu(e->new_chunk);
381}
382
383static void write_exception(struct pstore *ps,
384 uint32_t index, struct disk_exception *de)
385{
386 struct disk_exception *e = get_exception(ps, index);
387
388 /* copy it */
389 e->old_chunk = cpu_to_le64(de->old_chunk);
390 e->new_chunk = cpu_to_le64(de->new_chunk);
391}
392
393/*
394 * Registers the exceptions that are present in the current area.
395 * 'full' is filled in to indicate if the area has been
396 * filled.
397 */
398static int insert_exceptions(struct pstore *ps,
399 int (*callback)(void *callback_context,
400 chunk_t old, chunk_t new),
401 void *callback_context,
402 int *full)
403{
404 int r;
405 unsigned int i;
406 struct disk_exception de;
407
408 /* presume the area is full */
409 *full = 1;
410
411 for (i = 0; i < ps->exceptions_per_area; i++) {
412 read_exception(ps, i, &de);
413
414 /*
415 * If the new_chunk is pointing at the start of
416 * the COW device, where the first metadata area
417 * is we know that we've hit the end of the
418 * exceptions. Therefore the area is not full.
419 */
420 if (de.new_chunk == 0LL) {
421 ps->current_committed = i;
422 *full = 0;
423 break;
424 }
425
426 /*
427 * Keep track of the start of the free chunks.
428 */
429 if (ps->next_free <= de.new_chunk)
430 ps->next_free = de.new_chunk + 1;
431
432 /*
433 * Otherwise we add the exception to the snapshot.
434 */
435 r = callback(callback_context, de.old_chunk, de.new_chunk);
436 if (r)
437 return r;
438 }
439
440 return 0;
441}
442
443static int read_exceptions(struct pstore *ps,
444 int (*callback)(void *callback_context, chunk_t old,
445 chunk_t new),
446 void *callback_context)
447{
448 int r, full = 1;
449
450 /*
451 * Keeping reading chunks and inserting exceptions until
452 * we find a partially full area.
453 */
454 for (ps->current_area = 0; full; ps->current_area++) {
455 r = area_io(ps, READ);
456 if (r)
457 return r;
458
459 r = insert_exceptions(ps, callback, callback_context, &full);
460 if (r)
461 return r;
462 }
463
464 ps->current_area--;
465
466 return 0;
467}
468
469static struct pstore *get_info(struct dm_exception_store *store)
470{
471 return (struct pstore *) store->context;
472}
473
474static void persistent_fraction_full(struct dm_exception_store *store,
475 sector_t *numerator, sector_t *denominator)
476{
477 *numerator = get_info(store)->next_free * store->snap->chunk_size;
478 *denominator = get_dev_size(store->snap->cow->bdev);
479}
480
481static void persistent_destroy(struct dm_exception_store *store)
482{
483 struct pstore *ps = get_info(store);
484
485 destroy_workqueue(ps->metadata_wq);
486 dm_io_client_destroy(ps->io_client);
487 vfree(ps->callbacks);
488 free_area(ps);
489 kfree(ps);
490}
491
492static int persistent_read_metadata(struct dm_exception_store *store,
493 int (*callback)(void *callback_context,
494 chunk_t old, chunk_t new),
495 void *callback_context)
496{
497 int r, uninitialized_var(new_snapshot);
498 struct pstore *ps = get_info(store);
499
500 /*
501 * Read the snapshot header.
502 */
503 r = read_header(ps, &new_snapshot);
504 if (r)
505 return r;
506
507 /*
508 * Now we know correct chunk_size, complete the initialisation.
509 */
510 ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
511 sizeof(struct disk_exception);
512 ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
513 sizeof(*ps->callbacks));
514 if (!ps->callbacks)
515 return -ENOMEM;
516
517 /*
518 * Do we need to setup a new snapshot ?
519 */
520 if (new_snapshot) {
521 r = write_header(ps);
522 if (r) {
523 DMWARN("write_header failed");
524 return r;
525 }
526
527 ps->current_area = 0;
528 zero_memory_area(ps);
529 r = zero_disk_area(ps, 0);
530 if (r) {
531 DMWARN("zero_disk_area(0) failed");
532 return r;
533 }
534 } else {
535 /*
536 * Sanity checks.
537 */
538 if (ps->version != SNAPSHOT_DISK_VERSION) {
539 DMWARN("unable to handle snapshot disk version %d",
540 ps->version);
541 return -EINVAL;
542 }
543
544 /*
545 * Metadata are valid, but snapshot is invalidated
546 */
547 if (!ps->valid)
548 return 1;
549
550 /*
551 * Read the metadata.
552 */
553 r = read_exceptions(ps, callback, callback_context);
554 if (r)
555 return r;
556 }
557
558 return 0;
559}
560
561static int persistent_prepare_exception(struct dm_exception_store *store,
562 struct dm_snap_exception *e)
563{
564 struct pstore *ps = get_info(store);
565 uint32_t stride;
566 chunk_t next_free;
567 sector_t size = get_dev_size(store->snap->cow->bdev);
568
569 /* Is there enough room ? */
570 if (size < ((ps->next_free + 1) * store->snap->chunk_size))
571 return -ENOSPC;
572
573 e->new_chunk = ps->next_free;
574
575 /*
576 * Move onto the next free pending, making sure to take
577 * into account the location of the metadata chunks.
578 */
579 stride = (ps->exceptions_per_area + 1);
580 next_free = ++ps->next_free;
581 if (sector_div(next_free, stride) == 1)
582 ps->next_free++;
583
584 atomic_inc(&ps->pending_count);
585 return 0;
586}
587
588static void persistent_commit_exception(struct dm_exception_store *store,
589 struct dm_snap_exception *e,
590 void (*callback) (void *, int success),
591 void *callback_context)
592{
593 unsigned int i;
594 struct pstore *ps = get_info(store);
595 struct disk_exception de;
596 struct commit_callback *cb;
597
598 de.old_chunk = e->old_chunk;
599 de.new_chunk = e->new_chunk;
600 write_exception(ps, ps->current_committed++, &de);
601
602 /*
603 * Add the callback to the back of the array. This code
604 * is the only place where the callback array is
605 * manipulated, and we know that it will never be called
606 * multiple times concurrently.
607 */
608 cb = ps->callbacks + ps->callback_count++;
609 cb->callback = callback;
610 cb->context = callback_context;
611
612 /*
613 * If there are exceptions in flight and we have not yet
614 * filled this metadata area there's nothing more to do.
615 */
616 if (!atomic_dec_and_test(&ps->pending_count) &&
617 (ps->current_committed != ps->exceptions_per_area))
618 return;
619
620 /*
621 * If we completely filled the current area, then wipe the next one.
622 */
623 if ((ps->current_committed == ps->exceptions_per_area) &&
624 zero_disk_area(ps, ps->current_area + 1))
625 ps->valid = 0;
626
627 /*
628 * Commit exceptions to disk.
629 */
630 if (ps->valid && area_io(ps, WRITE))
631 ps->valid = 0;
632
633 /*
634 * Advance to the next area if this one is full.
635 */
636 if (ps->current_committed == ps->exceptions_per_area) {
637 ps->current_committed = 0;
638 ps->current_area++;
639 zero_memory_area(ps);
640 }
641
642 for (i = 0; i < ps->callback_count; i++) {
643 cb = ps->callbacks + i;
644 cb->callback(cb->context, ps->valid);
645 }
646
647 ps->callback_count = 0;
648}
649
650static void persistent_drop_snapshot(struct dm_exception_store *store)
651{
652 struct pstore *ps = get_info(store);
653
654 ps->valid = 0;
655 if (write_header(ps))
656 DMWARN("write header failed");
657}
658
659int dm_create_persistent(struct dm_exception_store *store)
660{
661 struct pstore *ps;
662
663 /* allocate the pstore */
664 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
665 if (!ps)
666 return -ENOMEM;
667
668 ps->snap = store->snap;
669 ps->valid = 1;
670 ps->version = SNAPSHOT_DISK_VERSION;
671 ps->area = NULL;
672 ps->next_free = 2; /* skipping the header and first area */
673 ps->current_committed = 0;
674
675 ps->callback_count = 0;
676 atomic_set(&ps->pending_count, 0);
677 ps->callbacks = NULL;
678
679 ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
680 if (!ps->metadata_wq) {
681 kfree(ps);
682 DMERR("couldn't start header metadata update thread");
683 return -ENOMEM;
684 }
685
686 store->destroy = persistent_destroy;
687 store->read_metadata = persistent_read_metadata;
688 store->prepare_exception = persistent_prepare_exception;
689 store->commit_exception = persistent_commit_exception;
690 store->drop_snapshot = persistent_drop_snapshot;
691 store->fraction_full = persistent_fraction_full;
692 store->context = ps;
693
694 return 0;
695}
696
697int dm_persistent_snapshot_init(void)
698{
699 return 0;
700}
701
702void dm_persistent_snapshot_exit(void)
703{
704}
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
new file mode 100644
index 000000000000..7f6e2e6dcb0d
--- /dev/null
+++ b/drivers/md/dm-snap-transient.c
@@ -0,0 +1,98 @@
1/*
2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2006-2008 Red Hat GmbH
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm-exception-store.h"
9#include "dm-snap.h"
10
11#include <linux/mm.h>
12#include <linux/pagemap.h>
13#include <linux/vmalloc.h>
14#include <linux/slab.h>
15#include <linux/dm-io.h>
16
17#define DM_MSG_PREFIX "transient snapshot"
18
19/*-----------------------------------------------------------------
20 * Implementation of the store for non-persistent snapshots.
21 *---------------------------------------------------------------*/
22struct transient_c {
23 sector_t next_free;
24};
25
26static void transient_destroy(struct dm_exception_store *store)
27{
28 kfree(store->context);
29}
30
31static int transient_read_metadata(struct dm_exception_store *store,
32 int (*callback)(void *callback_context,
33 chunk_t old, chunk_t new),
34 void *callback_context)
35{
36 return 0;
37}
38
39static int transient_prepare_exception(struct dm_exception_store *store,
40 struct dm_snap_exception *e)
41{
42 struct transient_c *tc = (struct transient_c *) store->context;
43 sector_t size = get_dev_size(store->snap->cow->bdev);
44
45 if (size < (tc->next_free + store->snap->chunk_size))
46 return -1;
47
48 e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
49 tc->next_free += store->snap->chunk_size;
50
51 return 0;
52}
53
54static void transient_commit_exception(struct dm_exception_store *store,
55 struct dm_snap_exception *e,
56 void (*callback) (void *, int success),
57 void *callback_context)
58{
59 /* Just succeed */
60 callback(callback_context, 1);
61}
62
63static void transient_fraction_full(struct dm_exception_store *store,
64 sector_t *numerator, sector_t *denominator)
65{
66 *numerator = ((struct transient_c *) store->context)->next_free;
67 *denominator = get_dev_size(store->snap->cow->bdev);
68}
69
70int dm_create_transient(struct dm_exception_store *store)
71{
72 struct transient_c *tc;
73
74 store->destroy = transient_destroy;
75 store->read_metadata = transient_read_metadata;
76 store->prepare_exception = transient_prepare_exception;
77 store->commit_exception = transient_commit_exception;
78 store->drop_snapshot = NULL;
79 store->fraction_full = transient_fraction_full;
80
81 tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
82 if (!tc)
83 return -ENOMEM;
84
85 tc->next_free = 0;
86 store->context = tc;
87
88 return 0;
89}
90
91int dm_transient_snapshot_init(void)
92{
93 return 0;
94}
95
96void dm_transient_snapshot_exit(void)
97{
98}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 6c96db26b87c..65ff82ff124e 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -9,6 +9,7 @@
9#include <linux/blkdev.h> 9#include <linux/blkdev.h>
10#include <linux/ctype.h> 10#include <linux/ctype.h>
11#include <linux/device-mapper.h> 11#include <linux/device-mapper.h>
12#include <linux/delay.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13#include <linux/init.h> 14#include <linux/init.h>
14#include <linux/kdev_t.h> 15#include <linux/kdev_t.h>
@@ -20,6 +21,7 @@
20#include <linux/log2.h> 21#include <linux/log2.h>
21#include <linux/dm-kcopyd.h> 22#include <linux/dm-kcopyd.h>
22 23
24#include "dm-exception-store.h"
23#include "dm-snap.h" 25#include "dm-snap.h"
24#include "dm-bio-list.h" 26#include "dm-bio-list.h"
25 27
@@ -428,8 +430,13 @@ out:
428 list_add(&new_e->hash_list, e ? &e->hash_list : l); 430 list_add(&new_e->hash_list, e ? &e->hash_list : l);
429} 431}
430 432
431int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) 433/*
434 * Callback used by the exception stores to load exceptions when
435 * initialising.
436 */
437static int dm_add_exception(void *context, chunk_t old, chunk_t new)
432{ 438{
439 struct dm_snapshot *s = context;
433 struct dm_snap_exception *e; 440 struct dm_snap_exception *e;
434 441
435 e = alloc_exception(); 442 e = alloc_exception();
@@ -658,7 +665,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
658 spin_lock_init(&s->tracked_chunk_lock); 665 spin_lock_init(&s->tracked_chunk_lock);
659 666
660 /* Metadata must only be loaded into one table at once */ 667 /* Metadata must only be loaded into one table at once */
661 r = s->store.read_metadata(&s->store); 668 r = s->store.read_metadata(&s->store, dm_add_exception, (void *)s);
662 if (r < 0) { 669 if (r < 0) {
663 ti->error = "Failed to read snapshot metadata"; 670 ti->error = "Failed to read snapshot metadata";
664 goto bad_load_and_register; 671 goto bad_load_and_register;
@@ -735,7 +742,7 @@ static void snapshot_dtr(struct dm_target *ti)
735 unregister_snapshot(s); 742 unregister_snapshot(s);
736 743
737 while (atomic_read(&s->pending_exceptions_count)) 744 while (atomic_read(&s->pending_exceptions_count))
738 yield(); 745 msleep(1);
739 /* 746 /*
740 * Ensure instructions in mempool_destroy aren't reordered 747 * Ensure instructions in mempool_destroy aren't reordered
741 * before atomic_read. 748 * before atomic_read.
@@ -888,10 +895,10 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
888 895
889 /* 896 /*
890 * Check for conflicting reads. This is extremely improbable, 897 * Check for conflicting reads. This is extremely improbable,
891 * so yield() is sufficient and there is no need for a wait queue. 898 * so msleep(1) is sufficient and there is no need for a wait queue.
892 */ 899 */
893 while (__chunk_is_tracked(s, pe->e.old_chunk)) 900 while (__chunk_is_tracked(s, pe->e.old_chunk))
894 yield(); 901 msleep(1);
895 902
896 /* 903 /*
897 * Add a proper exception, and remove the 904 * Add a proper exception, and remove the
@@ -1404,6 +1411,12 @@ static int __init dm_snapshot_init(void)
1404{ 1411{
1405 int r; 1412 int r;
1406 1413
1414 r = dm_exception_store_init();
1415 if (r) {
1416 DMERR("Failed to initialize exception stores");
1417 return r;
1418 }
1419
1407 r = dm_register_target(&snapshot_target); 1420 r = dm_register_target(&snapshot_target);
1408 if (r) { 1421 if (r) {
1409 DMERR("snapshot target register failed %d", r); 1422 DMERR("snapshot target register failed %d", r);
@@ -1452,39 +1465,34 @@ static int __init dm_snapshot_init(void)
1452 1465
1453 return 0; 1466 return 0;
1454 1467
1455 bad_pending_pool: 1468bad_pending_pool:
1456 kmem_cache_destroy(tracked_chunk_cache); 1469 kmem_cache_destroy(tracked_chunk_cache);
1457 bad5: 1470bad5:
1458 kmem_cache_destroy(pending_cache); 1471 kmem_cache_destroy(pending_cache);
1459 bad4: 1472bad4:
1460 kmem_cache_destroy(exception_cache); 1473 kmem_cache_destroy(exception_cache);
1461 bad3: 1474bad3:
1462 exit_origin_hash(); 1475 exit_origin_hash();
1463 bad2: 1476bad2:
1464 dm_unregister_target(&origin_target); 1477 dm_unregister_target(&origin_target);
1465 bad1: 1478bad1:
1466 dm_unregister_target(&snapshot_target); 1479 dm_unregister_target(&snapshot_target);
1467 return r; 1480 return r;
1468} 1481}
1469 1482
1470static void __exit dm_snapshot_exit(void) 1483static void __exit dm_snapshot_exit(void)
1471{ 1484{
1472 int r;
1473
1474 destroy_workqueue(ksnapd); 1485 destroy_workqueue(ksnapd);
1475 1486
1476 r = dm_unregister_target(&snapshot_target); 1487 dm_unregister_target(&snapshot_target);
1477 if (r) 1488 dm_unregister_target(&origin_target);
1478 DMERR("snapshot unregister failed %d", r);
1479
1480 r = dm_unregister_target(&origin_target);
1481 if (r)
1482 DMERR("origin unregister failed %d", r);
1483 1489
1484 exit_origin_hash(); 1490 exit_origin_hash();
1485 kmem_cache_destroy(pending_cache); 1491 kmem_cache_destroy(pending_cache);
1486 kmem_cache_destroy(exception_cache); 1492 kmem_cache_destroy(exception_cache);
1487 kmem_cache_destroy(tracked_chunk_cache); 1493 kmem_cache_destroy(tracked_chunk_cache);
1494
1495 dm_exception_store_exit();
1488} 1496}
1489 1497
1490/* Module hooks */ 1498/* Module hooks */
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index 99c0106ede2d..d9e62b43cf85 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -1,6 +1,4 @@
1/* 1/*
2 * dm-snapshot.c
3 *
4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5 * 3 *
6 * This file is released under the GPL. 4 * This file is released under the GPL.
@@ -10,6 +8,7 @@
10#define DM_SNAPSHOT_H 8#define DM_SNAPSHOT_H
11 9
12#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
11#include "dm-exception-store.h"
13#include "dm-bio-list.h" 12#include "dm-bio-list.h"
14#include <linux/blkdev.h> 13#include <linux/blkdev.h>
15#include <linux/workqueue.h> 14#include <linux/workqueue.h>
@@ -20,116 +19,6 @@ struct exception_table {
20 struct list_head *table; 19 struct list_head *table;
21}; 20};
22 21
23/*
24 * The snapshot code deals with largish chunks of the disk at a
25 * time. Typically 32k - 512k.
26 */
27typedef sector_t chunk_t;
28
29/*
30 * An exception is used where an old chunk of data has been
31 * replaced by a new one.
32 * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
33 * of chunks that follow contiguously. Remaining bits hold the number of the
34 * chunk within the device.
35 */
36struct dm_snap_exception {
37 struct list_head hash_list;
38
39 chunk_t old_chunk;
40 chunk_t new_chunk;
41};
42
43/*
44 * Funtions to manipulate consecutive chunks
45 */
46# if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
47# define DM_CHUNK_CONSECUTIVE_BITS 8
48# define DM_CHUNK_NUMBER_BITS 56
49
50static inline chunk_t dm_chunk_number(chunk_t chunk)
51{
52 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
53}
54
55static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
56{
57 return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
58}
59
60static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
61{
62 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
63
64 BUG_ON(!dm_consecutive_chunk_count(e));
65}
66
67# else
68# define DM_CHUNK_CONSECUTIVE_BITS 0
69
70static inline chunk_t dm_chunk_number(chunk_t chunk)
71{
72 return chunk;
73}
74
75static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
76{
77 return 0;
78}
79
80static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
81{
82}
83
84# endif
85
86/*
87 * Abstraction to handle the meta/layout of exception stores (the
88 * COW device).
89 */
90struct exception_store {
91
92 /*
93 * Destroys this object when you've finished with it.
94 */
95 void (*destroy) (struct exception_store *store);
96
97 /*
98 * The target shouldn't read the COW device until this is
99 * called.
100 */
101 int (*read_metadata) (struct exception_store *store);
102
103 /*
104 * Find somewhere to store the next exception.
105 */
106 int (*prepare_exception) (struct exception_store *store,
107 struct dm_snap_exception *e);
108
109 /*
110 * Update the metadata with this exception.
111 */
112 void (*commit_exception) (struct exception_store *store,
113 struct dm_snap_exception *e,
114 void (*callback) (void *, int success),
115 void *callback_context);
116
117 /*
118 * The snapshot is invalid, note this in the metadata.
119 */
120 void (*drop_snapshot) (struct exception_store *store);
121
122 /*
123 * Return how full the snapshot is.
124 */
125 void (*fraction_full) (struct exception_store *store,
126 sector_t *numerator,
127 sector_t *denominator);
128
129 struct dm_snapshot *snap;
130 void *context;
131};
132
133#define DM_TRACKED_CHUNK_HASH_SIZE 16 22#define DM_TRACKED_CHUNK_HASH_SIZE 16
134#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ 23#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
135 (DM_TRACKED_CHUNK_HASH_SIZE - 1)) 24 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
@@ -172,7 +61,7 @@ struct dm_snapshot {
172 spinlock_t pe_lock; 61 spinlock_t pe_lock;
173 62
174 /* The on disk metadata handler */ 63 /* The on disk metadata handler */
175 struct exception_store store; 64 struct dm_exception_store store;
176 65
177 struct dm_kcopyd_client *kcopyd_client; 66 struct dm_kcopyd_client *kcopyd_client;
178 67
@@ -187,20 +76,6 @@ struct dm_snapshot {
187}; 76};
188 77
189/* 78/*
190 * Used by the exception stores to load exceptions hen
191 * initialising.
192 */
193int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
194
195/*
196 * Constructor and destructor for the default persistent
197 * store.
198 */
199int dm_create_persistent(struct exception_store *store);
200
201int dm_create_transient(struct exception_store *store);
202
203/*
204 * Return the number of sectors in the device. 79 * Return the number of sectors in the device.
205 */ 80 */
206static inline sector_t get_dev_size(struct block_device *bdev) 81static inline sector_t get_dev_size(struct block_device *bdev)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 9e4ef88d421e..41569bc60abc 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -337,9 +337,7 @@ int __init dm_stripe_init(void)
337 337
338void dm_stripe_exit(void) 338void dm_stripe_exit(void)
339{ 339{
340 if (dm_unregister_target(&stripe_target)) 340 dm_unregister_target(&stripe_target);
341 DMWARN("target unregistration failed");
342
343 destroy_workqueue(kstriped); 341 destroy_workqueue(kstriped);
344 342
345 return; 343 return;
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
new file mode 100644
index 000000000000..a2a45e6c7c8b
--- /dev/null
+++ b/drivers/md/dm-sysfs.c
@@ -0,0 +1,99 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include <linux/sysfs.h>
8#include <linux/dm-ioctl.h>
9#include "dm.h"
10
11struct dm_sysfs_attr {
12 struct attribute attr;
13 ssize_t (*show)(struct mapped_device *, char *);
14 ssize_t (*store)(struct mapped_device *, char *);
15};
16
17#define DM_ATTR_RO(_name) \
18struct dm_sysfs_attr dm_attr_##_name = \
19 __ATTR(_name, S_IRUGO, dm_attr_##_name##_show, NULL)
20
21static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
22 char *page)
23{
24 struct dm_sysfs_attr *dm_attr;
25 struct mapped_device *md;
26 ssize_t ret;
27
28 dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
29 if (!dm_attr->show)
30 return -EIO;
31
32 md = dm_get_from_kobject(kobj);
33 if (!md)
34 return -EINVAL;
35
36 ret = dm_attr->show(md, page);
37 dm_put(md);
38
39 return ret;
40}
41
42static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
43{
44 if (dm_copy_name_and_uuid(md, buf, NULL))
45 return -EIO;
46
47 strcat(buf, "\n");
48 return strlen(buf);
49}
50
51static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
52{
53 if (dm_copy_name_and_uuid(md, NULL, buf))
54 return -EIO;
55
56 strcat(buf, "\n");
57 return strlen(buf);
58}
59
60static DM_ATTR_RO(name);
61static DM_ATTR_RO(uuid);
62
63static struct attribute *dm_attrs[] = {
64 &dm_attr_name.attr,
65 &dm_attr_uuid.attr,
66 NULL,
67};
68
69static struct sysfs_ops dm_sysfs_ops = {
70 .show = dm_attr_show,
71};
72
73/*
74 * dm kobject is embedded in mapped_device structure
75 * no need to define release function here
76 */
77static struct kobj_type dm_ktype = {
78 .sysfs_ops = &dm_sysfs_ops,
79 .default_attrs = dm_attrs,
80};
81
82/*
83 * Initialize kobj
84 * because nobody using md yet, no need to call explicit dm_get/put
85 */
86int dm_sysfs_init(struct mapped_device *md)
87{
88 return kobject_init_and_add(dm_kobject(md), &dm_ktype,
89 &disk_to_dev(dm_disk(md))->kobj,
90 "%s", "dm");
91}
92
93/*
94 * Remove kobj, called after all references removed
95 */
96void dm_sysfs_exit(struct mapped_device *md)
97{
98 kobject_put(dm_kobject(md));
99}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 04e5fd742c2c..2fd66c30f7f8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2001 Sistina Software (UK) Limited. 2 * Copyright (C) 2001 Sistina Software (UK) Limited.
3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -15,6 +15,7 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/mutex.h> 17#include <linux/mutex.h>
18#include <linux/delay.h>
18#include <asm/atomic.h> 19#include <asm/atomic.h>
19 20
20#define DM_MSG_PREFIX "table" 21#define DM_MSG_PREFIX "table"
@@ -24,6 +25,19 @@
24#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 25#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
25#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) 26#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
26 27
28/*
29 * The table has always exactly one reference from either mapped_device->map
30 * or hash_cell->new_map. This reference is not counted in table->holders.
31 * A pair of dm_create_table/dm_destroy_table functions is used for table
32 * creation/destruction.
33 *
34 * Temporary references from the other code increase table->holders. A pair
35 * of dm_table_get/dm_table_put functions is used to manipulate it.
36 *
37 * When the table is about to be destroyed, we wait for table->holders to
38 * drop to zero.
39 */
40
27struct dm_table { 41struct dm_table {
28 struct mapped_device *md; 42 struct mapped_device *md;
29 atomic_t holders; 43 atomic_t holders;
@@ -38,6 +52,8 @@ struct dm_table {
38 sector_t *highs; 52 sector_t *highs;
39 struct dm_target *targets; 53 struct dm_target *targets;
40 54
55 unsigned barriers_supported:1;
56
41 /* 57 /*
42 * Indicates the rw permissions for the new logical 58 * Indicates the rw permissions for the new logical
43 * device. This should be a combination of FMODE_READ 59 * device. This should be a combination of FMODE_READ
@@ -226,7 +242,8 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
226 return -ENOMEM; 242 return -ENOMEM;
227 243
228 INIT_LIST_HEAD(&t->devices); 244 INIT_LIST_HEAD(&t->devices);
229 atomic_set(&t->holders, 1); 245 atomic_set(&t->holders, 0);
246 t->barriers_supported = 1;
230 247
231 if (!num_targets) 248 if (!num_targets)
232 num_targets = KEYS_PER_NODE; 249 num_targets = KEYS_PER_NODE;
@@ -256,10 +273,14 @@ static void free_devices(struct list_head *devices)
256 } 273 }
257} 274}
258 275
259static void table_destroy(struct dm_table *t) 276void dm_table_destroy(struct dm_table *t)
260{ 277{
261 unsigned int i; 278 unsigned int i;
262 279
280 while (atomic_read(&t->holders))
281 msleep(1);
282 smp_mb();
283
263 /* free the indexes (see dm_table_complete) */ 284 /* free the indexes (see dm_table_complete) */
264 if (t->depth >= 2) 285 if (t->depth >= 2)
265 vfree(t->index[t->depth - 2]); 286 vfree(t->index[t->depth - 2]);
@@ -297,8 +318,8 @@ void dm_table_put(struct dm_table *t)
297 if (!t) 318 if (!t)
298 return; 319 return;
299 320
300 if (atomic_dec_and_test(&t->holders)) 321 smp_mb__before_atomic_dec();
301 table_destroy(t); 322 atomic_dec(&t->holders);
302} 323}
303 324
304/* 325/*
@@ -728,6 +749,10 @@ int dm_table_add_target(struct dm_table *t, const char *type,
728 /* FIXME: the plan is to combine high here and then have 749 /* FIXME: the plan is to combine high here and then have
729 * the merge fn apply the target level restrictions. */ 750 * the merge fn apply the target level restrictions. */
730 combine_restrictions_low(&t->limits, &tgt->limits); 751 combine_restrictions_low(&t->limits, &tgt->limits);
752
753 if (!(tgt->type->features & DM_TARGET_SUPPORTS_BARRIERS))
754 t->barriers_supported = 0;
755
731 return 0; 756 return 0;
732 757
733 bad: 758 bad:
@@ -772,6 +797,12 @@ int dm_table_complete(struct dm_table *t)
772 797
773 check_for_valid_limits(&t->limits); 798 check_for_valid_limits(&t->limits);
774 799
800 /*
801 * We only support barriers if there is exactly one underlying device.
802 */
803 if (!list_is_singular(&t->devices))
804 t->barriers_supported = 0;
805
775 /* how many indexes will the btree have ? */ 806 /* how many indexes will the btree have ? */
776 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); 807 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
777 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); 808 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -986,6 +1017,12 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
986 return t->md; 1017 return t->md;
987} 1018}
988 1019
1020int dm_table_barrier_ok(struct dm_table *t)
1021{
1022 return t->barriers_supported;
1023}
1024EXPORT_SYMBOL(dm_table_barrier_ok);
1025
989EXPORT_SYMBOL(dm_vcalloc); 1026EXPORT_SYMBOL(dm_vcalloc);
990EXPORT_SYMBOL(dm_get_device); 1027EXPORT_SYMBOL(dm_get_device);
991EXPORT_SYMBOL(dm_put_device); 1028EXPORT_SYMBOL(dm_put_device);
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 835cf95b857f..7decf10006e4 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -130,26 +130,26 @@ int dm_register_target(struct target_type *t)
130 return rv; 130 return rv;
131} 131}
132 132
133int dm_unregister_target(struct target_type *t) 133void dm_unregister_target(struct target_type *t)
134{ 134{
135 struct tt_internal *ti; 135 struct tt_internal *ti;
136 136
137 down_write(&_lock); 137 down_write(&_lock);
138 if (!(ti = __find_target_type(t->name))) { 138 if (!(ti = __find_target_type(t->name))) {
139 up_write(&_lock); 139 DMCRIT("Unregistering unrecognised target: %s", t->name);
140 return -EINVAL; 140 BUG();
141 } 141 }
142 142
143 if (ti->use) { 143 if (ti->use) {
144 up_write(&_lock); 144 DMCRIT("Attempt to unregister target still in use: %s",
145 return -ETXTBSY; 145 t->name);
146 BUG();
146 } 147 }
147 148
148 list_del(&ti->list); 149 list_del(&ti->list);
149 kfree(ti); 150 kfree(ti);
150 151
151 up_write(&_lock); 152 up_write(&_lock);
152 return 0;
153} 153}
154 154
155/* 155/*
@@ -187,8 +187,7 @@ int __init dm_target_init(void)
187 187
188void dm_target_exit(void) 188void dm_target_exit(void)
189{ 189{
190 if (dm_unregister_target(&error_target)) 190 dm_unregister_target(&error_target);
191 DMWARN("error target unregistration failed");
192} 191}
193 192
194EXPORT_SYMBOL(dm_register_target); 193EXPORT_SYMBOL(dm_register_target);
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index cdbf126ec106..bbc97030c0c2 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -69,10 +69,7 @@ static int __init dm_zero_init(void)
69 69
70static void __exit dm_zero_exit(void) 70static void __exit dm_zero_exit(void)
71{ 71{
72 int r = dm_unregister_target(&zero_target); 72 dm_unregister_target(&zero_target);
73
74 if (r < 0)
75 DMERR("unregister failed %d", r);
76} 73}
77 74
78module_init(dm_zero_init) 75module_init(dm_zero_init)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 421c9f02d8ca..51ba1db4b3e7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -32,6 +32,7 @@ static unsigned int _major = 0;
32 32
33static DEFINE_SPINLOCK(_minor_lock); 33static DEFINE_SPINLOCK(_minor_lock);
34/* 34/*
35 * For bio-based dm.
35 * One of these is allocated per bio. 36 * One of these is allocated per bio.
36 */ 37 */
37struct dm_io { 38struct dm_io {
@@ -43,6 +44,7 @@ struct dm_io {
43}; 44};
44 45
45/* 46/*
47 * For bio-based dm.
46 * One of these is allocated per target within a bio. Hopefully 48 * One of these is allocated per target within a bio. Hopefully
47 * this will be simplified out one day. 49 * this will be simplified out one day.
48 */ 50 */
@@ -54,6 +56,27 @@ struct dm_target_io {
54 56
55DEFINE_TRACE(block_bio_complete); 57DEFINE_TRACE(block_bio_complete);
56 58
59/*
60 * For request-based dm.
61 * One of these is allocated per request.
62 */
63struct dm_rq_target_io {
64 struct mapped_device *md;
65 struct dm_target *ti;
66 struct request *orig, clone;
67 int error;
68 union map_info info;
69};
70
71/*
72 * For request-based dm.
73 * One of these is allocated per bio.
74 */
75struct dm_rq_clone_bio_info {
76 struct bio *orig;
77 struct request *rq;
78};
79
57union map_info *dm_get_mapinfo(struct bio *bio) 80union map_info *dm_get_mapinfo(struct bio *bio)
58{ 81{
59 if (bio && bio->bi_private) 82 if (bio && bio->bi_private)
@@ -144,11 +167,16 @@ struct mapped_device {
144 167
145 /* forced geometry settings */ 168 /* forced geometry settings */
146 struct hd_geometry geometry; 169 struct hd_geometry geometry;
170
171 /* sysfs handle */
172 struct kobject kobj;
147}; 173};
148 174
149#define MIN_IOS 256 175#define MIN_IOS 256
150static struct kmem_cache *_io_cache; 176static struct kmem_cache *_io_cache;
151static struct kmem_cache *_tio_cache; 177static struct kmem_cache *_tio_cache;
178static struct kmem_cache *_rq_tio_cache;
179static struct kmem_cache *_rq_bio_info_cache;
152 180
153static int __init local_init(void) 181static int __init local_init(void)
154{ 182{
@@ -164,9 +192,17 @@ static int __init local_init(void)
164 if (!_tio_cache) 192 if (!_tio_cache)
165 goto out_free_io_cache; 193 goto out_free_io_cache;
166 194
195 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
196 if (!_rq_tio_cache)
197 goto out_free_tio_cache;
198
199 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
200 if (!_rq_bio_info_cache)
201 goto out_free_rq_tio_cache;
202
167 r = dm_uevent_init(); 203 r = dm_uevent_init();
168 if (r) 204 if (r)
169 goto out_free_tio_cache; 205 goto out_free_rq_bio_info_cache;
170 206
171 _major = major; 207 _major = major;
172 r = register_blkdev(_major, _name); 208 r = register_blkdev(_major, _name);
@@ -180,6 +216,10 @@ static int __init local_init(void)
180 216
181out_uevent_exit: 217out_uevent_exit:
182 dm_uevent_exit(); 218 dm_uevent_exit();
219out_free_rq_bio_info_cache:
220 kmem_cache_destroy(_rq_bio_info_cache);
221out_free_rq_tio_cache:
222 kmem_cache_destroy(_rq_tio_cache);
183out_free_tio_cache: 223out_free_tio_cache:
184 kmem_cache_destroy(_tio_cache); 224 kmem_cache_destroy(_tio_cache);
185out_free_io_cache: 225out_free_io_cache:
@@ -190,6 +230,8 @@ out_free_io_cache:
190 230
191static void local_exit(void) 231static void local_exit(void)
192{ 232{
233 kmem_cache_destroy(_rq_bio_info_cache);
234 kmem_cache_destroy(_rq_tio_cache);
193 kmem_cache_destroy(_tio_cache); 235 kmem_cache_destroy(_tio_cache);
194 kmem_cache_destroy(_io_cache); 236 kmem_cache_destroy(_io_cache);
195 unregister_blkdev(_major, _name); 237 unregister_blkdev(_major, _name);
@@ -796,7 +838,11 @@ static int __split_bio(struct mapped_device *md, struct bio *bio)
796 ci.map = dm_get_table(md); 838 ci.map = dm_get_table(md);
797 if (unlikely(!ci.map)) 839 if (unlikely(!ci.map))
798 return -EIO; 840 return -EIO;
799 841 if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) {
842 dm_table_put(ci.map);
843 bio_endio(bio, -EOPNOTSUPP);
844 return 0;
845 }
800 ci.md = md; 846 ci.md = md;
801 ci.bio = bio; 847 ci.bio = bio;
802 ci.io = alloc_io(md); 848 ci.io = alloc_io(md);
@@ -880,15 +926,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
880 struct mapped_device *md = q->queuedata; 926 struct mapped_device *md = q->queuedata;
881 int cpu; 927 int cpu;
882 928
883 /*
884 * There is no use in forwarding any barrier request since we can't
885 * guarantee it is (or can be) handled by the targets correctly.
886 */
887 if (unlikely(bio_barrier(bio))) {
888 bio_endio(bio, -EOPNOTSUPP);
889 return 0;
890 }
891
892 down_read(&md->io_lock); 929 down_read(&md->io_lock);
893 930
894 cpu = part_stat_lock(); 931 cpu = part_stat_lock();
@@ -943,8 +980,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
943 struct mapped_device *md = congested_data; 980 struct mapped_device *md = congested_data;
944 struct dm_table *map; 981 struct dm_table *map;
945 982
946 atomic_inc(&md->pending);
947
948 if (!test_bit(DMF_BLOCK_IO, &md->flags)) { 983 if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
949 map = dm_get_table(md); 984 map = dm_get_table(md);
950 if (map) { 985 if (map) {
@@ -953,10 +988,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
953 } 988 }
954 } 989 }
955 990
956 if (!atomic_dec_return(&md->pending))
957 /* nudge anyone waiting on suspend queue */
958 wake_up(&md->wait);
959
960 return r; 991 return r;
961} 992}
962 993
@@ -1216,10 +1247,12 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
1216 1247
1217 if (md->suspended_bdev) 1248 if (md->suspended_bdev)
1218 __set_size(md, size); 1249 __set_size(md, size);
1219 if (size == 0) 1250
1251 if (!size) {
1252 dm_table_destroy(t);
1220 return 0; 1253 return 0;
1254 }
1221 1255
1222 dm_table_get(t);
1223 dm_table_event_callback(t, event_callback, md); 1256 dm_table_event_callback(t, event_callback, md);
1224 1257
1225 write_lock(&md->map_lock); 1258 write_lock(&md->map_lock);
@@ -1241,7 +1274,7 @@ static void __unbind(struct mapped_device *md)
1241 write_lock(&md->map_lock); 1274 write_lock(&md->map_lock);
1242 md->map = NULL; 1275 md->map = NULL;
1243 write_unlock(&md->map_lock); 1276 write_unlock(&md->map_lock);
1244 dm_table_put(map); 1277 dm_table_destroy(map);
1245} 1278}
1246 1279
1247/* 1280/*
@@ -1255,6 +1288,8 @@ int dm_create(int minor, struct mapped_device **result)
1255 if (!md) 1288 if (!md)
1256 return -ENXIO; 1289 return -ENXIO;
1257 1290
1291 dm_sysfs_init(md);
1292
1258 *result = md; 1293 *result = md;
1259 return 0; 1294 return 0;
1260} 1295}
@@ -1330,8 +1365,9 @@ void dm_put(struct mapped_device *md)
1330 dm_table_presuspend_targets(map); 1365 dm_table_presuspend_targets(map);
1331 dm_table_postsuspend_targets(map); 1366 dm_table_postsuspend_targets(map);
1332 } 1367 }
1333 __unbind(md); 1368 dm_sysfs_exit(md);
1334 dm_table_put(map); 1369 dm_table_put(map);
1370 __unbind(md);
1335 free_dev(md); 1371 free_dev(md);
1336 } 1372 }
1337} 1373}
@@ -1669,6 +1705,27 @@ struct gendisk *dm_disk(struct mapped_device *md)
1669 return md->disk; 1705 return md->disk;
1670} 1706}
1671 1707
1708struct kobject *dm_kobject(struct mapped_device *md)
1709{
1710 return &md->kobj;
1711}
1712
1713/*
1714 * struct mapped_device should not be exported outside of dm.c
1715 * so use this check to verify that kobj is part of md structure
1716 */
1717struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
1718{
1719 struct mapped_device *md;
1720
1721 md = container_of(kobj, struct mapped_device, kobj);
1722 if (&md->kobj != kobj)
1723 return NULL;
1724
1725 dm_get(md);
1726 return md;
1727}
1728
1672int dm_suspended(struct mapped_device *md) 1729int dm_suspended(struct mapped_device *md)
1673{ 1730{
1674 return test_bit(DMF_SUSPENDED, &md->flags); 1731 return test_bit(DMF_SUSPENDED, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0ade60cdef42..20194e000c5a 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -36,6 +36,7 @@ struct dm_table;
36/*----------------------------------------------------------------- 36/*-----------------------------------------------------------------
37 * Internal table functions. 37 * Internal table functions.
38 *---------------------------------------------------------------*/ 38 *---------------------------------------------------------------*/
39void dm_table_destroy(struct dm_table *t);
39void dm_table_event_callback(struct dm_table *t, 40void dm_table_event_callback(struct dm_table *t,
40 void (*fn)(void *), void *context); 41 void (*fn)(void *), void *context);
41struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); 42struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
@@ -51,6 +52,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
51 * To check the return value from dm_table_find_target(). 52 * To check the return value from dm_table_find_target().
52 */ 53 */
53#define dm_target_is_valid(t) ((t)->table) 54#define dm_target_is_valid(t) ((t)->table)
55int dm_table_barrier_ok(struct dm_table *t);
54 56
55/*----------------------------------------------------------------- 57/*-----------------------------------------------------------------
56 * A registry of target types. 58 * A registry of target types.
@@ -72,6 +74,14 @@ int dm_interface_init(void);
72void dm_interface_exit(void); 74void dm_interface_exit(void);
73 75
74/* 76/*
77 * sysfs interface
78 */
79int dm_sysfs_init(struct mapped_device *md);
80void dm_sysfs_exit(struct mapped_device *md);
81struct kobject *dm_kobject(struct mapped_device *md);
82struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
83
84/*
75 * Targets for linear and striped mappings 85 * Targets for linear and striped mappings
76 */ 86 */
77int dm_linear_init(void); 87int dm_linear_init(void);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index c17fd334e574..8209e08969f9 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -45,6 +45,8 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti);
45 */ 45 */
46typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio, 46typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio,
47 union map_info *map_context); 47 union map_info *map_context);
48typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
49 union map_info *map_context);
48 50
49/* 51/*
50 * Returns: 52 * Returns:
@@ -57,6 +59,9 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio,
57typedef int (*dm_endio_fn) (struct dm_target *ti, 59typedef int (*dm_endio_fn) (struct dm_target *ti,
58 struct bio *bio, int error, 60 struct bio *bio, int error,
59 union map_info *map_context); 61 union map_info *map_context);
62typedef int (*dm_request_endio_fn) (struct dm_target *ti,
63 struct request *clone, int error,
64 union map_info *map_context);
60 65
61typedef void (*dm_flush_fn) (struct dm_target *ti); 66typedef void (*dm_flush_fn) (struct dm_target *ti);
62typedef void (*dm_presuspend_fn) (struct dm_target *ti); 67typedef void (*dm_presuspend_fn) (struct dm_target *ti);
@@ -75,6 +80,13 @@ typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd,
75typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm, 80typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
76 struct bio_vec *biovec, int max_size); 81 struct bio_vec *biovec, int max_size);
77 82
83/*
84 * Returns:
85 * 0: The target can handle the next I/O immediately.
86 * 1: The target can't handle the next I/O immediately.
87 */
88typedef int (*dm_busy_fn) (struct dm_target *ti);
89
78void dm_error(const char *message); 90void dm_error(const char *message);
79 91
80/* 92/*
@@ -100,14 +112,23 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d);
100/* 112/*
101 * Information about a target type 113 * Information about a target type
102 */ 114 */
115
116/*
117 * Target features
118 */
119#define DM_TARGET_SUPPORTS_BARRIERS 0x00000001
120
103struct target_type { 121struct target_type {
122 uint64_t features;
104 const char *name; 123 const char *name;
105 struct module *module; 124 struct module *module;
106 unsigned version[3]; 125 unsigned version[3];
107 dm_ctr_fn ctr; 126 dm_ctr_fn ctr;
108 dm_dtr_fn dtr; 127 dm_dtr_fn dtr;
109 dm_map_fn map; 128 dm_map_fn map;
129 dm_map_request_fn map_rq;
110 dm_endio_fn end_io; 130 dm_endio_fn end_io;
131 dm_request_endio_fn rq_end_io;
111 dm_flush_fn flush; 132 dm_flush_fn flush;
112 dm_presuspend_fn presuspend; 133 dm_presuspend_fn presuspend;
113 dm_postsuspend_fn postsuspend; 134 dm_postsuspend_fn postsuspend;
@@ -117,6 +138,7 @@ struct target_type {
117 dm_message_fn message; 138 dm_message_fn message;
118 dm_ioctl_fn ioctl; 139 dm_ioctl_fn ioctl;
119 dm_merge_fn merge; 140 dm_merge_fn merge;
141 dm_busy_fn busy;
120}; 142};
121 143
122struct io_restrictions { 144struct io_restrictions {
@@ -157,8 +179,7 @@ struct dm_target {
157}; 179};
158 180
159int dm_register_target(struct target_type *t); 181int dm_register_target(struct target_type *t);
160int dm_unregister_target(struct target_type *t); 182void dm_unregister_target(struct target_type *t);
161
162 183
163/*----------------------------------------------------------------- 184/*-----------------------------------------------------------------
164 * Functions for creating and manipulating mapped devices. 185 * Functions for creating and manipulating mapped devices.
@@ -276,6 +297,9 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
276 *---------------------------------------------------------------*/ 297 *---------------------------------------------------------------*/
277#define DM_NAME "device-mapper" 298#define DM_NAME "device-mapper"
278 299
300#define DMCRIT(f, arg...) \
301 printk(KERN_CRIT DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
302
279#define DMERR(f, arg...) \ 303#define DMERR(f, arg...) \
280 printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) 304 printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
281#define DMERR_LIMIT(f, arg...) \ 305#define DMERR_LIMIT(f, arg...) \