diff options
Diffstat (limited to 'drivers/md/dm-exception-store.c')
-rw-r--r-- | drivers/md/dm-exception-store.c | 648 |
1 files changed, 648 insertions, 0 deletions
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c new file mode 100644 index 000000000000..17212b4201a1 --- /dev/null +++ b/drivers/md/dm-exception-store.c | |||
@@ -0,0 +1,648 @@ | |||
1 | /* | ||
2 | * dm-snapshot.c | ||
3 | * | ||
4 | * Copyright (C) 2001-2002 Sistina Software (UK) Limited. | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | */ | ||
8 | |||
9 | #include "dm.h" | ||
10 | #include "dm-snap.h" | ||
11 | #include "dm-io.h" | ||
12 | #include "kcopyd.h" | ||
13 | |||
14 | #include <linux/mm.h> | ||
15 | #include <linux/pagemap.h> | ||
16 | #include <linux/vmalloc.h> | ||
17 | #include <linux/slab.h> | ||
18 | |||
19 | /*----------------------------------------------------------------- | ||
20 | * Persistent snapshots, by persistent we mean that the snapshot | ||
21 | * will survive a reboot. | ||
22 | *---------------------------------------------------------------*/ | ||
23 | |||
24 | /* | ||
25 | * We need to store a record of which parts of the origin have | ||
26 | * been copied to the snapshot device. The snapshot code | ||
27 | * requires that we copy exception chunks to chunk aligned areas | ||
28 | * of the COW store. It makes sense therefore, to store the | ||
29 | * metadata in chunk size blocks. | ||
30 | * | ||
31 | * There is no backward or forward compatibility implemented, | ||
32 | * snapshots with different disk versions than the kernel will | ||
33 | * not be usable. It is expected that "lvcreate" will blank out | ||
34 | * the start of a fresh COW device before calling the snapshot | ||
35 | * constructor. | ||
36 | * | ||
37 | * The first chunk of the COW device just contains the header. | ||
38 | * After this there is a chunk filled with exception metadata, | ||
39 | * followed by as many exception chunks as can fit in the | ||
40 | * metadata areas. | ||
41 | * | ||
42 | * All on disk structures are in little-endian format. The end | ||
43 | * of the exceptions info is indicated by an exception with a | ||
44 | * new_chunk of 0, which is invalid since it would point to the | ||
45 | * header chunk. | ||
46 | */ | ||
47 | |||
48 | /* | ||
49 | * Magic for persistent snapshots: "SnAp" - Feeble isn't it. | ||
50 | */ | ||
51 | #define SNAP_MAGIC 0x70416e53 | ||
52 | |||
53 | /* | ||
54 | * The on-disk version of the metadata. | ||
55 | */ | ||
56 | #define SNAPSHOT_DISK_VERSION 1 | ||
57 | |||
58 | struct disk_header { | ||
59 | uint32_t magic; | ||
60 | |||
61 | /* | ||
62 | * Is this snapshot valid. There is no way of recovering | ||
63 | * an invalid snapshot. | ||
64 | */ | ||
65 | uint32_t valid; | ||
66 | |||
67 | /* | ||
68 | * Simple, incrementing version. no backward | ||
69 | * compatibility. | ||
70 | */ | ||
71 | uint32_t version; | ||
72 | |||
73 | /* In sectors */ | ||
74 | uint32_t chunk_size; | ||
75 | }; | ||
76 | |||
77 | struct disk_exception { | ||
78 | uint64_t old_chunk; | ||
79 | uint64_t new_chunk; | ||
80 | }; | ||
81 | |||
82 | struct commit_callback { | ||
83 | void (*callback)(void *, int success); | ||
84 | void *context; | ||
85 | }; | ||
86 | |||
87 | /* | ||
88 | * The top level structure for a persistent exception store. | ||
89 | */ | ||
90 | struct pstore { | ||
91 | struct dm_snapshot *snap; /* up pointer to my snapshot */ | ||
92 | int version; | ||
93 | int valid; | ||
94 | uint32_t chunk_size; | ||
95 | uint32_t exceptions_per_area; | ||
96 | |||
97 | /* | ||
98 | * Now that we have an asynchronous kcopyd there is no | ||
99 | * need for large chunk sizes, so it wont hurt to have a | ||
100 | * whole chunks worth of metadata in memory at once. | ||
101 | */ | ||
102 | void *area; | ||
103 | |||
104 | /* | ||
105 | * Used to keep track of which metadata area the data in | ||
106 | * 'chunk' refers to. | ||
107 | */ | ||
108 | uint32_t current_area; | ||
109 | |||
110 | /* | ||
111 | * The next free chunk for an exception. | ||
112 | */ | ||
113 | uint32_t next_free; | ||
114 | |||
115 | /* | ||
116 | * The index of next free exception in the current | ||
117 | * metadata area. | ||
118 | */ | ||
119 | uint32_t current_committed; | ||
120 | |||
121 | atomic_t pending_count; | ||
122 | uint32_t callback_count; | ||
123 | struct commit_callback *callbacks; | ||
124 | }; | ||
125 | |||
126 | static inline unsigned int sectors_to_pages(unsigned int sectors) | ||
127 | { | ||
128 | return sectors / (PAGE_SIZE >> 9); | ||
129 | } | ||
130 | |||
131 | static int alloc_area(struct pstore *ps) | ||
132 | { | ||
133 | int r = -ENOMEM; | ||
134 | size_t len; | ||
135 | |||
136 | len = ps->chunk_size << SECTOR_SHIFT; | ||
137 | |||
138 | /* | ||
139 | * Allocate the chunk_size block of memory that will hold | ||
140 | * a single metadata area. | ||
141 | */ | ||
142 | ps->area = vmalloc(len); | ||
143 | if (!ps->area) | ||
144 | return r; | ||
145 | |||
146 | return 0; | ||
147 | } | ||
148 | |||
149 | static void free_area(struct pstore *ps) | ||
150 | { | ||
151 | vfree(ps->area); | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * Read or write a chunk aligned and sized block of data from a device. | ||
156 | */ | ||
157 | static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) | ||
158 | { | ||
159 | struct io_region where; | ||
160 | unsigned long bits; | ||
161 | |||
162 | where.bdev = ps->snap->cow->bdev; | ||
163 | where.sector = ps->chunk_size * chunk; | ||
164 | where.count = ps->chunk_size; | ||
165 | |||
166 | return dm_io_sync_vm(1, &where, rw, ps->area, &bits); | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * Read or write a metadata area. Remembering to skip the first | ||
171 | * chunk which holds the header. | ||
172 | */ | ||
173 | static int area_io(struct pstore *ps, uint32_t area, int rw) | ||
174 | { | ||
175 | int r; | ||
176 | uint32_t chunk; | ||
177 | |||
178 | /* convert a metadata area index to a chunk index */ | ||
179 | chunk = 1 + ((ps->exceptions_per_area + 1) * area); | ||
180 | |||
181 | r = chunk_io(ps, chunk, rw); | ||
182 | if (r) | ||
183 | return r; | ||
184 | |||
185 | ps->current_area = area; | ||
186 | return 0; | ||
187 | } | ||
188 | |||
189 | static int zero_area(struct pstore *ps, uint32_t area) | ||
190 | { | ||
191 | memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); | ||
192 | return area_io(ps, area, WRITE); | ||
193 | } | ||
194 | |||
195 | static int read_header(struct pstore *ps, int *new_snapshot) | ||
196 | { | ||
197 | int r; | ||
198 | struct disk_header *dh; | ||
199 | |||
200 | r = chunk_io(ps, 0, READ); | ||
201 | if (r) | ||
202 | return r; | ||
203 | |||
204 | dh = (struct disk_header *) ps->area; | ||
205 | |||
206 | if (le32_to_cpu(dh->magic) == 0) { | ||
207 | *new_snapshot = 1; | ||
208 | |||
209 | } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { | ||
210 | *new_snapshot = 0; | ||
211 | ps->valid = le32_to_cpu(dh->valid); | ||
212 | ps->version = le32_to_cpu(dh->version); | ||
213 | ps->chunk_size = le32_to_cpu(dh->chunk_size); | ||
214 | |||
215 | } else { | ||
216 | DMWARN("Invalid/corrupt snapshot"); | ||
217 | r = -ENXIO; | ||
218 | } | ||
219 | |||
220 | return r; | ||
221 | } | ||
222 | |||
223 | static int write_header(struct pstore *ps) | ||
224 | { | ||
225 | struct disk_header *dh; | ||
226 | |||
227 | memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); | ||
228 | |||
229 | dh = (struct disk_header *) ps->area; | ||
230 | dh->magic = cpu_to_le32(SNAP_MAGIC); | ||
231 | dh->valid = cpu_to_le32(ps->valid); | ||
232 | dh->version = cpu_to_le32(ps->version); | ||
233 | dh->chunk_size = cpu_to_le32(ps->chunk_size); | ||
234 | |||
235 | return chunk_io(ps, 0, WRITE); | ||
236 | } | ||
237 | |||
238 | /* | ||
239 | * Access functions for the disk exceptions, these do the endian conversions. | ||
240 | */ | ||
241 | static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) | ||
242 | { | ||
243 | if (index >= ps->exceptions_per_area) | ||
244 | return NULL; | ||
245 | |||
246 | return ((struct disk_exception *) ps->area) + index; | ||
247 | } | ||
248 | |||
249 | static int read_exception(struct pstore *ps, | ||
250 | uint32_t index, struct disk_exception *result) | ||
251 | { | ||
252 | struct disk_exception *e; | ||
253 | |||
254 | e = get_exception(ps, index); | ||
255 | if (!e) | ||
256 | return -EINVAL; | ||
257 | |||
258 | /* copy it */ | ||
259 | result->old_chunk = le64_to_cpu(e->old_chunk); | ||
260 | result->new_chunk = le64_to_cpu(e->new_chunk); | ||
261 | |||
262 | return 0; | ||
263 | } | ||
264 | |||
265 | static int write_exception(struct pstore *ps, | ||
266 | uint32_t index, struct disk_exception *de) | ||
267 | { | ||
268 | struct disk_exception *e; | ||
269 | |||
270 | e = get_exception(ps, index); | ||
271 | if (!e) | ||
272 | return -EINVAL; | ||
273 | |||
274 | /* copy it */ | ||
275 | e->old_chunk = cpu_to_le64(de->old_chunk); | ||
276 | e->new_chunk = cpu_to_le64(de->new_chunk); | ||
277 | |||
278 | return 0; | ||
279 | } | ||
280 | |||
281 | /* | ||
282 | * Registers the exceptions that are present in the current area. | ||
283 | * 'full' is filled in to indicate if the area has been | ||
284 | * filled. | ||
285 | */ | ||
286 | static int insert_exceptions(struct pstore *ps, int *full) | ||
287 | { | ||
288 | int r; | ||
289 | unsigned int i; | ||
290 | struct disk_exception de; | ||
291 | |||
292 | /* presume the area is full */ | ||
293 | *full = 1; | ||
294 | |||
295 | for (i = 0; i < ps->exceptions_per_area; i++) { | ||
296 | r = read_exception(ps, i, &de); | ||
297 | |||
298 | if (r) | ||
299 | return r; | ||
300 | |||
301 | /* | ||
302 | * If the new_chunk is pointing at the start of | ||
303 | * the COW device, where the first metadata area | ||
304 | * is we know that we've hit the end of the | ||
305 | * exceptions. Therefore the area is not full. | ||
306 | */ | ||
307 | if (de.new_chunk == 0LL) { | ||
308 | ps->current_committed = i; | ||
309 | *full = 0; | ||
310 | break; | ||
311 | } | ||
312 | |||
313 | /* | ||
314 | * Keep track of the start of the free chunks. | ||
315 | */ | ||
316 | if (ps->next_free <= de.new_chunk) | ||
317 | ps->next_free = de.new_chunk + 1; | ||
318 | |||
319 | /* | ||
320 | * Otherwise we add the exception to the snapshot. | ||
321 | */ | ||
322 | r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); | ||
323 | if (r) | ||
324 | return r; | ||
325 | } | ||
326 | |||
327 | return 0; | ||
328 | } | ||
329 | |||
330 | static int read_exceptions(struct pstore *ps) | ||
331 | { | ||
332 | uint32_t area; | ||
333 | int r, full = 1; | ||
334 | |||
335 | /* | ||
336 | * Keeping reading chunks and inserting exceptions until | ||
337 | * we find a partially full area. | ||
338 | */ | ||
339 | for (area = 0; full; area++) { | ||
340 | r = area_io(ps, area, READ); | ||
341 | if (r) | ||
342 | return r; | ||
343 | |||
344 | r = insert_exceptions(ps, &full); | ||
345 | if (r) | ||
346 | return r; | ||
347 | } | ||
348 | |||
349 | return 0; | ||
350 | } | ||
351 | |||
352 | static inline struct pstore *get_info(struct exception_store *store) | ||
353 | { | ||
354 | return (struct pstore *) store->context; | ||
355 | } | ||
356 | |||
357 | static void persistent_fraction_full(struct exception_store *store, | ||
358 | sector_t *numerator, sector_t *denominator) | ||
359 | { | ||
360 | *numerator = get_info(store)->next_free * store->snap->chunk_size; | ||
361 | *denominator = get_dev_size(store->snap->cow->bdev); | ||
362 | } | ||
363 | |||
364 | static void persistent_destroy(struct exception_store *store) | ||
365 | { | ||
366 | struct pstore *ps = get_info(store); | ||
367 | |||
368 | dm_io_put(sectors_to_pages(ps->chunk_size)); | ||
369 | vfree(ps->callbacks); | ||
370 | free_area(ps); | ||
371 | kfree(ps); | ||
372 | } | ||
373 | |||
374 | static int persistent_read_metadata(struct exception_store *store) | ||
375 | { | ||
376 | int r, new_snapshot; | ||
377 | struct pstore *ps = get_info(store); | ||
378 | |||
379 | /* | ||
380 | * Read the snapshot header. | ||
381 | */ | ||
382 | r = read_header(ps, &new_snapshot); | ||
383 | if (r) | ||
384 | return r; | ||
385 | |||
386 | /* | ||
387 | * Do we need to setup a new snapshot ? | ||
388 | */ | ||
389 | if (new_snapshot) { | ||
390 | r = write_header(ps); | ||
391 | if (r) { | ||
392 | DMWARN("write_header failed"); | ||
393 | return r; | ||
394 | } | ||
395 | |||
396 | r = zero_area(ps, 0); | ||
397 | if (r) { | ||
398 | DMWARN("zero_area(0) failed"); | ||
399 | return r; | ||
400 | } | ||
401 | |||
402 | } else { | ||
403 | /* | ||
404 | * Sanity checks. | ||
405 | */ | ||
406 | if (!ps->valid) { | ||
407 | DMWARN("snapshot is marked invalid"); | ||
408 | return -EINVAL; | ||
409 | } | ||
410 | |||
411 | if (ps->version != SNAPSHOT_DISK_VERSION) { | ||
412 | DMWARN("unable to handle snapshot disk version %d", | ||
413 | ps->version); | ||
414 | return -EINVAL; | ||
415 | } | ||
416 | |||
417 | /* | ||
418 | * Read the metadata. | ||
419 | */ | ||
420 | r = read_exceptions(ps); | ||
421 | if (r) | ||
422 | return r; | ||
423 | } | ||
424 | |||
425 | return 0; | ||
426 | } | ||
427 | |||
428 | static int persistent_prepare(struct exception_store *store, | ||
429 | struct exception *e) | ||
430 | { | ||
431 | struct pstore *ps = get_info(store); | ||
432 | uint32_t stride; | ||
433 | sector_t size = get_dev_size(store->snap->cow->bdev); | ||
434 | |||
435 | /* Is there enough room ? */ | ||
436 | if (size < ((ps->next_free + 1) * store->snap->chunk_size)) | ||
437 | return -ENOSPC; | ||
438 | |||
439 | e->new_chunk = ps->next_free; | ||
440 | |||
441 | /* | ||
442 | * Move onto the next free pending, making sure to take | ||
443 | * into account the location of the metadata chunks. | ||
444 | */ | ||
445 | stride = (ps->exceptions_per_area + 1); | ||
446 | if ((++ps->next_free % stride) == 1) | ||
447 | ps->next_free++; | ||
448 | |||
449 | atomic_inc(&ps->pending_count); | ||
450 | return 0; | ||
451 | } | ||
452 | |||
453 | static void persistent_commit(struct exception_store *store, | ||
454 | struct exception *e, | ||
455 | void (*callback) (void *, int success), | ||
456 | void *callback_context) | ||
457 | { | ||
458 | int r; | ||
459 | unsigned int i; | ||
460 | struct pstore *ps = get_info(store); | ||
461 | struct disk_exception de; | ||
462 | struct commit_callback *cb; | ||
463 | |||
464 | de.old_chunk = e->old_chunk; | ||
465 | de.new_chunk = e->new_chunk; | ||
466 | write_exception(ps, ps->current_committed++, &de); | ||
467 | |||
468 | /* | ||
469 | * Add the callback to the back of the array. This code | ||
470 | * is the only place where the callback array is | ||
471 | * manipulated, and we know that it will never be called | ||
472 | * multiple times concurrently. | ||
473 | */ | ||
474 | cb = ps->callbacks + ps->callback_count++; | ||
475 | cb->callback = callback; | ||
476 | cb->context = callback_context; | ||
477 | |||
478 | /* | ||
479 | * If there are no more exceptions in flight, or we have | ||
480 | * filled this metadata area we commit the exceptions to | ||
481 | * disk. | ||
482 | */ | ||
483 | if (atomic_dec_and_test(&ps->pending_count) || | ||
484 | (ps->current_committed == ps->exceptions_per_area)) { | ||
485 | r = area_io(ps, ps->current_area, WRITE); | ||
486 | if (r) | ||
487 | ps->valid = 0; | ||
488 | |||
489 | for (i = 0; i < ps->callback_count; i++) { | ||
490 | cb = ps->callbacks + i; | ||
491 | cb->callback(cb->context, r == 0 ? 1 : 0); | ||
492 | } | ||
493 | |||
494 | ps->callback_count = 0; | ||
495 | } | ||
496 | |||
497 | /* | ||
498 | * Have we completely filled the current area ? | ||
499 | */ | ||
500 | if (ps->current_committed == ps->exceptions_per_area) { | ||
501 | ps->current_committed = 0; | ||
502 | r = zero_area(ps, ps->current_area + 1); | ||
503 | if (r) | ||
504 | ps->valid = 0; | ||
505 | } | ||
506 | } | ||
507 | |||
508 | static void persistent_drop(struct exception_store *store) | ||
509 | { | ||
510 | struct pstore *ps = get_info(store); | ||
511 | |||
512 | ps->valid = 0; | ||
513 | if (write_header(ps)) | ||
514 | DMWARN("write header failed"); | ||
515 | } | ||
516 | |||
517 | int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) | ||
518 | { | ||
519 | int r; | ||
520 | struct pstore *ps; | ||
521 | |||
522 | r = dm_io_get(sectors_to_pages(chunk_size)); | ||
523 | if (r) | ||
524 | return r; | ||
525 | |||
526 | /* allocate the pstore */ | ||
527 | ps = kmalloc(sizeof(*ps), GFP_KERNEL); | ||
528 | if (!ps) { | ||
529 | r = -ENOMEM; | ||
530 | goto bad; | ||
531 | } | ||
532 | |||
533 | ps->snap = store->snap; | ||
534 | ps->valid = 1; | ||
535 | ps->version = SNAPSHOT_DISK_VERSION; | ||
536 | ps->chunk_size = chunk_size; | ||
537 | ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) / | ||
538 | sizeof(struct disk_exception); | ||
539 | ps->next_free = 2; /* skipping the header and first area */ | ||
540 | ps->current_committed = 0; | ||
541 | |||
542 | r = alloc_area(ps); | ||
543 | if (r) | ||
544 | goto bad; | ||
545 | |||
546 | /* | ||
547 | * Allocate space for all the callbacks. | ||
548 | */ | ||
549 | ps->callback_count = 0; | ||
550 | atomic_set(&ps->pending_count, 0); | ||
551 | ps->callbacks = dm_vcalloc(ps->exceptions_per_area, | ||
552 | sizeof(*ps->callbacks)); | ||
553 | |||
554 | if (!ps->callbacks) { | ||
555 | r = -ENOMEM; | ||
556 | goto bad; | ||
557 | } | ||
558 | |||
559 | store->destroy = persistent_destroy; | ||
560 | store->read_metadata = persistent_read_metadata; | ||
561 | store->prepare_exception = persistent_prepare; | ||
562 | store->commit_exception = persistent_commit; | ||
563 | store->drop_snapshot = persistent_drop; | ||
564 | store->fraction_full = persistent_fraction_full; | ||
565 | store->context = ps; | ||
566 | |||
567 | return 0; | ||
568 | |||
569 | bad: | ||
570 | dm_io_put(sectors_to_pages(chunk_size)); | ||
571 | if (ps) { | ||
572 | if (ps->area) | ||
573 | free_area(ps); | ||
574 | |||
575 | kfree(ps); | ||
576 | } | ||
577 | return r; | ||
578 | } | ||
579 | |||
580 | /*----------------------------------------------------------------- | ||
581 | * Implementation of the store for non-persistent snapshots. | ||
582 | *---------------------------------------------------------------*/ | ||
583 | struct transient_c { | ||
584 | sector_t next_free; | ||
585 | }; | ||
586 | |||
587 | static void transient_destroy(struct exception_store *store) | ||
588 | { | ||
589 | kfree(store->context); | ||
590 | } | ||
591 | |||
592 | static int transient_read_metadata(struct exception_store *store) | ||
593 | { | ||
594 | return 0; | ||
595 | } | ||
596 | |||
597 | static int transient_prepare(struct exception_store *store, struct exception *e) | ||
598 | { | ||
599 | struct transient_c *tc = (struct transient_c *) store->context; | ||
600 | sector_t size = get_dev_size(store->snap->cow->bdev); | ||
601 | |||
602 | if (size < (tc->next_free + store->snap->chunk_size)) | ||
603 | return -1; | ||
604 | |||
605 | e->new_chunk = sector_to_chunk(store->snap, tc->next_free); | ||
606 | tc->next_free += store->snap->chunk_size; | ||
607 | |||
608 | return 0; | ||
609 | } | ||
610 | |||
611 | static void transient_commit(struct exception_store *store, | ||
612 | struct exception *e, | ||
613 | void (*callback) (void *, int success), | ||
614 | void *callback_context) | ||
615 | { | ||
616 | /* Just succeed */ | ||
617 | callback(callback_context, 1); | ||
618 | } | ||
619 | |||
620 | static void transient_fraction_full(struct exception_store *store, | ||
621 | sector_t *numerator, sector_t *denominator) | ||
622 | { | ||
623 | *numerator = ((struct transient_c *) store->context)->next_free; | ||
624 | *denominator = get_dev_size(store->snap->cow->bdev); | ||
625 | } | ||
626 | |||
627 | int dm_create_transient(struct exception_store *store, | ||
628 | struct dm_snapshot *s, int blocksize) | ||
629 | { | ||
630 | struct transient_c *tc; | ||
631 | |||
632 | memset(store, 0, sizeof(*store)); | ||
633 | store->destroy = transient_destroy; | ||
634 | store->read_metadata = transient_read_metadata; | ||
635 | store->prepare_exception = transient_prepare; | ||
636 | store->commit_exception = transient_commit; | ||
637 | store->fraction_full = transient_fraction_full; | ||
638 | store->snap = s; | ||
639 | |||
640 | tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); | ||
641 | if (!tc) | ||
642 | return -ENOMEM; | ||
643 | |||
644 | tc->next_free = 0; | ||
645 | store->context = tc; | ||
646 | |||
647 | return 0; | ||
648 | } | ||