aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm-thin-metadata.c
diff options
context:
space:
mode:
authorJoe Thornber <thornber@redhat.com>2011-10-31 16:21:18 -0400
committerAlasdair G Kergon <agk@redhat.com>2011-10-31 16:21:18 -0400
commit991d9fa02da0dd1f843dc011376965e0c8c6c9b5 (patch)
treea64c94710246b77bb74cd77634581cea3d32cfe1 /drivers/md/dm-thin-metadata.c
parent3241b1d3e0aaafbfcd320f4d71ade629728cc4f4 (diff)
dm: add thin provisioning target
Initial EXPERIMENTAL implementation of device-mapper thin provisioning with snapshot support. The 'thin' target is used to create instances of the virtual devices that are hosted in the 'thin-pool' target. The thin-pool target provides data sharing among devices. This sharing is made possible using the persistent-data library in the previous patch. The main highlight of this implementation, compared to the previous implementation of snapshots, is that it allows many virtual devices to be stored on the same data volume, simplifying administration and allowing sharing of data between volumes (thus reducing disk usage). Another big feature is support for arbitrary depth of recursive snapshots (snapshots of snapshots of snapshots ...). The previous implementation of snapshots did this by chaining together lookup tables, and so performance was O(depth). This new implementation uses a single data structure so we don't get this degradation with depth. For further information and examples of how to use this, please read Documentation/device-mapper/thin-provisioning.txt Signed-off-by: Joe Thornber <thornber@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md/dm-thin-metadata.c')
-rw-r--r--drivers/md/dm-thin-metadata.c1391
1 files changed, 1391 insertions, 0 deletions
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
new file mode 100644
index 000000000000..59c4f0446ffa
--- /dev/null
+++ b/drivers/md/dm-thin-metadata.c
@@ -0,0 +1,1391 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
8#include "persistent-data/dm-btree.h"
9#include "persistent-data/dm-space-map.h"
10#include "persistent-data/dm-space-map-disk.h"
11#include "persistent-data/dm-transaction-manager.h"
12
13#include <linux/list.h>
14#include <linux/device-mapper.h>
15#include <linux/workqueue.h>
16
17/*--------------------------------------------------------------------------
18 * As far as the metadata goes, there is:
19 *
20 * - A superblock in block zero, taking up fewer than 512 bytes for
21 * atomic writes.
22 *
23 * - A space map managing the metadata blocks.
24 *
25 * - A space map managing the data blocks.
26 *
27 * - A btree mapping our internal thin dev ids onto struct disk_device_details.
28 *
29 * - A hierarchical btree, with 2 levels which effectively maps (thin
30 * dev id, virtual block) -> block_time. Block time is a 64-bit
31 * field holding the time in the low 24 bits, and block in the top 48
32 * bits.
33 *
34 * BTrees consist solely of btree_nodes, that fill a block. Some are
35 * internal nodes, as such their values are a __le64 pointing to other
36 * nodes. Leaf nodes can store data of any reasonable size (ie. much
37 * smaller than the block size). The nodes consist of the header,
38 * followed by an array of keys, followed by an array of values. We have
39 * to binary search on the keys so they're all held together to help the
40 * cpu cache.
41 *
42 * Space maps have 2 btrees:
43 *
44 * - One maps a uint64_t onto a struct index_entry. Which points to a
45 * bitmap block, and has some details about how many free entries there
46 * are etc.
47 *
48 * - The bitmap blocks have a header (for the checksum). Then the rest
49 * of the block is pairs of bits. With the meaning being:
50 *
51 * 0 - ref count is 0
52 * 1 - ref count is 1
53 * 2 - ref count is 2
54 * 3 - ref count is higher than 2
55 *
56 * - If the count is higher than 2 then the ref count is entered in a
57 * second btree that directly maps the block_address to a uint32_t ref
58 * count.
59 *
60 * The space map metadata variant doesn't have a bitmaps btree. Instead
61 * it has one single blocks worth of index_entries. This avoids
62 * recursive issues with the bitmap btree needing to allocate space in
63 * order to insert. With a small data block size such as 64k the
64 * metadata support data devices that are hundreds of terrabytes.
65 *
66 * The space maps allocate space linearly from front to back. Space that
67 * is freed in a transaction is never recycled within that transaction.
68 * To try and avoid fragmenting _free_ space the allocator always goes
69 * back and fills in gaps.
70 *
71 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
72 * from the block manager.
73 *--------------------------------------------------------------------------*/
74
75#define DM_MSG_PREFIX "thin metadata"
76
77#define THIN_SUPERBLOCK_MAGIC 27022010
78#define THIN_SUPERBLOCK_LOCATION 0
79#define THIN_VERSION 1
80#define THIN_METADATA_CACHE_SIZE 64
81#define SECTOR_TO_BLOCK_SHIFT 3
82
83/* This should be plenty */
84#define SPACE_MAP_ROOT_SIZE 128
85
86/*
87 * Little endian on-disk superblock and device details.
88 */
89struct thin_disk_superblock {
90 __le32 csum; /* Checksum of superblock except for this field. */
91 __le32 flags;
92 __le64 blocknr; /* This block number, dm_block_t. */
93
94 __u8 uuid[16];
95 __le64 magic;
96 __le32 version;
97 __le32 time;
98
99 __le64 trans_id;
100
101 /*
102 * Root held by userspace transactions.
103 */
104 __le64 held_root;
105
106 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
107 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
108
109 /*
110 * 2-level btree mapping (dev_id, (dev block, time)) -> data block
111 */
112 __le64 data_mapping_root;
113
114 /*
115 * Device detail root mapping dev_id -> device_details
116 */
117 __le64 device_details_root;
118
119 __le32 data_block_size; /* In 512-byte sectors. */
120
121 __le32 metadata_block_size; /* In 512-byte sectors. */
122 __le64 metadata_nr_blocks;
123
124 __le32 compat_flags;
125 __le32 compat_ro_flags;
126 __le32 incompat_flags;
127} __packed;
128
129struct disk_device_details {
130 __le64 mapped_blocks;
131 __le64 transaction_id; /* When created. */
132 __le32 creation_time;
133 __le32 snapshotted_time;
134} __packed;
135
136struct dm_pool_metadata {
137 struct hlist_node hash;
138
139 struct block_device *bdev;
140 struct dm_block_manager *bm;
141 struct dm_space_map *metadata_sm;
142 struct dm_space_map *data_sm;
143 struct dm_transaction_manager *tm;
144 struct dm_transaction_manager *nb_tm;
145
146 /*
147 * Two-level btree.
148 * First level holds thin_dev_t.
149 * Second level holds mappings.
150 */
151 struct dm_btree_info info;
152
153 /*
154 * Non-blocking version of the above.
155 */
156 struct dm_btree_info nb_info;
157
158 /*
159 * Just the top level for deleting whole devices.
160 */
161 struct dm_btree_info tl_info;
162
163 /*
164 * Just the bottom level for creating new devices.
165 */
166 struct dm_btree_info bl_info;
167
168 /*
169 * Describes the device details btree.
170 */
171 struct dm_btree_info details_info;
172
173 struct rw_semaphore root_lock;
174 uint32_t time;
175 int need_commit;
176 dm_block_t root;
177 dm_block_t details_root;
178 struct list_head thin_devices;
179 uint64_t trans_id;
180 unsigned long flags;
181 sector_t data_block_size;
182};
183
184struct dm_thin_device {
185 struct list_head list;
186 struct dm_pool_metadata *pmd;
187 dm_thin_id id;
188
189 int open_count;
190 int changed;
191 uint64_t mapped_blocks;
192 uint64_t transaction_id;
193 uint32_t creation_time;
194 uint32_t snapshotted_time;
195};
196
197/*----------------------------------------------------------------
198 * superblock validator
199 *--------------------------------------------------------------*/
200
201#define SUPERBLOCK_CSUM_XOR 160774
202
203static void sb_prepare_for_write(struct dm_block_validator *v,
204 struct dm_block *b,
205 size_t block_size)
206{
207 struct thin_disk_superblock *disk_super = dm_block_data(b);
208
209 disk_super->blocknr = cpu_to_le64(dm_block_location(b));
210 disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
211 block_size - sizeof(__le32),
212 SUPERBLOCK_CSUM_XOR));
213}
214
215static int sb_check(struct dm_block_validator *v,
216 struct dm_block *b,
217 size_t block_size)
218{
219 struct thin_disk_superblock *disk_super = dm_block_data(b);
220 __le32 csum_le;
221
222 if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
223 DMERR("sb_check failed: blocknr %llu: "
224 "wanted %llu", le64_to_cpu(disk_super->blocknr),
225 (unsigned long long)dm_block_location(b));
226 return -ENOTBLK;
227 }
228
229 if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
230 DMERR("sb_check failed: magic %llu: "
231 "wanted %llu", le64_to_cpu(disk_super->magic),
232 (unsigned long long)THIN_SUPERBLOCK_MAGIC);
233 return -EILSEQ;
234 }
235
236 csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
237 block_size - sizeof(__le32),
238 SUPERBLOCK_CSUM_XOR));
239 if (csum_le != disk_super->csum) {
240 DMERR("sb_check failed: csum %u: wanted %u",
241 le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
242 return -EILSEQ;
243 }
244
245 return 0;
246}
247
248static struct dm_block_validator sb_validator = {
249 .name = "superblock",
250 .prepare_for_write = sb_prepare_for_write,
251 .check = sb_check
252};
253
254/*----------------------------------------------------------------
255 * Methods for the btree value types
256 *--------------------------------------------------------------*/
257
258static uint64_t pack_block_time(dm_block_t b, uint32_t t)
259{
260 return (b << 24) | t;
261}
262
263static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
264{
265 *b = v >> 24;
266 *t = v & ((1 << 24) - 1);
267}
268
269static void data_block_inc(void *context, void *value_le)
270{
271 struct dm_space_map *sm = context;
272 __le64 v_le;
273 uint64_t b;
274 uint32_t t;
275
276 memcpy(&v_le, value_le, sizeof(v_le));
277 unpack_block_time(le64_to_cpu(v_le), &b, &t);
278 dm_sm_inc_block(sm, b);
279}
280
281static void data_block_dec(void *context, void *value_le)
282{
283 struct dm_space_map *sm = context;
284 __le64 v_le;
285 uint64_t b;
286 uint32_t t;
287
288 memcpy(&v_le, value_le, sizeof(v_le));
289 unpack_block_time(le64_to_cpu(v_le), &b, &t);
290 dm_sm_dec_block(sm, b);
291}
292
293static int data_block_equal(void *context, void *value1_le, void *value2_le)
294{
295 __le64 v1_le, v2_le;
296 uint64_t b1, b2;
297 uint32_t t;
298
299 memcpy(&v1_le, value1_le, sizeof(v1_le));
300 memcpy(&v2_le, value2_le, sizeof(v2_le));
301 unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
302 unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
303
304 return b1 == b2;
305}
306
307static void subtree_inc(void *context, void *value)
308{
309 struct dm_btree_info *info = context;
310 __le64 root_le;
311 uint64_t root;
312
313 memcpy(&root_le, value, sizeof(root_le));
314 root = le64_to_cpu(root_le);
315 dm_tm_inc(info->tm, root);
316}
317
318static void subtree_dec(void *context, void *value)
319{
320 struct dm_btree_info *info = context;
321 __le64 root_le;
322 uint64_t root;
323
324 memcpy(&root_le, value, sizeof(root_le));
325 root = le64_to_cpu(root_le);
326 if (dm_btree_del(info, root))
327 DMERR("btree delete failed\n");
328}
329
330static int subtree_equal(void *context, void *value1_le, void *value2_le)
331{
332 __le64 v1_le, v2_le;
333 memcpy(&v1_le, value1_le, sizeof(v1_le));
334 memcpy(&v2_le, value2_le, sizeof(v2_le));
335
336 return v1_le == v2_le;
337}
338
339/*----------------------------------------------------------------*/
340
341static int superblock_all_zeroes(struct dm_block_manager *bm, int *result)
342{
343 int r;
344 unsigned i;
345 struct dm_block *b;
346 __le64 *data_le, zero = cpu_to_le64(0);
347 unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
348
349 /*
350 * We can't use a validator here - it may be all zeroes.
351 */
352 r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
353 if (r)
354 return r;
355
356 data_le = dm_block_data(b);
357 *result = 1;
358 for (i = 0; i < block_size; i++) {
359 if (data_le[i] != zero) {
360 *result = 0;
361 break;
362 }
363 }
364
365 return dm_bm_unlock(b);
366}
367
368static int init_pmd(struct dm_pool_metadata *pmd,
369 struct dm_block_manager *bm,
370 dm_block_t nr_blocks, int create)
371{
372 int r;
373 struct dm_space_map *sm, *data_sm;
374 struct dm_transaction_manager *tm;
375 struct dm_block *sblock;
376
377 if (create) {
378 r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION,
379 &sb_validator, &tm, &sm, &sblock);
380 if (r < 0) {
381 DMERR("tm_create_with_sm failed");
382 return r;
383 }
384
385 data_sm = dm_sm_disk_create(tm, nr_blocks);
386 if (IS_ERR(data_sm)) {
387 DMERR("sm_disk_create failed");
388 r = PTR_ERR(data_sm);
389 goto bad;
390 }
391 } else {
392 struct thin_disk_superblock *disk_super = NULL;
393 size_t space_map_root_offset =
394 offsetof(struct thin_disk_superblock, metadata_space_map_root);
395
396 r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION,
397 &sb_validator, space_map_root_offset,
398 SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock);
399 if (r < 0) {
400 DMERR("tm_open_with_sm failed");
401 return r;
402 }
403
404 disk_super = dm_block_data(sblock);
405 data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root,
406 sizeof(disk_super->data_space_map_root));
407 if (IS_ERR(data_sm)) {
408 DMERR("sm_disk_open failed");
409 r = PTR_ERR(data_sm);
410 goto bad;
411 }
412 }
413
414
415 r = dm_tm_unlock(tm, sblock);
416 if (r < 0) {
417 DMERR("couldn't unlock superblock");
418 goto bad_data_sm;
419 }
420
421 pmd->bm = bm;
422 pmd->metadata_sm = sm;
423 pmd->data_sm = data_sm;
424 pmd->tm = tm;
425 pmd->nb_tm = dm_tm_create_non_blocking_clone(tm);
426 if (!pmd->nb_tm) {
427 DMERR("could not create clone tm");
428 r = -ENOMEM;
429 goto bad_data_sm;
430 }
431
432 pmd->info.tm = tm;
433 pmd->info.levels = 2;
434 pmd->info.value_type.context = pmd->data_sm;
435 pmd->info.value_type.size = sizeof(__le64);
436 pmd->info.value_type.inc = data_block_inc;
437 pmd->info.value_type.dec = data_block_dec;
438 pmd->info.value_type.equal = data_block_equal;
439
440 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
441 pmd->nb_info.tm = pmd->nb_tm;
442
443 pmd->tl_info.tm = tm;
444 pmd->tl_info.levels = 1;
445 pmd->tl_info.value_type.context = &pmd->info;
446 pmd->tl_info.value_type.size = sizeof(__le64);
447 pmd->tl_info.value_type.inc = subtree_inc;
448 pmd->tl_info.value_type.dec = subtree_dec;
449 pmd->tl_info.value_type.equal = subtree_equal;
450
451 pmd->bl_info.tm = tm;
452 pmd->bl_info.levels = 1;
453 pmd->bl_info.value_type.context = pmd->data_sm;
454 pmd->bl_info.value_type.size = sizeof(__le64);
455 pmd->bl_info.value_type.inc = data_block_inc;
456 pmd->bl_info.value_type.dec = data_block_dec;
457 pmd->bl_info.value_type.equal = data_block_equal;
458
459 pmd->details_info.tm = tm;
460 pmd->details_info.levels = 1;
461 pmd->details_info.value_type.context = NULL;
462 pmd->details_info.value_type.size = sizeof(struct disk_device_details);
463 pmd->details_info.value_type.inc = NULL;
464 pmd->details_info.value_type.dec = NULL;
465 pmd->details_info.value_type.equal = NULL;
466
467 pmd->root = 0;
468
469 init_rwsem(&pmd->root_lock);
470 pmd->time = 0;
471 pmd->need_commit = 0;
472 pmd->details_root = 0;
473 pmd->trans_id = 0;
474 pmd->flags = 0;
475 INIT_LIST_HEAD(&pmd->thin_devices);
476
477 return 0;
478
479bad_data_sm:
480 dm_sm_destroy(data_sm);
481bad:
482 dm_tm_destroy(tm);
483 dm_sm_destroy(sm);
484
485 return r;
486}
487
488static int __begin_transaction(struct dm_pool_metadata *pmd)
489{
490 int r;
491 u32 features;
492 struct thin_disk_superblock *disk_super;
493 struct dm_block *sblock;
494
495 /*
496 * __maybe_commit_transaction() resets these
497 */
498 WARN_ON(pmd->need_commit);
499
500 /*
501 * We re-read the superblock every time. Shouldn't need to do this
502 * really.
503 */
504 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
505 &sb_validator, &sblock);
506 if (r)
507 return r;
508
509 disk_super = dm_block_data(sblock);
510 pmd->time = le32_to_cpu(disk_super->time);
511 pmd->root = le64_to_cpu(disk_super->data_mapping_root);
512 pmd->details_root = le64_to_cpu(disk_super->device_details_root);
513 pmd->trans_id = le64_to_cpu(disk_super->trans_id);
514 pmd->flags = le32_to_cpu(disk_super->flags);
515 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
516
517 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
518 if (features) {
519 DMERR("could not access metadata due to "
520 "unsupported optional features (%lx).",
521 (unsigned long)features);
522 r = -EINVAL;
523 goto out;
524 }
525
526 /*
527 * Check for read-only metadata to skip the following RDWR checks.
528 */
529 if (get_disk_ro(pmd->bdev->bd_disk))
530 goto out;
531
532 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
533 if (features) {
534 DMERR("could not access metadata RDWR due to "
535 "unsupported optional features (%lx).",
536 (unsigned long)features);
537 r = -EINVAL;
538 }
539
540out:
541 dm_bm_unlock(sblock);
542 return r;
543}
544
545static int __write_changed_details(struct dm_pool_metadata *pmd)
546{
547 int r;
548 struct dm_thin_device *td, *tmp;
549 struct disk_device_details details;
550 uint64_t key;
551
552 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
553 if (!td->changed)
554 continue;
555
556 key = td->id;
557
558 details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
559 details.transaction_id = cpu_to_le64(td->transaction_id);
560 details.creation_time = cpu_to_le32(td->creation_time);
561 details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
562 __dm_bless_for_disk(&details);
563
564 r = dm_btree_insert(&pmd->details_info, pmd->details_root,
565 &key, &details, &pmd->details_root);
566 if (r)
567 return r;
568
569 if (td->open_count)
570 td->changed = 0;
571 else {
572 list_del(&td->list);
573 kfree(td);
574 }
575
576 pmd->need_commit = 1;
577 }
578
579 return 0;
580}
581
582static int __commit_transaction(struct dm_pool_metadata *pmd)
583{
584 /*
585 * FIXME: Associated pool should be made read-only on failure.
586 */
587 int r;
588 size_t metadata_len, data_len;
589 struct thin_disk_superblock *disk_super;
590 struct dm_block *sblock;
591
592 /*
593 * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
594 */
595 BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
596
597 r = __write_changed_details(pmd);
598 if (r < 0)
599 goto out;
600
601 if (!pmd->need_commit)
602 goto out;
603
604 r = dm_sm_commit(pmd->data_sm);
605 if (r < 0)
606 goto out;
607
608 r = dm_tm_pre_commit(pmd->tm);
609 if (r < 0)
610 goto out;
611
612 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
613 if (r < 0)
614 goto out;
615
616 r = dm_sm_root_size(pmd->metadata_sm, &data_len);
617 if (r < 0)
618 goto out;
619
620 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
621 &sb_validator, &sblock);
622 if (r)
623 goto out;
624
625 disk_super = dm_block_data(sblock);
626 disk_super->time = cpu_to_le32(pmd->time);
627 disk_super->data_mapping_root = cpu_to_le64(pmd->root);
628 disk_super->device_details_root = cpu_to_le64(pmd->details_root);
629 disk_super->trans_id = cpu_to_le64(pmd->trans_id);
630 disk_super->flags = cpu_to_le32(pmd->flags);
631
632 r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
633 metadata_len);
634 if (r < 0)
635 goto out_locked;
636
637 r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
638 data_len);
639 if (r < 0)
640 goto out_locked;
641
642 r = dm_tm_commit(pmd->tm, sblock);
643 if (!r)
644 pmd->need_commit = 0;
645
646out:
647 return r;
648
649out_locked:
650 dm_bm_unlock(sblock);
651 return r;
652}
653
654struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
655 sector_t data_block_size)
656{
657 int r;
658 struct thin_disk_superblock *disk_super;
659 struct dm_pool_metadata *pmd;
660 sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
661 struct dm_block_manager *bm;
662 int create;
663 struct dm_block *sblock;
664
665 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
666 if (!pmd) {
667 DMERR("could not allocate metadata struct");
668 return ERR_PTR(-ENOMEM);
669 }
670
671 /*
672 * Max hex locks:
673 * 3 for btree insert +
674 * 2 for btree lookup used within space map
675 */
676 bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE,
677 THIN_METADATA_CACHE_SIZE, 5);
678 if (!bm) {
679 DMERR("could not create block manager");
680 kfree(pmd);
681 return ERR_PTR(-ENOMEM);
682 }
683
684 r = superblock_all_zeroes(bm, &create);
685 if (r) {
686 dm_block_manager_destroy(bm);
687 kfree(pmd);
688 return ERR_PTR(r);
689 }
690
691
692 r = init_pmd(pmd, bm, 0, create);
693 if (r) {
694 dm_block_manager_destroy(bm);
695 kfree(pmd);
696 return ERR_PTR(r);
697 }
698 pmd->bdev = bdev;
699
700 if (!create) {
701 r = __begin_transaction(pmd);
702 if (r < 0)
703 goto bad;
704 return pmd;
705 }
706
707 /*
708 * Create.
709 */
710 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
711 &sb_validator, &sblock);
712 if (r)
713 goto bad;
714
715 disk_super = dm_block_data(sblock);
716 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
717 disk_super->version = cpu_to_le32(THIN_VERSION);
718 disk_super->time = 0;
719 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
720 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
721 disk_super->data_block_size = cpu_to_le32(data_block_size);
722
723 r = dm_bm_unlock(sblock);
724 if (r < 0)
725 goto bad;
726
727 r = dm_btree_empty(&pmd->info, &pmd->root);
728 if (r < 0)
729 goto bad;
730
731 r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
732 if (r < 0) {
733 DMERR("couldn't create devices root");
734 goto bad;
735 }
736
737 pmd->flags = 0;
738 pmd->need_commit = 1;
739 r = dm_pool_commit_metadata(pmd);
740 if (r < 0) {
741 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
742 __func__, r);
743 goto bad;
744 }
745
746 return pmd;
747
748bad:
749 if (dm_pool_metadata_close(pmd) < 0)
750 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
751 return ERR_PTR(r);
752}
753
754int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
755{
756 int r;
757 unsigned open_devices = 0;
758 struct dm_thin_device *td, *tmp;
759
760 down_read(&pmd->root_lock);
761 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
762 if (td->open_count)
763 open_devices++;
764 else {
765 list_del(&td->list);
766 kfree(td);
767 }
768 }
769 up_read(&pmd->root_lock);
770
771 if (open_devices) {
772 DMERR("attempt to close pmd when %u device(s) are still open",
773 open_devices);
774 return -EBUSY;
775 }
776
777 r = __commit_transaction(pmd);
778 if (r < 0)
779 DMWARN("%s: __commit_transaction() failed, error = %d",
780 __func__, r);
781
782 dm_tm_destroy(pmd->tm);
783 dm_tm_destroy(pmd->nb_tm);
784 dm_block_manager_destroy(pmd->bm);
785 dm_sm_destroy(pmd->metadata_sm);
786 dm_sm_destroy(pmd->data_sm);
787 kfree(pmd);
788
789 return 0;
790}
791
792static int __open_device(struct dm_pool_metadata *pmd,
793 dm_thin_id dev, int create,
794 struct dm_thin_device **td)
795{
796 int r, changed = 0;
797 struct dm_thin_device *td2;
798 uint64_t key = dev;
799 struct disk_device_details details_le;
800
801 /*
802 * Check the device isn't already open.
803 */
804 list_for_each_entry(td2, &pmd->thin_devices, list)
805 if (td2->id == dev) {
806 td2->open_count++;
807 *td = td2;
808 return 0;
809 }
810
811 /*
812 * Check the device exists.
813 */
814 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
815 &key, &details_le);
816 if (r) {
817 if (r != -ENODATA || !create)
818 return r;
819
820 changed = 1;
821 details_le.mapped_blocks = 0;
822 details_le.transaction_id = cpu_to_le64(pmd->trans_id);
823 details_le.creation_time = cpu_to_le32(pmd->time);
824 details_le.snapshotted_time = cpu_to_le32(pmd->time);
825 }
826
827 *td = kmalloc(sizeof(**td), GFP_NOIO);
828 if (!*td)
829 return -ENOMEM;
830
831 (*td)->pmd = pmd;
832 (*td)->id = dev;
833 (*td)->open_count = 1;
834 (*td)->changed = changed;
835 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
836 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
837 (*td)->creation_time = le32_to_cpu(details_le.creation_time);
838 (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
839
840 list_add(&(*td)->list, &pmd->thin_devices);
841
842 return 0;
843}
844
845static void __close_device(struct dm_thin_device *td)
846{
847 --td->open_count;
848}
849
850static int __create_thin(struct dm_pool_metadata *pmd,
851 dm_thin_id dev)
852{
853 int r;
854 dm_block_t dev_root;
855 uint64_t key = dev;
856 struct disk_device_details details_le;
857 struct dm_thin_device *td;
858 __le64 value;
859
860 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
861 &key, &details_le);
862 if (!r)
863 return -EEXIST;
864
865 /*
866 * Create an empty btree for the mappings.
867 */
868 r = dm_btree_empty(&pmd->bl_info, &dev_root);
869 if (r)
870 return r;
871
872 /*
873 * Insert it into the main mapping tree.
874 */
875 value = cpu_to_le64(dev_root);
876 __dm_bless_for_disk(&value);
877 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
878 if (r) {
879 dm_btree_del(&pmd->bl_info, dev_root);
880 return r;
881 }
882
883 r = __open_device(pmd, dev, 1, &td);
884 if (r) {
885 __close_device(td);
886 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
887 dm_btree_del(&pmd->bl_info, dev_root);
888 return r;
889 }
890 td->changed = 1;
891 __close_device(td);
892
893 return r;
894}
895
896int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
897{
898 int r;
899
900 down_write(&pmd->root_lock);
901 r = __create_thin(pmd, dev);
902 up_write(&pmd->root_lock);
903
904 return r;
905}
906
907static int __set_snapshot_details(struct dm_pool_metadata *pmd,
908 struct dm_thin_device *snap,
909 dm_thin_id origin, uint32_t time)
910{
911 int r;
912 struct dm_thin_device *td;
913
914 r = __open_device(pmd, origin, 0, &td);
915 if (r)
916 return r;
917
918 td->changed = 1;
919 td->snapshotted_time = time;
920
921 snap->mapped_blocks = td->mapped_blocks;
922 snap->snapshotted_time = time;
923 __close_device(td);
924
925 return 0;
926}
927
928static int __create_snap(struct dm_pool_metadata *pmd,
929 dm_thin_id dev, dm_thin_id origin)
930{
931 int r;
932 dm_block_t origin_root;
933 uint64_t key = origin, dev_key = dev;
934 struct dm_thin_device *td;
935 struct disk_device_details details_le;
936 __le64 value;
937
938 /* check this device is unused */
939 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
940 &dev_key, &details_le);
941 if (!r)
942 return -EEXIST;
943
944 /* find the mapping tree for the origin */
945 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
946 if (r)
947 return r;
948 origin_root = le64_to_cpu(value);
949
950 /* clone the origin, an inc will do */
951 dm_tm_inc(pmd->tm, origin_root);
952
953 /* insert into the main mapping tree */
954 value = cpu_to_le64(origin_root);
955 __dm_bless_for_disk(&value);
956 key = dev;
957 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
958 if (r) {
959 dm_tm_dec(pmd->tm, origin_root);
960 return r;
961 }
962
963 pmd->time++;
964
965 r = __open_device(pmd, dev, 1, &td);
966 if (r)
967 goto bad;
968
969 r = __set_snapshot_details(pmd, td, origin, pmd->time);
970 if (r)
971 goto bad;
972
973 __close_device(td);
974 return 0;
975
976bad:
977 __close_device(td);
978 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
979 dm_btree_remove(&pmd->details_info, pmd->details_root,
980 &key, &pmd->details_root);
981 return r;
982}
983
984int dm_pool_create_snap(struct dm_pool_metadata *pmd,
985 dm_thin_id dev,
986 dm_thin_id origin)
987{
988 int r;
989
990 down_write(&pmd->root_lock);
991 r = __create_snap(pmd, dev, origin);
992 up_write(&pmd->root_lock);
993
994 return r;
995}
996
997static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
998{
999 int r;
1000 uint64_t key = dev;
1001 struct dm_thin_device *td;
1002
1003 /* TODO: failure should mark the transaction invalid */
1004 r = __open_device(pmd, dev, 0, &td);
1005 if (r)
1006 return r;
1007
1008 if (td->open_count > 1) {
1009 __close_device(td);
1010 return -EBUSY;
1011 }
1012
1013 list_del(&td->list);
1014 kfree(td);
1015 r = dm_btree_remove(&pmd->details_info, pmd->details_root,
1016 &key, &pmd->details_root);
1017 if (r)
1018 return r;
1019
1020 r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1021 if (r)
1022 return r;
1023
1024 pmd->need_commit = 1;
1025
1026 return 0;
1027}
1028
1029int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1030 dm_thin_id dev)
1031{
1032 int r;
1033
1034 down_write(&pmd->root_lock);
1035 r = __delete_device(pmd, dev);
1036 up_write(&pmd->root_lock);
1037
1038 return r;
1039}
1040
1041int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1042 uint64_t current_id,
1043 uint64_t new_id)
1044{
1045 down_write(&pmd->root_lock);
1046 if (pmd->trans_id != current_id) {
1047 up_write(&pmd->root_lock);
1048 DMERR("mismatched transaction id");
1049 return -EINVAL;
1050 }
1051
1052 pmd->trans_id = new_id;
1053 pmd->need_commit = 1;
1054 up_write(&pmd->root_lock);
1055
1056 return 0;
1057}
1058
1059int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1060 uint64_t *result)
1061{
1062 down_read(&pmd->root_lock);
1063 *result = pmd->trans_id;
1064 up_read(&pmd->root_lock);
1065
1066 return 0;
1067}
1068
1069static int __get_held_metadata_root(struct dm_pool_metadata *pmd,
1070 dm_block_t *result)
1071{
1072 int r;
1073 struct thin_disk_superblock *disk_super;
1074 struct dm_block *sblock;
1075
1076 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
1077 &sb_validator, &sblock);
1078 if (r)
1079 return r;
1080
1081 disk_super = dm_block_data(sblock);
1082 *result = le64_to_cpu(disk_super->held_root);
1083
1084 return dm_bm_unlock(sblock);
1085}
1086
1087int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd,
1088 dm_block_t *result)
1089{
1090 int r;
1091
1092 down_read(&pmd->root_lock);
1093 r = __get_held_metadata_root(pmd, result);
1094 up_read(&pmd->root_lock);
1095
1096 return r;
1097}
1098
1099int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1100 struct dm_thin_device **td)
1101{
1102 int r;
1103
1104 down_write(&pmd->root_lock);
1105 r = __open_device(pmd, dev, 0, td);
1106 up_write(&pmd->root_lock);
1107
1108 return r;
1109}
1110
1111int dm_pool_close_thin_device(struct dm_thin_device *td)
1112{
1113 down_write(&td->pmd->root_lock);
1114 __close_device(td);
1115 up_write(&td->pmd->root_lock);
1116
1117 return 0;
1118}
1119
1120dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1121{
1122 return td->id;
1123}
1124
1125static int __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1126{
1127 return td->snapshotted_time > time;
1128}
1129
1130int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1131 int can_block, struct dm_thin_lookup_result *result)
1132{
1133 int r;
1134 uint64_t block_time = 0;
1135 __le64 value;
1136 struct dm_pool_metadata *pmd = td->pmd;
1137 dm_block_t keys[2] = { td->id, block };
1138
1139 if (can_block) {
1140 down_read(&pmd->root_lock);
1141 r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value);
1142 if (!r)
1143 block_time = le64_to_cpu(value);
1144 up_read(&pmd->root_lock);
1145
1146 } else if (down_read_trylock(&pmd->root_lock)) {
1147 r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value);
1148 if (!r)
1149 block_time = le64_to_cpu(value);
1150 up_read(&pmd->root_lock);
1151
1152 } else
1153 return -EWOULDBLOCK;
1154
1155 if (!r) {
1156 dm_block_t exception_block;
1157 uint32_t exception_time;
1158 unpack_block_time(block_time, &exception_block,
1159 &exception_time);
1160 result->block = exception_block;
1161 result->shared = __snapshotted_since(td, exception_time);
1162 }
1163
1164 return r;
1165}
1166
1167static int __insert(struct dm_thin_device *td, dm_block_t block,
1168 dm_block_t data_block)
1169{
1170 int r, inserted;
1171 __le64 value;
1172 struct dm_pool_metadata *pmd = td->pmd;
1173 dm_block_t keys[2] = { td->id, block };
1174
1175 pmd->need_commit = 1;
1176 value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1177 __dm_bless_for_disk(&value);
1178
1179 r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
1180 &pmd->root, &inserted);
1181 if (r)
1182 return r;
1183
1184 if (inserted) {
1185 td->mapped_blocks++;
1186 td->changed = 1;
1187 }
1188
1189 return 0;
1190}
1191
1192int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1193 dm_block_t data_block)
1194{
1195 int r;
1196
1197 down_write(&td->pmd->root_lock);
1198 r = __insert(td, block, data_block);
1199 up_write(&td->pmd->root_lock);
1200
1201 return r;
1202}
1203
1204static int __remove(struct dm_thin_device *td, dm_block_t block)
1205{
1206 int r;
1207 struct dm_pool_metadata *pmd = td->pmd;
1208 dm_block_t keys[2] = { td->id, block };
1209
1210 r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root);
1211 if (r)
1212 return r;
1213
1214 pmd->need_commit = 1;
1215
1216 return 0;
1217}
1218
1219int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1220{
1221 int r;
1222
1223 down_write(&td->pmd->root_lock);
1224 r = __remove(td, block);
1225 up_write(&td->pmd->root_lock);
1226
1227 return r;
1228}
1229
1230int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1231{
1232 int r;
1233
1234 down_write(&pmd->root_lock);
1235
1236 r = dm_sm_new_block(pmd->data_sm, result);
1237 pmd->need_commit = 1;
1238
1239 up_write(&pmd->root_lock);
1240
1241 return r;
1242}
1243
1244int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1245{
1246 int r;
1247
1248 down_write(&pmd->root_lock);
1249
1250 r = __commit_transaction(pmd);
1251 if (r <= 0)
1252 goto out;
1253
1254 /*
1255 * Open the next transaction.
1256 */
1257 r = __begin_transaction(pmd);
1258out:
1259 up_write(&pmd->root_lock);
1260 return r;
1261}
1262
1263int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1264{
1265 int r;
1266
1267 down_read(&pmd->root_lock);
1268 r = dm_sm_get_nr_free(pmd->data_sm, result);
1269 up_read(&pmd->root_lock);
1270
1271 return r;
1272}
1273
1274int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1275 dm_block_t *result)
1276{
1277 int r;
1278
1279 down_read(&pmd->root_lock);
1280 r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1281 up_read(&pmd->root_lock);
1282
1283 return r;
1284}
1285
1286int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1287 dm_block_t *result)
1288{
1289 int r;
1290
1291 down_read(&pmd->root_lock);
1292 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1293 up_read(&pmd->root_lock);
1294
1295 return r;
1296}
1297
1298int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
1299{
1300 down_read(&pmd->root_lock);
1301 *result = pmd->data_block_size;
1302 up_read(&pmd->root_lock);
1303
1304 return 0;
1305}
1306
1307int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1308{
1309 int r;
1310
1311 down_read(&pmd->root_lock);
1312 r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1313 up_read(&pmd->root_lock);
1314
1315 return r;
1316}
1317
1318int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1319{
1320 struct dm_pool_metadata *pmd = td->pmd;
1321
1322 down_read(&pmd->root_lock);
1323 *result = td->mapped_blocks;
1324 up_read(&pmd->root_lock);
1325
1326 return 0;
1327}
1328
1329static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
1330{
1331 int r;
1332 __le64 value_le;
1333 dm_block_t thin_root;
1334 struct dm_pool_metadata *pmd = td->pmd;
1335
1336 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
1337 if (r)
1338 return r;
1339
1340 thin_root = le64_to_cpu(value_le);
1341
1342 return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
1343}
1344
1345int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
1346 dm_block_t *result)
1347{
1348 int r;
1349 struct dm_pool_metadata *pmd = td->pmd;
1350
1351 down_read(&pmd->root_lock);
1352 r = __highest_block(td, result);
1353 up_read(&pmd->root_lock);
1354
1355 return r;
1356}
1357
1358static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1359{
1360 int r;
1361 dm_block_t old_count;
1362
1363 r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count);
1364 if (r)
1365 return r;
1366
1367 if (new_count == old_count)
1368 return 0;
1369
1370 if (new_count < old_count) {
1371 DMERR("cannot reduce size of data device");
1372 return -EINVAL;
1373 }
1374
1375 r = dm_sm_extend(pmd->data_sm, new_count - old_count);
1376 if (!r)
1377 pmd->need_commit = 1;
1378
1379 return r;
1380}
1381
1382int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1383{
1384 int r;
1385
1386 down_write(&pmd->root_lock);
1387 r = __resize_data_dev(pmd, new_count);
1388 up_write(&pmd->root_lock);
1389
1390 return r;
1391}