diff options
author | Joe Thornber <ejt@redhat.com> | 2014-03-03 10:23:15 -0500 |
---|---|---|
committer | Mike Snitzer <snitzer@redhat.com> | 2014-03-27 16:56:23 -0400 |
commit | eec40579d84873dfb7021eb24c50360f073237c5 (patch) | |
tree | a294d43a2029ab02ceeab33396e7c948e374a571 /drivers/md/dm-era-target.c | |
parent | b098d6726bbfb94c06d6e1097466187afddae61f (diff) |
dm: add era target
dm-era is a target that behaves similar to the linear target. In
addition it keeps track of which blocks were written within a user
defined period of time called an 'era'. Each era target instance
maintains the current era as a monotonically increasing 32-bit
counter.
Use cases include tracking changed blocks for backup software, and
partially invalidating the contents of a cache to restore cache
coherency after rolling back a vendor snapshot.
dm-era is primarily expected to be paired with the dm-cache target.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Diffstat (limited to 'drivers/md/dm-era-target.c')
-rw-r--r-- | drivers/md/dm-era-target.c | 1730 |
1 files changed, 1730 insertions, 0 deletions
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c new file mode 100644 index 000000000000..03d9560bfd95 --- /dev/null +++ b/drivers/md/dm-era-target.c | |||
@@ -0,0 +1,1730 @@ | |||
1 | #include "dm.h" | ||
2 | #include "persistent-data/dm-transaction-manager.h" | ||
3 | #include "persistent-data/dm-bitset.h" | ||
4 | #include "persistent-data/dm-space-map.h" | ||
5 | |||
6 | #include <linux/dm-io.h> | ||
7 | #include <linux/dm-kcopyd.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/mempool.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/vmalloc.h> | ||
13 | |||
14 | #define DM_MSG_PREFIX "era" | ||
15 | |||
16 | #define SUPERBLOCK_LOCATION 0 | ||
17 | #define SUPERBLOCK_MAGIC 2126579579 | ||
18 | #define SUPERBLOCK_CSUM_XOR 146538381 | ||
19 | #define MIN_ERA_VERSION 1 | ||
20 | #define MAX_ERA_VERSION 1 | ||
21 | #define INVALID_WRITESET_ROOT SUPERBLOCK_LOCATION | ||
22 | #define MIN_BLOCK_SIZE 8 | ||
23 | |||
24 | /*---------------------------------------------------------------- | ||
25 | * Writeset | ||
26 | *--------------------------------------------------------------*/ | ||
27 | struct writeset_metadata { | ||
28 | uint32_t nr_bits; | ||
29 | dm_block_t root; | ||
30 | }; | ||
31 | |||
32 | struct writeset { | ||
33 | struct writeset_metadata md; | ||
34 | |||
35 | /* | ||
36 | * An in core copy of the bits to save constantly doing look ups on | ||
37 | * disk. | ||
38 | */ | ||
39 | unsigned long *bits; | ||
40 | }; | ||
41 | |||
42 | /* | ||
43 | * This does not free off the on disk bitset as this will normally be done | ||
44 | * after digesting into the era array. | ||
45 | */ | ||
46 | static void writeset_free(struct writeset *ws) | ||
47 | { | ||
48 | vfree(ws->bits); | ||
49 | } | ||
50 | |||
51 | static int setup_on_disk_bitset(struct dm_disk_bitset *info, | ||
52 | unsigned nr_bits, dm_block_t *root) | ||
53 | { | ||
54 | int r; | ||
55 | |||
56 | r = dm_bitset_empty(info, root); | ||
57 | if (r) | ||
58 | return r; | ||
59 | |||
60 | return dm_bitset_resize(info, *root, 0, nr_bits, false, root); | ||
61 | } | ||
62 | |||
63 | static size_t bitset_size(unsigned nr_bits) | ||
64 | { | ||
65 | return sizeof(unsigned long) * dm_div_up(nr_bits, BITS_PER_LONG); | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * Allocates memory for the in core bitset. | ||
70 | */ | ||
71 | static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks) | ||
72 | { | ||
73 | ws->md.nr_bits = nr_blocks; | ||
74 | ws->md.root = INVALID_WRITESET_ROOT; | ||
75 | ws->bits = vzalloc(bitset_size(nr_blocks)); | ||
76 | if (!ws->bits) { | ||
77 | DMERR("%s: couldn't allocate in memory bitset", __func__); | ||
78 | return -ENOMEM; | ||
79 | } | ||
80 | |||
81 | return 0; | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | * Wipes the in-core bitset, and creates a new on disk bitset. | ||
86 | */ | ||
87 | static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws) | ||
88 | { | ||
89 | int r; | ||
90 | |||
91 | memset(ws->bits, 0, bitset_size(ws->md.nr_bits)); | ||
92 | |||
93 | r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root); | ||
94 | if (r) { | ||
95 | DMERR("%s: setup_on_disk_bitset failed", __func__); | ||
96 | return r; | ||
97 | } | ||
98 | |||
99 | return 0; | ||
100 | } | ||
101 | |||
102 | static bool writeset_marked(struct writeset *ws, dm_block_t block) | ||
103 | { | ||
104 | return test_bit(block, ws->bits); | ||
105 | } | ||
106 | |||
107 | static int writeset_marked_on_disk(struct dm_disk_bitset *info, | ||
108 | struct writeset_metadata *m, dm_block_t block, | ||
109 | bool *result) | ||
110 | { | ||
111 | dm_block_t old = m->root; | ||
112 | |||
113 | /* | ||
114 | * The bitset was flushed when it was archived, so we know there'll | ||
115 | * be no change to the root. | ||
116 | */ | ||
117 | int r = dm_bitset_test_bit(info, m->root, block, &m->root, result); | ||
118 | if (r) { | ||
119 | DMERR("%s: dm_bitset_test_bit failed", __func__); | ||
120 | return r; | ||
121 | } | ||
122 | |||
123 | BUG_ON(m->root != old); | ||
124 | |||
125 | return r; | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * Returns < 0 on error, 0 if the bit wasn't previously set, 1 if it was. | ||
130 | */ | ||
131 | static int writeset_test_and_set(struct dm_disk_bitset *info, | ||
132 | struct writeset *ws, uint32_t block) | ||
133 | { | ||
134 | int r; | ||
135 | |||
136 | if (!test_and_set_bit(block, ws->bits)) { | ||
137 | r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root); | ||
138 | if (r) { | ||
139 | /* FIXME: fail mode */ | ||
140 | return r; | ||
141 | } | ||
142 | |||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | return 1; | ||
147 | } | ||
148 | |||
149 | /*---------------------------------------------------------------- | ||
150 | * On disk metadata layout | ||
151 | *--------------------------------------------------------------*/ | ||
152 | #define SPACE_MAP_ROOT_SIZE 128 | ||
153 | #define UUID_LEN 16 | ||
154 | |||
155 | struct writeset_disk { | ||
156 | __le32 nr_bits; | ||
157 | __le64 root; | ||
158 | } __packed; | ||
159 | |||
160 | struct superblock_disk { | ||
161 | __le32 csum; | ||
162 | __le32 flags; | ||
163 | __le64 blocknr; | ||
164 | |||
165 | __u8 uuid[UUID_LEN]; | ||
166 | __le64 magic; | ||
167 | __le32 version; | ||
168 | |||
169 | __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; | ||
170 | |||
171 | __le32 data_block_size; | ||
172 | __le32 metadata_block_size; | ||
173 | __le32 nr_blocks; | ||
174 | |||
175 | __le32 current_era; | ||
176 | struct writeset_disk current_writeset; | ||
177 | |||
178 | /* | ||
179 | * Only these two fields are valid within the metadata snapshot. | ||
180 | */ | ||
181 | __le64 writeset_tree_root; | ||
182 | __le64 era_array_root; | ||
183 | |||
184 | __le64 metadata_snap; | ||
185 | } __packed; | ||
186 | |||
187 | /*---------------------------------------------------------------- | ||
188 | * Superblock validation | ||
189 | *--------------------------------------------------------------*/ | ||
190 | static void sb_prepare_for_write(struct dm_block_validator *v, | ||
191 | struct dm_block *b, | ||
192 | size_t sb_block_size) | ||
193 | { | ||
194 | struct superblock_disk *disk = dm_block_data(b); | ||
195 | |||
196 | disk->blocknr = cpu_to_le64(dm_block_location(b)); | ||
197 | disk->csum = cpu_to_le32(dm_bm_checksum(&disk->flags, | ||
198 | sb_block_size - sizeof(__le32), | ||
199 | SUPERBLOCK_CSUM_XOR)); | ||
200 | } | ||
201 | |||
202 | static int check_metadata_version(struct superblock_disk *disk) | ||
203 | { | ||
204 | uint32_t metadata_version = le32_to_cpu(disk->version); | ||
205 | if (metadata_version < MIN_ERA_VERSION || metadata_version > MAX_ERA_VERSION) { | ||
206 | DMERR("Era metadata version %u found, but only versions between %u and %u supported.", | ||
207 | metadata_version, MIN_ERA_VERSION, MAX_ERA_VERSION); | ||
208 | return -EINVAL; | ||
209 | } | ||
210 | |||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | static int sb_check(struct dm_block_validator *v, | ||
215 | struct dm_block *b, | ||
216 | size_t sb_block_size) | ||
217 | { | ||
218 | struct superblock_disk *disk = dm_block_data(b); | ||
219 | __le32 csum_le; | ||
220 | |||
221 | if (dm_block_location(b) != le64_to_cpu(disk->blocknr)) { | ||
222 | DMERR("sb_check failed: blocknr %llu: wanted %llu", | ||
223 | le64_to_cpu(disk->blocknr), | ||
224 | (unsigned long long)dm_block_location(b)); | ||
225 | return -ENOTBLK; | ||
226 | } | ||
227 | |||
228 | if (le64_to_cpu(disk->magic) != SUPERBLOCK_MAGIC) { | ||
229 | DMERR("sb_check failed: magic %llu: wanted %llu", | ||
230 | le64_to_cpu(disk->magic), | ||
231 | (unsigned long long) SUPERBLOCK_MAGIC); | ||
232 | return -EILSEQ; | ||
233 | } | ||
234 | |||
235 | csum_le = cpu_to_le32(dm_bm_checksum(&disk->flags, | ||
236 | sb_block_size - sizeof(__le32), | ||
237 | SUPERBLOCK_CSUM_XOR)); | ||
238 | if (csum_le != disk->csum) { | ||
239 | DMERR("sb_check failed: csum %u: wanted %u", | ||
240 | le32_to_cpu(csum_le), le32_to_cpu(disk->csum)); | ||
241 | return -EILSEQ; | ||
242 | } | ||
243 | |||
244 | return check_metadata_version(disk); | ||
245 | } | ||
246 | |||
247 | static struct dm_block_validator sb_validator = { | ||
248 | .name = "superblock", | ||
249 | .prepare_for_write = sb_prepare_for_write, | ||
250 | .check = sb_check | ||
251 | }; | ||
252 | |||
253 | /*---------------------------------------------------------------- | ||
254 | * Low level metadata handling | ||
255 | *--------------------------------------------------------------*/ | ||
256 | #define DM_ERA_METADATA_BLOCK_SIZE 4096 | ||
257 | #define DM_ERA_METADATA_CACHE_SIZE 64 | ||
258 | #define ERA_MAX_CONCURRENT_LOCKS 5 | ||
259 | |||
260 | struct era_metadata { | ||
261 | struct block_device *bdev; | ||
262 | struct dm_block_manager *bm; | ||
263 | struct dm_space_map *sm; | ||
264 | struct dm_transaction_manager *tm; | ||
265 | |||
266 | dm_block_t block_size; | ||
267 | uint32_t nr_blocks; | ||
268 | |||
269 | uint32_t current_era; | ||
270 | |||
271 | /* | ||
272 | * We preallocate 2 writesets. When an era rolls over we | ||
273 | * switch between them. This means the allocation is done at | ||
274 | * preresume time, rather than on the io path. | ||
275 | */ | ||
276 | struct writeset writesets[2]; | ||
277 | struct writeset *current_writeset; | ||
278 | |||
279 | dm_block_t writeset_tree_root; | ||
280 | dm_block_t era_array_root; | ||
281 | |||
282 | struct dm_disk_bitset bitset_info; | ||
283 | struct dm_btree_info writeset_tree_info; | ||
284 | struct dm_array_info era_array_info; | ||
285 | |||
286 | dm_block_t metadata_snap; | ||
287 | |||
288 | /* | ||
289 | * A flag that is set whenever a writeset has been archived. | ||
290 | */ | ||
291 | bool archived_writesets; | ||
292 | }; | ||
293 | |||
294 | static int superblock_read_lock(struct era_metadata *md, | ||
295 | struct dm_block **sblock) | ||
296 | { | ||
297 | return dm_bm_read_lock(md->bm, SUPERBLOCK_LOCATION, | ||
298 | &sb_validator, sblock); | ||
299 | } | ||
300 | |||
301 | static int superblock_lock_zero(struct era_metadata *md, | ||
302 | struct dm_block **sblock) | ||
303 | { | ||
304 | return dm_bm_write_lock_zero(md->bm, SUPERBLOCK_LOCATION, | ||
305 | &sb_validator, sblock); | ||
306 | } | ||
307 | |||
308 | static int superblock_lock(struct era_metadata *md, | ||
309 | struct dm_block **sblock) | ||
310 | { | ||
311 | return dm_bm_write_lock(md->bm, SUPERBLOCK_LOCATION, | ||
312 | &sb_validator, sblock); | ||
313 | } | ||
314 | |||
315 | /* FIXME: duplication with cache and thin */ | ||
316 | static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result) | ||
317 | { | ||
318 | int r; | ||
319 | unsigned i; | ||
320 | struct dm_block *b; | ||
321 | __le64 *data_le, zero = cpu_to_le64(0); | ||
322 | unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64); | ||
323 | |||
324 | /* | ||
325 | * We can't use a validator here - it may be all zeroes. | ||
326 | */ | ||
327 | r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &b); | ||
328 | if (r) | ||
329 | return r; | ||
330 | |||
331 | data_le = dm_block_data(b); | ||
332 | *result = true; | ||
333 | for (i = 0; i < sb_block_size; i++) { | ||
334 | if (data_le[i] != zero) { | ||
335 | *result = false; | ||
336 | break; | ||
337 | } | ||
338 | } | ||
339 | |||
340 | return dm_bm_unlock(b); | ||
341 | } | ||
342 | |||
343 | /*----------------------------------------------------------------*/ | ||
344 | |||
345 | static void ws_pack(const struct writeset_metadata *core, struct writeset_disk *disk) | ||
346 | { | ||
347 | disk->nr_bits = cpu_to_le32(core->nr_bits); | ||
348 | disk->root = cpu_to_le64(core->root); | ||
349 | } | ||
350 | |||
351 | static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata *core) | ||
352 | { | ||
353 | core->nr_bits = le32_to_cpu(disk->nr_bits); | ||
354 | core->root = le64_to_cpu(disk->root); | ||
355 | } | ||
356 | |||
357 | static void ws_inc(void *context, const void *value) | ||
358 | { | ||
359 | struct era_metadata *md = context; | ||
360 | struct writeset_disk ws_d; | ||
361 | dm_block_t b; | ||
362 | |||
363 | memcpy(&ws_d, value, sizeof(ws_d)); | ||
364 | b = le64_to_cpu(ws_d.root); | ||
365 | |||
366 | dm_tm_inc(md->tm, b); | ||
367 | } | ||
368 | |||
369 | static void ws_dec(void *context, const void *value) | ||
370 | { | ||
371 | struct era_metadata *md = context; | ||
372 | struct writeset_disk ws_d; | ||
373 | dm_block_t b; | ||
374 | |||
375 | memcpy(&ws_d, value, sizeof(ws_d)); | ||
376 | b = le64_to_cpu(ws_d.root); | ||
377 | |||
378 | dm_bitset_del(&md->bitset_info, b); | ||
379 | } | ||
380 | |||
381 | static int ws_eq(void *context, const void *value1, const void *value2) | ||
382 | { | ||
383 | return !memcmp(value1, value2, sizeof(struct writeset_metadata)); | ||
384 | } | ||
385 | |||
386 | /*----------------------------------------------------------------*/ | ||
387 | |||
388 | static void setup_writeset_tree_info(struct era_metadata *md) | ||
389 | { | ||
390 | struct dm_btree_value_type *vt = &md->writeset_tree_info.value_type; | ||
391 | md->writeset_tree_info.tm = md->tm; | ||
392 | md->writeset_tree_info.levels = 1; | ||
393 | vt->context = md; | ||
394 | vt->size = sizeof(struct writeset_disk); | ||
395 | vt->inc = ws_inc; | ||
396 | vt->dec = ws_dec; | ||
397 | vt->equal = ws_eq; | ||
398 | } | ||
399 | |||
400 | static void setup_era_array_info(struct era_metadata *md) | ||
401 | |||
402 | { | ||
403 | struct dm_btree_value_type vt; | ||
404 | vt.context = NULL; | ||
405 | vt.size = sizeof(__le32); | ||
406 | vt.inc = NULL; | ||
407 | vt.dec = NULL; | ||
408 | vt.equal = NULL; | ||
409 | |||
410 | dm_array_info_init(&md->era_array_info, md->tm, &vt); | ||
411 | } | ||
412 | |||
413 | static void setup_infos(struct era_metadata *md) | ||
414 | { | ||
415 | dm_disk_bitset_init(md->tm, &md->bitset_info); | ||
416 | setup_writeset_tree_info(md); | ||
417 | setup_era_array_info(md); | ||
418 | } | ||
419 | |||
420 | /*----------------------------------------------------------------*/ | ||
421 | |||
422 | static int create_fresh_metadata(struct era_metadata *md) | ||
423 | { | ||
424 | int r; | ||
425 | |||
426 | r = dm_tm_create_with_sm(md->bm, SUPERBLOCK_LOCATION, | ||
427 | &md->tm, &md->sm); | ||
428 | if (r < 0) { | ||
429 | DMERR("dm_tm_create_with_sm failed"); | ||
430 | return r; | ||
431 | } | ||
432 | |||
433 | setup_infos(md); | ||
434 | |||
435 | r = dm_btree_empty(&md->writeset_tree_info, &md->writeset_tree_root); | ||
436 | if (r) { | ||
437 | DMERR("couldn't create new writeset tree"); | ||
438 | goto bad; | ||
439 | } | ||
440 | |||
441 | r = dm_array_empty(&md->era_array_info, &md->era_array_root); | ||
442 | if (r) { | ||
443 | DMERR("couldn't create era array"); | ||
444 | goto bad; | ||
445 | } | ||
446 | |||
447 | return 0; | ||
448 | |||
449 | bad: | ||
450 | dm_sm_destroy(md->sm); | ||
451 | dm_tm_destroy(md->tm); | ||
452 | |||
453 | return r; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * Writes a superblock, including the static fields that don't get updated | ||
458 | * with every commit (possible optimisation here). 'md' should be fully | ||
459 | * constructed when this is called. | ||
460 | */ | ||
461 | static int prepare_superblock(struct era_metadata *md, struct superblock_disk *disk) | ||
462 | { | ||
463 | int r; | ||
464 | size_t metadata_len; | ||
465 | |||
466 | disk->magic = cpu_to_le64(SUPERBLOCK_MAGIC); | ||
467 | disk->flags = cpu_to_le32(0ul); | ||
468 | |||
469 | /* FIXME: can't keep blanking the uuid (uuid is currently unused though) */ | ||
470 | memset(disk->uuid, 0, sizeof(disk->uuid)); | ||
471 | disk->version = cpu_to_le32(MAX_ERA_VERSION); | ||
472 | |||
473 | r = dm_sm_root_size(md->sm, &metadata_len); | ||
474 | if (r < 0) | ||
475 | return r; | ||
476 | |||
477 | r = dm_sm_copy_root(md->sm, &disk->metadata_space_map_root, | ||
478 | metadata_len); | ||
479 | if (r < 0) | ||
480 | return r; | ||
481 | |||
482 | disk->data_block_size = cpu_to_le32(md->block_size); | ||
483 | disk->metadata_block_size = cpu_to_le32(DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); | ||
484 | disk->nr_blocks = cpu_to_le32(md->nr_blocks); | ||
485 | disk->current_era = cpu_to_le32(md->current_era); | ||
486 | |||
487 | ws_pack(&md->current_writeset->md, &disk->current_writeset); | ||
488 | disk->writeset_tree_root = cpu_to_le64(md->writeset_tree_root); | ||
489 | disk->era_array_root = cpu_to_le64(md->era_array_root); | ||
490 | disk->metadata_snap = cpu_to_le64(md->metadata_snap); | ||
491 | |||
492 | return 0; | ||
493 | } | ||
494 | |||
495 | static int write_superblock(struct era_metadata *md) | ||
496 | { | ||
497 | int r; | ||
498 | struct dm_block *sblock; | ||
499 | struct superblock_disk *disk; | ||
500 | |||
501 | r = superblock_lock_zero(md, &sblock); | ||
502 | if (r) | ||
503 | return r; | ||
504 | |||
505 | disk = dm_block_data(sblock); | ||
506 | r = prepare_superblock(md, disk); | ||
507 | if (r) { | ||
508 | DMERR("%s: prepare_superblock failed", __func__); | ||
509 | dm_bm_unlock(sblock); /* FIXME: does this commit? */ | ||
510 | return r; | ||
511 | } | ||
512 | |||
513 | return dm_tm_commit(md->tm, sblock); | ||
514 | } | ||
515 | |||
516 | /* | ||
517 | * Assumes block_size and the infos are set. | ||
518 | */ | ||
519 | static int format_metadata(struct era_metadata *md) | ||
520 | { | ||
521 | int r; | ||
522 | |||
523 | r = create_fresh_metadata(md); | ||
524 | if (r) | ||
525 | return r; | ||
526 | |||
527 | r = write_superblock(md); | ||
528 | if (r) { | ||
529 | dm_sm_destroy(md->sm); | ||
530 | dm_tm_destroy(md->tm); | ||
531 | return r; | ||
532 | } | ||
533 | |||
534 | return 0; | ||
535 | } | ||
536 | |||
537 | static int open_metadata(struct era_metadata *md) | ||
538 | { | ||
539 | int r; | ||
540 | struct dm_block *sblock; | ||
541 | struct superblock_disk *disk; | ||
542 | |||
543 | r = superblock_read_lock(md, &sblock); | ||
544 | if (r) { | ||
545 | DMERR("couldn't read_lock superblock"); | ||
546 | return r; | ||
547 | } | ||
548 | |||
549 | disk = dm_block_data(sblock); | ||
550 | r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION, | ||
551 | disk->metadata_space_map_root, | ||
552 | sizeof(disk->metadata_space_map_root), | ||
553 | &md->tm, &md->sm); | ||
554 | if (r) { | ||
555 | DMERR("dm_tm_open_with_sm failed"); | ||
556 | goto bad; | ||
557 | } | ||
558 | |||
559 | setup_infos(md); | ||
560 | |||
561 | md->block_size = le32_to_cpu(disk->data_block_size); | ||
562 | md->nr_blocks = le32_to_cpu(disk->nr_blocks); | ||
563 | md->current_era = le32_to_cpu(disk->current_era); | ||
564 | |||
565 | md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root); | ||
566 | md->era_array_root = le64_to_cpu(disk->era_array_root); | ||
567 | md->metadata_snap = le64_to_cpu(disk->metadata_snap); | ||
568 | md->archived_writesets = true; | ||
569 | |||
570 | return dm_bm_unlock(sblock); | ||
571 | |||
572 | bad: | ||
573 | dm_bm_unlock(sblock); | ||
574 | return r; | ||
575 | } | ||
576 | |||
577 | static int open_or_format_metadata(struct era_metadata *md, | ||
578 | bool may_format) | ||
579 | { | ||
580 | int r; | ||
581 | bool unformatted = false; | ||
582 | |||
583 | r = superblock_all_zeroes(md->bm, &unformatted); | ||
584 | if (r) | ||
585 | return r; | ||
586 | |||
587 | if (unformatted) | ||
588 | return may_format ? format_metadata(md) : -EPERM; | ||
589 | |||
590 | return open_metadata(md); | ||
591 | } | ||
592 | |||
593 | static int create_persistent_data_objects(struct era_metadata *md, | ||
594 | bool may_format) | ||
595 | { | ||
596 | int r; | ||
597 | |||
598 | md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE, | ||
599 | DM_ERA_METADATA_CACHE_SIZE, | ||
600 | ERA_MAX_CONCURRENT_LOCKS); | ||
601 | if (IS_ERR(md->bm)) { | ||
602 | DMERR("could not create block manager"); | ||
603 | return PTR_ERR(md->bm); | ||
604 | } | ||
605 | |||
606 | r = open_or_format_metadata(md, may_format); | ||
607 | if (r) | ||
608 | dm_block_manager_destroy(md->bm); | ||
609 | |||
610 | return r; | ||
611 | } | ||
612 | |||
613 | static void destroy_persistent_data_objects(struct era_metadata *md) | ||
614 | { | ||
615 | dm_sm_destroy(md->sm); | ||
616 | dm_tm_destroy(md->tm); | ||
617 | dm_block_manager_destroy(md->bm); | ||
618 | } | ||
619 | |||
620 | /* | ||
621 | * This waits until all era_map threads have picked up the new filter. | ||
622 | */ | ||
623 | static void swap_writeset(struct era_metadata *md, struct writeset *new_writeset) | ||
624 | { | ||
625 | rcu_assign_pointer(md->current_writeset, new_writeset); | ||
626 | synchronize_rcu(); | ||
627 | } | ||
628 | |||
629 | /*---------------------------------------------------------------- | ||
630 | * Writesets get 'digested' into the main era array. | ||
631 | * | ||
632 | * We're using a coroutine here so the worker thread can do the digestion, | ||
633 | * thus avoiding synchronisation of the metadata. Digesting a whole | ||
634 | * writeset in one go would cause too much latency. | ||
635 | *--------------------------------------------------------------*/ | ||
636 | struct digest { | ||
637 | uint32_t era; | ||
638 | unsigned nr_bits, current_bit; | ||
639 | struct writeset_metadata writeset; | ||
640 | __le32 value; | ||
641 | struct dm_disk_bitset info; | ||
642 | |||
643 | int (*step)(struct era_metadata *, struct digest *); | ||
644 | }; | ||
645 | |||
646 | static int metadata_digest_lookup_writeset(struct era_metadata *md, | ||
647 | struct digest *d); | ||
648 | |||
649 | static int metadata_digest_remove_writeset(struct era_metadata *md, | ||
650 | struct digest *d) | ||
651 | { | ||
652 | int r; | ||
653 | uint64_t key = d->era; | ||
654 | |||
655 | r = dm_btree_remove(&md->writeset_tree_info, md->writeset_tree_root, | ||
656 | &key, &md->writeset_tree_root); | ||
657 | if (r) { | ||
658 | DMERR("%s: dm_btree_remove failed", __func__); | ||
659 | return r; | ||
660 | } | ||
661 | |||
662 | d->step = metadata_digest_lookup_writeset; | ||
663 | return 0; | ||
664 | } | ||
665 | |||
666 | #define INSERTS_PER_STEP 100 | ||
667 | |||
668 | static int metadata_digest_transcribe_writeset(struct era_metadata *md, | ||
669 | struct digest *d) | ||
670 | { | ||
671 | int r; | ||
672 | bool marked; | ||
673 | unsigned b, e = min(d->current_bit + INSERTS_PER_STEP, d->nr_bits); | ||
674 | |||
675 | for (b = d->current_bit; b < e; b++) { | ||
676 | r = writeset_marked_on_disk(&d->info, &d->writeset, b, &marked); | ||
677 | if (r) { | ||
678 | DMERR("%s: writeset_marked_on_disk failed", __func__); | ||
679 | return r; | ||
680 | } | ||
681 | |||
682 | if (!marked) | ||
683 | continue; | ||
684 | |||
685 | __dm_bless_for_disk(&d->value); | ||
686 | r = dm_array_set_value(&md->era_array_info, md->era_array_root, | ||
687 | b, &d->value, &md->era_array_root); | ||
688 | if (r) { | ||
689 | DMERR("%s: dm_array_set_value failed", __func__); | ||
690 | return r; | ||
691 | } | ||
692 | } | ||
693 | |||
694 | if (b == d->nr_bits) | ||
695 | d->step = metadata_digest_remove_writeset; | ||
696 | else | ||
697 | d->current_bit = b; | ||
698 | |||
699 | return 0; | ||
700 | } | ||
701 | |||
702 | static int metadata_digest_lookup_writeset(struct era_metadata *md, | ||
703 | struct digest *d) | ||
704 | { | ||
705 | int r; | ||
706 | uint64_t key; | ||
707 | struct writeset_disk disk; | ||
708 | |||
709 | r = dm_btree_find_lowest_key(&md->writeset_tree_info, | ||
710 | md->writeset_tree_root, &key); | ||
711 | if (r < 0) | ||
712 | return r; | ||
713 | |||
714 | d->era = key; | ||
715 | |||
716 | r = dm_btree_lookup(&md->writeset_tree_info, | ||
717 | md->writeset_tree_root, &key, &disk); | ||
718 | if (r) { | ||
719 | if (r == -ENODATA) { | ||
720 | d->step = NULL; | ||
721 | return 0; | ||
722 | } | ||
723 | |||
724 | DMERR("%s: dm_btree_lookup failed", __func__); | ||
725 | return r; | ||
726 | } | ||
727 | |||
728 | ws_unpack(&disk, &d->writeset); | ||
729 | d->value = cpu_to_le32(key); | ||
730 | |||
731 | d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks); | ||
732 | d->current_bit = 0; | ||
733 | d->step = metadata_digest_transcribe_writeset; | ||
734 | |||
735 | return 0; | ||
736 | } | ||
737 | |||
738 | static int metadata_digest_start(struct era_metadata *md, struct digest *d) | ||
739 | { | ||
740 | if (d->step) | ||
741 | return 0; | ||
742 | |||
743 | memset(d, 0, sizeof(*d)); | ||
744 | |||
745 | /* | ||
746 | * We initialise another bitset info to avoid any caching side | ||
747 | * effects with the previous one. | ||
748 | */ | ||
749 | dm_disk_bitset_init(md->tm, &d->info); | ||
750 | d->step = metadata_digest_lookup_writeset; | ||
751 | |||
752 | return 0; | ||
753 | } | ||
754 | |||
755 | /*---------------------------------------------------------------- | ||
756 | * High level metadata interface. Target methods should use these, and not | ||
757 | * the lower level ones. | ||
758 | *--------------------------------------------------------------*/ | ||
759 | static struct era_metadata *metadata_open(struct block_device *bdev, | ||
760 | sector_t block_size, | ||
761 | bool may_format) | ||
762 | { | ||
763 | int r; | ||
764 | struct era_metadata *md = kzalloc(sizeof(*md), GFP_KERNEL); | ||
765 | |||
766 | if (!md) | ||
767 | return NULL; | ||
768 | |||
769 | md->bdev = bdev; | ||
770 | md->block_size = block_size; | ||
771 | |||
772 | md->writesets[0].md.root = INVALID_WRITESET_ROOT; | ||
773 | md->writesets[1].md.root = INVALID_WRITESET_ROOT; | ||
774 | md->current_writeset = &md->writesets[0]; | ||
775 | |||
776 | r = create_persistent_data_objects(md, may_format); | ||
777 | if (r) { | ||
778 | kfree(md); | ||
779 | return ERR_PTR(r); | ||
780 | } | ||
781 | |||
782 | return md; | ||
783 | } | ||
784 | |||
785 | static void metadata_close(struct era_metadata *md) | ||
786 | { | ||
787 | destroy_persistent_data_objects(md); | ||
788 | kfree(md); | ||
789 | } | ||
790 | |||
791 | static bool valid_nr_blocks(dm_block_t n) | ||
792 | { | ||
793 | /* | ||
794 | * dm_bitset restricts us to 2^32. test_bit & co. restrict us | ||
795 | * further to 2^31 - 1 | ||
796 | */ | ||
797 | return n < (1ull << 31); | ||
798 | } | ||
799 | |||
800 | static int metadata_resize(struct era_metadata *md, void *arg) | ||
801 | { | ||
802 | int r; | ||
803 | dm_block_t *new_size = arg; | ||
804 | __le32 value; | ||
805 | |||
806 | if (!valid_nr_blocks(*new_size)) { | ||
807 | DMERR("Invalid number of origin blocks %llu", | ||
808 | (unsigned long long) *new_size); | ||
809 | return -EINVAL; | ||
810 | } | ||
811 | |||
812 | writeset_free(&md->writesets[0]); | ||
813 | writeset_free(&md->writesets[1]); | ||
814 | |||
815 | r = writeset_alloc(&md->writesets[0], *new_size); | ||
816 | if (r) { | ||
817 | DMERR("%s: writeset_alloc failed for writeset 0", __func__); | ||
818 | return r; | ||
819 | } | ||
820 | |||
821 | r = writeset_alloc(&md->writesets[1], *new_size); | ||
822 | if (r) { | ||
823 | DMERR("%s: writeset_alloc failed for writeset 1", __func__); | ||
824 | return r; | ||
825 | } | ||
826 | |||
827 | value = cpu_to_le32(0u); | ||
828 | __dm_bless_for_disk(&value); | ||
829 | r = dm_array_resize(&md->era_array_info, md->era_array_root, | ||
830 | md->nr_blocks, *new_size, | ||
831 | &value, &md->era_array_root); | ||
832 | if (r) { | ||
833 | DMERR("%s: dm_array_resize failed", __func__); | ||
834 | return r; | ||
835 | } | ||
836 | |||
837 | md->nr_blocks = *new_size; | ||
838 | return 0; | ||
839 | } | ||
840 | |||
841 | static int metadata_era_archive(struct era_metadata *md) | ||
842 | { | ||
843 | int r; | ||
844 | uint64_t keys[1]; | ||
845 | struct writeset_disk value; | ||
846 | |||
847 | r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root, | ||
848 | &md->current_writeset->md.root); | ||
849 | if (r) { | ||
850 | DMERR("%s: dm_bitset_flush failed", __func__); | ||
851 | return r; | ||
852 | } | ||
853 | |||
854 | ws_pack(&md->current_writeset->md, &value); | ||
855 | md->current_writeset->md.root = INVALID_WRITESET_ROOT; | ||
856 | |||
857 | keys[0] = md->current_era; | ||
858 | __dm_bless_for_disk(&value); | ||
859 | r = dm_btree_insert(&md->writeset_tree_info, md->writeset_tree_root, | ||
860 | keys, &value, &md->writeset_tree_root); | ||
861 | if (r) { | ||
862 | DMERR("%s: couldn't insert writeset into btree", __func__); | ||
863 | /* FIXME: fail mode */ | ||
864 | return r; | ||
865 | } | ||
866 | |||
867 | md->archived_writesets = true; | ||
868 | |||
869 | return 0; | ||
870 | } | ||
871 | |||
872 | static struct writeset *next_writeset(struct era_metadata *md) | ||
873 | { | ||
874 | return (md->current_writeset == &md->writesets[0]) ? | ||
875 | &md->writesets[1] : &md->writesets[0]; | ||
876 | } | ||
877 | |||
878 | static int metadata_new_era(struct era_metadata *md) | ||
879 | { | ||
880 | int r; | ||
881 | struct writeset *new_writeset = next_writeset(md); | ||
882 | |||
883 | r = writeset_init(&md->bitset_info, new_writeset); | ||
884 | if (r) { | ||
885 | DMERR("%s: writeset_init failed", __func__); | ||
886 | return r; | ||
887 | } | ||
888 | |||
889 | swap_writeset(md, new_writeset); | ||
890 | md->current_era++; | ||
891 | |||
892 | return 0; | ||
893 | } | ||
894 | |||
895 | static int metadata_era_rollover(struct era_metadata *md) | ||
896 | { | ||
897 | int r; | ||
898 | |||
899 | if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) { | ||
900 | r = metadata_era_archive(md); | ||
901 | if (r) { | ||
902 | DMERR("%s: metadata_archive_era failed", __func__); | ||
903 | /* FIXME: fail mode? */ | ||
904 | return r; | ||
905 | } | ||
906 | } | ||
907 | |||
908 | r = metadata_new_era(md); | ||
909 | if (r) { | ||
910 | DMERR("%s: new era failed", __func__); | ||
911 | /* FIXME: fail mode */ | ||
912 | return r; | ||
913 | } | ||
914 | |||
915 | return 0; | ||
916 | } | ||
917 | |||
918 | static bool metadata_current_marked(struct era_metadata *md, dm_block_t block) | ||
919 | { | ||
920 | bool r; | ||
921 | struct writeset *ws; | ||
922 | |||
923 | rcu_read_lock(); | ||
924 | ws = rcu_dereference(md->current_writeset); | ||
925 | r = writeset_marked(ws, block); | ||
926 | rcu_read_unlock(); | ||
927 | |||
928 | return r; | ||
929 | } | ||
930 | |||
931 | static int metadata_commit(struct era_metadata *md) | ||
932 | { | ||
933 | int r; | ||
934 | struct dm_block *sblock; | ||
935 | |||
936 | if (md->current_writeset->md.root != SUPERBLOCK_LOCATION) { | ||
937 | r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root, | ||
938 | &md->current_writeset->md.root); | ||
939 | if (r) { | ||
940 | DMERR("%s: bitset flush failed", __func__); | ||
941 | return r; | ||
942 | } | ||
943 | } | ||
944 | |||
945 | r = dm_tm_pre_commit(md->tm); | ||
946 | if (r) { | ||
947 | DMERR("%s: pre commit failed", __func__); | ||
948 | return r; | ||
949 | } | ||
950 | |||
951 | r = superblock_lock(md, &sblock); | ||
952 | if (r) { | ||
953 | DMERR("%s: superblock lock failed", __func__); | ||
954 | return r; | ||
955 | } | ||
956 | |||
957 | r = prepare_superblock(md, dm_block_data(sblock)); | ||
958 | if (r) { | ||
959 | DMERR("%s: prepare_superblock failed", __func__); | ||
960 | dm_bm_unlock(sblock); /* FIXME: does this commit? */ | ||
961 | return r; | ||
962 | } | ||
963 | |||
964 | return dm_tm_commit(md->tm, sblock); | ||
965 | } | ||
966 | |||
967 | static int metadata_checkpoint(struct era_metadata *md) | ||
968 | { | ||
969 | /* | ||
970 | * For now we just rollover, but later I want to put a check in to | ||
971 | * avoid this if the filter is still pretty fresh. | ||
972 | */ | ||
973 | return metadata_era_rollover(md); | ||
974 | } | ||
975 | |||
976 | /* | ||
977 | * Metadata snapshots allow userland to access era data. | ||
978 | */ | ||
979 | static int metadata_take_snap(struct era_metadata *md) | ||
980 | { | ||
981 | int r, inc; | ||
982 | struct dm_block *clone; | ||
983 | |||
984 | if (md->metadata_snap != SUPERBLOCK_LOCATION) { | ||
985 | DMERR("%s: metadata snapshot already exists", __func__); | ||
986 | return -EINVAL; | ||
987 | } | ||
988 | |||
989 | r = metadata_era_rollover(md); | ||
990 | if (r) { | ||
991 | DMERR("%s: era rollover failed", __func__); | ||
992 | return r; | ||
993 | } | ||
994 | |||
995 | r = metadata_commit(md); | ||
996 | if (r) { | ||
997 | DMERR("%s: pre commit failed", __func__); | ||
998 | return r; | ||
999 | } | ||
1000 | |||
1001 | r = dm_sm_inc_block(md->sm, SUPERBLOCK_LOCATION); | ||
1002 | if (r) { | ||
1003 | DMERR("%s: couldn't increment superblock", __func__); | ||
1004 | return r; | ||
1005 | } | ||
1006 | |||
1007 | r = dm_tm_shadow_block(md->tm, SUPERBLOCK_LOCATION, | ||
1008 | &sb_validator, &clone, &inc); | ||
1009 | if (r) { | ||
1010 | DMERR("%s: couldn't shadow superblock", __func__); | ||
1011 | dm_sm_dec_block(md->sm, SUPERBLOCK_LOCATION); | ||
1012 | return r; | ||
1013 | } | ||
1014 | BUG_ON(!inc); | ||
1015 | |||
1016 | r = dm_sm_inc_block(md->sm, md->writeset_tree_root); | ||
1017 | if (r) { | ||
1018 | DMERR("%s: couldn't inc writeset tree root", __func__); | ||
1019 | dm_tm_unlock(md->tm, clone); | ||
1020 | return r; | ||
1021 | } | ||
1022 | |||
1023 | r = dm_sm_inc_block(md->sm, md->era_array_root); | ||
1024 | if (r) { | ||
1025 | DMERR("%s: couldn't inc era tree root", __func__); | ||
1026 | dm_sm_dec_block(md->sm, md->writeset_tree_root); | ||
1027 | dm_tm_unlock(md->tm, clone); | ||
1028 | return r; | ||
1029 | } | ||
1030 | |||
1031 | md->metadata_snap = dm_block_location(clone); | ||
1032 | |||
1033 | r = dm_tm_unlock(md->tm, clone); | ||
1034 | if (r) { | ||
1035 | DMERR("%s: couldn't unlock clone", __func__); | ||
1036 | md->metadata_snap = SUPERBLOCK_LOCATION; | ||
1037 | return r; | ||
1038 | } | ||
1039 | |||
1040 | return 0; | ||
1041 | } | ||
1042 | |||
1043 | static int metadata_drop_snap(struct era_metadata *md) | ||
1044 | { | ||
1045 | int r; | ||
1046 | dm_block_t location; | ||
1047 | struct dm_block *clone; | ||
1048 | struct superblock_disk *disk; | ||
1049 | |||
1050 | if (md->metadata_snap == SUPERBLOCK_LOCATION) { | ||
1051 | DMERR("%s: no snap to drop", __func__); | ||
1052 | return -EINVAL; | ||
1053 | } | ||
1054 | |||
1055 | r = dm_tm_read_lock(md->tm, md->metadata_snap, &sb_validator, &clone); | ||
1056 | if (r) { | ||
1057 | DMERR("%s: couldn't read lock superblock clone", __func__); | ||
1058 | return r; | ||
1059 | } | ||
1060 | |||
1061 | /* | ||
1062 | * Whatever happens now we'll commit with no record of the metadata | ||
1063 | * snap. | ||
1064 | */ | ||
1065 | md->metadata_snap = SUPERBLOCK_LOCATION; | ||
1066 | |||
1067 | disk = dm_block_data(clone); | ||
1068 | r = dm_btree_del(&md->writeset_tree_info, | ||
1069 | le64_to_cpu(disk->writeset_tree_root)); | ||
1070 | if (r) { | ||
1071 | DMERR("%s: error deleting writeset tree clone", __func__); | ||
1072 | dm_tm_unlock(md->tm, clone); | ||
1073 | return r; | ||
1074 | } | ||
1075 | |||
1076 | r = dm_array_del(&md->era_array_info, le64_to_cpu(disk->era_array_root)); | ||
1077 | if (r) { | ||
1078 | DMERR("%s: error deleting era array clone", __func__); | ||
1079 | dm_tm_unlock(md->tm, clone); | ||
1080 | return r; | ||
1081 | } | ||
1082 | |||
1083 | location = dm_block_location(clone); | ||
1084 | dm_tm_unlock(md->tm, clone); | ||
1085 | |||
1086 | return dm_sm_dec_block(md->sm, location); | ||
1087 | } | ||
1088 | |||
1089 | struct metadata_stats { | ||
1090 | dm_block_t used; | ||
1091 | dm_block_t total; | ||
1092 | dm_block_t snap; | ||
1093 | uint32_t era; | ||
1094 | }; | ||
1095 | |||
1096 | static int metadata_get_stats(struct era_metadata *md, void *ptr) | ||
1097 | { | ||
1098 | int r; | ||
1099 | struct metadata_stats *s = ptr; | ||
1100 | dm_block_t nr_free, nr_total; | ||
1101 | |||
1102 | r = dm_sm_get_nr_free(md->sm, &nr_free); | ||
1103 | if (r) { | ||
1104 | DMERR("dm_sm_get_nr_free returned %d", r); | ||
1105 | return r; | ||
1106 | } | ||
1107 | |||
1108 | r = dm_sm_get_nr_blocks(md->sm, &nr_total); | ||
1109 | if (r) { | ||
1110 | DMERR("dm_pool_get_metadata_dev_size returned %d", r); | ||
1111 | return r; | ||
1112 | } | ||
1113 | |||
1114 | s->used = nr_total - nr_free; | ||
1115 | s->total = nr_total; | ||
1116 | s->snap = md->metadata_snap; | ||
1117 | s->era = md->current_era; | ||
1118 | |||
1119 | return 0; | ||
1120 | } | ||
1121 | |||
1122 | /*----------------------------------------------------------------*/ | ||
1123 | |||
1124 | struct era { | ||
1125 | struct dm_target *ti; | ||
1126 | struct dm_target_callbacks callbacks; | ||
1127 | |||
1128 | struct dm_dev *metadata_dev; | ||
1129 | struct dm_dev *origin_dev; | ||
1130 | |||
1131 | dm_block_t nr_blocks; | ||
1132 | uint32_t sectors_per_block; | ||
1133 | int sectors_per_block_shift; | ||
1134 | struct era_metadata *md; | ||
1135 | |||
1136 | struct workqueue_struct *wq; | ||
1137 | struct work_struct worker; | ||
1138 | |||
1139 | spinlock_t deferred_lock; | ||
1140 | struct bio_list deferred_bios; | ||
1141 | |||
1142 | spinlock_t rpc_lock; | ||
1143 | struct list_head rpc_calls; | ||
1144 | |||
1145 | struct digest digest; | ||
1146 | atomic_t suspended; | ||
1147 | }; | ||
1148 | |||
1149 | struct rpc { | ||
1150 | struct list_head list; | ||
1151 | |||
1152 | int (*fn0)(struct era_metadata *); | ||
1153 | int (*fn1)(struct era_metadata *, void *); | ||
1154 | void *arg; | ||
1155 | int result; | ||
1156 | |||
1157 | struct completion complete; | ||
1158 | }; | ||
1159 | |||
1160 | /*---------------------------------------------------------------- | ||
1161 | * Remapping. | ||
1162 | *---------------------------------------------------------------*/ | ||
1163 | static bool block_size_is_power_of_two(struct era *era) | ||
1164 | { | ||
1165 | return era->sectors_per_block_shift >= 0; | ||
1166 | } | ||
1167 | |||
1168 | static dm_block_t get_block(struct era *era, struct bio *bio) | ||
1169 | { | ||
1170 | sector_t block_nr = bio->bi_iter.bi_sector; | ||
1171 | |||
1172 | if (!block_size_is_power_of_two(era)) | ||
1173 | (void) sector_div(block_nr, era->sectors_per_block); | ||
1174 | else | ||
1175 | block_nr >>= era->sectors_per_block_shift; | ||
1176 | |||
1177 | return block_nr; | ||
1178 | } | ||
1179 | |||
1180 | static void remap_to_origin(struct era *era, struct bio *bio) | ||
1181 | { | ||
1182 | bio->bi_bdev = era->origin_dev->bdev; | ||
1183 | } | ||
1184 | |||
1185 | /*---------------------------------------------------------------- | ||
1186 | * Worker thread | ||
1187 | *--------------------------------------------------------------*/ | ||
1188 | static void wake_worker(struct era *era) | ||
1189 | { | ||
1190 | if (!atomic_read(&era->suspended)) | ||
1191 | queue_work(era->wq, &era->worker); | ||
1192 | } | ||
1193 | |||
1194 | static void process_old_eras(struct era *era) | ||
1195 | { | ||
1196 | int r; | ||
1197 | |||
1198 | if (!era->digest.step) | ||
1199 | return; | ||
1200 | |||
1201 | r = era->digest.step(era->md, &era->digest); | ||
1202 | if (r < 0) { | ||
1203 | DMERR("%s: digest step failed, stopping digestion", __func__); | ||
1204 | era->digest.step = NULL; | ||
1205 | |||
1206 | } else if (era->digest.step) | ||
1207 | wake_worker(era); | ||
1208 | } | ||
1209 | |||
1210 | static void process_deferred_bios(struct era *era) | ||
1211 | { | ||
1212 | int r; | ||
1213 | struct bio_list deferred_bios, marked_bios; | ||
1214 | struct bio *bio; | ||
1215 | bool commit_needed = false; | ||
1216 | bool failed = false; | ||
1217 | |||
1218 | bio_list_init(&deferred_bios); | ||
1219 | bio_list_init(&marked_bios); | ||
1220 | |||
1221 | spin_lock(&era->deferred_lock); | ||
1222 | bio_list_merge(&deferred_bios, &era->deferred_bios); | ||
1223 | bio_list_init(&era->deferred_bios); | ||
1224 | spin_unlock(&era->deferred_lock); | ||
1225 | |||
1226 | while ((bio = bio_list_pop(&deferred_bios))) { | ||
1227 | r = writeset_test_and_set(&era->md->bitset_info, | ||
1228 | era->md->current_writeset, | ||
1229 | get_block(era, bio)); | ||
1230 | if (r < 0) { | ||
1231 | /* | ||
1232 | * This is bad news, we need to rollback. | ||
1233 | * FIXME: finish. | ||
1234 | */ | ||
1235 | failed = true; | ||
1236 | |||
1237 | } else if (r == 0) | ||
1238 | commit_needed = true; | ||
1239 | |||
1240 | bio_list_add(&marked_bios, bio); | ||
1241 | } | ||
1242 | |||
1243 | if (commit_needed) { | ||
1244 | r = metadata_commit(era->md); | ||
1245 | if (r) | ||
1246 | failed = true; | ||
1247 | } | ||
1248 | |||
1249 | if (failed) | ||
1250 | while ((bio = bio_list_pop(&marked_bios))) | ||
1251 | bio_io_error(bio); | ||
1252 | else | ||
1253 | while ((bio = bio_list_pop(&marked_bios))) | ||
1254 | generic_make_request(bio); | ||
1255 | } | ||
1256 | |||
1257 | static void process_rpc_calls(struct era *era) | ||
1258 | { | ||
1259 | int r; | ||
1260 | bool need_commit = false; | ||
1261 | struct list_head calls; | ||
1262 | struct rpc *rpc, *tmp; | ||
1263 | |||
1264 | INIT_LIST_HEAD(&calls); | ||
1265 | spin_lock(&era->rpc_lock); | ||
1266 | list_splice_init(&era->rpc_calls, &calls); | ||
1267 | spin_unlock(&era->rpc_lock); | ||
1268 | |||
1269 | list_for_each_entry_safe(rpc, tmp, &calls, list) { | ||
1270 | rpc->result = rpc->fn0 ? rpc->fn0(era->md) : rpc->fn1(era->md, rpc->arg); | ||
1271 | need_commit = true; | ||
1272 | } | ||
1273 | |||
1274 | if (need_commit) { | ||
1275 | r = metadata_commit(era->md); | ||
1276 | if (r) | ||
1277 | list_for_each_entry_safe(rpc, tmp, &calls, list) | ||
1278 | rpc->result = r; | ||
1279 | } | ||
1280 | |||
1281 | list_for_each_entry_safe(rpc, tmp, &calls, list) | ||
1282 | complete(&rpc->complete); | ||
1283 | } | ||
1284 | |||
1285 | static void kick_off_digest(struct era *era) | ||
1286 | { | ||
1287 | if (era->md->archived_writesets) { | ||
1288 | era->md->archived_writesets = false; | ||
1289 | metadata_digest_start(era->md, &era->digest); | ||
1290 | } | ||
1291 | } | ||
1292 | |||
1293 | static void do_work(struct work_struct *ws) | ||
1294 | { | ||
1295 | struct era *era = container_of(ws, struct era, worker); | ||
1296 | |||
1297 | kick_off_digest(era); | ||
1298 | process_old_eras(era); | ||
1299 | process_deferred_bios(era); | ||
1300 | process_rpc_calls(era); | ||
1301 | } | ||
1302 | |||
1303 | static void defer_bio(struct era *era, struct bio *bio) | ||
1304 | { | ||
1305 | spin_lock(&era->deferred_lock); | ||
1306 | bio_list_add(&era->deferred_bios, bio); | ||
1307 | spin_unlock(&era->deferred_lock); | ||
1308 | |||
1309 | wake_worker(era); | ||
1310 | } | ||
1311 | |||
1312 | /* | ||
1313 | * Make an rpc call to the worker to change the metadata. | ||
1314 | */ | ||
1315 | static int perform_rpc(struct era *era, struct rpc *rpc) | ||
1316 | { | ||
1317 | rpc->result = 0; | ||
1318 | init_completion(&rpc->complete); | ||
1319 | |||
1320 | spin_lock(&era->rpc_lock); | ||
1321 | list_add(&rpc->list, &era->rpc_calls); | ||
1322 | spin_unlock(&era->rpc_lock); | ||
1323 | |||
1324 | wake_worker(era); | ||
1325 | wait_for_completion(&rpc->complete); | ||
1326 | |||
1327 | return rpc->result; | ||
1328 | } | ||
1329 | |||
1330 | static int in_worker0(struct era *era, int (*fn)(struct era_metadata *)) | ||
1331 | { | ||
1332 | struct rpc rpc; | ||
1333 | rpc.fn0 = fn; | ||
1334 | rpc.fn1 = NULL; | ||
1335 | |||
1336 | return perform_rpc(era, &rpc); | ||
1337 | } | ||
1338 | |||
1339 | static int in_worker1(struct era *era, | ||
1340 | int (*fn)(struct era_metadata *, void *), void *arg) | ||
1341 | { | ||
1342 | struct rpc rpc; | ||
1343 | rpc.fn0 = NULL; | ||
1344 | rpc.fn1 = fn; | ||
1345 | rpc.arg = arg; | ||
1346 | |||
1347 | return perform_rpc(era, &rpc); | ||
1348 | } | ||
1349 | |||
1350 | static void start_worker(struct era *era) | ||
1351 | { | ||
1352 | atomic_set(&era->suspended, 0); | ||
1353 | } | ||
1354 | |||
1355 | static void stop_worker(struct era *era) | ||
1356 | { | ||
1357 | atomic_set(&era->suspended, 1); | ||
1358 | flush_workqueue(era->wq); | ||
1359 | } | ||
1360 | |||
1361 | /*---------------------------------------------------------------- | ||
1362 | * Target methods | ||
1363 | *--------------------------------------------------------------*/ | ||
1364 | static int dev_is_congested(struct dm_dev *dev, int bdi_bits) | ||
1365 | { | ||
1366 | struct request_queue *q = bdev_get_queue(dev->bdev); | ||
1367 | return bdi_congested(&q->backing_dev_info, bdi_bits); | ||
1368 | } | ||
1369 | |||
1370 | static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits) | ||
1371 | { | ||
1372 | struct era *era = container_of(cb, struct era, callbacks); | ||
1373 | return dev_is_congested(era->origin_dev, bdi_bits); | ||
1374 | } | ||
1375 | |||
1376 | static void era_destroy(struct era *era) | ||
1377 | { | ||
1378 | metadata_close(era->md); | ||
1379 | |||
1380 | if (era->wq) | ||
1381 | destroy_workqueue(era->wq); | ||
1382 | |||
1383 | if (era->origin_dev) | ||
1384 | dm_put_device(era->ti, era->origin_dev); | ||
1385 | |||
1386 | if (era->metadata_dev) | ||
1387 | dm_put_device(era->ti, era->metadata_dev); | ||
1388 | |||
1389 | kfree(era); | ||
1390 | } | ||
1391 | |||
1392 | static dm_block_t calc_nr_blocks(struct era *era) | ||
1393 | { | ||
1394 | return dm_sector_div_up(era->ti->len, era->sectors_per_block); | ||
1395 | } | ||
1396 | |||
1397 | static bool valid_block_size(dm_block_t block_size) | ||
1398 | { | ||
1399 | bool greater_than_zero = block_size > 0; | ||
1400 | bool multiple_of_min_block_size = (block_size & (MIN_BLOCK_SIZE - 1)) == 0; | ||
1401 | |||
1402 | return greater_than_zero && multiple_of_min_block_size; | ||
1403 | } | ||
1404 | |||
1405 | /* | ||
1406 | * <metadata dev> <data dev> <data block size (sectors)> | ||
1407 | */ | ||
1408 | static int era_ctr(struct dm_target *ti, unsigned argc, char **argv) | ||
1409 | { | ||
1410 | int r; | ||
1411 | char dummy; | ||
1412 | struct era *era; | ||
1413 | struct era_metadata *md; | ||
1414 | |||
1415 | if (argc != 3) { | ||
1416 | ti->error = "Invalid argument count"; | ||
1417 | return -EINVAL; | ||
1418 | } | ||
1419 | |||
1420 | era = kzalloc(sizeof(*era), GFP_KERNEL); | ||
1421 | if (!era) { | ||
1422 | ti->error = "Error allocating era structure"; | ||
1423 | return -ENOMEM; | ||
1424 | } | ||
1425 | |||
1426 | era->ti = ti; | ||
1427 | |||
1428 | r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev); | ||
1429 | if (r) { | ||
1430 | ti->error = "Error opening metadata device"; | ||
1431 | era_destroy(era); | ||
1432 | return -EINVAL; | ||
1433 | } | ||
1434 | |||
1435 | r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev); | ||
1436 | if (r) { | ||
1437 | ti->error = "Error opening data device"; | ||
1438 | era_destroy(era); | ||
1439 | return -EINVAL; | ||
1440 | } | ||
1441 | |||
1442 | r = sscanf(argv[2], "%u%c", &era->sectors_per_block, &dummy); | ||
1443 | if (r != 1) { | ||
1444 | ti->error = "Error parsing block size"; | ||
1445 | era_destroy(era); | ||
1446 | return -EINVAL; | ||
1447 | } | ||
1448 | |||
1449 | r = dm_set_target_max_io_len(ti, era->sectors_per_block); | ||
1450 | if (r) { | ||
1451 | ti->error = "could not set max io len"; | ||
1452 | era_destroy(era); | ||
1453 | return -EINVAL; | ||
1454 | } | ||
1455 | |||
1456 | if (!valid_block_size(era->sectors_per_block)) { | ||
1457 | ti->error = "Invalid block size"; | ||
1458 | era_destroy(era); | ||
1459 | return -EINVAL; | ||
1460 | } | ||
1461 | if (era->sectors_per_block & (era->sectors_per_block - 1)) | ||
1462 | era->sectors_per_block_shift = -1; | ||
1463 | else | ||
1464 | era->sectors_per_block_shift = __ffs(era->sectors_per_block); | ||
1465 | |||
1466 | md = metadata_open(era->metadata_dev->bdev, era->sectors_per_block, true); | ||
1467 | if (IS_ERR(md)) { | ||
1468 | ti->error = "Error reading metadata"; | ||
1469 | era_destroy(era); | ||
1470 | return PTR_ERR(md); | ||
1471 | } | ||
1472 | era->md = md; | ||
1473 | |||
1474 | era->nr_blocks = calc_nr_blocks(era); | ||
1475 | |||
1476 | r = metadata_resize(era->md, &era->nr_blocks); | ||
1477 | if (r) { | ||
1478 | ti->error = "couldn't resize metadata"; | ||
1479 | era_destroy(era); | ||
1480 | return -ENOMEM; | ||
1481 | } | ||
1482 | |||
1483 | era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); | ||
1484 | if (!era->wq) { | ||
1485 | ti->error = "could not create workqueue for metadata object"; | ||
1486 | era_destroy(era); | ||
1487 | return -ENOMEM; | ||
1488 | } | ||
1489 | INIT_WORK(&era->worker, do_work); | ||
1490 | |||
1491 | spin_lock_init(&era->deferred_lock); | ||
1492 | bio_list_init(&era->deferred_bios); | ||
1493 | |||
1494 | spin_lock_init(&era->rpc_lock); | ||
1495 | INIT_LIST_HEAD(&era->rpc_calls); | ||
1496 | |||
1497 | ti->private = era; | ||
1498 | ti->num_flush_bios = 1; | ||
1499 | ti->flush_supported = true; | ||
1500 | |||
1501 | ti->num_discard_bios = 1; | ||
1502 | ti->discards_supported = true; | ||
1503 | era->callbacks.congested_fn = era_is_congested; | ||
1504 | dm_table_add_target_callbacks(ti->table, &era->callbacks); | ||
1505 | |||
1506 | return 0; | ||
1507 | } | ||
1508 | |||
1509 | static void era_dtr(struct dm_target *ti) | ||
1510 | { | ||
1511 | era_destroy(ti->private); | ||
1512 | } | ||
1513 | |||
1514 | static int era_map(struct dm_target *ti, struct bio *bio) | ||
1515 | { | ||
1516 | struct era *era = ti->private; | ||
1517 | dm_block_t block = get_block(era, bio); | ||
1518 | |||
1519 | /* | ||
1520 | * All bios get remapped to the origin device. We do this now, but | ||
1521 | * it may not get issued until later. Depending on whether the | ||
1522 | * block is marked in this era. | ||
1523 | */ | ||
1524 | remap_to_origin(era, bio); | ||
1525 | |||
1526 | /* | ||
1527 | * REQ_FLUSH bios carry no data, so we're not interested in them. | ||
1528 | */ | ||
1529 | if (!(bio->bi_rw & REQ_FLUSH) && | ||
1530 | (bio_data_dir(bio) == WRITE) && | ||
1531 | !metadata_current_marked(era->md, block)) { | ||
1532 | defer_bio(era, bio); | ||
1533 | return DM_MAPIO_SUBMITTED; | ||
1534 | } | ||
1535 | |||
1536 | return DM_MAPIO_REMAPPED; | ||
1537 | } | ||
1538 | |||
1539 | static void era_postsuspend(struct dm_target *ti) | ||
1540 | { | ||
1541 | int r; | ||
1542 | struct era *era = ti->private; | ||
1543 | |||
1544 | r = in_worker0(era, metadata_era_archive); | ||
1545 | if (r) { | ||
1546 | DMERR("%s: couldn't archive current era", __func__); | ||
1547 | /* FIXME: fail mode */ | ||
1548 | } | ||
1549 | |||
1550 | stop_worker(era); | ||
1551 | } | ||
1552 | |||
1553 | static int era_preresume(struct dm_target *ti) | ||
1554 | { | ||
1555 | int r; | ||
1556 | struct era *era = ti->private; | ||
1557 | dm_block_t new_size = calc_nr_blocks(era); | ||
1558 | |||
1559 | if (era->nr_blocks != new_size) { | ||
1560 | r = in_worker1(era, metadata_resize, &new_size); | ||
1561 | if (r) | ||
1562 | return r; | ||
1563 | |||
1564 | era->nr_blocks = new_size; | ||
1565 | } | ||
1566 | |||
1567 | start_worker(era); | ||
1568 | |||
1569 | r = in_worker0(era, metadata_new_era); | ||
1570 | if (r) { | ||
1571 | DMERR("%s: metadata_era_rollover failed", __func__); | ||
1572 | return r; | ||
1573 | } | ||
1574 | |||
1575 | return 0; | ||
1576 | } | ||
1577 | |||
1578 | /* | ||
1579 | * Status format: | ||
1580 | * | ||
1581 | * <metadata block size> <#used metadata blocks>/<#total metadata blocks> | ||
1582 | * <current era> <held metadata root | '-'> | ||
1583 | */ | ||
1584 | static void era_status(struct dm_target *ti, status_type_t type, | ||
1585 | unsigned status_flags, char *result, unsigned maxlen) | ||
1586 | { | ||
1587 | int r; | ||
1588 | struct era *era = ti->private; | ||
1589 | ssize_t sz = 0; | ||
1590 | struct metadata_stats stats; | ||
1591 | char buf[BDEVNAME_SIZE]; | ||
1592 | |||
1593 | switch (type) { | ||
1594 | case STATUSTYPE_INFO: | ||
1595 | r = in_worker1(era, metadata_get_stats, &stats); | ||
1596 | if (r) | ||
1597 | goto err; | ||
1598 | |||
1599 | DMEMIT("%u %llu/%llu %u", | ||
1600 | (unsigned) (DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT), | ||
1601 | (unsigned long long) stats.used, | ||
1602 | (unsigned long long) stats.total, | ||
1603 | (unsigned) stats.era); | ||
1604 | |||
1605 | if (stats.snap != SUPERBLOCK_LOCATION) | ||
1606 | DMEMIT(" %llu", stats.snap); | ||
1607 | else | ||
1608 | DMEMIT(" -"); | ||
1609 | break; | ||
1610 | |||
1611 | case STATUSTYPE_TABLE: | ||
1612 | format_dev_t(buf, era->metadata_dev->bdev->bd_dev); | ||
1613 | DMEMIT("%s ", buf); | ||
1614 | format_dev_t(buf, era->origin_dev->bdev->bd_dev); | ||
1615 | DMEMIT("%s %u", buf, era->sectors_per_block); | ||
1616 | break; | ||
1617 | } | ||
1618 | |||
1619 | return; | ||
1620 | |||
1621 | err: | ||
1622 | DMEMIT("Error"); | ||
1623 | } | ||
1624 | |||
1625 | static int era_message(struct dm_target *ti, unsigned argc, char **argv) | ||
1626 | { | ||
1627 | struct era *era = ti->private; | ||
1628 | |||
1629 | if (argc != 1) { | ||
1630 | DMERR("incorrect number of message arguments"); | ||
1631 | return -EINVAL; | ||
1632 | } | ||
1633 | |||
1634 | if (!strcasecmp(argv[0], "checkpoint")) | ||
1635 | return in_worker0(era, metadata_checkpoint); | ||
1636 | |||
1637 | if (!strcasecmp(argv[0], "take_metadata_snap")) | ||
1638 | return in_worker0(era, metadata_take_snap); | ||
1639 | |||
1640 | if (!strcasecmp(argv[0], "drop_metadata_snap")) | ||
1641 | return in_worker0(era, metadata_drop_snap); | ||
1642 | |||
1643 | DMERR("unsupported message '%s'", argv[0]); | ||
1644 | return -EINVAL; | ||
1645 | } | ||
1646 | |||
1647 | static sector_t get_dev_size(struct dm_dev *dev) | ||
1648 | { | ||
1649 | return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; | ||
1650 | } | ||
1651 | |||
1652 | static int era_iterate_devices(struct dm_target *ti, | ||
1653 | iterate_devices_callout_fn fn, void *data) | ||
1654 | { | ||
1655 | struct era *era = ti->private; | ||
1656 | return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data); | ||
1657 | } | ||
1658 | |||
1659 | static int era_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | ||
1660 | struct bio_vec *biovec, int max_size) | ||
1661 | { | ||
1662 | struct era *era = ti->private; | ||
1663 | struct request_queue *q = bdev_get_queue(era->origin_dev->bdev); | ||
1664 | |||
1665 | if (!q->merge_bvec_fn) | ||
1666 | return max_size; | ||
1667 | |||
1668 | bvm->bi_bdev = era->origin_dev->bdev; | ||
1669 | |||
1670 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | ||
1671 | } | ||
1672 | |||
1673 | static void era_io_hints(struct dm_target *ti, struct queue_limits *limits) | ||
1674 | { | ||
1675 | struct era *era = ti->private; | ||
1676 | uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; | ||
1677 | |||
1678 | /* | ||
1679 | * If the system-determined stacked limits are compatible with the | ||
1680 | * era device's blocksize (io_opt is a factor) do not override them. | ||
1681 | */ | ||
1682 | if (io_opt_sectors < era->sectors_per_block || | ||
1683 | do_div(io_opt_sectors, era->sectors_per_block)) { | ||
1684 | blk_limits_io_min(limits, 0); | ||
1685 | blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT); | ||
1686 | } | ||
1687 | } | ||
1688 | |||
1689 | /*----------------------------------------------------------------*/ | ||
1690 | |||
1691 | static struct target_type era_target = { | ||
1692 | .name = "era", | ||
1693 | .version = {1, 0, 0}, | ||
1694 | .module = THIS_MODULE, | ||
1695 | .ctr = era_ctr, | ||
1696 | .dtr = era_dtr, | ||
1697 | .map = era_map, | ||
1698 | .postsuspend = era_postsuspend, | ||
1699 | .preresume = era_preresume, | ||
1700 | .status = era_status, | ||
1701 | .message = era_message, | ||
1702 | .iterate_devices = era_iterate_devices, | ||
1703 | .merge = era_merge, | ||
1704 | .io_hints = era_io_hints | ||
1705 | }; | ||
1706 | |||
1707 | static int __init dm_era_init(void) | ||
1708 | { | ||
1709 | int r; | ||
1710 | |||
1711 | r = dm_register_target(&era_target); | ||
1712 | if (r) { | ||
1713 | DMERR("era target registration failed: %d", r); | ||
1714 | return r; | ||
1715 | } | ||
1716 | |||
1717 | return 0; | ||
1718 | } | ||
1719 | |||
1720 | static void __exit dm_era_exit(void) | ||
1721 | { | ||
1722 | dm_unregister_target(&era_target); | ||
1723 | } | ||
1724 | |||
1725 | module_init(dm_era_init); | ||
1726 | module_exit(dm_era_exit); | ||
1727 | |||
1728 | MODULE_DESCRIPTION(DM_NAME " era target"); | ||
1729 | MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); | ||
1730 | MODULE_LICENSE("GPL"); | ||