diff options
author | Jan Schmidt <list.btrfs@jan-o-sch.net> | 2012-05-16 11:18:50 -0400 |
---|---|---|
committer | Jan Schmidt <list.btrfs@jan-o-sch.net> | 2012-05-30 09:17:01 -0400 |
commit | bd989ba359f2acb8bc5f5490e19010fc0a6f8356 (patch) | |
tree | 3689629e226678802bfe9b8a91ae560ef50a93f0 /fs/btrfs/ctree.c | |
parent | f29021b29a85701c08afadfd51d87163fb078059 (diff) |
Btrfs: add tree modification log functions
The tree mod log will log modifications made fs-tree nodes. Most
modifications are done by autobalance of the tree. Such changes are recorded
as long as a block entry exists. When released, the log is cleaned.
With the tree modification log, it's possible to reconstruct a consistent
old state of the tree. This is required to do backref walking on a busy
file system.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Diffstat (limited to 'fs/btrfs/ctree.c')
-rw-r--r-- | fs/btrfs/ctree.c | 408 |
1 files changed, 407 insertions, 1 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 56485b3b7c31..72b9f97e2fdc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
@@ -18,6 +18,7 @@ | |||
18 | 18 | ||
19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
20 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
21 | #include <linux/rbtree.h> | ||
21 | #include "ctree.h" | 22 | #include "ctree.h" |
22 | #include "disk-io.h" | 23 | #include "disk-io.h" |
23 | #include "transaction.h" | 24 | #include "transaction.h" |
@@ -288,6 +289,412 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, | |||
288 | return 0; | 289 | return 0; |
289 | } | 290 | } |
290 | 291 | ||
292 | enum mod_log_op { | ||
293 | MOD_LOG_KEY_REPLACE, | ||
294 | MOD_LOG_KEY_ADD, | ||
295 | MOD_LOG_KEY_REMOVE, | ||
296 | MOD_LOG_KEY_REMOVE_WHILE_FREEING, | ||
297 | MOD_LOG_KEY_REMOVE_WHILE_MOVING, | ||
298 | MOD_LOG_MOVE_KEYS, | ||
299 | MOD_LOG_ROOT_REPLACE, | ||
300 | }; | ||
301 | |||
302 | struct tree_mod_move { | ||
303 | int dst_slot; | ||
304 | int nr_items; | ||
305 | }; | ||
306 | |||
307 | struct tree_mod_root { | ||
308 | u64 logical; | ||
309 | u8 level; | ||
310 | }; | ||
311 | |||
312 | struct tree_mod_elem { | ||
313 | struct rb_node node; | ||
314 | u64 index; /* shifted logical */ | ||
315 | struct seq_list elem; | ||
316 | enum mod_log_op op; | ||
317 | |||
318 | /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ | ||
319 | int slot; | ||
320 | |||
321 | /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ | ||
322 | u64 generation; | ||
323 | |||
324 | /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ | ||
325 | struct btrfs_disk_key key; | ||
326 | u64 blockptr; | ||
327 | |||
328 | /* this is used for op == MOD_LOG_MOVE_KEYS */ | ||
329 | struct tree_mod_move move; | ||
330 | |||
331 | /* this is used for op == MOD_LOG_ROOT_REPLACE */ | ||
332 | struct tree_mod_root old_root; | ||
333 | }; | ||
334 | |||
335 | static inline void | ||
336 | __get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) | ||
337 | { | ||
338 | elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); | ||
339 | list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); | ||
340 | } | ||
341 | |||
342 | void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, | ||
343 | struct seq_list *elem) | ||
344 | { | ||
345 | elem->flags = 1; | ||
346 | spin_lock(&fs_info->tree_mod_seq_lock); | ||
347 | __get_tree_mod_seq(fs_info, elem); | ||
348 | spin_unlock(&fs_info->tree_mod_seq_lock); | ||
349 | } | ||
350 | |||
351 | void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, | ||
352 | struct seq_list *elem) | ||
353 | { | ||
354 | struct rb_root *tm_root; | ||
355 | struct rb_node *node; | ||
356 | struct rb_node *next; | ||
357 | struct seq_list *cur_elem; | ||
358 | struct tree_mod_elem *tm; | ||
359 | u64 min_seq = (u64)-1; | ||
360 | u64 seq_putting = elem->seq; | ||
361 | |||
362 | if (!seq_putting) | ||
363 | return; | ||
364 | |||
365 | BUG_ON(!(elem->flags & 1)); | ||
366 | spin_lock(&fs_info->tree_mod_seq_lock); | ||
367 | list_del(&elem->list); | ||
368 | |||
369 | list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { | ||
370 | if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { | ||
371 | if (seq_putting > cur_elem->seq) { | ||
372 | /* | ||
373 | * blocker with lower sequence number exists, we | ||
374 | * cannot remove anything from the log | ||
375 | */ | ||
376 | goto out; | ||
377 | } | ||
378 | min_seq = cur_elem->seq; | ||
379 | } | ||
380 | } | ||
381 | |||
382 | /* | ||
383 | * anything that's lower than the lowest existing (read: blocked) | ||
384 | * sequence number can be removed from the tree. | ||
385 | */ | ||
386 | write_lock(&fs_info->tree_mod_log_lock); | ||
387 | tm_root = &fs_info->tree_mod_log; | ||
388 | for (node = rb_first(tm_root); node; node = next) { | ||
389 | next = rb_next(node); | ||
390 | tm = container_of(node, struct tree_mod_elem, node); | ||
391 | if (tm->elem.seq > min_seq) | ||
392 | continue; | ||
393 | rb_erase(node, tm_root); | ||
394 | list_del(&tm->elem.list); | ||
395 | kfree(tm); | ||
396 | } | ||
397 | write_unlock(&fs_info->tree_mod_log_lock); | ||
398 | out: | ||
399 | spin_unlock(&fs_info->tree_mod_seq_lock); | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * key order of the log: | ||
404 | * index -> sequence | ||
405 | * | ||
406 | * the index is the shifted logical of the *new* root node for root replace | ||
407 | * operations, or the shifted logical of the affected block for all other | ||
408 | * operations. | ||
409 | */ | ||
410 | static noinline int | ||
411 | __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) | ||
412 | { | ||
413 | struct rb_root *tm_root; | ||
414 | struct rb_node **new; | ||
415 | struct rb_node *parent = NULL; | ||
416 | struct tree_mod_elem *cur; | ||
417 | int ret = 0; | ||
418 | |||
419 | BUG_ON(!tm || !tm->elem.seq); | ||
420 | |||
421 | write_lock(&fs_info->tree_mod_log_lock); | ||
422 | tm_root = &fs_info->tree_mod_log; | ||
423 | new = &tm_root->rb_node; | ||
424 | while (*new) { | ||
425 | cur = container_of(*new, struct tree_mod_elem, node); | ||
426 | parent = *new; | ||
427 | if (cur->index < tm->index) | ||
428 | new = &((*new)->rb_left); | ||
429 | else if (cur->index > tm->index) | ||
430 | new = &((*new)->rb_right); | ||
431 | else if (cur->elem.seq < tm->elem.seq) | ||
432 | new = &((*new)->rb_left); | ||
433 | else if (cur->elem.seq > tm->elem.seq) | ||
434 | new = &((*new)->rb_right); | ||
435 | else { | ||
436 | kfree(tm); | ||
437 | ret = -EEXIST; | ||
438 | goto unlock; | ||
439 | } | ||
440 | } | ||
441 | |||
442 | rb_link_node(&tm->node, parent, new); | ||
443 | rb_insert_color(&tm->node, tm_root); | ||
444 | unlock: | ||
445 | write_unlock(&fs_info->tree_mod_log_lock); | ||
446 | return ret; | ||
447 | } | ||
448 | |||
449 | int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, | ||
450 | struct tree_mod_elem **tm_ret) | ||
451 | { | ||
452 | struct tree_mod_elem *tm; | ||
453 | u64 seq = 0; | ||
454 | |||
455 | smp_mb(); | ||
456 | if (list_empty(&fs_info->tree_mod_seq_list)) | ||
457 | return 0; | ||
458 | |||
459 | tm = *tm_ret = kzalloc(sizeof(*tm), flags); | ||
460 | if (!tm) | ||
461 | return -ENOMEM; | ||
462 | |||
463 | __get_tree_mod_seq(fs_info, &tm->elem); | ||
464 | seq = tm->elem.seq; | ||
465 | tm->elem.flags = 0; | ||
466 | |||
467 | return seq; | ||
468 | } | ||
469 | |||
470 | static noinline int | ||
471 | tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, | ||
472 | struct extent_buffer *eb, int slot, | ||
473 | enum mod_log_op op, gfp_t flags) | ||
474 | { | ||
475 | struct tree_mod_elem *tm; | ||
476 | int ret; | ||
477 | |||
478 | ret = tree_mod_alloc(fs_info, flags, &tm); | ||
479 | if (ret <= 0) | ||
480 | return ret; | ||
481 | |||
482 | tm->index = eb->start >> PAGE_CACHE_SHIFT; | ||
483 | if (op != MOD_LOG_KEY_ADD) { | ||
484 | btrfs_node_key(eb, &tm->key, slot); | ||
485 | tm->blockptr = btrfs_node_blockptr(eb, slot); | ||
486 | } | ||
487 | tm->op = op; | ||
488 | tm->slot = slot; | ||
489 | tm->generation = btrfs_node_ptr_generation(eb, slot); | ||
490 | |||
491 | return __tree_mod_log_insert(fs_info, tm); | ||
492 | } | ||
493 | |||
494 | static noinline int | ||
495 | tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, | ||
496 | int slot, enum mod_log_op op) | ||
497 | { | ||
498 | return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); | ||
499 | } | ||
500 | |||
501 | static noinline int | ||
502 | tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, | ||
503 | struct extent_buffer *eb, int dst_slot, int src_slot, | ||
504 | int nr_items, gfp_t flags) | ||
505 | { | ||
506 | struct tree_mod_elem *tm; | ||
507 | int ret; | ||
508 | int i; | ||
509 | |||
510 | ret = tree_mod_alloc(fs_info, flags, &tm); | ||
511 | if (ret <= 0) | ||
512 | return ret; | ||
513 | |||
514 | for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { | ||
515 | ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, | ||
516 | MOD_LOG_KEY_REMOVE_WHILE_MOVING); | ||
517 | BUG_ON(ret < 0); | ||
518 | } | ||
519 | |||
520 | tm->index = eb->start >> PAGE_CACHE_SHIFT; | ||
521 | tm->slot = src_slot; | ||
522 | tm->move.dst_slot = dst_slot; | ||
523 | tm->move.nr_items = nr_items; | ||
524 | tm->op = MOD_LOG_MOVE_KEYS; | ||
525 | |||
526 | return __tree_mod_log_insert(fs_info, tm); | ||
527 | } | ||
528 | |||
529 | static noinline int | ||
530 | tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, | ||
531 | struct extent_buffer *old_root, | ||
532 | struct extent_buffer *new_root, gfp_t flags) | ||
533 | { | ||
534 | struct tree_mod_elem *tm; | ||
535 | int ret; | ||
536 | |||
537 | ret = tree_mod_alloc(fs_info, flags, &tm); | ||
538 | if (ret <= 0) | ||
539 | return ret; | ||
540 | |||
541 | tm->index = new_root->start >> PAGE_CACHE_SHIFT; | ||
542 | tm->old_root.logical = old_root->start; | ||
543 | tm->old_root.level = btrfs_header_level(old_root); | ||
544 | tm->generation = btrfs_header_generation(old_root); | ||
545 | tm->op = MOD_LOG_ROOT_REPLACE; | ||
546 | |||
547 | return __tree_mod_log_insert(fs_info, tm); | ||
548 | } | ||
549 | |||
550 | static struct tree_mod_elem * | ||
551 | __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, | ||
552 | int smallest) | ||
553 | { | ||
554 | struct rb_root *tm_root; | ||
555 | struct rb_node *node; | ||
556 | struct tree_mod_elem *cur = NULL; | ||
557 | struct tree_mod_elem *found = NULL; | ||
558 | u64 index = start >> PAGE_CACHE_SHIFT; | ||
559 | |||
560 | read_lock(&fs_info->tree_mod_log_lock); | ||
561 | tm_root = &fs_info->tree_mod_log; | ||
562 | node = tm_root->rb_node; | ||
563 | while (node) { | ||
564 | cur = container_of(node, struct tree_mod_elem, node); | ||
565 | if (cur->index < index) { | ||
566 | node = node->rb_left; | ||
567 | } else if (cur->index > index) { | ||
568 | node = node->rb_right; | ||
569 | } else if (cur->elem.seq < min_seq) { | ||
570 | node = node->rb_left; | ||
571 | } else if (!smallest) { | ||
572 | /* we want the node with the highest seq */ | ||
573 | if (found) | ||
574 | BUG_ON(found->elem.seq > cur->elem.seq); | ||
575 | found = cur; | ||
576 | node = node->rb_left; | ||
577 | } else if (cur->elem.seq > min_seq) { | ||
578 | /* we want the node with the smallest seq */ | ||
579 | if (found) | ||
580 | BUG_ON(found->elem.seq < cur->elem.seq); | ||
581 | found = cur; | ||
582 | node = node->rb_right; | ||
583 | } else { | ||
584 | found = cur; | ||
585 | break; | ||
586 | } | ||
587 | } | ||
588 | read_unlock(&fs_info->tree_mod_log_lock); | ||
589 | |||
590 | return found; | ||
591 | } | ||
592 | |||
593 | /* | ||
594 | * this returns the element from the log with the smallest time sequence | ||
595 | * value that's in the log (the oldest log item). any element with a time | ||
596 | * sequence lower than min_seq will be ignored. | ||
597 | */ | ||
598 | static struct tree_mod_elem * | ||
599 | tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, | ||
600 | u64 min_seq) | ||
601 | { | ||
602 | return __tree_mod_log_search(fs_info, start, min_seq, 1); | ||
603 | } | ||
604 | |||
605 | /* | ||
606 | * this returns the element from the log with the largest time sequence | ||
607 | * value that's in the log (the most recent log item). any element with | ||
608 | * a time sequence lower than min_seq will be ignored. | ||
609 | */ | ||
610 | static struct tree_mod_elem * | ||
611 | tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) | ||
612 | { | ||
613 | return __tree_mod_log_search(fs_info, start, min_seq, 0); | ||
614 | } | ||
615 | |||
616 | static inline void | ||
617 | tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, | ||
618 | struct extent_buffer *src, unsigned long dst_offset, | ||
619 | unsigned long src_offset, int nr_items) | ||
620 | { | ||
621 | int ret; | ||
622 | int i; | ||
623 | |||
624 | smp_mb(); | ||
625 | if (list_empty(&fs_info->tree_mod_seq_list)) | ||
626 | return; | ||
627 | |||
628 | if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) | ||
629 | return; | ||
630 | |||
631 | /* speed this up by single seq for all operations? */ | ||
632 | for (i = 0; i < nr_items; i++) { | ||
633 | ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, | ||
634 | MOD_LOG_KEY_REMOVE); | ||
635 | BUG_ON(ret < 0); | ||
636 | ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, | ||
637 | MOD_LOG_KEY_ADD); | ||
638 | BUG_ON(ret < 0); | ||
639 | } | ||
640 | } | ||
641 | |||
642 | static inline void | ||
643 | tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, | ||
644 | int dst_offset, int src_offset, int nr_items) | ||
645 | { | ||
646 | int ret; | ||
647 | ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, | ||
648 | nr_items, GFP_NOFS); | ||
649 | BUG_ON(ret < 0); | ||
650 | } | ||
651 | |||
652 | static inline void | ||
653 | tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, | ||
654 | struct extent_buffer *eb, | ||
655 | struct btrfs_disk_key *disk_key, int slot, int atomic) | ||
656 | { | ||
657 | int ret; | ||
658 | |||
659 | ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, | ||
660 | MOD_LOG_KEY_REPLACE, | ||
661 | atomic ? GFP_ATOMIC : GFP_NOFS); | ||
662 | BUG_ON(ret < 0); | ||
663 | } | ||
664 | |||
665 | static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, | ||
666 | struct extent_buffer *eb) | ||
667 | { | ||
668 | int i; | ||
669 | int ret; | ||
670 | u32 nritems; | ||
671 | |||
672 | smp_mb(); | ||
673 | if (list_empty(&fs_info->tree_mod_seq_list)) | ||
674 | return; | ||
675 | |||
676 | if (btrfs_header_level(eb) == 0) | ||
677 | return; | ||
678 | |||
679 | nritems = btrfs_header_nritems(eb); | ||
680 | for (i = nritems - 1; i >= 0; i--) { | ||
681 | ret = tree_mod_log_insert_key(fs_info, eb, i, | ||
682 | MOD_LOG_KEY_REMOVE_WHILE_FREEING); | ||
683 | BUG_ON(ret < 0); | ||
684 | } | ||
685 | } | ||
686 | |||
687 | static inline void | ||
688 | tree_mod_log_set_root_pointer(struct btrfs_root *root, | ||
689 | struct extent_buffer *new_root_node) | ||
690 | { | ||
691 | int ret; | ||
692 | tree_mod_log_free_eb(root->fs_info, root->node); | ||
693 | ret = tree_mod_log_insert_root(root->fs_info, root->node, | ||
694 | new_root_node, GFP_NOFS); | ||
695 | BUG_ON(ret < 0); | ||
696 | } | ||
697 | |||
291 | /* | 698 | /* |
292 | * check if the tree block can be shared by multiple trees | 699 | * check if the tree block can be shared by multiple trees |
293 | */ | 700 | */ |
@@ -2271,7 +2678,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans, | |||
2271 | (unsigned long)btrfs_header_chunk_tree_uuid(split), | 2678 | (unsigned long)btrfs_header_chunk_tree_uuid(split), |
2272 | BTRFS_UUID_SIZE); | 2679 | BTRFS_UUID_SIZE); |
2273 | 2680 | ||
2274 | |||
2275 | copy_extent_buffer(split, c, | 2681 | copy_extent_buffer(split, c, |
2276 | btrfs_node_key_ptr_offset(0), | 2682 | btrfs_node_key_ptr_offset(0), |
2277 | btrfs_node_key_ptr_offset(mid), | 2683 | btrfs_node_key_ptr_offset(mid), |