diff options
author | Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | 2008-10-16 10:14:27 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-10-16 10:14:27 -0400 |
commit | c894058d66637c7720569fbe12957f4de64d9991 (patch) | |
tree | f13472d7fd76155f1365550515997a24aff611c9 | |
parent | c2774d84fd6cab2bfa2a2fae0b1ca8d8ebde48a2 (diff) |
ext4: Use an rbtree for tracking blocks freed during transaction.
With this patch we track the block freed during a transaction using
red-black tree. We also make sure contiguous blocks freed are collected
in one node in the tree.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
-rw-r--r-- | fs/ext4/mballoc.c | 184 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 26 |
2 files changed, 133 insertions, 77 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 154f8dec97e..bd9b011941a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |||
2300 | } | 2300 | } |
2301 | 2301 | ||
2302 | INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); | 2302 | INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); |
2303 | meta_group_info[i]->bb_free_root.rb_node = NULL;; | ||
2303 | 2304 | ||
2304 | #ifdef DOUBLE_CHECK | 2305 | #ifdef DOUBLE_CHECK |
2305 | { | 2306 | { |
@@ -2647,13 +2648,11 @@ int ext4_mb_release(struct super_block *sb) | |||
2647 | static noinline_for_stack void | 2648 | static noinline_for_stack void |
2648 | ext4_mb_free_committed_blocks(struct super_block *sb) | 2649 | ext4_mb_free_committed_blocks(struct super_block *sb) |
2649 | { | 2650 | { |
2650 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2651 | int err; | ||
2652 | int i; | ||
2653 | int count = 0; | ||
2654 | int count2 = 0; | ||
2655 | struct ext4_free_metadata *md; | ||
2656 | struct ext4_buddy e4b; | 2651 | struct ext4_buddy e4b; |
2652 | struct ext4_group_info *db; | ||
2653 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2654 | int err, count = 0, count2 = 0; | ||
2655 | struct ext4_free_data *entry; | ||
2657 | 2656 | ||
2658 | if (list_empty(&sbi->s_committed_transaction)) | 2657 | if (list_empty(&sbi->s_committed_transaction)) |
2659 | return; | 2658 | return; |
@@ -2661,44 +2660,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb) | |||
2661 | /* there is committed blocks to be freed yet */ | 2660 | /* there is committed blocks to be freed yet */ |
2662 | do { | 2661 | do { |
2663 | /* get next array of blocks */ | 2662 | /* get next array of blocks */ |
2664 | md = NULL; | 2663 | entry = NULL; |
2665 | spin_lock(&sbi->s_md_lock); | 2664 | spin_lock(&sbi->s_md_lock); |
2666 | if (!list_empty(&sbi->s_committed_transaction)) { | 2665 | if (!list_empty(&sbi->s_committed_transaction)) { |
2667 | md = list_entry(sbi->s_committed_transaction.next, | 2666 | entry = list_entry(sbi->s_committed_transaction.next, |
2668 | struct ext4_free_metadata, list); | 2667 | struct ext4_free_data, list); |
2669 | list_del(&md->list); | 2668 | list_del(&entry->list); |
2670 | } | 2669 | } |
2671 | spin_unlock(&sbi->s_md_lock); | 2670 | spin_unlock(&sbi->s_md_lock); |
2672 | 2671 | ||
2673 | if (md == NULL) | 2672 | if (entry == NULL) |
2674 | break; | 2673 | break; |
2675 | 2674 | ||
2676 | mb_debug("gonna free %u blocks in group %lu (0x%p):", | 2675 | mb_debug("gonna free %u blocks in group %lu (0x%p):", |
2677 | md->num, md->group, md); | 2676 | entry->count, entry->group, entry); |
2678 | 2677 | ||
2679 | err = ext4_mb_load_buddy(sb, md->group, &e4b); | 2678 | err = ext4_mb_load_buddy(sb, entry->group, &e4b); |
2680 | /* we expect to find existing buddy because it's pinned */ | 2679 | /* we expect to find existing buddy because it's pinned */ |
2681 | BUG_ON(err != 0); | 2680 | BUG_ON(err != 0); |
2682 | 2681 | ||
2682 | db = e4b.bd_info; | ||
2683 | /* there are blocks to put in buddy to make them really free */ | 2683 | /* there are blocks to put in buddy to make them really free */ |
2684 | count += md->num; | 2684 | count += entry->count; |
2685 | count2++; | 2685 | count2++; |
2686 | ext4_lock_group(sb, md->group); | 2686 | ext4_lock_group(sb, entry->group); |
2687 | for (i = 0; i < md->num; i++) { | 2687 | /* Take it out of per group rb tree */ |
2688 | mb_debug(" %u", md->blocks[i]); | 2688 | rb_erase(&entry->node, &(db->bb_free_root)); |
2689 | mb_free_blocks(NULL, &e4b, md->blocks[i], 1); | 2689 | mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); |
2690 | |||
2691 | if (!db->bb_free_root.rb_node) { | ||
2692 | /* No more items in the per group rb tree | ||
2693 | * balance refcounts from ext4_mb_free_metadata() | ||
2694 | */ | ||
2695 | page_cache_release(e4b.bd_buddy_page); | ||
2696 | page_cache_release(e4b.bd_bitmap_page); | ||
2690 | } | 2697 | } |
2691 | mb_debug("\n"); | 2698 | ext4_unlock_group(sb, entry->group); |
2692 | ext4_unlock_group(sb, md->group); | ||
2693 | |||
2694 | /* balance refcounts from ext4_mb_free_metadata() */ | ||
2695 | page_cache_release(e4b.bd_buddy_page); | ||
2696 | page_cache_release(e4b.bd_bitmap_page); | ||
2697 | 2699 | ||
2698 | kfree(md); | 2700 | kmem_cache_free(ext4_free_ext_cachep, entry); |
2699 | ext4_mb_release_desc(&e4b); | 2701 | ext4_mb_release_desc(&e4b); |
2700 | 2702 | } while (1); | |
2701 | } while (md); | ||
2702 | 2703 | ||
2703 | mb_debug("freed %u blocks in %u structures\n", count, count2); | 2704 | mb_debug("freed %u blocks in %u structures\n", count, count2); |
2704 | } | 2705 | } |
@@ -2771,6 +2772,16 @@ int __init init_ext4_mballoc(void) | |||
2771 | kmem_cache_destroy(ext4_pspace_cachep); | 2772 | kmem_cache_destroy(ext4_pspace_cachep); |
2772 | return -ENOMEM; | 2773 | return -ENOMEM; |
2773 | } | 2774 | } |
2775 | |||
2776 | ext4_free_ext_cachep = | ||
2777 | kmem_cache_create("ext4_free_block_extents", | ||
2778 | sizeof(struct ext4_free_data), | ||
2779 | 0, SLAB_RECLAIM_ACCOUNT, NULL); | ||
2780 | if (ext4_free_ext_cachep == NULL) { | ||
2781 | kmem_cache_destroy(ext4_pspace_cachep); | ||
2782 | kmem_cache_destroy(ext4_ac_cachep); | ||
2783 | return -ENOMEM; | ||
2784 | } | ||
2774 | return 0; | 2785 | return 0; |
2775 | } | 2786 | } |
2776 | 2787 | ||
@@ -2779,6 +2790,7 @@ void exit_ext4_mballoc(void) | |||
2779 | /* XXX: synchronize_rcu(); */ | 2790 | /* XXX: synchronize_rcu(); */ |
2780 | kmem_cache_destroy(ext4_pspace_cachep); | 2791 | kmem_cache_destroy(ext4_pspace_cachep); |
2781 | kmem_cache_destroy(ext4_ac_cachep); | 2792 | kmem_cache_destroy(ext4_ac_cachep); |
2793 | kmem_cache_destroy(ext4_free_ext_cachep); | ||
2782 | } | 2794 | } |
2783 | 2795 | ||
2784 | 2796 | ||
@@ -4415,6 +4427,21 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb, | |||
4415 | ext4_mb_free_committed_blocks(sb); | 4427 | ext4_mb_free_committed_blocks(sb); |
4416 | } | 4428 | } |
4417 | 4429 | ||
4430 | /* | ||
4431 | * We can merge two free data extents only if the physical blocks | ||
4432 | * are contiguous, AND the extents were freed by the same transaction, | ||
4433 | * AND the blocks are associated with the same group. | ||
4434 | */ | ||
4435 | static int can_merge(struct ext4_free_data *entry1, | ||
4436 | struct ext4_free_data *entry2) | ||
4437 | { | ||
4438 | if ((entry1->t_tid == entry2->t_tid) && | ||
4439 | (entry1->group == entry2->group) && | ||
4440 | ((entry1->start_blk + entry1->count) == entry2->start_blk)) | ||
4441 | return 1; | ||
4442 | return 0; | ||
4443 | } | ||
4444 | |||
4418 | static noinline_for_stack int | 4445 | static noinline_for_stack int |
4419 | ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, | 4446 | ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, |
4420 | ext4_group_t group, ext4_grpblk_t block, int count) | 4447 | ext4_group_t group, ext4_grpblk_t block, int count) |
@@ -4422,57 +4449,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, | |||
4422 | struct ext4_group_info *db = e4b->bd_info; | 4449 | struct ext4_group_info *db = e4b->bd_info; |
4423 | struct super_block *sb = e4b->bd_sb; | 4450 | struct super_block *sb = e4b->bd_sb; |
4424 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 4451 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
4425 | struct ext4_free_metadata *md; | 4452 | struct ext4_free_data *entry, *new_entry; |
4426 | int i; | 4453 | struct rb_node **n = &db->bb_free_root.rb_node, *node; |
4454 | struct rb_node *parent = NULL, *new_node; | ||
4455 | |||
4427 | 4456 | ||
4428 | BUG_ON(e4b->bd_bitmap_page == NULL); | 4457 | BUG_ON(e4b->bd_bitmap_page == NULL); |
4429 | BUG_ON(e4b->bd_buddy_page == NULL); | 4458 | BUG_ON(e4b->bd_buddy_page == NULL); |
4430 | 4459 | ||
4460 | new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); | ||
4461 | new_entry->start_blk = block; | ||
4462 | new_entry->group = group; | ||
4463 | new_entry->count = count; | ||
4464 | new_entry->t_tid = handle->h_transaction->t_tid; | ||
4465 | new_node = &new_entry->node; | ||
4466 | |||
4431 | ext4_lock_group(sb, group); | 4467 | ext4_lock_group(sb, group); |
4432 | for (i = 0; i < count; i++) { | 4468 | if (!*n) { |
4433 | md = db->bb_md_cur; | 4469 | /* first free block exent. We need to |
4434 | if (md && db->bb_tid != handle->h_transaction->t_tid) { | 4470 | protect buddy cache from being freed, |
4435 | db->bb_md_cur = NULL; | 4471 | * otherwise we'll refresh it from |
4436 | md = NULL; | 4472 | * on-disk bitmap and lose not-yet-available |
4473 | * blocks */ | ||
4474 | page_cache_get(e4b->bd_buddy_page); | ||
4475 | page_cache_get(e4b->bd_bitmap_page); | ||
4476 | } | ||
4477 | while (*n) { | ||
4478 | parent = *n; | ||
4479 | entry = rb_entry(parent, struct ext4_free_data, node); | ||
4480 | if (block < entry->start_blk) | ||
4481 | n = &(*n)->rb_left; | ||
4482 | else if (block >= (entry->start_blk + entry->count)) | ||
4483 | n = &(*n)->rb_right; | ||
4484 | else { | ||
4485 | ext4_error(sb, __func__, | ||
4486 | "Double free of blocks %d (%d %d)\n", | ||
4487 | block, entry->start_blk, entry->count); | ||
4488 | return 0; | ||
4437 | } | 4489 | } |
4490 | } | ||
4438 | 4491 | ||
4439 | if (md == NULL) { | 4492 | rb_link_node(new_node, parent, n); |
4440 | ext4_unlock_group(sb, group); | 4493 | rb_insert_color(new_node, &db->bb_free_root); |
4441 | md = kmalloc(sizeof(*md), GFP_NOFS); | 4494 | |
4442 | if (md == NULL) | 4495 | /* Now try to see the extent can be merged to left and right */ |
4443 | return -ENOMEM; | 4496 | node = rb_prev(new_node); |
4444 | md->num = 0; | 4497 | if (node) { |
4445 | md->group = group; | 4498 | entry = rb_entry(node, struct ext4_free_data, node); |
4446 | 4499 | if (can_merge(entry, new_entry)) { | |
4447 | ext4_lock_group(sb, group); | 4500 | new_entry->start_blk = entry->start_blk; |
4448 | if (db->bb_md_cur == NULL) { | 4501 | new_entry->count += entry->count; |
4449 | spin_lock(&sbi->s_md_lock); | 4502 | rb_erase(node, &(db->bb_free_root)); |
4450 | list_add(&md->list, &sbi->s_active_transaction); | 4503 | spin_lock(&sbi->s_md_lock); |
4451 | spin_unlock(&sbi->s_md_lock); | 4504 | list_del(&entry->list); |
4452 | /* protect buddy cache from being freed, | 4505 | spin_unlock(&sbi->s_md_lock); |
4453 | * otherwise we'll refresh it from | 4506 | kmem_cache_free(ext4_free_ext_cachep, entry); |
4454 | * on-disk bitmap and lose not-yet-available | ||
4455 | * blocks */ | ||
4456 | page_cache_get(e4b->bd_buddy_page); | ||
4457 | page_cache_get(e4b->bd_bitmap_page); | ||
4458 | db->bb_md_cur = md; | ||
4459 | db->bb_tid = handle->h_transaction->t_tid; | ||
4460 | mb_debug("new md 0x%p for group %lu\n", | ||
4461 | md, md->group); | ||
4462 | } else { | ||
4463 | kfree(md); | ||
4464 | md = db->bb_md_cur; | ||
4465 | } | ||
4466 | } | 4507 | } |
4508 | } | ||
4467 | 4509 | ||
4468 | BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); | 4510 | node = rb_next(new_node); |
4469 | md->blocks[md->num] = block + i; | 4511 | if (node) { |
4470 | md->num++; | 4512 | entry = rb_entry(node, struct ext4_free_data, node); |
4471 | if (md->num == EXT4_BB_MAX_BLOCKS) { | 4513 | if (can_merge(new_entry, entry)) { |
4472 | /* no more space, put full container on a sb's list */ | 4514 | new_entry->count += entry->count; |
4473 | db->bb_md_cur = NULL; | 4515 | rb_erase(node, &(db->bb_free_root)); |
4516 | spin_lock(&sbi->s_md_lock); | ||
4517 | list_del(&entry->list); | ||
4518 | spin_unlock(&sbi->s_md_lock); | ||
4519 | kmem_cache_free(ext4_free_ext_cachep, entry); | ||
4474 | } | 4520 | } |
4475 | } | 4521 | } |
4522 | /* Add the extent to active_transaction list */ | ||
4523 | spin_lock(&sbi->s_md_lock); | ||
4524 | list_add(&new_entry->list, &sbi->s_active_transaction); | ||
4525 | spin_unlock(&sbi->s_md_lock); | ||
4476 | ext4_unlock_group(sb, group); | 4526 | ext4_unlock_group(sb, group); |
4477 | return 0; | 4527 | return 0; |
4478 | } | 4528 | } |
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b3b4828f8b8..9e815c4e37d 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h | |||
@@ -98,23 +98,29 @@ | |||
98 | 98 | ||
99 | static struct kmem_cache *ext4_pspace_cachep; | 99 | static struct kmem_cache *ext4_pspace_cachep; |
100 | static struct kmem_cache *ext4_ac_cachep; | 100 | static struct kmem_cache *ext4_ac_cachep; |
101 | static struct kmem_cache *ext4_free_ext_cachep; | ||
101 | 102 | ||
102 | #ifdef EXT4_BB_MAX_BLOCKS | 103 | struct ext4_free_data { |
103 | #undef EXT4_BB_MAX_BLOCKS | 104 | /* this links the free block information from group_info */ |
104 | #endif | 105 | struct rb_node node; |
105 | #define EXT4_BB_MAX_BLOCKS 30 | ||
106 | 106 | ||
107 | struct ext4_free_metadata { | 107 | /* this links the free block information from ext4_sb_info */ |
108 | ext4_group_t group; | ||
109 | unsigned short num; | ||
110 | ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; | ||
111 | struct list_head list; | 108 | struct list_head list; |
109 | |||
110 | /* group which free block extent belongs */ | ||
111 | ext4_group_t group; | ||
112 | |||
113 | /* free block extent */ | ||
114 | ext4_grpblk_t start_blk; | ||
115 | ext4_grpblk_t count; | ||
116 | |||
117 | /* transaction which freed this extent */ | ||
118 | tid_t t_tid; | ||
112 | }; | 119 | }; |
113 | 120 | ||
114 | struct ext4_group_info { | 121 | struct ext4_group_info { |
115 | unsigned long bb_state; | 122 | unsigned long bb_state; |
116 | unsigned long bb_tid; | 123 | struct rb_root bb_free_root; |
117 | struct ext4_free_metadata *bb_md_cur; | ||
118 | unsigned short bb_first_free; | 124 | unsigned short bb_first_free; |
119 | unsigned short bb_free; | 125 | unsigned short bb_free; |
120 | unsigned short bb_fragments; | 126 | unsigned short bb_fragments; |