diff options
author | Yan, Zheng <zheng.yan@oracle.com> | 2009-09-24 09:17:31 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2009-09-24 09:17:31 -0400 |
commit | 0257bb82d21bedff26541bcf12f1461c23f9ed61 (patch) | |
tree | 0b79fd9bf377094f6fa6f5f923fe192515672af3 /fs | |
parent | f679a84034be6f7da123be786bbd8838bf3e9207 (diff) |
Btrfs: relocate file extents in clusters
The extent relocation code copy file extents one by one when
relocating data block group. This is inefficient if file
extents are small. This patch makes the relocation code copy
file extents in clusters. So we can can make better use of
read-ahead.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/relocation.c | 237 |
1 files changed, 148 insertions, 89 deletions
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 48a504260635..361ad323faac 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c | |||
@@ -121,6 +121,15 @@ struct inodevec { | |||
121 | int nr; | 121 | int nr; |
122 | }; | 122 | }; |
123 | 123 | ||
124 | #define MAX_EXTENTS 128 | ||
125 | |||
126 | struct file_extent_cluster { | ||
127 | u64 start; | ||
128 | u64 end; | ||
129 | u64 boundary[MAX_EXTENTS]; | ||
130 | unsigned int nr; | ||
131 | }; | ||
132 | |||
124 | struct reloc_control { | 133 | struct reloc_control { |
125 | /* block group to relocate */ | 134 | /* block group to relocate */ |
126 | struct btrfs_block_group_cache *block_group; | 135 | struct btrfs_block_group_cache *block_group; |
@@ -2529,56 +2538,94 @@ out: | |||
2529 | } | 2538 | } |
2530 | 2539 | ||
2531 | static noinline_for_stack | 2540 | static noinline_for_stack |
2532 | int relocate_inode_pages(struct inode *inode, u64 start, u64 len) | 2541 | int setup_extent_mapping(struct inode *inode, u64 start, u64 end, |
2542 | u64 block_start) | ||
2543 | { | ||
2544 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
2545 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
2546 | struct extent_map *em; | ||
2547 | int ret = 0; | ||
2548 | |||
2549 | em = alloc_extent_map(GFP_NOFS); | ||
2550 | if (!em) | ||
2551 | return -ENOMEM; | ||
2552 | |||
2553 | em->start = start; | ||
2554 | em->len = end + 1 - start; | ||
2555 | em->block_len = em->len; | ||
2556 | em->block_start = block_start; | ||
2557 | em->bdev = root->fs_info->fs_devices->latest_bdev; | ||
2558 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | ||
2559 | |||
2560 | lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); | ||
2561 | while (1) { | ||
2562 | write_lock(&em_tree->lock); | ||
2563 | ret = add_extent_mapping(em_tree, em); | ||
2564 | write_unlock(&em_tree->lock); | ||
2565 | if (ret != -EEXIST) { | ||
2566 | free_extent_map(em); | ||
2567 | break; | ||
2568 | } | ||
2569 | btrfs_drop_extent_cache(inode, start, end, 0); | ||
2570 | } | ||
2571 | unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); | ||
2572 | return ret; | ||
2573 | } | ||
2574 | |||
2575 | static int relocate_file_extent_cluster(struct inode *inode, | ||
2576 | struct file_extent_cluster *cluster) | ||
2533 | { | 2577 | { |
2534 | u64 page_start; | 2578 | u64 page_start; |
2535 | u64 page_end; | 2579 | u64 page_end; |
2536 | unsigned long i; | 2580 | u64 offset = BTRFS_I(inode)->index_cnt; |
2537 | unsigned long first_index; | 2581 | unsigned long index; |
2538 | unsigned long last_index; | 2582 | unsigned long last_index; |
2539 | unsigned int total_read = 0; | 2583 | unsigned int dirty_page = 0; |
2540 | unsigned int total_dirty = 0; | ||
2541 | struct page *page; | 2584 | struct page *page; |
2542 | struct file_ra_state *ra; | 2585 | struct file_ra_state *ra; |
2543 | struct btrfs_ordered_extent *ordered; | 2586 | int nr = 0; |
2544 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
2545 | int ret = 0; | 2587 | int ret = 0; |
2546 | 2588 | ||
2589 | if (!cluster->nr) | ||
2590 | return 0; | ||
2591 | |||
2547 | ra = kzalloc(sizeof(*ra), GFP_NOFS); | 2592 | ra = kzalloc(sizeof(*ra), GFP_NOFS); |
2548 | if (!ra) | 2593 | if (!ra) |
2549 | return -ENOMEM; | 2594 | return -ENOMEM; |
2550 | 2595 | ||
2596 | index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; | ||
2597 | last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; | ||
2598 | |||
2551 | mutex_lock(&inode->i_mutex); | 2599 | mutex_lock(&inode->i_mutex); |
2552 | first_index = start >> PAGE_CACHE_SHIFT; | ||
2553 | last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; | ||
2554 | 2600 | ||
2555 | /* make sure the dirty trick played by the caller work */ | 2601 | i_size_write(inode, cluster->end + 1 - offset); |
2556 | while (1) { | 2602 | ret = setup_extent_mapping(inode, cluster->start - offset, |
2557 | ret = invalidate_inode_pages2_range(inode->i_mapping, | 2603 | cluster->end - offset, cluster->start); |
2558 | first_index, last_index); | ||
2559 | if (ret != -EBUSY) | ||
2560 | break; | ||
2561 | schedule_timeout(HZ/10); | ||
2562 | } | ||
2563 | if (ret) | 2604 | if (ret) |
2564 | goto out_unlock; | 2605 | goto out_unlock; |
2565 | 2606 | ||
2566 | file_ra_state_init(ra, inode->i_mapping); | 2607 | file_ra_state_init(ra, inode->i_mapping); |
2567 | 2608 | ||
2568 | for (i = first_index ; i <= last_index; i++) { | 2609 | WARN_ON(cluster->start != cluster->boundary[0]); |
2569 | if (total_read % ra->ra_pages == 0) { | 2610 | while (index <= last_index) { |
2570 | btrfs_force_ra(inode->i_mapping, ra, NULL, i, | 2611 | page = find_lock_page(inode->i_mapping, index); |
2571 | min(last_index, ra->ra_pages + i - 1)); | ||
2572 | } | ||
2573 | total_read++; | ||
2574 | again: | ||
2575 | if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) | ||
2576 | BUG_ON(1); | ||
2577 | page = grab_cache_page(inode->i_mapping, i); | ||
2578 | if (!page) { | 2612 | if (!page) { |
2579 | ret = -ENOMEM; | 2613 | page_cache_sync_readahead(inode->i_mapping, |
2580 | goto out_unlock; | 2614 | ra, NULL, index, |
2615 | last_index + 1 - index); | ||
2616 | page = grab_cache_page(inode->i_mapping, index); | ||
2617 | if (!page) { | ||
2618 | ret = -ENOMEM; | ||
2619 | goto out_unlock; | ||
2620 | } | ||
2621 | } | ||
2622 | |||
2623 | if (PageReadahead(page)) { | ||
2624 | page_cache_async_readahead(inode->i_mapping, | ||
2625 | ra, NULL, page, index, | ||
2626 | last_index + 1 - index); | ||
2581 | } | 2627 | } |
2628 | |||
2582 | if (!PageUptodate(page)) { | 2629 | if (!PageUptodate(page)) { |
2583 | btrfs_readpage(NULL, page); | 2630 | btrfs_readpage(NULL, page); |
2584 | lock_page(page); | 2631 | lock_page(page); |
@@ -2589,75 +2636,79 @@ again: | |||
2589 | goto out_unlock; | 2636 | goto out_unlock; |
2590 | } | 2637 | } |
2591 | } | 2638 | } |
2592 | wait_on_page_writeback(page); | ||
2593 | 2639 | ||
2594 | page_start = (u64)page->index << PAGE_CACHE_SHIFT; | 2640 | page_start = (u64)page->index << PAGE_CACHE_SHIFT; |
2595 | page_end = page_start + PAGE_CACHE_SIZE - 1; | 2641 | page_end = page_start + PAGE_CACHE_SIZE - 1; |
2596 | lock_extent(io_tree, page_start, page_end, GFP_NOFS); | 2642 | |
2597 | 2643 | lock_extent(&BTRFS_I(inode)->io_tree, | |
2598 | ordered = btrfs_lookup_ordered_extent(inode, page_start); | 2644 | page_start, page_end, GFP_NOFS); |
2599 | if (ordered) { | 2645 | |
2600 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
2601 | unlock_page(page); | ||
2602 | page_cache_release(page); | ||
2603 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
2604 | btrfs_put_ordered_extent(ordered); | ||
2605 | goto again; | ||
2606 | } | ||
2607 | set_page_extent_mapped(page); | 2646 | set_page_extent_mapped(page); |
2608 | 2647 | ||
2609 | if (i == first_index) | 2648 | if (nr < cluster->nr && |
2610 | set_extent_bits(io_tree, page_start, page_end, | 2649 | page_start + offset == cluster->boundary[nr]) { |
2650 | set_extent_bits(&BTRFS_I(inode)->io_tree, | ||
2651 | page_start, page_end, | ||
2611 | EXTENT_BOUNDARY, GFP_NOFS); | 2652 | EXTENT_BOUNDARY, GFP_NOFS); |
2653 | nr++; | ||
2654 | } | ||
2612 | btrfs_set_extent_delalloc(inode, page_start, page_end); | 2655 | btrfs_set_extent_delalloc(inode, page_start, page_end); |
2613 | 2656 | ||
2614 | set_page_dirty(page); | 2657 | set_page_dirty(page); |
2615 | total_dirty++; | 2658 | dirty_page++; |
2616 | 2659 | ||
2617 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | 2660 | unlock_extent(&BTRFS_I(inode)->io_tree, |
2661 | page_start, page_end, GFP_NOFS); | ||
2618 | unlock_page(page); | 2662 | unlock_page(page); |
2619 | page_cache_release(page); | 2663 | page_cache_release(page); |
2664 | |||
2665 | index++; | ||
2666 | if (nr < cluster->nr && | ||
2667 | page_end + 1 + offset == cluster->boundary[nr]) { | ||
2668 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, | ||
2669 | dirty_page); | ||
2670 | dirty_page = 0; | ||
2671 | } | ||
2672 | } | ||
2673 | if (dirty_page) { | ||
2674 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, | ||
2675 | dirty_page); | ||
2620 | } | 2676 | } |
2677 | WARN_ON(nr != cluster->nr); | ||
2621 | out_unlock: | 2678 | out_unlock: |
2622 | mutex_unlock(&inode->i_mutex); | 2679 | mutex_unlock(&inode->i_mutex); |
2623 | kfree(ra); | 2680 | kfree(ra); |
2624 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); | ||
2625 | return ret; | 2681 | return ret; |
2626 | } | 2682 | } |
2627 | 2683 | ||
2628 | static noinline_for_stack | 2684 | static noinline_for_stack |
2629 | int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) | 2685 | int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, |
2686 | struct file_extent_cluster *cluster) | ||
2630 | { | 2687 | { |
2631 | struct btrfs_root *root = BTRFS_I(inode)->root; | 2688 | int ret; |
2632 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
2633 | struct extent_map *em; | ||
2634 | u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt; | ||
2635 | u64 end = start + extent_key->offset - 1; | ||
2636 | |||
2637 | em = alloc_extent_map(GFP_NOFS); | ||
2638 | em->start = start; | ||
2639 | em->len = extent_key->offset; | ||
2640 | em->block_len = extent_key->offset; | ||
2641 | em->block_start = extent_key->objectid; | ||
2642 | em->bdev = root->fs_info->fs_devices->latest_bdev; | ||
2643 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | ||
2644 | 2689 | ||
2645 | /* setup extent map to cheat btrfs_readpage */ | 2690 | if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { |
2646 | lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); | 2691 | ret = relocate_file_extent_cluster(inode, cluster); |
2647 | while (1) { | 2692 | if (ret) |
2648 | int ret; | 2693 | return ret; |
2649 | write_lock(&em_tree->lock); | 2694 | cluster->nr = 0; |
2650 | ret = add_extent_mapping(em_tree, em); | ||
2651 | write_unlock(&em_tree->lock); | ||
2652 | if (ret != -EEXIST) { | ||
2653 | free_extent_map(em); | ||
2654 | break; | ||
2655 | } | ||
2656 | btrfs_drop_extent_cache(inode, start, end, 0); | ||
2657 | } | 2695 | } |
2658 | unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); | ||
2659 | 2696 | ||
2660 | return relocate_inode_pages(inode, start, extent_key->offset); | 2697 | if (!cluster->nr) |
2698 | cluster->start = extent_key->objectid; | ||
2699 | else | ||
2700 | BUG_ON(cluster->nr >= MAX_EXTENTS); | ||
2701 | cluster->end = extent_key->objectid + extent_key->offset - 1; | ||
2702 | cluster->boundary[cluster->nr] = extent_key->objectid; | ||
2703 | cluster->nr++; | ||
2704 | |||
2705 | if (cluster->nr >= MAX_EXTENTS) { | ||
2706 | ret = relocate_file_extent_cluster(inode, cluster); | ||
2707 | if (ret) | ||
2708 | return ret; | ||
2709 | cluster->nr = 0; | ||
2710 | } | ||
2711 | return 0; | ||
2661 | } | 2712 | } |
2662 | 2713 | ||
2663 | #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 | 2714 | #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 |
@@ -3208,6 +3259,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) | |||
3208 | { | 3259 | { |
3209 | struct rb_root blocks = RB_ROOT; | 3260 | struct rb_root blocks = RB_ROOT; |
3210 | struct btrfs_key key; | 3261 | struct btrfs_key key; |
3262 | struct file_extent_cluster *cluster; | ||
3211 | struct btrfs_trans_handle *trans = NULL; | 3263 | struct btrfs_trans_handle *trans = NULL; |
3212 | struct btrfs_path *path; | 3264 | struct btrfs_path *path; |
3213 | struct btrfs_extent_item *ei; | 3265 | struct btrfs_extent_item *ei; |
@@ -3217,6 +3269,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) | |||
3217 | int ret; | 3269 | int ret; |
3218 | int err = 0; | 3270 | int err = 0; |
3219 | 3271 | ||
3272 | cluster = kzalloc(sizeof(*cluster), GFP_NOFS); | ||
3273 | if (!cluster) | ||
3274 | return -ENOMEM; | ||
3275 | |||
3220 | path = btrfs_alloc_path(); | 3276 | path = btrfs_alloc_path(); |
3221 | if (!path) | 3277 | if (!path) |
3222 | return -ENOMEM; | 3278 | return -ENOMEM; |
@@ -3310,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) | |||
3310 | } | 3366 | } |
3311 | 3367 | ||
3312 | nr = trans->blocks_used; | 3368 | nr = trans->blocks_used; |
3313 | btrfs_end_transaction_throttle(trans, rc->extent_root); | 3369 | btrfs_end_transaction(trans, rc->extent_root); |
3314 | trans = NULL; | 3370 | trans = NULL; |
3315 | btrfs_btree_balance_dirty(rc->extent_root, nr); | 3371 | btrfs_btree_balance_dirty(rc->extent_root, nr); |
3316 | 3372 | ||
3317 | if (rc->stage == MOVE_DATA_EXTENTS && | 3373 | if (rc->stage == MOVE_DATA_EXTENTS && |
3318 | (flags & BTRFS_EXTENT_FLAG_DATA)) { | 3374 | (flags & BTRFS_EXTENT_FLAG_DATA)) { |
3319 | rc->found_file_extent = 1; | 3375 | rc->found_file_extent = 1; |
3320 | ret = relocate_data_extent(rc->data_inode, &key); | 3376 | ret = relocate_data_extent(rc->data_inode, |
3377 | &key, cluster); | ||
3321 | if (ret < 0) { | 3378 | if (ret < 0) { |
3322 | err = ret; | 3379 | err = ret; |
3323 | break; | 3380 | break; |
@@ -3332,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) | |||
3332 | btrfs_btree_balance_dirty(rc->extent_root, nr); | 3389 | btrfs_btree_balance_dirty(rc->extent_root, nr); |
3333 | } | 3390 | } |
3334 | 3391 | ||
3392 | if (!err) { | ||
3393 | ret = relocate_file_extent_cluster(rc->data_inode, cluster); | ||
3394 | if (ret < 0) | ||
3395 | err = ret; | ||
3396 | } | ||
3397 | |||
3398 | kfree(cluster); | ||
3399 | |||
3335 | rc->create_reloc_root = 0; | 3400 | rc->create_reloc_root = 0; |
3336 | smp_mb(); | 3401 | smp_mb(); |
3337 | 3402 | ||
@@ -3352,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) | |||
3352 | } | 3417 | } |
3353 | 3418 | ||
3354 | static int __insert_orphan_inode(struct btrfs_trans_handle *trans, | 3419 | static int __insert_orphan_inode(struct btrfs_trans_handle *trans, |
3355 | struct btrfs_root *root, | 3420 | struct btrfs_root *root, u64 objectid) |
3356 | u64 objectid, u64 size) | ||
3357 | { | 3421 | { |
3358 | struct btrfs_path *path; | 3422 | struct btrfs_path *path; |
3359 | struct btrfs_inode_item *item; | 3423 | struct btrfs_inode_item *item; |
@@ -3372,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, | |||
3372 | item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); | 3436 | item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); |
3373 | memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); | 3437 | memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); |
3374 | btrfs_set_inode_generation(leaf, item, 1); | 3438 | btrfs_set_inode_generation(leaf, item, 1); |
3375 | btrfs_set_inode_size(leaf, item, size); | 3439 | btrfs_set_inode_size(leaf, item, 0); |
3376 | btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); | 3440 | btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); |
3377 | btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); | 3441 | btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); |
3378 | btrfs_mark_buffer_dirty(leaf); | 3442 | btrfs_mark_buffer_dirty(leaf); |
@@ -3408,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, | |||
3408 | if (err) | 3472 | if (err) |
3409 | goto out; | 3473 | goto out; |
3410 | 3474 | ||
3411 | err = __insert_orphan_inode(trans, root, objectid, group->key.offset); | 3475 | err = __insert_orphan_inode(trans, root, objectid); |
3412 | BUG_ON(err); | ||
3413 | |||
3414 | err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, | ||
3415 | group->key.offset, 0, group->key.offset, | ||
3416 | 0, 0, 0); | ||
3417 | BUG_ON(err); | 3476 | BUG_ON(err); |
3418 | 3477 | ||
3419 | key.objectid = objectid; | 3478 | key.objectid = objectid; |
@@ -3519,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) | |||
3519 | } | 3578 | } |
3520 | } | 3579 | } |
3521 | 3580 | ||
3522 | filemap_fdatawrite_range(fs_info->btree_inode->i_mapping, | 3581 | filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, |
3523 | rc->block_group->key.objectid, | 3582 | rc->block_group->key.objectid, |
3524 | rc->block_group->key.objectid + | 3583 | rc->block_group->key.objectid + |
3525 | rc->block_group->key.offset - 1); | 3584 | rc->block_group->key.offset - 1); |
3526 | 3585 | ||
3527 | WARN_ON(rc->block_group->pinned > 0); | 3586 | WARN_ON(rc->block_group->pinned > 0); |
3528 | WARN_ON(rc->block_group->reserved > 0); | 3587 | WARN_ON(rc->block_group->reserved > 0); |