aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorFilipe David Borba Manana <fdmanana@gmail.com>2013-11-19 17:29:35 -0500
committerChris Mason <clm@fb.com>2014-01-28 16:19:44 -0500
commit131e404a2a54d30f894425ef723f9867a43bff4c (patch)
tree2cdc62a1df81a1c7a0253c2cc99680ab8fef3c90 /fs/btrfs/inode.c
parent0647bf564f1e35975e84f152dcba1a1ad54fbe7e (diff)
Btrfs: fix very slow inode eviction and fs unmount
The inode eviction can be very slow, because during eviction we tell the VFS to truncate all of the inode's pages. This results in calls to btrfs_invalidatepage() which in turn does calls to lock_extent_bits() and clear_extent_bit(). These calls result in too many merges and splits of extent_state structures, which consume a lot of time and cpu when the inode has many pages. In some scenarios I have experienced umount times higher than 15 minutes, even when there's no pending IO (after a btrfs fs sync). A quick way to reproduce this issue: $ mkfs.btrfs -f /dev/sdb3 $ mount /dev/sdb3 /mnt/btrfs $ cd /mnt/btrfs $ sysbench --test=fileio --file-num=128 --file-total-size=16G \ --file-test-mode=seqwr --num-threads=128 \ --file-block-size=16384 --max-time=60 --max-requests=0 run $ time btrfs fi sync . FSSync '.' real 0m25.457s user 0m0.000s sys 0m0.092s $ cd .. $ time umount /mnt/btrfs real 1m38.234s user 0m0.000s sys 1m25.760s The same test on ext4 runs much faster: $ mkfs.ext4 /dev/sdb3 $ mount /dev/sdb3 /mnt/ext4 $ cd /mnt/ext4 $ sysbench --test=fileio --file-num=128 --file-total-size=16G \ --file-test-mode=seqwr --num-threads=128 \ --file-block-size=16384 --max-time=60 --max-requests=0 run $ sync $ cd .. $ time umount /mnt/ext4 real 0m3.626s user 0m0.004s sys 0m3.012s After this patch, the unmount (inode evictions) is much faster: $ mkfs.btrfs -f /dev/sdb3 $ mount /dev/sdb3 /mnt/btrfs $ cd /mnt/btrfs $ sysbench --test=fileio --file-num=128 --file-total-size=16G \ --file-test-mode=seqwr --num-threads=128 \ --file-block-size=16384 --max-time=60 --max-requests=0 run $ time btrfs fi sync . FSSync '.' real 0m26.774s user 0m0.000s sys 0m0.084s $ cd .. $ time umount /mnt/btrfs real 0m1.811s user 0m0.000s sys 0m1.564s Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c98
1 files changed, 84 insertions, 14 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5a5de36d39fc..e889779c9b37 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4488,6 +4488,62 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
4488 return err; 4488 return err;
4489} 4489}
4490 4490
4491/*
4492 * While truncating the inode pages during eviction, we get the VFS calling
4493 * btrfs_invalidatepage() against each page of the inode. This is slow because
4494 * the calls to btrfs_invalidatepage() result in a huge amount of calls to
4495 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
4496 * extent_state structures over and over, wasting lots of time.
4497 *
4498 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
4499 * those expensive operations on a per page basis and do only the ordered io
4500 * finishing, while we release here the extent_map and extent_state structures,
4501 * without the excessive merging and splitting.
4502 */
4503static void evict_inode_truncate_pages(struct inode *inode)
4504{
4505 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4506 struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
4507 struct rb_node *node;
4508
4509 ASSERT(inode->i_state & I_FREEING);
4510 truncate_inode_pages(&inode->i_data, 0);
4511
4512 write_lock(&map_tree->lock);
4513 while (!RB_EMPTY_ROOT(&map_tree->map)) {
4514 struct extent_map *em;
4515
4516 node = rb_first(&map_tree->map);
4517 em = rb_entry(node, struct extent_map, rb_node);
4518 remove_extent_mapping(map_tree, em);
4519 free_extent_map(em);
4520 }
4521 write_unlock(&map_tree->lock);
4522
4523 spin_lock(&io_tree->lock);
4524 while (!RB_EMPTY_ROOT(&io_tree->state)) {
4525 struct extent_state *state;
4526 struct extent_state *cached_state = NULL;
4527
4528 node = rb_first(&io_tree->state);
4529 state = rb_entry(node, struct extent_state, rb_node);
4530 atomic_inc(&state->refs);
4531 spin_unlock(&io_tree->lock);
4532
4533 lock_extent_bits(io_tree, state->start, state->end,
4534 0, &cached_state);
4535 clear_extent_bit(io_tree, state->start, state->end,
4536 EXTENT_LOCKED | EXTENT_DIRTY |
4537 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
4538 EXTENT_DEFRAG, 1, 1,
4539 &cached_state, GFP_NOFS);
4540 free_extent_state(state);
4541
4542 spin_lock(&io_tree->lock);
4543 }
4544 spin_unlock(&io_tree->lock);
4545}
4546
4491void btrfs_evict_inode(struct inode *inode) 4547void btrfs_evict_inode(struct inode *inode)
4492{ 4548{
4493 struct btrfs_trans_handle *trans; 4549 struct btrfs_trans_handle *trans;
@@ -4498,7 +4554,8 @@ void btrfs_evict_inode(struct inode *inode)
4498 4554
4499 trace_btrfs_inode_evict(inode); 4555 trace_btrfs_inode_evict(inode);
4500 4556
4501 truncate_inode_pages(&inode->i_data, 0); 4557 evict_inode_truncate_pages(inode);
4558
4502 if (inode->i_nlink && 4559 if (inode->i_nlink &&
4503 ((btrfs_root_refs(&root->root_item) != 0 && 4560 ((btrfs_root_refs(&root->root_item) != 0 &&
4504 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || 4561 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
@@ -7379,6 +7436,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7379 struct extent_state *cached_state = NULL; 7436 struct extent_state *cached_state = NULL;
7380 u64 page_start = page_offset(page); 7437 u64 page_start = page_offset(page);
7381 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 7438 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
7439 int inode_evicting = inode->i_state & I_FREEING;
7382 7440
7383 /* 7441 /*
7384 * we have the page locked, so new writeback can't start, 7442 * we have the page locked, so new writeback can't start,
@@ -7394,17 +7452,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7394 btrfs_releasepage(page, GFP_NOFS); 7452 btrfs_releasepage(page, GFP_NOFS);
7395 return; 7453 return;
7396 } 7454 }
7397 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 7455
7398 ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); 7456 if (!inode_evicting)
7457 lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
7458 ordered = btrfs_lookup_ordered_extent(inode, page_start);
7399 if (ordered) { 7459 if (ordered) {
7400 /* 7460 /*
7401 * IO on this page will never be started, so we need 7461 * IO on this page will never be started, so we need
7402 * to account for any ordered extents now 7462 * to account for any ordered extents now
7403 */ 7463 */
7404 clear_extent_bit(tree, page_start, page_end, 7464 if (!inode_evicting)
7405 EXTENT_DIRTY | EXTENT_DELALLOC | 7465 clear_extent_bit(tree, page_start, page_end,
7406 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 7466 EXTENT_DIRTY | EXTENT_DELALLOC |
7407 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); 7467 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7468 EXTENT_DEFRAG, 1, 0, &cached_state,
7469 GFP_NOFS);
7408 /* 7470 /*
7409 * whoever cleared the private bit is responsible 7471 * whoever cleared the private bit is responsible
7410 * for the finish_ordered_io 7472 * for the finish_ordered_io
@@ -7428,14 +7490,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7428 btrfs_finish_ordered_io(ordered); 7490 btrfs_finish_ordered_io(ordered);
7429 } 7491 }
7430 btrfs_put_ordered_extent(ordered); 7492 btrfs_put_ordered_extent(ordered);
7431 cached_state = NULL; 7493 if (!inode_evicting) {
7432 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 7494 cached_state = NULL;
7495 lock_extent_bits(tree, page_start, page_end, 0,
7496 &cached_state);
7497 }
7498 }
7499
7500 if (!inode_evicting) {
7501 clear_extent_bit(tree, page_start, page_end,
7502 EXTENT_LOCKED | EXTENT_DIRTY |
7503 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
7504 EXTENT_DEFRAG, 1, 1,
7505 &cached_state, GFP_NOFS);
7506
7507 __btrfs_releasepage(page, GFP_NOFS);
7433 } 7508 }
7434 clear_extent_bit(tree, page_start, page_end,
7435 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
7436 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
7437 &cached_state, GFP_NOFS);
7438 __btrfs_releasepage(page, GFP_NOFS);
7439 7509
7440 ClearPageChecked(page); 7510 ClearPageChecked(page);
7441 if (PagePrivate(page)) { 7511 if (PagePrivate(page)) {