From 551ebb2d34304ee2abfe6b00d39ec65d5e4e8266 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 23 Apr 2012 14:41:09 -0400 Subject: Btrfs: remove useless waiting and extra filemap work In btrfs_wait_ordered_range we have been calling filemap_fdata_write() twice because compression does strange things and then waiting. Then we look up ordered extents and if we find any we will always schedule_timeout(); once and then loop back around and do it all again. We will even check to see if there is delalloc pages on this range and loop again. So this patch gets rid of the multipe fdata_write() calls and just does filemap_write_and_wait(). In the case of compression we will still find the ordered extents and start those individually if we need to so that is ok, but in the normal buffered case we avoid all this weird overhead. Then in the case of the schedule_timeout(1), we don't need it. All callers either 1) don't care, they just want to make sure what they just wrote maeks it to disk or 2) are doing the lock()->lookup ordered->unlock->flush thing in which case it will lock and check for ordered extents _anyway_ so get back to them as quickly as possible. The delaloc check is simply not needed, this only catches the case where we write to the file again since doing the filemap_write_and_wait() and if the caller truly cares about that it will take care of everything itself. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/ordered-data.c') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index bbf6d0d9aebe..9807750c6255 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -621,19 +621,11 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) if (orig_end > INT_LIMIT(loff_t)) orig_end = INT_LIMIT(loff_t); } -again: + /* start IO across the range first to instantiate any delalloc * extents */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - /* The compression code will leave pages locked but return from - * writepage without setting the page writeback. Starting again - * with WB_SYNC_ALL will end up waiting for the IO to actually start. - */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - filemap_fdatawait_range(inode->i_mapping, start, orig_end); + filemap_write_and_wait_range(inode->i_mapping, start, orig_end); end = orig_end; found = 0; @@ -657,11 +649,6 @@ again: break; end--; } - if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, - EXTENT_DELALLOC, 0, NULL)) { - schedule_timeout(1); - goto again; - } } /* -- cgit v1.2.2 From 4e89915220e2f1341c757b610d0f0c3821f3a65f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 May 2012 13:52:09 -0400 Subject: Btrfs: do not check delalloc when updating disk_i_size We are checking delalloc to see if it is ok to update the i_size. There are 2 cases it stops us from updating 1) If there is delalloc between our current disk_i_size and this ordered extent 2) If there is delalloc between our current ordered extent and the next ordered extent These tests are racy however since we can set delalloc for these ranges at any time. Also for the first case if we notice there is delalloc between disk_i_size and our ordered extent we will not update disk_i_size and assume that when that delalloc bit gets written out it will update everything properly. However if we crash before that we will have file extents outside of our i_size, which is not good, so this test is dangerous as well as racy. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'fs/btrfs/ordered-data.c') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9807750c6255..9565c0289164 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -751,7 +751,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered) { struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; u64 disk_i_size; u64 new_i_size; u64 i_size_test; @@ -784,14 +783,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, goto out; } - /* - * we can't update the disk_isize if there are delalloc bytes - * between disk_i_size and this ordered extent - */ - if (test_range_bit(io_tree, disk_i_size, offset - 1, - EXTENT_DELALLOC, 0, NULL)) { - goto out; - } /* * walk backward from this ordered extent to disk_i_size. * if we find an ordered extent then we can't update disk i_size @@ -853,15 +844,11 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, /* * i_size_test is the end of a region after this ordered - * extent where there are no ordered extents. As long as there - * are no delalloc bytes in this area, it is safe to update - * disk_i_size to the end of the region. + * extent where there are no ordered extents, we can safely set + * disk_i_size to this. */ - if (i_size_test > offset && - !test_range_bit(io_tree, offset, i_size_test - 1, - EXTENT_DELALLOC, 0, NULL)) { + if (i_size_test > offset) new_i_size = min_t(u64, i_size_test, i_size); - } BTRFS_I(inode)->disk_i_size = new_i_size; ret = 0; out: -- cgit v1.2.2 From 5fd02043553b02867b29de1ac9fff2ec16b84def Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 May 2012 14:00:54 -0400 Subject: Btrfs: finish ordered extents in their own thread We noticed that the ordered extent completion doesn't really rely on having a page and that it could be done independantly of ending the writeback on a page. This patch makes us not do the threaded endio stuff for normal buffered writes and direct writes so we can end page writeback as soon as possible (in irq context) and only start threads to do the ordered work when it is actually done. Compression needs to be reworked some to take advantage of this as well, but atm it has to do a find_get_page in its endio handler so it must be done in its own thread. This makes direct writes quite a bit faster. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 129 +++++++++++++++++++++++++++--------------------- 1 file changed, 72 insertions(+), 57 deletions(-) (limited to 'fs/btrfs/ordered-data.c') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9565c0289164..9e138cdc36c5 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->len = len; entry->disk_len = disk_len; entry->bytes_left = len; - entry->inode = inode; + entry->inode = igrab(inode); entry->compress_type = compress_type; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, trace_btrfs_ordered_extent_add(inode, entry); - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); if (node) ordered_data_tree_panic(inode, -EEXIST, file_offset); - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_inode_tree *tree; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); } /* @@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode, */ int btrfs_dec_test_first_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 *file_offset, u64 io_size) + u64 *file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; int ret; + unsigned long flags; u64 dec_end; u64 dec_start; u64 to_dec; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, *file_offset); if (!node) { ret = 1; @@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, (unsigned long long)to_dec); } entry->bytes_left -= to_dec; + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + if (entry->bytes_left == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); else @@ -332,7 +336,7 @@ out: *cached = entry; atomic_inc(&entry->refs); } - spin_unlock(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; } @@ -347,15 +351,21 @@ out: */ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size) + u64 file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; int ret; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); + if (cached && *cached) { + entry = *cached; + goto have_entry; + } + node = tree_search(tree, file_offset); if (!node) { ret = 1; @@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); +have_entry: if (!offset_in_entry(entry, file_offset)) { ret = 1; goto out; @@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, (unsigned long long)io_size); } entry->bytes_left -= io_size; + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + if (entry->bytes_left == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); else @@ -383,7 +397,7 @@ out: *cached = entry; atomic_inc(&entry->refs); } - spin_unlock(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; } @@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); if (atomic_dec_and_test(&entry->refs)) { + if (entry->inode) + btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); @@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) /* * remove an ordered extent from the tree. No references are dropped - * and you must wake_up entry->wait. You must hold the tree lock - * while you call this function. + * and waiters are woken up. */ -static void __btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +void btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; struct rb_node *node; tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + spin_unlock_irq(&tree->lock); spin_lock(&root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); @@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode, list_del_init(&BTRFS_I(inode)->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); -} - -/* - * remove an ordered extent from the tree. No references are dropped - * but any waiters are woken. - */ -void btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) -{ - struct btrfs_ordered_inode_tree *tree; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); - __btrfs_remove_ordered_extent(inode, entry); - spin_unlock(&tree->lock); wake_up(&entry->wait); } @@ -663,7 +665,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) goto out; @@ -674,7 +676,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, if (entry) atomic_inc(&entry->refs); out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -690,7 +692,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) { node = tree_search(tree, file_offset + len); @@ -715,7 +717,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, out: if (entry) atomic_inc(&entry->refs); - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -731,7 +733,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) goto out; @@ -739,7 +741,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); atomic_inc(&entry->refs); out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -765,7 +767,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, else offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); disk_i_size = BTRFS_I(inode)->disk_i_size; /* truncate file */ @@ -803,15 +805,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, } node = prev; } - while (node) { + for (; node; node = rb_prev(node)) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + + /* We treat this entry as if it doesnt exist */ + if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) + continue; if (test->file_offset + test->len <= disk_i_size) break; if (test->file_offset >= i_size) break; if (test->file_offset >= disk_i_size) goto out; - node = rb_prev(node); } new_i_size = min_t(u64, offset, i_size); @@ -829,17 +834,27 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, else node = rb_first(&tree->tree); } - i_size_test = 0; - if (node) { - /* - * do we have an area where IO might have finished - * between our ordered extent and the next one. - */ + + /* + * We are looking for an area between our current extent and the next + * ordered extent to update the i_size to. There are 3 cases here + * + * 1) We don't actually have anything and we can update to i_size. + * 2) We have stuff but they already did their i_size update so again we + * can just update to i_size. + * 3) We have an outstanding ordered extent so the most we can update + * our disk_i_size to is the start of the next offset. + */ + i_size_test = i_size; + for (; node; node = rb_next(node)) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (test->file_offset > offset) + + if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) + continue; + if (test->file_offset > offset) { i_size_test = test->file_offset; - } else { - i_size_test = i_size; + break; + } } /* @@ -853,15 +868,15 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, ret = 0; out: /* - * we need to remove the ordered extent with the tree lock held - * so that other people calling this function don't find our fully - * processed ordered entry and skip updating the i_size + * We need to do this because we can't remove ordered extents until + * after the i_disk_size has been updated and then the inode has been + * updated to reflect the change, so we need to tell anybody who finds + * this ordered extent that we've already done all the real work, we + * just haven't completed all the other work. */ if (ordered) - __btrfs_remove_ordered_extent(inode, ordered); - spin_unlock(&tree->lock); - if (ordered) - wake_up(&ordered->wait); + set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); + spin_unlock_irq(&tree->lock); return ret; } @@ -886,7 +901,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, if (!ordered) return 1; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr) { num_sectors = ordered_sum->len / sectorsize; @@ -901,7 +916,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, } } out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); btrfs_put_ordered_extent(ordered); return ret; } -- cgit v1.2.2