diff options
author | Chris Mason <chris.mason@oracle.com> | 2009-09-02 16:53:46 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2009-09-11 13:31:07 -0400 |
commit | 8b62b72b26bcd72082c4a69d179dd906bcc22200 (patch) | |
tree | ceee20dfebe45654cb3a25d8916c195836cdbabf /fs/btrfs/inode.c | |
parent | 9655d2982b53fdb38a9e0f2f11315b99b92d66e2 (diff) |
Btrfs: Use PagePrivate2 to track pages in the data=ordered code.
Btrfs writes go through delalloc to the data=ordered code. This
makes sure that all of the data is on disk before the metadata
that references it. The tracking means that we have to make sure
each page in an extent is fully written before we add that extent into
the on-disk btree.
This was done in the past by setting the EXTENT_ORDERED bit for the
range of an extent when it was added to the data=ordered code, and then
clearing the EXTENT_ORDERED bit in the extent state tree as each page
finished IO.
One of the reasons we had to do this was because sometimes pages are
magically dirtied without page_mkwrite being called. The EXTENT_ORDERED
bit is checked at writepage time, and if it isn't there, our page become
dirty without going through the proper path.
These bit operations make for a number of rbtree searches for each page,
and can cause considerable lock contention.
This commit switches from the EXTENT_ORDERED bit to use PagePrivate2.
As pages go into the ordered code, PagePrivate2 is set on each one.
This is a cheap operation because we already have all the pages locked
and ready to go.
As IO finishes, the PagePrivate2 bit is cleared and the ordered
accoutning is updated for each page.
At writepage time, if the PagePrivate2 bit is missing, we go into the
writepage fixup code to handle improperly dirtied pages.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 47 |
1 files changed, 30 insertions, 17 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3f8e93de2989..739a245e25d6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -426,7 +426,7 @@ again: | |||
426 | extent_clear_unlock_delalloc(inode, | 426 | extent_clear_unlock_delalloc(inode, |
427 | &BTRFS_I(inode)->io_tree, | 427 | &BTRFS_I(inode)->io_tree, |
428 | start, end, NULL, 1, 0, | 428 | start, end, NULL, 1, 0, |
429 | 0, 1, 1, 1); | 429 | 0, 1, 1, 1, 0); |
430 | ret = 0; | 430 | ret = 0; |
431 | goto free_pages_out; | 431 | goto free_pages_out; |
432 | } | 432 | } |
@@ -641,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode, | |||
641 | async_extent->start, | 641 | async_extent->start, |
642 | async_extent->start + | 642 | async_extent->start + |
643 | async_extent->ram_size - 1, | 643 | async_extent->ram_size - 1, |
644 | NULL, 1, 1, 0, 1, 1, 0); | 644 | NULL, 1, 1, 0, 1, 1, 0, 0); |
645 | 645 | ||
646 | ret = btrfs_submit_compressed_write(inode, | 646 | ret = btrfs_submit_compressed_write(inode, |
647 | async_extent->start, | 647 | async_extent->start, |
@@ -714,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode, | |||
714 | extent_clear_unlock_delalloc(inode, | 714 | extent_clear_unlock_delalloc(inode, |
715 | &BTRFS_I(inode)->io_tree, | 715 | &BTRFS_I(inode)->io_tree, |
716 | start, end, NULL, 1, 1, | 716 | start, end, NULL, 1, 1, |
717 | 1, 1, 1, 1); | 717 | 1, 1, 1, 1, 0); |
718 | *nr_written = *nr_written + | 718 | *nr_written = *nr_written + |
719 | (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; | 719 | (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; |
720 | *page_started = 1; | 720 | *page_started = 1; |
@@ -777,11 +777,14 @@ static noinline int cow_file_range(struct inode *inode, | |||
777 | /* we're not doing compressed IO, don't unlock the first | 777 | /* we're not doing compressed IO, don't unlock the first |
778 | * page (which the caller expects to stay locked), don't | 778 | * page (which the caller expects to stay locked), don't |
779 | * clear any dirty bits and don't set any writeback bits | 779 | * clear any dirty bits and don't set any writeback bits |
780 | * | ||
781 | * Do set the Private2 bit so we know this page was properly | ||
782 | * setup for writepage | ||
780 | */ | 783 | */ |
781 | extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, | 784 | extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, |
782 | start, start + ram_size - 1, | 785 | start, start + ram_size - 1, |
783 | locked_page, unlock, 1, | 786 | locked_page, unlock, 1, |
784 | 1, 0, 0, 0); | 787 | 1, 0, 0, 0, 1); |
785 | disk_num_bytes -= cur_alloc_size; | 788 | disk_num_bytes -= cur_alloc_size; |
786 | num_bytes -= cur_alloc_size; | 789 | num_bytes -= cur_alloc_size; |
787 | alloc_hint = ins.objectid + ins.offset; | 790 | alloc_hint = ins.objectid + ins.offset; |
@@ -1102,7 +1105,7 @@ out_check: | |||
1102 | 1105 | ||
1103 | extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, | 1106 | extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, |
1104 | cur_offset, cur_offset + num_bytes - 1, | 1107 | cur_offset, cur_offset + num_bytes - 1, |
1105 | locked_page, 1, 1, 1, 0, 0, 0); | 1108 | locked_page, 1, 1, 1, 0, 0, 0, 1); |
1106 | cur_offset = extent_end; | 1109 | cur_offset = extent_end; |
1107 | if (cur_offset > end) | 1110 | if (cur_offset > end) |
1108 | break; | 1111 | break; |
@@ -1375,10 +1378,8 @@ again: | |||
1375 | lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); | 1378 | lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); |
1376 | 1379 | ||
1377 | /* already ordered? We're done */ | 1380 | /* already ordered? We're done */ |
1378 | if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, | 1381 | if (PagePrivate2(page)) |
1379 | EXTENT_ORDERED, 0, NULL)) { | ||
1380 | goto out; | 1382 | goto out; |
1381 | } | ||
1382 | 1383 | ||
1383 | ordered = btrfs_lookup_ordered_extent(inode, page_start); | 1384 | ordered = btrfs_lookup_ordered_extent(inode, page_start); |
1384 | if (ordered) { | 1385 | if (ordered) { |
@@ -1414,11 +1415,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) | |||
1414 | struct inode *inode = page->mapping->host; | 1415 | struct inode *inode = page->mapping->host; |
1415 | struct btrfs_writepage_fixup *fixup; | 1416 | struct btrfs_writepage_fixup *fixup; |
1416 | struct btrfs_root *root = BTRFS_I(inode)->root; | 1417 | struct btrfs_root *root = BTRFS_I(inode)->root; |
1417 | int ret; | ||
1418 | 1418 | ||
1419 | ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, | 1419 | /* this page is properly in the ordered list */ |
1420 | EXTENT_ORDERED, 0, NULL); | 1420 | if (TestClearPagePrivate2(page)) |
1421 | if (ret) | ||
1422 | return 0; | 1421 | return 0; |
1423 | 1422 | ||
1424 | if (PageChecked(page)) | 1423 | if (PageChecked(page)) |
@@ -1624,6 +1623,7 @@ nocow: | |||
1624 | static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, | 1623 | static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, |
1625 | struct extent_state *state, int uptodate) | 1624 | struct extent_state *state, int uptodate) |
1626 | { | 1625 | { |
1626 | ClearPagePrivate2(page); | ||
1627 | return btrfs_finish_ordered_io(page->mapping->host, start, end); | 1627 | return btrfs_finish_ordered_io(page->mapping->host, start, end); |
1628 | } | 1628 | } |
1629 | 1629 | ||
@@ -4403,13 +4403,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) | |||
4403 | u64 page_start = page_offset(page); | 4403 | u64 page_start = page_offset(page); |
4404 | u64 page_end = page_start + PAGE_CACHE_SIZE - 1; | 4404 | u64 page_end = page_start + PAGE_CACHE_SIZE - 1; |
4405 | 4405 | ||
4406 | |||
4407 | /* | ||
4408 | * we have the page locked, so new writeback can't start, | ||
4409 | * and the dirty bit won't be cleared while we are here. | ||
4410 | * | ||
4411 | * Wait for IO on this page so that we can safely clear | ||
4412 | * the PagePrivate2 bit and do ordered accounting | ||
4413 | */ | ||
4406 | wait_on_page_writeback(page); | 4414 | wait_on_page_writeback(page); |
4415 | |||
4407 | tree = &BTRFS_I(page->mapping->host)->io_tree; | 4416 | tree = &BTRFS_I(page->mapping->host)->io_tree; |
4408 | if (offset) { | 4417 | if (offset) { |
4409 | btrfs_releasepage(page, GFP_NOFS); | 4418 | btrfs_releasepage(page, GFP_NOFS); |
4410 | return; | 4419 | return; |
4411 | } | 4420 | } |
4412 | |||
4413 | lock_extent(tree, page_start, page_end, GFP_NOFS); | 4421 | lock_extent(tree, page_start, page_end, GFP_NOFS); |
4414 | ordered = btrfs_lookup_ordered_extent(page->mapping->host, | 4422 | ordered = btrfs_lookup_ordered_extent(page->mapping->host, |
4415 | page_offset(page)); | 4423 | page_offset(page)); |
@@ -4421,14 +4429,19 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) | |||
4421 | clear_extent_bit(tree, page_start, page_end, | 4429 | clear_extent_bit(tree, page_start, page_end, |
4422 | EXTENT_DIRTY | EXTENT_DELALLOC | | 4430 | EXTENT_DIRTY | EXTENT_DELALLOC | |
4423 | EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); | 4431 | EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); |
4424 | btrfs_finish_ordered_io(page->mapping->host, | 4432 | /* |
4425 | page_start, page_end); | 4433 | * whoever cleared the private bit is responsible |
4434 | * for the finish_ordered_io | ||
4435 | */ | ||
4436 | if (TestClearPagePrivate2(page)) { | ||
4437 | btrfs_finish_ordered_io(page->mapping->host, | ||
4438 | page_start, page_end); | ||
4439 | } | ||
4426 | btrfs_put_ordered_extent(ordered); | 4440 | btrfs_put_ordered_extent(ordered); |
4427 | lock_extent(tree, page_start, page_end, GFP_NOFS); | 4441 | lock_extent(tree, page_start, page_end, GFP_NOFS); |
4428 | } | 4442 | } |
4429 | clear_extent_bit(tree, page_start, page_end, | 4443 | clear_extent_bit(tree, page_start, page_end, |
4430 | EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | | 4444 | EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, |
4431 | EXTENT_ORDERED, | ||
4432 | 1, 1, NULL, GFP_NOFS); | 4445 | 1, 1, NULL, GFP_NOFS); |
4433 | __btrfs_releasepage(page, GFP_NOFS); | 4446 | __btrfs_releasepage(page, GFP_NOFS); |
4434 | 4447 | ||