diff options
author | Chris Mason <chris.mason@oracle.com> | 2008-07-17 12:53:50 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2008-09-25 11:04:04 -0400 |
commit | e6dcd2dc9c489108648e2ed543315dd134d50a9a (patch) | |
tree | cddf6f588b65e28c5feb8bff89b22d8ff70f8a50 /fs/btrfs/file.c | |
parent | 77a41afb7d0dd0f27b6f2f1a5bc701929c7034de (diff) |
Btrfs: New data=ordered implementation
The old data=ordered code would force commit to wait until
all the data extents from the transaction were fully on disk. This
introduced large latencies into the commit and stalled new writers
in the transaction for a long time.
The new code changes the way data allocations and extents work:
* When delayed allocation is filled, data extents are reserved, and
the extent bit EXTENT_ORDERED is set on the entire range of the extent.
A struct btrfs_ordered_extent is allocated an inserted into a per-inode
rbtree to track the pending extents.
* As each page is written EXTENT_ORDERED is cleared on the bytes corresponding
to that page.
* When all of the bytes corresponding to a single struct btrfs_ordered_extent
are written, The previously reserved extent is inserted into the FS
btree and into the extent allocation trees. The checksums for the file
data are also updated.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r-- | fs/btrfs/file.c | 67 |
1 files changed, 45 insertions, 22 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8037792f8789..12e765f7e0d4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -34,7 +34,6 @@ | |||
34 | #include "disk-io.h" | 34 | #include "disk-io.h" |
35 | #include "transaction.h" | 35 | #include "transaction.h" |
36 | #include "btrfs_inode.h" | 36 | #include "btrfs_inode.h" |
37 | #include "ordered-data.h" | ||
38 | #include "ioctl.h" | 37 | #include "ioctl.h" |
39 | #include "print-tree.h" | 38 | #include "print-tree.h" |
40 | #include "compat.h" | 39 | #include "compat.h" |
@@ -273,7 +272,9 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, | |||
273 | u64 mask = root->sectorsize - 1; | 272 | u64 mask = root->sectorsize - 1; |
274 | last_pos_in_file = (isize + mask) & ~mask; | 273 | last_pos_in_file = (isize + mask) & ~mask; |
275 | hole_size = (start_pos - last_pos_in_file + mask) & ~mask; | 274 | hole_size = (start_pos - last_pos_in_file + mask) & ~mask; |
276 | if (last_pos_in_file < start_pos) { | 275 | if (hole_size > 0) { |
276 | btrfs_wait_ordered_range(inode, last_pos_in_file, | ||
277 | last_pos_in_file + hole_size); | ||
277 | err = btrfs_drop_extents(trans, root, inode, | 278 | err = btrfs_drop_extents(trans, root, inode, |
278 | last_pos_in_file, | 279 | last_pos_in_file, |
279 | last_pos_in_file + hole_size, | 280 | last_pos_in_file + hole_size, |
@@ -303,19 +304,17 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, | |||
303 | inline_size > root->fs_info->max_inline || | 304 | inline_size > root->fs_info->max_inline || |
304 | (inline_size & (root->sectorsize -1)) == 0 || | 305 | (inline_size & (root->sectorsize -1)) == 0 || |
305 | inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) { | 306 | inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) { |
306 | u64 last_end; | 307 | /* check for reserved extents on each page, we don't want |
307 | 308 | * to reset the delalloc bit on things that already have | |
309 | * extents reserved. | ||
310 | */ | ||
311 | set_extent_delalloc(io_tree, start_pos, | ||
312 | end_of_last_block, GFP_NOFS); | ||
308 | for (i = 0; i < num_pages; i++) { | 313 | for (i = 0; i < num_pages; i++) { |
309 | struct page *p = pages[i]; | 314 | struct page *p = pages[i]; |
310 | SetPageUptodate(p); | 315 | SetPageUptodate(p); |
311 | set_page_dirty(p); | 316 | set_page_dirty(p); |
312 | } | 317 | } |
313 | last_end = (u64)(pages[num_pages -1]->index) << | ||
314 | PAGE_CACHE_SHIFT; | ||
315 | last_end += PAGE_CACHE_SIZE - 1; | ||
316 | set_extent_delalloc(io_tree, start_pos, end_of_last_block, | ||
317 | GFP_NOFS); | ||
318 | btrfs_add_ordered_inode(inode); | ||
319 | } else { | 318 | } else { |
320 | u64 aligned_end; | 319 | u64 aligned_end; |
321 | /* step one, delete the existing extents in this range */ | 320 | /* step one, delete the existing extents in this range */ |
@@ -350,10 +349,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end) | |||
350 | struct extent_map *split = NULL; | 349 | struct extent_map *split = NULL; |
351 | struct extent_map *split2 = NULL; | 350 | struct extent_map *split2 = NULL; |
352 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | 351 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; |
352 | struct extent_map *tmp; | ||
353 | u64 len = end - start + 1; | 353 | u64 len = end - start + 1; |
354 | u64 next_start; | ||
354 | int ret; | 355 | int ret; |
355 | int testend = 1; | 356 | int testend = 1; |
356 | 357 | ||
358 | WARN_ON(end < start); | ||
357 | if (end == (u64)-1) { | 359 | if (end == (u64)-1) { |
358 | len = (u64)-1; | 360 | len = (u64)-1; |
359 | testend = 0; | 361 | testend = 0; |
@@ -370,6 +372,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end) | |||
370 | spin_unlock(&em_tree->lock); | 372 | spin_unlock(&em_tree->lock); |
371 | break; | 373 | break; |
372 | } | 374 | } |
375 | tmp = rb_entry(&em->rb_node, struct extent_map, rb_node); | ||
376 | next_start = tmp->start; | ||
373 | remove_extent_mapping(em_tree, em); | 377 | remove_extent_mapping(em_tree, em); |
374 | 378 | ||
375 | if (em->block_start < EXTENT_MAP_LAST_BYTE && | 379 | if (em->block_start < EXTENT_MAP_LAST_BYTE && |
@@ -778,37 +782,58 @@ static int prepare_pages(struct btrfs_root *root, struct file *file, | |||
778 | struct inode *inode = fdentry(file)->d_inode; | 782 | struct inode *inode = fdentry(file)->d_inode; |
779 | int err = 0; | 783 | int err = 0; |
780 | u64 start_pos; | 784 | u64 start_pos; |
785 | u64 last_pos; | ||
781 | 786 | ||
782 | start_pos = pos & ~((u64)root->sectorsize - 1); | 787 | start_pos = pos & ~((u64)root->sectorsize - 1); |
788 | last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; | ||
783 | 789 | ||
784 | memset(pages, 0, num_pages * sizeof(struct page *)); | 790 | memset(pages, 0, num_pages * sizeof(struct page *)); |
785 | 791 | again: | |
786 | for (i = 0; i < num_pages; i++) { | 792 | for (i = 0; i < num_pages; i++) { |
787 | pages[i] = grab_cache_page(inode->i_mapping, index + i); | 793 | pages[i] = grab_cache_page(inode->i_mapping, index + i); |
788 | if (!pages[i]) { | 794 | if (!pages[i]) { |
789 | err = -ENOMEM; | 795 | err = -ENOMEM; |
790 | BUG_ON(1); | 796 | BUG_ON(1); |
791 | } | 797 | } |
792 | #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) | ||
793 | ClearPageDirty(pages[i]); | ||
794 | #else | ||
795 | cancel_dirty_page(pages[i], PAGE_CACHE_SIZE); | ||
796 | #endif | ||
797 | wait_on_page_writeback(pages[i]); | 798 | wait_on_page_writeback(pages[i]); |
798 | set_page_extent_mapped(pages[i]); | ||
799 | WARN_ON(!PageLocked(pages[i])); | ||
800 | } | 799 | } |
801 | if (start_pos < inode->i_size) { | 800 | if (start_pos < inode->i_size) { |
802 | u64 last_pos; | 801 | struct btrfs_ordered_extent *ordered; |
803 | last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; | ||
804 | lock_extent(&BTRFS_I(inode)->io_tree, | 802 | lock_extent(&BTRFS_I(inode)->io_tree, |
805 | start_pos, last_pos - 1, GFP_NOFS); | 803 | start_pos, last_pos - 1, GFP_NOFS); |
804 | ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1); | ||
805 | if (ordered && | ||
806 | ordered->file_offset + ordered->len > start_pos && | ||
807 | ordered->file_offset < last_pos) { | ||
808 | btrfs_put_ordered_extent(ordered); | ||
809 | unlock_extent(&BTRFS_I(inode)->io_tree, | ||
810 | start_pos, last_pos - 1, GFP_NOFS); | ||
811 | for (i = 0; i < num_pages; i++) { | ||
812 | unlock_page(pages[i]); | ||
813 | page_cache_release(pages[i]); | ||
814 | } | ||
815 | btrfs_wait_ordered_range(inode, start_pos, | ||
816 | last_pos - start_pos); | ||
817 | goto again; | ||
818 | } | ||
819 | if (ordered) | ||
820 | btrfs_put_ordered_extent(ordered); | ||
821 | |||
806 | clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, | 822 | clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, |
807 | last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, | 823 | last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, |
808 | GFP_NOFS); | 824 | GFP_NOFS); |
809 | unlock_extent(&BTRFS_I(inode)->io_tree, | 825 | unlock_extent(&BTRFS_I(inode)->io_tree, |
810 | start_pos, last_pos - 1, GFP_NOFS); | 826 | start_pos, last_pos - 1, GFP_NOFS); |
811 | } | 827 | } |
828 | for (i = 0; i < num_pages; i++) { | ||
829 | #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) | ||
830 | ClearPageDirty(pages[i]); | ||
831 | #else | ||
832 | cancel_dirty_page(pages[i], PAGE_CACHE_SIZE); | ||
833 | #endif | ||
834 | set_page_extent_mapped(pages[i]); | ||
835 | WARN_ON(!PageLocked(pages[i])); | ||
836 | } | ||
812 | return 0; | 837 | return 0; |
813 | } | 838 | } |
814 | 839 | ||
@@ -969,13 +994,11 @@ out_nolock: | |||
969 | (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); | 994 | (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); |
970 | } | 995 | } |
971 | current->backing_dev_info = NULL; | 996 | current->backing_dev_info = NULL; |
972 | btrfs_ordered_throttle(root, inode); | ||
973 | return num_written ? num_written : err; | 997 | return num_written ? num_written : err; |
974 | } | 998 | } |
975 | 999 | ||
976 | int btrfs_release_file(struct inode * inode, struct file * filp) | 1000 | int btrfs_release_file(struct inode * inode, struct file * filp) |
977 | { | 1001 | { |
978 | btrfs_del_ordered_inode(inode, 0); | ||
979 | if (filp->private_data) | 1002 | if (filp->private_data) |
980 | btrfs_ioctl_trans_end(filp); | 1003 | btrfs_ioctl_trans_end(filp); |
981 | return 0; | 1004 | return 0; |