aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/ctree.h
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2009-03-31 13:27:11 -0400
committerChris Mason <chris.mason@oracle.com>2009-03-31 14:27:58 -0400
commit5a3f23d515a2ebf0c750db80579ca57b28cbce6d (patch)
treee0ffb43dd35f1c3def9a74ec7a6f4470902c9761 /fs/btrfs/ctree.h
parent1a81af4d1d9c60d4313309f937a1fc5567205a87 (diff)
Btrfs: add extra flushing for renames and truncates
Renames and truncates are both common ways to replace old data with new data. The filesystem can make an effort to make sure the new data is on disk before actually replacing the old data. This is especially important for rename, which many application use as though it were atomic for both the data and the metadata involved. The current btrfs code will happily replace a file that is fully on disk with one that was just created and still has pending IO. If we crash after transaction commit but before the IO is done, we'll end up replacing a good file with a zero length file. The solution used here is to create a list of inodes that need special ordering and force them to disk before the commit is done. This is similar to the ext3 style data=ordering, except it is only done on selected files. Btrfs is able to get away with this because it does not wait on commits very often, even for fsync (which use a sub-commit). For renames, we order the file when it wasn't already on disk and when it is replacing an existing file. Larger files are sent to filemap_flush right away (before the transaction handle is opened). For truncates, we order if the file goes from non-zero size down to zero size. This is a little different, because at the time of the truncate the file has no dirty bytes to order. But, we flag the inode so that it is added to the ordered list on close (via release method). We also immediately add it to the ordered list of the current transaction so that we can try to flush down any writes the application sneaks in before commit. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/ctree.h')
-rw-r--r--fs/btrfs/ctree.h35
1 files changed, 35 insertions, 0 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2737facbd341..f48905ee5240 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
45 45
46#define BTRFS_MAX_LEVEL 8 46#define BTRFS_MAX_LEVEL 8
47 47
48/*
49 * files bigger than this get some pre-flushing when they are added
50 * to the ordered operations list. That way we limit the total
51 * work done by the commit
52 */
53#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
54
48/* holds pointers to all of the tree roots */ 55/* holds pointers to all of the tree roots */
49#define BTRFS_ROOT_TREE_OBJECTID 1ULL 56#define BTRFS_ROOT_TREE_OBJECTID 1ULL
50 57
@@ -727,6 +734,15 @@ struct btrfs_fs_info {
727 struct mutex volume_mutex; 734 struct mutex volume_mutex;
728 struct mutex tree_reloc_mutex; 735 struct mutex tree_reloc_mutex;
729 736
737 /*
738 * this protects the ordered operations list only while we are
739 * processing all of the entries on it. This way we make
740 * sure the commit code doesn't find the list temporarily empty
741 * because another function happens to be doing non-waiting preflush
742 * before jumping into the main commit.
743 */
744 struct mutex ordered_operations_mutex;
745
730 struct list_head trans_list; 746 struct list_head trans_list;
731 struct list_head hashers; 747 struct list_head hashers;
732 struct list_head dead_roots; 748 struct list_head dead_roots;
@@ -741,10 +757,29 @@ struct btrfs_fs_info {
741 * ordered extents 757 * ordered extents
742 */ 758 */
743 spinlock_t ordered_extent_lock; 759 spinlock_t ordered_extent_lock;
760
761 /*
762 * all of the data=ordered extents pending writeback
763 * these can span multiple transactions and basically include
764 * every dirty data page that isn't from nodatacow
765 */
744 struct list_head ordered_extents; 766 struct list_head ordered_extents;
767
768 /*
769 * all of the inodes that have delalloc bytes. It is possible for
770 * this list to be empty even when there is still dirty data=ordered
771 * extents waiting to finish IO.
772 */
745 struct list_head delalloc_inodes; 773 struct list_head delalloc_inodes;
746 774
747 /* 775 /*
776 * special rename and truncate targets that must be on disk before
777 * we're allowed to commit. This is basically the ext3 style
778 * data=ordered list.
779 */
780 struct list_head ordered_operations;
781
782 /*
748 * there is a pool of worker threads for checksumming during writes 783 * there is a pool of worker threads for checksumming during writes
749 * and a pool for checksumming after reads. This is because readers 784 * and a pool for checksumming after reads. This is because readers
750 * can run with FS locks held, and the writers may be waiting for 785 * can run with FS locks held, and the writers may be waiting for