aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2009-03-24 10:24:20 -0400
committerChris Mason <chris.mason@oracle.com>2009-03-24 16:14:52 -0400
commit12fcfd22fe5bf4fe74710232098bc101af497995 (patch)
treeb01ba82147ea76c89149e54d475ed97121387261
parenta74ac3220774d33db967088906dc3351829e2d3a (diff)
Btrfs: tree logging unlink/rename fixes
The tree logging code allows individual files or directories to be logged without including operations on other files and directories in the FS. It tries to commit the minimal set of changes to disk in order to fsync the single file or directory that was sent to fsync or O_SYNC. The tree logging code was allowing files and directories to be unlinked if they were part of a rename operation where only one directory in the rename was in the fsync log. This patch adds a few new rules to the tree logging. 1) on rename or unlink, if the inode being unlinked isn't in the fsync log, we must force a full commit before doing an fsync of the directory where the unlink was done. The commit isn't done during the unlink, but it is forced the next time we try to log the parent directory. Solution: record transid of last unlink/rename per directory when the directory wasn't already logged. For renames this is only done when renaming to a different directory. mkdir foo/some_dir normal commit rename foo/some_dir foo2/some_dir mkdir foo/some_dir fsync foo/some_dir/some_file The fsync above will unlink the original some_dir without recording it in its new location (foo2). After a crash, some_dir will be gone unless the fsync of some_file forces a full commit 2) we must log any new names for any file or dir that is in the fsync log. This way we make sure not to lose files that are unlinked during the same transaction. 2a) we must log any new names for any file or dir during rename when the directory they are being removed from was logged. 2a is actually the more important variant. Without the extra logging a crash might unlink the old name without recreating the new one 3) after a crash, we must go through any directories with a link count of zero and redo the rm -rf mkdir f1/foo normal commit rm -rf f1/foo fsync(f1) The directory f1 was fully removed from the FS, but fsync was never called on f1, only its parent dir. After a crash the rm -rf must be replayed. This must be able to recurse down the entire directory tree. The inode link count fixup code takes care of the ugly details. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/btrfs_inode.h13
-rw-r--r--fs/btrfs/ctree.h7
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/file.c14
-rw-r--r--fs/btrfs/inode.c28
-rw-r--r--fs/btrfs/tree-log.c389
-rw-r--r--fs/btrfs/tree-log.h17
7 files changed, 372 insertions, 98 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74f..3af4cfb5654c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -86,12 +86,6 @@ struct btrfs_inode {
86 */ 86 */
87 u64 logged_trans; 87 u64 logged_trans;
88 88
89 /*
90 * trans that last made a change that should be fully fsync'd. This
91 * gets reset to zero each time the inode is logged
92 */
93 u64 log_dirty_trans;
94
95 /* total number of bytes pending delalloc, used by stat to calc the 89 /* total number of bytes pending delalloc, used by stat to calc the
96 * real block usage of the file 90 * real block usage of the file
97 */ 91 */
@@ -121,6 +115,13 @@ struct btrfs_inode {
121 /* the start of block group preferred for allocations. */ 115 /* the start of block group preferred for allocations. */
122 u64 block_group; 116 u64 block_group;
123 117
118 /* the fsync log has some corner cases that mean we have to check
119 * directories to see if any unlinks have been done before
120 * the directory was logged. See tree-log.c for all the
121 * details
122 */
123 u64 last_unlink_trans;
124
124 struct inode vfs_inode; 125 struct inode vfs_inode;
125}; 126};
126 127
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4ddce91cf3f9..2737facbd341 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -695,7 +695,12 @@ struct btrfs_fs_info {
695 695
696 u64 generation; 696 u64 generation;
697 u64 last_trans_committed; 697 u64 last_trans_committed;
698 u64 last_trans_new_blockgroup; 698
699 /*
700 * this is updated to the current trans every time a full commit
701 * is required instead of the faster short fsync log commits
702 */
703 u64 last_trans_log_full_commit;
699 u64 open_ioctl_trans; 704 u64 open_ioctl_trans;
700 unsigned long mount_opt; 705 unsigned long mount_opt;
701 u64 max_extent; 706 u64 max_extent;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8933d15a240f..0c482e0d7c43 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5897,7 +5897,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5897 5897
5898 extent_root = root->fs_info->extent_root; 5898 extent_root = root->fs_info->extent_root;
5899 5899
5900 root->fs_info->last_trans_new_blockgroup = trans->transid; 5900 root->fs_info->last_trans_log_full_commit = trans->transid;
5901 5901
5902 cache = kzalloc(sizeof(*cache), GFP_NOFS); 5902 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5903 if (!cache) 5903 if (!cache)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f06c275644b7..32d10a617613 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1173,8 +1173,11 @@ out_nolock:
1173 ret = btrfs_log_dentry_safe(trans, root, 1173 ret = btrfs_log_dentry_safe(trans, root,
1174 file->f_dentry); 1174 file->f_dentry);
1175 if (ret == 0) { 1175 if (ret == 0) {
1176 btrfs_sync_log(trans, root); 1176 ret = btrfs_sync_log(trans, root);
1177 btrfs_end_transaction(trans, root); 1177 if (ret == 0)
1178 btrfs_end_transaction(trans, root);
1179 else
1180 btrfs_commit_transaction(trans, root);
1178 } else { 1181 } else {
1179 btrfs_commit_transaction(trans, root); 1182 btrfs_commit_transaction(trans, root);
1180 } 1183 }
@@ -1266,8 +1269,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1266 if (ret > 0) { 1269 if (ret > 0) {
1267 ret = btrfs_commit_transaction(trans, root); 1270 ret = btrfs_commit_transaction(trans, root);
1268 } else { 1271 } else {
1269 btrfs_sync_log(trans, root); 1272 ret = btrfs_sync_log(trans, root);
1270 ret = btrfs_end_transaction(trans, root); 1273 if (ret == 0)
1274 ret = btrfs_end_transaction(trans, root);
1275 else
1276 ret = btrfs_commit_transaction(trans, root);
1271 } 1277 }
1272 mutex_lock(&dentry->d_inode->i_mutex); 1278 mutex_lock(&dentry->d_inode->i_mutex);
1273out: 1279out:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9b4faac50c18..bffd79faffb5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2246,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2246 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2246 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2247 inode, dir->i_ino); 2247 inode, dir->i_ino);
2248 BUG_ON(ret != 0 && ret != -ENOENT); 2248 BUG_ON(ret != 0 && ret != -ENOENT);
2249 if (ret != -ENOENT)
2250 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2251 2249
2252 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2250 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2253 dir, index); 2251 dir, index);
@@ -2280,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2280 trans = btrfs_start_transaction(root, 1); 2278 trans = btrfs_start_transaction(root, 1);
2281 2279
2282 btrfs_set_trans_block_group(trans, dir); 2280 btrfs_set_trans_block_group(trans, dir);
2281
2282 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2283
2283 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2284 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2284 dentry->d_name.name, dentry->d_name.len); 2285 dentry->d_name.name, dentry->d_name.len);
2285 2286
@@ -3042,7 +3043,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3042 bi->disk_i_size = 0; 3043 bi->disk_i_size = 0;
3043 bi->flags = 0; 3044 bi->flags = 0;
3044 bi->index_cnt = (u64)-1; 3045 bi->index_cnt = (u64)-1;
3045 bi->log_dirty_trans = 0; 3046 bi->last_unlink_trans = 0;
3046 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3047 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3047 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3048 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3048 inode->i_mapping, GFP_NOFS); 3049 inode->i_mapping, GFP_NOFS);
@@ -3786,6 +3787,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3786 drop_inode = 1; 3787 drop_inode = 1;
3787 3788
3788 nr = trans->blocks_used; 3789 nr = trans->blocks_used;
3790
3791 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
3789 btrfs_end_transaction_throttle(trans, root); 3792 btrfs_end_transaction_throttle(trans, root);
3790fail: 3793fail:
3791 if (drop_inode) { 3794 if (drop_inode) {
@@ -4666,6 +4669,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4666 4669
4667 trans = btrfs_start_transaction(root, 1); 4670 trans = btrfs_start_transaction(root, 1);
4668 4671
4672 /*
4673 * this is an ugly little race, but the rename is required to make
4674 * sure that if we crash, the inode is either at the old name
4675 * or the new one. pinning the log transaction lets us make sure
4676 * we don't allow a log commit to come in after we unlink the
4677 * name but before we add the new name back in.
4678 */
4679 btrfs_pin_log_trans(root);
4680
4669 btrfs_set_trans_block_group(trans, new_dir); 4681 btrfs_set_trans_block_group(trans, new_dir);
4670 4682
4671 btrfs_inc_nlink(old_dentry->d_inode); 4683 btrfs_inc_nlink(old_dentry->d_inode);
@@ -4673,6 +4685,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4673 new_dir->i_ctime = new_dir->i_mtime = ctime; 4685 new_dir->i_ctime = new_dir->i_mtime = ctime;
4674 old_inode->i_ctime = ctime; 4686 old_inode->i_ctime = ctime;
4675 4687
4688 if (old_dentry->d_parent != new_dentry->d_parent)
4689 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
4690
4676 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, 4691 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4677 old_dentry->d_name.name, 4692 old_dentry->d_name.name,
4678 old_dentry->d_name.len); 4693 old_dentry->d_name.len);
@@ -4704,7 +4719,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4704 if (ret) 4719 if (ret)
4705 goto out_fail; 4720 goto out_fail;
4706 4721
4722 btrfs_log_new_name(trans, old_inode, old_dir,
4723 new_dentry->d_parent);
4707out_fail: 4724out_fail:
4725
4726 /* this btrfs_end_log_trans just allows the current
4727 * log-sub transaction to complete
4728 */
4729 btrfs_end_log_trans(root);
4708 btrfs_end_transaction_throttle(trans, root); 4730 btrfs_end_transaction_throttle(trans, root);
4709out_unlock: 4731out_unlock:
4710 return ret; 4732 return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 405439ca4c45..1b7f04a8f168 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
35#define LOG_INODE_EXISTS 1 35#define LOG_INODE_EXISTS 1
36 36
37/* 37/*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2). After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 * 2a is actually the more important variant. With the extra logging
63 * a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir. After a crash the rm -rf must
75 * be replayed. This must be able to recurse down the entire
76 * directory tree. The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80/*
38 * stages for the tree walking. The first 81 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find 82 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes 83 * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
47#define LOG_WALK_REPLAY_INODES 1 90#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2 91#define LOG_WALK_REPLAY_ALL 2
49 92
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 93static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode, 94 struct btrfs_root *root, struct inode *inode,
52 int inode_only); 95 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 96static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, 97 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid); 98 struct btrfs_path *path, u64 objectid);
99static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root,
101 struct btrfs_root *log,
102 struct btrfs_path *path,
103 u64 dirid, int del_all);
56 104
57/* 105/*
58 * tree logging is a special write ahead log used to make sure that 106 * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
133} 181}
134 182
135/* 183/*
184 * This either makes the current running log transaction wait
185 * until you call btrfs_end_log_trans() or it makes any future
186 * log transactions wait until you call btrfs_end_log_trans()
187 */
188int btrfs_pin_log_trans(struct btrfs_root *root)
189{
190 int ret = -ENOENT;
191
192 mutex_lock(&root->log_mutex);
193 atomic_inc(&root->log_writers);
194 mutex_unlock(&root->log_mutex);
195 return ret;
196}
197
198/*
136 * indicate we're done making changes to the log tree 199 * indicate we're done making changes to the log tree
137 * and wake up anyone waiting to do a sync 200 * and wake up anyone waiting to do a sync
138 */ 201 */
139static int end_log_trans(struct btrfs_root *root) 202int btrfs_end_log_trans(struct btrfs_root *root)
140{ 203{
141 if (atomic_dec_and_test(&root->log_writers)) { 204 if (atomic_dec_and_test(&root->log_writers)) {
142 smp_mb(); 205 smp_mb();
@@ -602,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
602 665
603 ret = link_to_fixup_dir(trans, root, path, location.objectid); 666 ret = link_to_fixup_dir(trans, root, path, location.objectid);
604 BUG_ON(ret); 667 BUG_ON(ret);
668
605 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 669 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
606 BUG_ON(ret); 670 BUG_ON(ret);
607 kfree(name); 671 kfree(name);
@@ -803,6 +867,7 @@ conflict_again:
803 victim_name_len)) { 867 victim_name_len)) {
804 btrfs_inc_nlink(inode); 868 btrfs_inc_nlink(inode);
805 btrfs_release_path(root, path); 869 btrfs_release_path(root, path);
870
806 ret = btrfs_unlink_inode(trans, root, dir, 871 ret = btrfs_unlink_inode(trans, root, dir,
807 inode, victim_name, 872 inode, victim_name,
808 victim_name_len); 873 victim_name_len);
@@ -921,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
921 key.offset--; 986 key.offset--;
922 btrfs_release_path(root, path); 987 btrfs_release_path(root, path);
923 } 988 }
924 btrfs_free_path(path); 989 btrfs_release_path(root, path);
925 if (nlink != inode->i_nlink) { 990 if (nlink != inode->i_nlink) {
926 inode->i_nlink = nlink; 991 inode->i_nlink = nlink;
927 btrfs_update_inode(trans, root, inode); 992 btrfs_update_inode(trans, root, inode);
928 } 993 }
929 BTRFS_I(inode)->index_cnt = (u64)-1; 994 BTRFS_I(inode)->index_cnt = (u64)-1;
930 995
996 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
997 ret = replay_dir_deletes(trans, root, NULL, path,
998 inode->i_ino, 1);
999 BUG_ON(ret);
1000 }
1001 btrfs_free_path(path);
1002
931 return 0; 1003 return 0;
932} 1004}
933 1005
@@ -970,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
970 1042
971 iput(inode); 1043 iput(inode);
972 1044
973 if (key.offset == 0) 1045 /*
974 break; 1046 * fixup on a directory may create new entries,
975 key.offset--; 1047 * make sure we always look for the highset possible
1048 * offset
1049 */
1050 key.offset = (u64)-1;
976 } 1051 }
977 btrfs_release_path(root, path); 1052 btrfs_release_path(root, path);
978 return 0; 1053 return 0;
@@ -1312,11 +1387,11 @@ again:
1312 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1387 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1313 name_len); 1388 name_len);
1314 log_di = NULL; 1389 log_di = NULL;
1315 if (dir_key->type == BTRFS_DIR_ITEM_KEY) { 1390 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1316 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1391 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1317 dir_key->objectid, 1392 dir_key->objectid,
1318 name, name_len, 0); 1393 name, name_len, 0);
1319 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { 1394 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1320 log_di = btrfs_lookup_dir_index_item(trans, log, 1395 log_di = btrfs_lookup_dir_index_item(trans, log,
1321 log_path, 1396 log_path,
1322 dir_key->objectid, 1397 dir_key->objectid,
@@ -1377,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1377 struct btrfs_root *root, 1452 struct btrfs_root *root,
1378 struct btrfs_root *log, 1453 struct btrfs_root *log,
1379 struct btrfs_path *path, 1454 struct btrfs_path *path,
1380 u64 dirid) 1455 u64 dirid, int del_all)
1381{ 1456{
1382 u64 range_start; 1457 u64 range_start;
1383 u64 range_end; 1458 u64 range_end;
@@ -1407,10 +1482,14 @@ again:
1407 range_start = 0; 1482 range_start = 0;
1408 range_end = 0; 1483 range_end = 0;
1409 while (1) { 1484 while (1) {
1410 ret = find_dir_range(log, path, dirid, key_type, 1485 if (del_all)
1411 &range_start, &range_end); 1486 range_end = (u64)-1;
1412 if (ret != 0) 1487 else {
1413 break; 1488 ret = find_dir_range(log, path, dirid, key_type,
1489 &range_start, &range_end);
1490 if (ret != 0)
1491 break;
1492 }
1414 1493
1415 dir_key.offset = range_start; 1494 dir_key.offset = range_start;
1416 while (1) { 1495 while (1) {
@@ -1436,7 +1515,8 @@ again:
1436 break; 1515 break;
1437 1516
1438 ret = check_item_in_log(trans, root, log, path, 1517 ret = check_item_in_log(trans, root, log, path,
1439 log_path, dir, &found_key); 1518 log_path, dir,
1519 &found_key);
1440 BUG_ON(ret); 1520 BUG_ON(ret);
1441 if (found_key.offset == (u64)-1) 1521 if (found_key.offset == (u64)-1)
1442 break; 1522 break;
@@ -1513,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1513 mode = btrfs_inode_mode(eb, inode_item); 1593 mode = btrfs_inode_mode(eb, inode_item);
1514 if (S_ISDIR(mode)) { 1594 if (S_ISDIR(mode)) {
1515 ret = replay_dir_deletes(wc->trans, 1595 ret = replay_dir_deletes(wc->trans,
1516 root, log, path, key.objectid); 1596 root, log, path, key.objectid, 0);
1517 BUG_ON(ret); 1597 BUG_ON(ret);
1518 } 1598 }
1519 ret = overwrite_item(wc->trans, root, path, 1599 ret = overwrite_item(wc->trans, root, path,
@@ -1850,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
1850 return ret; 1930 return ret;
1851} 1931}
1852 1932
1853static int wait_log_commit(struct btrfs_root *root, unsigned long transid) 1933static int wait_log_commit(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root, unsigned long transid)
1854{ 1935{
1855 DEFINE_WAIT(wait); 1936 DEFINE_WAIT(wait);
1856 int index = transid % 2; 1937 int index = transid % 2;
@@ -1864,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1864 prepare_to_wait(&root->log_commit_wait[index], 1945 prepare_to_wait(&root->log_commit_wait[index],
1865 &wait, TASK_UNINTERRUPTIBLE); 1946 &wait, TASK_UNINTERRUPTIBLE);
1866 mutex_unlock(&root->log_mutex); 1947 mutex_unlock(&root->log_mutex);
1867 if (root->log_transid < transid + 2 && 1948
1949 if (root->fs_info->last_trans_log_full_commit !=
1950 trans->transid && root->log_transid < transid + 2 &&
1868 atomic_read(&root->log_commit[index])) 1951 atomic_read(&root->log_commit[index]))
1869 schedule(); 1952 schedule();
1953
1870 finish_wait(&root->log_commit_wait[index], &wait); 1954 finish_wait(&root->log_commit_wait[index], &wait);
1871 mutex_lock(&root->log_mutex); 1955 mutex_lock(&root->log_mutex);
1872 } while (root->log_transid < transid + 2 && 1956 } while (root->log_transid < transid + 2 &&
@@ -1874,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1874 return 0; 1958 return 0;
1875} 1959}
1876 1960
1877static int wait_for_writer(struct btrfs_root *root) 1961static int wait_for_writer(struct btrfs_trans_handle *trans,
1962 struct btrfs_root *root)
1878{ 1963{
1879 DEFINE_WAIT(wait); 1964 DEFINE_WAIT(wait);
1880 while (atomic_read(&root->log_writers)) { 1965 while (atomic_read(&root->log_writers)) {
1881 prepare_to_wait(&root->log_writer_wait, 1966 prepare_to_wait(&root->log_writer_wait,
1882 &wait, TASK_UNINTERRUPTIBLE); 1967 &wait, TASK_UNINTERRUPTIBLE);
1883 mutex_unlock(&root->log_mutex); 1968 mutex_unlock(&root->log_mutex);
1884 if (atomic_read(&root->log_writers)) 1969 if (root->fs_info->last_trans_log_full_commit !=
1970 trans->transid && atomic_read(&root->log_writers))
1885 schedule(); 1971 schedule();
1886 mutex_lock(&root->log_mutex); 1972 mutex_lock(&root->log_mutex);
1887 finish_wait(&root->log_writer_wait, &wait); 1973 finish_wait(&root->log_writer_wait, &wait);
@@ -1892,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root)
1892/* 1978/*
1893 * btrfs_sync_log does sends a given tree log down to the disk and 1979 * btrfs_sync_log does sends a given tree log down to the disk and
1894 * updates the super blocks to record it. When this call is done, 1980 * updates the super blocks to record it. When this call is done,
1895 * you know that any inodes previously logged are safely on disk 1981 * you know that any inodes previously logged are safely on disk only
1982 * if it returns 0.
1983 *
1984 * Any other return value means you need to call btrfs_commit_transaction.
1985 * Some of the edge cases for fsyncing directories that have had unlinks
1986 * or renames done in the past mean that sometimes the only safe
1987 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
1988 * that has happened.
1896 */ 1989 */
1897int btrfs_sync_log(struct btrfs_trans_handle *trans, 1990int btrfs_sync_log(struct btrfs_trans_handle *trans,
1898 struct btrfs_root *root) 1991 struct btrfs_root *root)
@@ -1906,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1906 mutex_lock(&root->log_mutex); 1999 mutex_lock(&root->log_mutex);
1907 index1 = root->log_transid % 2; 2000 index1 = root->log_transid % 2;
1908 if (atomic_read(&root->log_commit[index1])) { 2001 if (atomic_read(&root->log_commit[index1])) {
1909 wait_log_commit(root, root->log_transid); 2002 wait_log_commit(trans, root, root->log_transid);
1910 mutex_unlock(&root->log_mutex); 2003 mutex_unlock(&root->log_mutex);
1911 return 0; 2004 return 0;
1912 } 2005 }
@@ -1914,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1914 2007
1915 /* wait for previous tree log sync to complete */ 2008 /* wait for previous tree log sync to complete */
1916 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2009 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1917 wait_log_commit(root, root->log_transid - 1); 2010 wait_log_commit(trans, root, root->log_transid - 1);
1918 2011
1919 while (1) { 2012 while (1) {
1920 unsigned long batch = root->log_batch; 2013 unsigned long batch = root->log_batch;
1921 mutex_unlock(&root->log_mutex); 2014 mutex_unlock(&root->log_mutex);
1922 schedule_timeout_uninterruptible(1); 2015 schedule_timeout_uninterruptible(1);
1923 mutex_lock(&root->log_mutex); 2016 mutex_lock(&root->log_mutex);
1924 wait_for_writer(root); 2017
2018 wait_for_writer(trans, root);
1925 if (batch == root->log_batch) 2019 if (batch == root->log_batch)
1926 break; 2020 break;
1927 } 2021 }
1928 2022
2023 /* bail out if we need to do a full commit */
2024 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2025 ret = -EAGAIN;
2026 mutex_unlock(&root->log_mutex);
2027 goto out;
2028 }
2029
1929 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2030 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1930 BUG_ON(ret); 2031 BUG_ON(ret);
1931 2032
@@ -1961,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1961 2062
1962 index2 = log_root_tree->log_transid % 2; 2063 index2 = log_root_tree->log_transid % 2;
1963 if (atomic_read(&log_root_tree->log_commit[index2])) { 2064 if (atomic_read(&log_root_tree->log_commit[index2])) {
1964 wait_log_commit(log_root_tree, log_root_tree->log_transid); 2065 wait_log_commit(trans, log_root_tree,
2066 log_root_tree->log_transid);
1965 mutex_unlock(&log_root_tree->log_mutex); 2067 mutex_unlock(&log_root_tree->log_mutex);
1966 goto out; 2068 goto out;
1967 } 2069 }
1968 atomic_set(&log_root_tree->log_commit[index2], 1); 2070 atomic_set(&log_root_tree->log_commit[index2], 1);
1969 2071
1970 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) 2072 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
1971 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); 2073 wait_log_commit(trans, log_root_tree,
2074 log_root_tree->log_transid - 1);
2075 }
2076
2077 wait_for_writer(trans, log_root_tree);
1972 2078
1973 wait_for_writer(log_root_tree); 2079 /*
2080 * now that we've moved on to the tree of log tree roots,
2081 * check the full commit flag again
2082 */
2083 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2084 mutex_unlock(&log_root_tree->log_mutex);
2085 ret = -EAGAIN;
2086 goto out_wake_log_root;
2087 }
1974 2088
1975 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2089 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1976 &log_root_tree->dirty_log_pages); 2090 &log_root_tree->dirty_log_pages);
@@ -1995,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1995 * in and cause problems either. 2109 * in and cause problems either.
1996 */ 2110 */
1997 write_ctree_super(trans, root->fs_info->tree_root, 2); 2111 write_ctree_super(trans, root->fs_info->tree_root, 2);
2112 ret = 0;
1998 2113
2114out_wake_log_root:
1999 atomic_set(&log_root_tree->log_commit[index2], 0); 2115 atomic_set(&log_root_tree->log_commit[index2], 0);
2000 smp_mb(); 2116 smp_mb();
2001 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2117 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -2008,7 +2124,8 @@ out:
2008 return 0; 2124 return 0;
2009} 2125}
2010 2126
2011/* * free all the extents used by the tree log. This should be called 2127/*
2128 * free all the extents used by the tree log. This should be called
2012 * at commit time of the full transaction 2129 * at commit time of the full transaction
2013 */ 2130 */
2014int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2131int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2142,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2142 2259
2143 btrfs_free_path(path); 2260 btrfs_free_path(path);
2144 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2261 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2145 end_log_trans(root); 2262 btrfs_end_log_trans(root);
2146 2263
2147 return 0; 2264 return 0;
2148} 2265}
@@ -2169,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2169 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2286 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2170 dirid, &index); 2287 dirid, &index);
2171 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2288 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2172 end_log_trans(root); 2289 btrfs_end_log_trans(root);
2173 2290
2174 return ret; 2291 return ret;
2175} 2292}
@@ -2569,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2569 * 2686 *
2570 * This handles both files and directories. 2687 * This handles both files and directories.
2571 */ 2688 */
2572static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 2689static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2573 struct btrfs_root *root, struct inode *inode, 2690 struct btrfs_root *root, struct inode *inode,
2574 int inode_only) 2691 int inode_only)
2575{ 2692{
@@ -2595,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2595 min_key.offset = 0; 2712 min_key.offset = 0;
2596 2713
2597 max_key.objectid = inode->i_ino; 2714 max_key.objectid = inode->i_ino;
2715
2716 /* today the code can only do partial logging of directories */
2717 if (!S_ISDIR(inode->i_mode))
2718 inode_only = LOG_INODE_ALL;
2719
2598 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2720 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2599 max_key.type = BTRFS_XATTR_ITEM_KEY; 2721 max_key.type = BTRFS_XATTR_ITEM_KEY;
2600 else 2722 else
2601 max_key.type = (u8)-1; 2723 max_key.type = (u8)-1;
2602 max_key.offset = (u64)-1; 2724 max_key.offset = (u64)-1;
2603 2725
2604 /*
2605 * if this inode has already been logged and we're in inode_only
2606 * mode, we don't want to delete the things that have already
2607 * been written to the log.
2608 *
2609 * But, if the inode has been through an inode_only log,
2610 * the logged_trans field is not set. This allows us to catch
2611 * any new names for this inode in the backrefs by logging it
2612 * again
2613 */
2614 if (inode_only == LOG_INODE_EXISTS &&
2615 BTRFS_I(inode)->logged_trans == trans->transid) {
2616 btrfs_free_path(path);
2617 btrfs_free_path(dst_path);
2618 goto out;
2619 }
2620 mutex_lock(&BTRFS_I(inode)->log_mutex); 2726 mutex_lock(&BTRFS_I(inode)->log_mutex);
2621 2727
2622 /* 2728 /*
@@ -2703,7 +2809,6 @@ next_slot:
2703 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2809 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2704 btrfs_release_path(root, path); 2810 btrfs_release_path(root, path);
2705 btrfs_release_path(log, dst_path); 2811 btrfs_release_path(log, dst_path);
2706 BTRFS_I(inode)->log_dirty_trans = 0;
2707 ret = log_directory_changes(trans, root, inode, path, dst_path); 2812 ret = log_directory_changes(trans, root, inode, path, dst_path);
2708 BUG_ON(ret); 2813 BUG_ON(ret);
2709 } 2814 }
@@ -2712,19 +2817,58 @@ next_slot:
2712 2817
2713 btrfs_free_path(path); 2818 btrfs_free_path(path);
2714 btrfs_free_path(dst_path); 2819 btrfs_free_path(dst_path);
2715out:
2716 return 0; 2820 return 0;
2717} 2821}
2718 2822
2719int btrfs_log_inode(struct btrfs_trans_handle *trans, 2823/*
2720 struct btrfs_root *root, struct inode *inode, 2824 * follow the dentry parent pointers up the chain and see if any
2721 int inode_only) 2825 * of the directories in it require a full commit before they can
2826 * be logged. Returns zero if nothing special needs to be done or 1 if
2827 * a full commit is required.
2828 */
2829static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2830 struct inode *inode,
2831 struct dentry *parent,
2832 struct super_block *sb,
2833 u64 last_committed)
2722{ 2834{
2723 int ret; 2835 int ret = 0;
2836 struct btrfs_root *root;
2724 2837
2725 start_log_trans(trans, root); 2838 if (!S_ISDIR(inode->i_mode)) {
2726 ret = __btrfs_log_inode(trans, root, inode, inode_only); 2839 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2727 end_log_trans(root); 2840 goto out;
2841 inode = parent->d_inode;
2842 }
2843
2844 while (1) {
2845 BTRFS_I(inode)->logged_trans = trans->transid;
2846 smp_mb();
2847
2848 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
2849 root = BTRFS_I(inode)->root;
2850
2851 /*
2852 * make sure any commits to the log are forced
2853 * to be full commits
2854 */
2855 root->fs_info->last_trans_log_full_commit =
2856 trans->transid;
2857 ret = 1;
2858 break;
2859 }
2860
2861 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2862 break;
2863
2864 if (parent == sb->s_root)
2865 break;
2866
2867 parent = parent->d_parent;
2868 inode = parent->d_inode;
2869
2870 }
2871out:
2728 return ret; 2872 return ret;
2729} 2873}
2730 2874
@@ -2734,31 +2878,53 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
2734 * only logging is done of any parent directories that are older than 2878 * only logging is done of any parent directories that are older than
2735 * the last committed transaction 2879 * the last committed transaction
2736 */ 2880 */
2737int btrfs_log_dentry(struct btrfs_trans_handle *trans, 2881int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2738 struct btrfs_root *root, struct dentry *dentry) 2882 struct btrfs_root *root, struct inode *inode,
2883 struct dentry *parent, int exists_only)
2739{ 2884{
2740 int inode_only = LOG_INODE_ALL; 2885 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2741 struct super_block *sb; 2886 struct super_block *sb;
2742 int ret; 2887 int ret = 0;
2888 u64 last_committed = root->fs_info->last_trans_committed;
2889
2890 sb = inode->i_sb;
2891
2892 if (root->fs_info->last_trans_log_full_commit >
2893 root->fs_info->last_trans_committed) {
2894 ret = 1;
2895 goto end_no_trans;
2896 }
2897
2898 ret = check_parent_dirs_for_sync(trans, inode, parent,
2899 sb, last_committed);
2900 if (ret)
2901 goto end_no_trans;
2743 2902
2744 start_log_trans(trans, root); 2903 start_log_trans(trans, root);
2745 sb = dentry->d_inode->i_sb;
2746 while (1) {
2747 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2748 inode_only);
2749 BUG_ON(ret);
2750 inode_only = LOG_INODE_EXISTS;
2751 2904
2752 dentry = dentry->d_parent; 2905 ret = btrfs_log_inode(trans, root, inode, inode_only);
2753 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) 2906 BUG_ON(ret);
2907 inode_only = LOG_INODE_EXISTS;
2908
2909 while (1) {
2910 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2754 break; 2911 break;
2755 2912
2756 if (BTRFS_I(dentry->d_inode)->generation <= 2913 inode = parent->d_inode;
2757 root->fs_info->last_trans_committed) 2914 if (BTRFS_I(inode)->generation >
2915 root->fs_info->last_trans_committed) {
2916 ret = btrfs_log_inode(trans, root, inode, inode_only);
2917 BUG_ON(ret);
2918 }
2919 if (parent == sb->s_root)
2758 break; 2920 break;
2921
2922 parent = parent->d_parent;
2759 } 2923 }
2760 end_log_trans(root); 2924 ret = 0;
2761 return 0; 2925 btrfs_end_log_trans(root);
2926end_no_trans:
2927 return ret;
2762} 2928}
2763 2929
2764/* 2930/*
@@ -2770,12 +2936,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2770int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2936int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2771 struct btrfs_root *root, struct dentry *dentry) 2937 struct btrfs_root *root, struct dentry *dentry)
2772{ 2938{
2773 u64 gen; 2939 return btrfs_log_inode_parent(trans, root, dentry->d_inode,
2774 gen = root->fs_info->last_trans_new_blockgroup; 2940 dentry->d_parent, 0);
2775 if (gen > root->fs_info->last_trans_committed)
2776 return 1;
2777 else
2778 return btrfs_log_dentry(trans, root, dentry);
2779} 2941}
2780 2942
2781/* 2943/*
@@ -2894,3 +3056,74 @@ again:
2894 kfree(log_root_tree); 3056 kfree(log_root_tree);
2895 return 0; 3057 return 0;
2896} 3058}
3059
3060/*
3061 * there are some corner cases where we want to force a full
3062 * commit instead of allowing a directory to be logged.
3063 *
3064 * They revolve around files there were unlinked from the directory, and
3065 * this function updates the parent directory so that a full commit is
3066 * properly done if it is fsync'd later after the unlinks are done.
3067 */
3068void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3069 struct inode *dir, struct inode *inode,
3070 int for_rename)
3071{
3072 /*
3073 * if this directory was already logged any new
3074 * names for this file/dir will get recorded
3075 */
3076 smp_mb();
3077 if (BTRFS_I(dir)->logged_trans == trans->transid)
3078 return;
3079
3080 /*
3081 * if the inode we're about to unlink was logged,
3082 * the log will be properly updated for any new names
3083 */
3084 if (BTRFS_I(inode)->logged_trans == trans->transid)
3085 return;
3086
3087 /*
3088 * when renaming files across directories, if the directory
3089 * there we're unlinking from gets fsync'd later on, there's
3090 * no way to find the destination directory later and fsync it
3091 * properly. So, we have to be conservative and force commits
3092 * so the new name gets discovered.
3093 */
3094 if (for_rename)
3095 goto record;
3096
3097 /* we can safely do the unlink without any special recording */
3098 return;
3099
3100record:
3101 BTRFS_I(dir)->last_unlink_trans = trans->transid;
3102}
3103
3104/*
3105 * Call this after adding a new name for a file and it will properly
3106 * update the log to reflect the new name.
3107 *
3108 * It will return zero if all goes well, and it will return 1 if a
3109 * full transaction commit is required.
3110 */
3111int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3112 struct inode *inode, struct inode *old_dir,
3113 struct dentry *parent)
3114{
3115 struct btrfs_root * root = BTRFS_I(inode)->root;
3116
3117 /*
3118 * if this inode hasn't been logged and directory we're renaming it
3119 * from hasn't been logged, we don't need to log it
3120 */
3121 if (BTRFS_I(inode)->logged_trans <=
3122 root->fs_info->last_trans_committed &&
3123 (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3124 root->fs_info->last_trans_committed))
3125 return 0;
3126
3127 return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3128}
3129
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed02..d09c7609e16b 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
22int btrfs_sync_log(struct btrfs_trans_handle *trans, 22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root); 23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root); 25int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 26int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry); 27 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 28int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 29 struct btrfs_root *root,
35 const char *name, int name_len, 30 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root, 33 struct btrfs_root *root,
39 const char *name, int name_len, 34 const char *name, int name_len,
40 struct inode *inode, u64 dirid); 35 struct inode *inode, u64 dirid);
36int btrfs_join_running_log_trans(struct btrfs_root *root);
37int btrfs_end_log_trans(struct btrfs_root *root);
38int btrfs_pin_log_trans(struct btrfs_root *root);
39int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
40 struct btrfs_root *root, struct inode *inode,
41 struct dentry *parent, int exists_only);
42void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
43 struct inode *dir, struct inode *inode,
44 int for_rename);
45int btrfs_log_new_name(struct btrfs_trans_handle *trans,
46 struct inode *inode, struct inode *old_dir,
47 struct dentry *parent);
41#endif 48#endif