aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/btrfs/btrfs_inode.h13
-rw-r--r--fs/btrfs/ctree.h7
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/file.c14
-rw-r--r--fs/btrfs/inode.c28
-rw-r--r--fs/btrfs/tree-log.c389
-rw-r--r--fs/btrfs/tree-log.h17
7 files changed, 372 insertions, 98 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74f..3af4cfb5654c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -86,12 +86,6 @@ struct btrfs_inode {
86 */ 86 */
87 u64 logged_trans; 87 u64 logged_trans;
88 88
89 /*
90 * trans that last made a change that should be fully fsync'd. This
91 * gets reset to zero each time the inode is logged
92 */
93 u64 log_dirty_trans;
94
95 /* total number of bytes pending delalloc, used by stat to calc the 89 /* total number of bytes pending delalloc, used by stat to calc the
96 * real block usage of the file 90 * real block usage of the file
97 */ 91 */
@@ -121,6 +115,13 @@ struct btrfs_inode {
121 /* the start of block group preferred for allocations. */ 115 /* the start of block group preferred for allocations. */
122 u64 block_group; 116 u64 block_group;
123 117
118 /* the fsync log has some corner cases that mean we have to check
119 * directories to see if any unlinks have been done before
120 * the directory was logged. See tree-log.c for all the
121 * details
122 */
123 u64 last_unlink_trans;
124
124 struct inode vfs_inode; 125 struct inode vfs_inode;
125}; 126};
126 127
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4ddce91cf3f9..2737facbd341 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -695,7 +695,12 @@ struct btrfs_fs_info {
695 695
696 u64 generation; 696 u64 generation;
697 u64 last_trans_committed; 697 u64 last_trans_committed;
698 u64 last_trans_new_blockgroup; 698
699 /*
700 * this is updated to the current trans every time a full commit
701 * is required instead of the faster short fsync log commits
702 */
703 u64 last_trans_log_full_commit;
699 u64 open_ioctl_trans; 704 u64 open_ioctl_trans;
700 unsigned long mount_opt; 705 unsigned long mount_opt;
701 u64 max_extent; 706 u64 max_extent;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8933d15a240f..0c482e0d7c43 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5897,7 +5897,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5897 5897
5898 extent_root = root->fs_info->extent_root; 5898 extent_root = root->fs_info->extent_root;
5899 5899
5900 root->fs_info->last_trans_new_blockgroup = trans->transid; 5900 root->fs_info->last_trans_log_full_commit = trans->transid;
5901 5901
5902 cache = kzalloc(sizeof(*cache), GFP_NOFS); 5902 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5903 if (!cache) 5903 if (!cache)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f06c275644b7..32d10a617613 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1173,8 +1173,11 @@ out_nolock:
1173 ret = btrfs_log_dentry_safe(trans, root, 1173 ret = btrfs_log_dentry_safe(trans, root,
1174 file->f_dentry); 1174 file->f_dentry);
1175 if (ret == 0) { 1175 if (ret == 0) {
1176 btrfs_sync_log(trans, root); 1176 ret = btrfs_sync_log(trans, root);
1177 btrfs_end_transaction(trans, root); 1177 if (ret == 0)
1178 btrfs_end_transaction(trans, root);
1179 else
1180 btrfs_commit_transaction(trans, root);
1178 } else { 1181 } else {
1179 btrfs_commit_transaction(trans, root); 1182 btrfs_commit_transaction(trans, root);
1180 } 1183 }
@@ -1266,8 +1269,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1266 if (ret > 0) { 1269 if (ret > 0) {
1267 ret = btrfs_commit_transaction(trans, root); 1270 ret = btrfs_commit_transaction(trans, root);
1268 } else { 1271 } else {
1269 btrfs_sync_log(trans, root); 1272 ret = btrfs_sync_log(trans, root);
1270 ret = btrfs_end_transaction(trans, root); 1273 if (ret == 0)
1274 ret = btrfs_end_transaction(trans, root);
1275 else
1276 ret = btrfs_commit_transaction(trans, root);
1271 } 1277 }
1272 mutex_lock(&dentry->d_inode->i_mutex); 1278 mutex_lock(&dentry->d_inode->i_mutex);
1273out: 1279out:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9b4faac50c18..bffd79faffb5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2246,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2246 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2246 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2247 inode, dir->i_ino); 2247 inode, dir->i_ino);
2248 BUG_ON(ret != 0 && ret != -ENOENT); 2248 BUG_ON(ret != 0 && ret != -ENOENT);
2249 if (ret != -ENOENT)
2250 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2251 2249
2252 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2250 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2253 dir, index); 2251 dir, index);
@@ -2280,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2280 trans = btrfs_start_transaction(root, 1); 2278 trans = btrfs_start_transaction(root, 1);
2281 2279
2282 btrfs_set_trans_block_group(trans, dir); 2280 btrfs_set_trans_block_group(trans, dir);
2281
2282 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2283
2283 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2284 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2284 dentry->d_name.name, dentry->d_name.len); 2285 dentry->d_name.name, dentry->d_name.len);
2285 2286
@@ -3042,7 +3043,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3042 bi->disk_i_size = 0; 3043 bi->disk_i_size = 0;
3043 bi->flags = 0; 3044 bi->flags = 0;
3044 bi->index_cnt = (u64)-1; 3045 bi->index_cnt = (u64)-1;
3045 bi->log_dirty_trans = 0; 3046 bi->last_unlink_trans = 0;
3046 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3047 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3047 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3048 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3048 inode->i_mapping, GFP_NOFS); 3049 inode->i_mapping, GFP_NOFS);
@@ -3786,6 +3787,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3786 drop_inode = 1; 3787 drop_inode = 1;
3787 3788
3788 nr = trans->blocks_used; 3789 nr = trans->blocks_used;
3790
3791 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
3789 btrfs_end_transaction_throttle(trans, root); 3792 btrfs_end_transaction_throttle(trans, root);
3790fail: 3793fail:
3791 if (drop_inode) { 3794 if (drop_inode) {
@@ -4666,6 +4669,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4666 4669
4667 trans = btrfs_start_transaction(root, 1); 4670 trans = btrfs_start_transaction(root, 1);
4668 4671
4672 /*
4673 * this is an ugly little race, but the rename is required to make
4674 * sure that if we crash, the inode is either at the old name
4675 * or the new one. pinning the log transaction lets us make sure
4676 * we don't allow a log commit to come in after we unlink the
4677 * name but before we add the new name back in.
4678 */
4679 btrfs_pin_log_trans(root);
4680
4669 btrfs_set_trans_block_group(trans, new_dir); 4681 btrfs_set_trans_block_group(trans, new_dir);
4670 4682
4671 btrfs_inc_nlink(old_dentry->d_inode); 4683 btrfs_inc_nlink(old_dentry->d_inode);
@@ -4673,6 +4685,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4673 new_dir->i_ctime = new_dir->i_mtime = ctime; 4685 new_dir->i_ctime = new_dir->i_mtime = ctime;
4674 old_inode->i_ctime = ctime; 4686 old_inode->i_ctime = ctime;
4675 4687
4688 if (old_dentry->d_parent != new_dentry->d_parent)
4689 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
4690
4676 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, 4691 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4677 old_dentry->d_name.name, 4692 old_dentry->d_name.name,
4678 old_dentry->d_name.len); 4693 old_dentry->d_name.len);
@@ -4704,7 +4719,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4704 if (ret) 4719 if (ret)
4705 goto out_fail; 4720 goto out_fail;
4706 4721
4722 btrfs_log_new_name(trans, old_inode, old_dir,
4723 new_dentry->d_parent);
4707out_fail: 4724out_fail:
4725
4726 /* this btrfs_end_log_trans just allows the current
4727 * log-sub transaction to complete
4728 */
4729 btrfs_end_log_trans(root);
4708 btrfs_end_transaction_throttle(trans, root); 4730 btrfs_end_transaction_throttle(trans, root);
4709out_unlock: 4731out_unlock:
4710 return ret; 4732 return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 405439ca4c45..1b7f04a8f168 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
35#define LOG_INODE_EXISTS 1 35#define LOG_INODE_EXISTS 1
36 36
37/* 37/*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2). After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 * 2a is actually the more important variant. With the extra logging
63 * a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir. After a crash the rm -rf must
75 * be replayed. This must be able to recurse down the entire
76 * directory tree. The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80/*
38 * stages for the tree walking. The first 81 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find 82 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes 83 * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
47#define LOG_WALK_REPLAY_INODES 1 90#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2 91#define LOG_WALK_REPLAY_ALL 2
49 92
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 93static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode, 94 struct btrfs_root *root, struct inode *inode,
52 int inode_only); 95 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 96static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, 97 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid); 98 struct btrfs_path *path, u64 objectid);
99static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root,
101 struct btrfs_root *log,
102 struct btrfs_path *path,
103 u64 dirid, int del_all);
56 104
57/* 105/*
58 * tree logging is a special write ahead log used to make sure that 106 * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
133} 181}
134 182
135/* 183/*
184 * This either makes the current running log transaction wait
185 * until you call btrfs_end_log_trans() or it makes any future
186 * log transactions wait until you call btrfs_end_log_trans()
187 */
188int btrfs_pin_log_trans(struct btrfs_root *root)
189{
190 int ret = -ENOENT;
191
192 mutex_lock(&root->log_mutex);
193 atomic_inc(&root->log_writers);
194 mutex_unlock(&root->log_mutex);
195 return ret;
196}
197
198/*
136 * indicate we're done making changes to the log tree 199 * indicate we're done making changes to the log tree
137 * and wake up anyone waiting to do a sync 200 * and wake up anyone waiting to do a sync
138 */ 201 */
139static int end_log_trans(struct btrfs_root *root) 202int btrfs_end_log_trans(struct btrfs_root *root)
140{ 203{
141 if (atomic_dec_and_test(&root->log_writers)) { 204 if (atomic_dec_and_test(&root->log_writers)) {
142 smp_mb(); 205 smp_mb();
@@ -602,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
602 665
603 ret = link_to_fixup_dir(trans, root, path, location.objectid); 666 ret = link_to_fixup_dir(trans, root, path, location.objectid);
604 BUG_ON(ret); 667 BUG_ON(ret);
668
605 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 669 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
606 BUG_ON(ret); 670 BUG_ON(ret);
607 kfree(name); 671 kfree(name);
@@ -803,6 +867,7 @@ conflict_again:
803 victim_name_len)) { 867 victim_name_len)) {
804 btrfs_inc_nlink(inode); 868 btrfs_inc_nlink(inode);
805 btrfs_release_path(root, path); 869 btrfs_release_path(root, path);
870
806 ret = btrfs_unlink_inode(trans, root, dir, 871 ret = btrfs_unlink_inode(trans, root, dir,
807 inode, victim_name, 872 inode, victim_name,
808 victim_name_len); 873 victim_name_len);
@@ -921,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
921 key.offset--; 986 key.offset--;
922 btrfs_release_path(root, path); 987 btrfs_release_path(root, path);
923 } 988 }
924 btrfs_free_path(path); 989 btrfs_release_path(root, path);
925 if (nlink != inode->i_nlink) { 990 if (nlink != inode->i_nlink) {
926 inode->i_nlink = nlink; 991 inode->i_nlink = nlink;
927 btrfs_update_inode(trans, root, inode); 992 btrfs_update_inode(trans, root, inode);
928 } 993 }
929 BTRFS_I(inode)->index_cnt = (u64)-1; 994 BTRFS_I(inode)->index_cnt = (u64)-1;
930 995
996 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
997 ret = replay_dir_deletes(trans, root, NULL, path,
998 inode->i_ino, 1);
999 BUG_ON(ret);
1000 }
1001 btrfs_free_path(path);
1002
931 return 0; 1003 return 0;
932} 1004}
933 1005
@@ -970,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
970 1042
971 iput(inode); 1043 iput(inode);
972 1044
973 if (key.offset == 0) 1045 /*
974 break; 1046 * fixup on a directory may create new entries,
975 key.offset--; 1047 * make sure we always look for the highset possible
1048 * offset
1049 */
1050 key.offset = (u64)-1;
976 } 1051 }
977 btrfs_release_path(root, path); 1052 btrfs_release_path(root, path);
978 return 0; 1053 return 0;
@@ -1312,11 +1387,11 @@ again:
1312 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1387 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1313 name_len); 1388 name_len);
1314 log_di = NULL; 1389 log_di = NULL;
1315 if (dir_key->type == BTRFS_DIR_ITEM_KEY) { 1390 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1316 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1391 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1317 dir_key->objectid, 1392 dir_key->objectid,
1318 name, name_len, 0); 1393 name, name_len, 0);
1319 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { 1394 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1320 log_di = btrfs_lookup_dir_index_item(trans, log, 1395 log_di = btrfs_lookup_dir_index_item(trans, log,
1321 log_path, 1396 log_path,
1322 dir_key->objectid, 1397 dir_key->objectid,
@@ -1377,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1377 struct btrfs_root *root, 1452 struct btrfs_root *root,
1378 struct btrfs_root *log, 1453 struct btrfs_root *log,
1379 struct btrfs_path *path, 1454 struct btrfs_path *path,
1380 u64 dirid) 1455 u64 dirid, int del_all)
1381{ 1456{
1382 u64 range_start; 1457 u64 range_start;
1383 u64 range_end; 1458 u64 range_end;
@@ -1407,10 +1482,14 @@ again:
1407 range_start = 0; 1482 range_start = 0;
1408 range_end = 0; 1483 range_end = 0;
1409 while (1) { 1484 while (1) {
1410 ret = find_dir_range(log, path, dirid, key_type, 1485 if (del_all)
1411 &range_start, &range_end); 1486 range_end = (u64)-1;
1412 if (ret != 0) 1487 else {
1413 break; 1488 ret = find_dir_range(log, path, dirid, key_type,
1489 &range_start, &range_end);
1490 if (ret != 0)
1491 break;
1492 }
1414 1493
1415 dir_key.offset = range_start; 1494 dir_key.offset = range_start;
1416 while (1) { 1495 while (1) {
@@ -1436,7 +1515,8 @@ again:
1436 break; 1515 break;
1437 1516
1438 ret = check_item_in_log(trans, root, log, path, 1517 ret = check_item_in_log(trans, root, log, path,
1439 log_path, dir, &found_key); 1518 log_path, dir,
1519 &found_key);
1440 BUG_ON(ret); 1520 BUG_ON(ret);
1441 if (found_key.offset == (u64)-1) 1521 if (found_key.offset == (u64)-1)
1442 break; 1522 break;
@@ -1513,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1513 mode = btrfs_inode_mode(eb, inode_item); 1593 mode = btrfs_inode_mode(eb, inode_item);
1514 if (S_ISDIR(mode)) { 1594 if (S_ISDIR(mode)) {
1515 ret = replay_dir_deletes(wc->trans, 1595 ret = replay_dir_deletes(wc->trans,
1516 root, log, path, key.objectid); 1596 root, log, path, key.objectid, 0);
1517 BUG_ON(ret); 1597 BUG_ON(ret);
1518 } 1598 }
1519 ret = overwrite_item(wc->trans, root, path, 1599 ret = overwrite_item(wc->trans, root, path,
@@ -1850,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
1850 return ret; 1930 return ret;
1851} 1931}
1852 1932
1853static int wait_log_commit(struct btrfs_root *root, unsigned long transid) 1933static int wait_log_commit(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root, unsigned long transid)
1854{ 1935{
1855 DEFINE_WAIT(wait); 1936 DEFINE_WAIT(wait);
1856 int index = transid % 2; 1937 int index = transid % 2;
@@ -1864,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1864 prepare_to_wait(&root->log_commit_wait[index], 1945 prepare_to_wait(&root->log_commit_wait[index],
1865 &wait, TASK_UNINTERRUPTIBLE); 1946 &wait, TASK_UNINTERRUPTIBLE);
1866 mutex_unlock(&root->log_mutex); 1947 mutex_unlock(&root->log_mutex);
1867 if (root->log_transid < transid + 2 && 1948
1949 if (root->fs_info->last_trans_log_full_commit !=
1950 trans->transid && root->log_transid < transid + 2 &&
1868 atomic_read(&root->log_commit[index])) 1951 atomic_read(&root->log_commit[index]))
1869 schedule(); 1952 schedule();
1953
1870 finish_wait(&root->log_commit_wait[index], &wait); 1954 finish_wait(&root->log_commit_wait[index], &wait);
1871 mutex_lock(&root->log_mutex); 1955 mutex_lock(&root->log_mutex);
1872 } while (root->log_transid < transid + 2 && 1956 } while (root->log_transid < transid + 2 &&
@@ -1874,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1874 return 0; 1958 return 0;
1875} 1959}
1876 1960
1877static int wait_for_writer(struct btrfs_root *root) 1961static int wait_for_writer(struct btrfs_trans_handle *trans,
1962 struct btrfs_root *root)
1878{ 1963{
1879 DEFINE_WAIT(wait); 1964 DEFINE_WAIT(wait);
1880 while (atomic_read(&root->log_writers)) { 1965 while (atomic_read(&root->log_writers)) {
1881 prepare_to_wait(&root->log_writer_wait, 1966 prepare_to_wait(&root->log_writer_wait,
1882 &wait, TASK_UNINTERRUPTIBLE); 1967 &wait, TASK_UNINTERRUPTIBLE);
1883 mutex_unlock(&root->log_mutex); 1968 mutex_unlock(&root->log_mutex);
1884 if (atomic_read(&root->log_writers)) 1969 if (root->fs_info->last_trans_log_full_commit !=
1970 trans->transid && atomic_read(&root->log_writers))
1885 schedule(); 1971 schedule();
1886 mutex_lock(&root->log_mutex); 1972 mutex_lock(&root->log_mutex);
1887 finish_wait(&root->log_writer_wait, &wait); 1973 finish_wait(&root->log_writer_wait, &wait);
@@ -1892,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root)
1892/* 1978/*
1893 * btrfs_sync_log does sends a given tree log down to the disk and 1979 * btrfs_sync_log does sends a given tree log down to the disk and
1894 * updates the super blocks to record it. When this call is done, 1980 * updates the super blocks to record it. When this call is done,
1895 * you know that any inodes previously logged are safely on disk 1981 * you know that any inodes previously logged are safely on disk only
1982 * if it returns 0.
1983 *
1984 * Any other return value means you need to call btrfs_commit_transaction.
1985 * Some of the edge cases for fsyncing directories that have had unlinks
1986 * or renames done in the past mean that sometimes the only safe
1987 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
1988 * that has happened.
1896 */ 1989 */
1897int btrfs_sync_log(struct btrfs_trans_handle *trans, 1990int btrfs_sync_log(struct btrfs_trans_handle *trans,
1898 struct btrfs_root *root) 1991 struct btrfs_root *root)
@@ -1906,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1906 mutex_lock(&root->log_mutex); 1999 mutex_lock(&root->log_mutex);
1907 index1 = root->log_transid % 2; 2000 index1 = root->log_transid % 2;
1908 if (atomic_read(&root->log_commit[index1])) { 2001 if (atomic_read(&root->log_commit[index1])) {
1909 wait_log_commit(root, root->log_transid); 2002 wait_log_commit(trans, root, root->log_transid);
1910 mutex_unlock(&root->log_mutex); 2003 mutex_unlock(&root->log_mutex);
1911 return 0; 2004 return 0;
1912 } 2005 }
@@ -1914,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1914 2007
1915 /* wait for previous tree log sync to complete */ 2008 /* wait for previous tree log sync to complete */
1916 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2009 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1917 wait_log_commit(root, root->log_transid - 1); 2010 wait_log_commit(trans, root, root->log_transid - 1);
1918 2011
1919 while (1) { 2012 while (1) {
1920 unsigned long batch = root->log_batch; 2013 unsigned long batch = root->log_batch;
1921 mutex_unlock(&root->log_mutex); 2014 mutex_unlock(&root->log_mutex);
1922 schedule_timeout_uninterruptible(1); 2015 schedule_timeout_uninterruptible(1);
1923 mutex_lock(&root->log_mutex); 2016 mutex_lock(&root->log_mutex);
1924 wait_for_writer(root); 2017
2018 wait_for_writer(trans, root);
1925 if (batch == root->log_batch) 2019 if (batch == root->log_batch)
1926 break; 2020 break;
1927 } 2021 }
1928 2022
2023 /* bail out if we need to do a full commit */
2024 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2025 ret = -EAGAIN;
2026 mutex_unlock(&root->log_mutex);
2027 goto out;
2028 }
2029
1929 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2030 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1930 BUG_ON(ret); 2031 BUG_ON(ret);
1931 2032
@@ -1961,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1961 2062
1962 index2 = log_root_tree->log_transid % 2; 2063 index2 = log_root_tree->log_transid % 2;
1963 if (atomic_read(&log_root_tree->log_commit[index2])) { 2064 if (atomic_read(&log_root_tree->log_commit[index2])) {
1964 wait_log_commit(log_root_tree, log_root_tree->log_transid); 2065 wait_log_commit(trans, log_root_tree,
2066 log_root_tree->log_transid);
1965 mutex_unlock(&log_root_tree->log_mutex); 2067 mutex_unlock(&log_root_tree->log_mutex);
1966 goto out; 2068 goto out;
1967 } 2069 }
1968 atomic_set(&log_root_tree->log_commit[index2], 1); 2070 atomic_set(&log_root_tree->log_commit[index2], 1);
1969 2071
1970 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) 2072 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
1971 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); 2073 wait_log_commit(trans, log_root_tree,
2074 log_root_tree->log_transid - 1);
2075 }
2076
2077 wait_for_writer(trans, log_root_tree);
1972 2078
1973 wait_for_writer(log_root_tree); 2079 /*
2080 * now that we've moved on to the tree of log tree roots,
2081 * check the full commit flag again
2082 */
2083 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2084 mutex_unlock(&log_root_tree->log_mutex);
2085 ret = -EAGAIN;
2086 goto out_wake_log_root;
2087 }
1974 2088
1975 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2089 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1976 &log_root_tree->dirty_log_pages); 2090 &log_root_tree->dirty_log_pages);
@@ -1995,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1995 * in and cause problems either. 2109 * in and cause problems either.
1996 */ 2110 */
1997 write_ctree_super(trans, root->fs_info->tree_root, 2); 2111 write_ctree_super(trans, root->fs_info->tree_root, 2);
2112 ret = 0;
1998 2113
2114out_wake_log_root:
1999 atomic_set(&log_root_tree->log_commit[index2], 0); 2115 atomic_set(&log_root_tree->log_commit[index2], 0);
2000 smp_mb(); 2116 smp_mb();
2001 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2117 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -2008,7 +2124,8 @@ out:
2008 return 0; 2124 return 0;
2009} 2125}
2010 2126
2011/* * free all the extents used by the tree log. This should be called 2127/*
2128 * free all the extents used by the tree log. This should be called
2012 * at commit time of the full transaction 2129 * at commit time of the full transaction
2013 */ 2130 */
2014int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2131int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2142,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2142 2259
2143 btrfs_free_path(path); 2260 btrfs_free_path(path);
2144 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2261 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2145 end_log_trans(root); 2262 btrfs_end_log_trans(root);
2146 2263
2147 return 0; 2264 return 0;
2148} 2265}
@@ -2169,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2169 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2286 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2170 dirid, &index); 2287 dirid, &index);
2171 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2288 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2172 end_log_trans(root); 2289 btrfs_end_log_trans(root);
2173 2290
2174 return ret; 2291 return ret;
2175} 2292}
@@ -2569,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2569 * 2686 *
2570 * This handles both files and directories. 2687 * This handles both files and directories.
2571 */ 2688 */
2572static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 2689static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2573 struct btrfs_root *root, struct inode *inode, 2690 struct btrfs_root *root, struct inode *inode,
2574 int inode_only) 2691 int inode_only)
2575{ 2692{
@@ -2595,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2595 min_key.offset = 0; 2712 min_key.offset = 0;
2596 2713
2597 max_key.objectid = inode->i_ino; 2714 max_key.objectid = inode->i_ino;
2715
2716 /* today the code can only do partial logging of directories */
2717 if (!S_ISDIR(inode->i_mode))
2718 inode_only = LOG_INODE_ALL;
2719
2598 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2720 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2599 max_key.type = BTRFS_XATTR_ITEM_KEY; 2721 max_key.type = BTRFS_XATTR_ITEM_KEY;
2600 else 2722 else
2601 max_key.type = (u8)-1; 2723 max_key.type = (u8)-1;
2602 max_key.offset = (u64)-1; 2724 max_key.offset = (u64)-1;
2603 2725
2604 /*
2605 * if this inode has already been logged and we're in inode_only
2606 * mode, we don't want to delete the things that have already
2607 * been written to the log.
2608 *
2609 * But, if the inode has been through an inode_only log,
2610 * the logged_trans field is not set. This allows us to catch
2611 * any new names for this inode in the backrefs by logging it
2612 * again
2613 */
2614 if (inode_only == LOG_INODE_EXISTS &&
2615 BTRFS_I(inode)->logged_trans == trans->transid) {
2616 btrfs_free_path(path);
2617 btrfs_free_path(dst_path);
2618 goto out;
2619 }
2620 mutex_lock(&BTRFS_I(inode)->log_mutex); 2726 mutex_lock(&BTRFS_I(inode)->log_mutex);
2621 2727
2622 /* 2728 /*
@@ -2703,7 +2809,6 @@ next_slot:
2703 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2809 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2704 btrfs_release_path(root, path); 2810 btrfs_release_path(root, path);
2705 btrfs_release_path(log, dst_path); 2811 btrfs_release_path(log, dst_path);
2706 BTRFS_I(inode)->log_dirty_trans = 0;
2707 ret = log_directory_changes(trans, root, inode, path, dst_path); 2812 ret = log_directory_changes(trans, root, inode, path, dst_path);
2708 BUG_ON(ret); 2813 BUG_ON(ret);
2709 } 2814 }
@@ -2712,19 +2817,58 @@ next_slot:
2712 2817
2713 btrfs_free_path(path); 2818 btrfs_free_path(path);
2714 btrfs_free_path(dst_path); 2819 btrfs_free_path(dst_path);
2715out:
2716 return 0; 2820 return 0;
2717} 2821}
2718 2822
2719int btrfs_log_inode(struct btrfs_trans_handle *trans, 2823/*
2720 struct btrfs_root *root, struct inode *inode, 2824 * follow the dentry parent pointers up the chain and see if any
2721 int inode_only) 2825 * of the directories in it require a full commit before they can
2826 * be logged. Returns zero if nothing special needs to be done or 1 if
2827 * a full commit is required.
2828 */
2829static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2830 struct inode *inode,
2831 struct dentry *parent,
2832 struct super_block *sb,
2833 u64 last_committed)
2722{ 2834{
2723 int ret; 2835 int ret = 0;
2836 struct btrfs_root *root;
2724 2837
2725 start_log_trans(trans, root); 2838 if (!S_ISDIR(inode->i_mode)) {
2726 ret = __btrfs_log_inode(trans, root, inode, inode_only); 2839 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2727 end_log_trans(root); 2840 goto out;
2841 inode = parent->d_inode;
2842 }
2843
2844 while (1) {
2845 BTRFS_I(inode)->logged_trans = trans->transid;
2846 smp_mb();
2847
2848 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
2849 root = BTRFS_I(inode)->root;
2850
2851 /*
2852 * make sure any commits to the log are forced
2853 * to be full commits
2854 */
2855 root->fs_info->last_trans_log_full_commit =
2856 trans->transid;
2857 ret = 1;
2858 break;
2859 }
2860
2861 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2862 break;
2863
2864 if (parent == sb->s_root)
2865 break;
2866
2867 parent = parent->d_parent;
2868 inode = parent->d_inode;
2869
2870 }
2871out:
2728 return ret; 2872 return ret;
2729} 2873}
2730 2874
@@ -2734,31 +2878,53 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
2734 * only logging is done of any parent directories that are older than 2878 * only logging is done of any parent directories that are older than
2735 * the last committed transaction 2879 * the last committed transaction
2736 */ 2880 */
2737int btrfs_log_dentry(struct btrfs_trans_handle *trans, 2881int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2738 struct btrfs_root *root, struct dentry *dentry) 2882 struct btrfs_root *root, struct inode *inode,
2883 struct dentry *parent, int exists_only)
2739{ 2884{
2740 int inode_only = LOG_INODE_ALL; 2885 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2741 struct super_block *sb; 2886 struct super_block *sb;
2742 int ret; 2887 int ret = 0;
2888 u64 last_committed = root->fs_info->last_trans_committed;
2889
2890 sb = inode->i_sb;
2891
2892 if (root->fs_info->last_trans_log_full_commit >
2893 root->fs_info->last_trans_committed) {
2894 ret = 1;
2895 goto end_no_trans;
2896 }
2897
2898 ret = check_parent_dirs_for_sync(trans, inode, parent,
2899 sb, last_committed);
2900 if (ret)
2901 goto end_no_trans;
2743 2902
2744 start_log_trans(trans, root); 2903 start_log_trans(trans, root);
2745 sb = dentry->d_inode->i_sb;
2746 while (1) {
2747 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2748 inode_only);
2749 BUG_ON(ret);
2750 inode_only = LOG_INODE_EXISTS;
2751 2904
2752 dentry = dentry->d_parent; 2905 ret = btrfs_log_inode(trans, root, inode, inode_only);
2753 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) 2906 BUG_ON(ret);
2907 inode_only = LOG_INODE_EXISTS;
2908
2909 while (1) {
2910 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2754 break; 2911 break;
2755 2912
2756 if (BTRFS_I(dentry->d_inode)->generation <= 2913 inode = parent->d_inode;
2757 root->fs_info->last_trans_committed) 2914 if (BTRFS_I(inode)->generation >
2915 root->fs_info->last_trans_committed) {
2916 ret = btrfs_log_inode(trans, root, inode, inode_only);
2917 BUG_ON(ret);
2918 }
2919 if (parent == sb->s_root)
2758 break; 2920 break;
2921
2922 parent = parent->d_parent;
2759 } 2923 }
2760 end_log_trans(root); 2924 ret = 0;
2761 return 0; 2925 btrfs_end_log_trans(root);
2926end_no_trans:
2927 return ret;
2762} 2928}
2763 2929
2764/* 2930/*
@@ -2770,12 +2936,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2770int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2936int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2771 struct btrfs_root *root, struct dentry *dentry) 2937 struct btrfs_root *root, struct dentry *dentry)
2772{ 2938{
2773 u64 gen; 2939 return btrfs_log_inode_parent(trans, root, dentry->d_inode,
2774 gen = root->fs_info->last_trans_new_blockgroup; 2940 dentry->d_parent, 0);
2775 if (gen > root->fs_info->last_trans_committed)
2776 return 1;
2777 else
2778 return btrfs_log_dentry(trans, root, dentry);
2779} 2941}
2780 2942
2781/* 2943/*
@@ -2894,3 +3056,74 @@ again:
2894 kfree(log_root_tree); 3056 kfree(log_root_tree);
2895 return 0; 3057 return 0;
2896} 3058}
3059
3060/*
3061 * there are some corner cases where we want to force a full
3062 * commit instead of allowing a directory to be logged.
3063 *
3064 * They revolve around files there were unlinked from the directory, and
3065 * this function updates the parent directory so that a full commit is
3066 * properly done if it is fsync'd later after the unlinks are done.
3067 */
3068void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3069 struct inode *dir, struct inode *inode,
3070 int for_rename)
3071{
3072 /*
3073 * if this directory was already logged any new
3074 * names for this file/dir will get recorded
3075 */
3076 smp_mb();
3077 if (BTRFS_I(dir)->logged_trans == trans->transid)
3078 return;
3079
3080 /*
3081 * if the inode we're about to unlink was logged,
3082 * the log will be properly updated for any new names
3083 */
3084 if (BTRFS_I(inode)->logged_trans == trans->transid)
3085 return;
3086
3087 /*
3088 * when renaming files across directories, if the directory
3089 * there we're unlinking from gets fsync'd later on, there's
3090 * no way to find the destination directory later and fsync it
3091 * properly. So, we have to be conservative and force commits
3092 * so the new name gets discovered.
3093 */
3094 if (for_rename)
3095 goto record;
3096
3097 /* we can safely do the unlink without any special recording */
3098 return;
3099
3100record:
3101 BTRFS_I(dir)->last_unlink_trans = trans->transid;
3102}
3103
3104/*
3105 * Call this after adding a new name for a file and it will properly
3106 * update the log to reflect the new name.
3107 *
3108 * It will return zero if all goes well, and it will return 1 if a
3109 * full transaction commit is required.
3110 */
3111int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3112 struct inode *inode, struct inode *old_dir,
3113 struct dentry *parent)
3114{
3115 struct btrfs_root * root = BTRFS_I(inode)->root;
3116
3117 /*
3118 * if this inode hasn't been logged and directory we're renaming it
3119 * from hasn't been logged, we don't need to log it
3120 */
3121 if (BTRFS_I(inode)->logged_trans <=
3122 root->fs_info->last_trans_committed &&
3123 (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3124 root->fs_info->last_trans_committed))
3125 return 0;
3126
3127 return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3128}
3129
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed02..d09c7609e16b 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
22int btrfs_sync_log(struct btrfs_trans_handle *trans, 22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root); 23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root); 25int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 26int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry); 27 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 28int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 29 struct btrfs_root *root,
35 const char *name, int name_len, 30 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root, 33 struct btrfs_root *root,
39 const char *name, int name_len, 34 const char *name, int name_len,
40 struct inode *inode, u64 dirid); 35 struct inode *inode, u64 dirid);
36int btrfs_join_running_log_trans(struct btrfs_root *root);
37int btrfs_end_log_trans(struct btrfs_root *root);
38int btrfs_pin_log_trans(struct btrfs_root *root);
39int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
40 struct btrfs_root *root, struct inode *inode,
41 struct dentry *parent, int exists_only);
42void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
43 struct inode *dir, struct inode *inode,
44 int for_rename);
45int btrfs_log_new_name(struct btrfs_trans_handle *trans,
46 struct inode *inode, struct inode *old_dir,
47 struct dentry *parent);
41#endif 48#endif