aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c444
1 files changed, 365 insertions, 79 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60fa..fc9b87a7975b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
35#define LOG_INODE_EXISTS 1 35#define LOG_INODE_EXISTS 1
36 36
37/* 37/*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2). After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 * 2a is actually the more important variant. With the extra logging
63 * a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir. After a crash the rm -rf must
75 * be replayed. This must be able to recurse down the entire
76 * directory tree. The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80/*
38 * stages for the tree walking. The first 81 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find 82 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes 83 * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
47#define LOG_WALK_REPLAY_INODES 1 90#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2 91#define LOG_WALK_REPLAY_ALL 2
49 92
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 93static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode, 94 struct btrfs_root *root, struct inode *inode,
52 int inode_only); 95 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 96static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, 97 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid); 98 struct btrfs_path *path, u64 objectid);
99static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root,
101 struct btrfs_root *log,
102 struct btrfs_path *path,
103 u64 dirid, int del_all);
56 104
57/* 105/*
58 * tree logging is a special write ahead log used to make sure that 106 * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
133} 181}
134 182
135/* 183/*
184 * This either makes the current running log transaction wait
185 * until you call btrfs_end_log_trans() or it makes any future
186 * log transactions wait until you call btrfs_end_log_trans()
187 */
188int btrfs_pin_log_trans(struct btrfs_root *root)
189{
190 int ret = -ENOENT;
191
192 mutex_lock(&root->log_mutex);
193 atomic_inc(&root->log_writers);
194 mutex_unlock(&root->log_mutex);
195 return ret;
196}
197
198/*
136 * indicate we're done making changes to the log tree 199 * indicate we're done making changes to the log tree
137 * and wake up anyone waiting to do a sync 200 * and wake up anyone waiting to do a sync
138 */ 201 */
139static int end_log_trans(struct btrfs_root *root) 202int btrfs_end_log_trans(struct btrfs_root *root)
140{ 203{
141 if (atomic_dec_and_test(&root->log_writers)) { 204 if (atomic_dec_and_test(&root->log_writers)) {
142 smp_mb(); 205 smp_mb();
@@ -203,7 +266,6 @@ static int process_one_buffer(struct btrfs_root *log,
203 mutex_lock(&log->fs_info->pinned_mutex); 266 mutex_lock(&log->fs_info->pinned_mutex);
204 btrfs_update_pinned_extents(log->fs_info->extent_root, 267 btrfs_update_pinned_extents(log->fs_info->extent_root,
205 eb->start, eb->len, 1); 268 eb->start, eb->len, 1);
206 mutex_unlock(&log->fs_info->pinned_mutex);
207 } 269 }
208 270
209 if (btrfs_buffer_uptodate(eb, gen)) { 271 if (btrfs_buffer_uptodate(eb, gen)) {
@@ -603,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
603 665
604 ret = link_to_fixup_dir(trans, root, path, location.objectid); 666 ret = link_to_fixup_dir(trans, root, path, location.objectid);
605 BUG_ON(ret); 667 BUG_ON(ret);
668
606 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 669 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
607 BUG_ON(ret); 670 BUG_ON(ret);
608 kfree(name); 671 kfree(name);
@@ -804,6 +867,7 @@ conflict_again:
804 victim_name_len)) { 867 victim_name_len)) {
805 btrfs_inc_nlink(inode); 868 btrfs_inc_nlink(inode);
806 btrfs_release_path(root, path); 869 btrfs_release_path(root, path);
870
807 ret = btrfs_unlink_inode(trans, root, dir, 871 ret = btrfs_unlink_inode(trans, root, dir,
808 inode, victim_name, 872 inode, victim_name,
809 victim_name_len); 873 victim_name_len);
@@ -922,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
922 key.offset--; 986 key.offset--;
923 btrfs_release_path(root, path); 987 btrfs_release_path(root, path);
924 } 988 }
925 btrfs_free_path(path); 989 btrfs_release_path(root, path);
926 if (nlink != inode->i_nlink) { 990 if (nlink != inode->i_nlink) {
927 inode->i_nlink = nlink; 991 inode->i_nlink = nlink;
928 btrfs_update_inode(trans, root, inode); 992 btrfs_update_inode(trans, root, inode);
929 } 993 }
930 BTRFS_I(inode)->index_cnt = (u64)-1; 994 BTRFS_I(inode)->index_cnt = (u64)-1;
931 995
996 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
997 ret = replay_dir_deletes(trans, root, NULL, path,
998 inode->i_ino, 1);
999 BUG_ON(ret);
1000 }
1001 btrfs_free_path(path);
1002
932 return 0; 1003 return 0;
933} 1004}
934 1005
@@ -971,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
971 1042
972 iput(inode); 1043 iput(inode);
973 1044
974 if (key.offset == 0) 1045 /*
975 break; 1046 * fixup on a directory may create new entries,
976 key.offset--; 1047 * make sure we always look for the highset possible
1048 * offset
1049 */
1050 key.offset = (u64)-1;
977 } 1051 }
978 btrfs_release_path(root, path); 1052 btrfs_release_path(root, path);
979 return 0; 1053 return 0;
@@ -1313,11 +1387,11 @@ again:
1313 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1387 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1314 name_len); 1388 name_len);
1315 log_di = NULL; 1389 log_di = NULL;
1316 if (dir_key->type == BTRFS_DIR_ITEM_KEY) { 1390 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1317 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1391 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1318 dir_key->objectid, 1392 dir_key->objectid,
1319 name, name_len, 0); 1393 name, name_len, 0);
1320 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { 1394 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1321 log_di = btrfs_lookup_dir_index_item(trans, log, 1395 log_di = btrfs_lookup_dir_index_item(trans, log,
1322 log_path, 1396 log_path,
1323 dir_key->objectid, 1397 dir_key->objectid,
@@ -1378,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1378 struct btrfs_root *root, 1452 struct btrfs_root *root,
1379 struct btrfs_root *log, 1453 struct btrfs_root *log,
1380 struct btrfs_path *path, 1454 struct btrfs_path *path,
1381 u64 dirid) 1455 u64 dirid, int del_all)
1382{ 1456{
1383 u64 range_start; 1457 u64 range_start;
1384 u64 range_end; 1458 u64 range_end;
@@ -1408,10 +1482,14 @@ again:
1408 range_start = 0; 1482 range_start = 0;
1409 range_end = 0; 1483 range_end = 0;
1410 while (1) { 1484 while (1) {
1411 ret = find_dir_range(log, path, dirid, key_type, 1485 if (del_all)
1412 &range_start, &range_end); 1486 range_end = (u64)-1;
1413 if (ret != 0) 1487 else {
1414 break; 1488 ret = find_dir_range(log, path, dirid, key_type,
1489 &range_start, &range_end);
1490 if (ret != 0)
1491 break;
1492 }
1415 1493
1416 dir_key.offset = range_start; 1494 dir_key.offset = range_start;
1417 while (1) { 1495 while (1) {
@@ -1437,7 +1515,8 @@ again:
1437 break; 1515 break;
1438 1516
1439 ret = check_item_in_log(trans, root, log, path, 1517 ret = check_item_in_log(trans, root, log, path,
1440 log_path, dir, &found_key); 1518 log_path, dir,
1519 &found_key);
1441 BUG_ON(ret); 1520 BUG_ON(ret);
1442 if (found_key.offset == (u64)-1) 1521 if (found_key.offset == (u64)-1)
1443 break; 1522 break;
@@ -1514,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1514 mode = btrfs_inode_mode(eb, inode_item); 1593 mode = btrfs_inode_mode(eb, inode_item);
1515 if (S_ISDIR(mode)) { 1594 if (S_ISDIR(mode)) {
1516 ret = replay_dir_deletes(wc->trans, 1595 ret = replay_dir_deletes(wc->trans,
1517 root, log, path, key.objectid); 1596 root, log, path, key.objectid, 0);
1518 BUG_ON(ret); 1597 BUG_ON(ret);
1519 } 1598 }
1520 ret = overwrite_item(wc->trans, root, path, 1599 ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1612,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1533 root, inode, inode->i_size, 1612 root, inode, inode->i_size,
1534 BTRFS_EXTENT_DATA_KEY); 1613 BTRFS_EXTENT_DATA_KEY);
1535 BUG_ON(ret); 1614 BUG_ON(ret);
1615
1616 /* if the nlink count is zero here, the iput
1617 * will free the inode. We bump it to make
1618 * sure it doesn't get freed until the link
1619 * count fixup is done
1620 */
1621 if (inode->i_nlink == 0) {
1622 btrfs_inc_nlink(inode);
1623 btrfs_update_inode(wc->trans,
1624 root, inode);
1625 }
1536 iput(inode); 1626 iput(inode);
1537 } 1627 }
1538 ret = link_to_fixup_dir(wc->trans, root, 1628 ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
1840 return ret; 1930 return ret;
1841} 1931}
1842 1932
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid) 1933static int wait_log_commit(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root, unsigned long transid)
1844{ 1935{
1845 DEFINE_WAIT(wait); 1936 DEFINE_WAIT(wait);
1846 int index = transid % 2; 1937 int index = transid % 2;
@@ -1854,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1854 prepare_to_wait(&root->log_commit_wait[index], 1945 prepare_to_wait(&root->log_commit_wait[index],
1855 &wait, TASK_UNINTERRUPTIBLE); 1946 &wait, TASK_UNINTERRUPTIBLE);
1856 mutex_unlock(&root->log_mutex); 1947 mutex_unlock(&root->log_mutex);
1857 if (root->log_transid < transid + 2 && 1948
1949 if (root->fs_info->last_trans_log_full_commit !=
1950 trans->transid && root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index])) 1951 atomic_read(&root->log_commit[index]))
1859 schedule(); 1952 schedule();
1953
1860 finish_wait(&root->log_commit_wait[index], &wait); 1954 finish_wait(&root->log_commit_wait[index], &wait);
1861 mutex_lock(&root->log_mutex); 1955 mutex_lock(&root->log_mutex);
1862 } while (root->log_transid < transid + 2 && 1956 } while (root->log_transid < transid + 2 &&
@@ -1864,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1864 return 0; 1958 return 0;
1865} 1959}
1866 1960
1867static int wait_for_writer(struct btrfs_root *root) 1961static int wait_for_writer(struct btrfs_trans_handle *trans,
1962 struct btrfs_root *root)
1868{ 1963{
1869 DEFINE_WAIT(wait); 1964 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) { 1965 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait, 1966 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE); 1967 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex); 1968 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers)) 1969 if (root->fs_info->last_trans_log_full_commit !=
1970 trans->transid && atomic_read(&root->log_writers))
1875 schedule(); 1971 schedule();
1876 mutex_lock(&root->log_mutex); 1972 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait); 1973 finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root)
1882/* 1978/*
1883 * btrfs_sync_log does sends a given tree log down to the disk and 1979 * btrfs_sync_log does sends a given tree log down to the disk and
1884 * updates the super blocks to record it. When this call is done, 1980 * updates the super blocks to record it. When this call is done,
1885 * you know that any inodes previously logged are safely on disk 1981 * you know that any inodes previously logged are safely on disk only
1982 * if it returns 0.
1983 *
1984 * Any other return value means you need to call btrfs_commit_transaction.
1985 * Some of the edge cases for fsyncing directories that have had unlinks
1986 * or renames done in the past mean that sometimes the only safe
1987 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
1988 * that has happened.
1886 */ 1989 */
1887int btrfs_sync_log(struct btrfs_trans_handle *trans, 1990int btrfs_sync_log(struct btrfs_trans_handle *trans,
1888 struct btrfs_root *root) 1991 struct btrfs_root *root)
@@ -1896,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1896 mutex_lock(&root->log_mutex); 1999 mutex_lock(&root->log_mutex);
1897 index1 = root->log_transid % 2; 2000 index1 = root->log_transid % 2;
1898 if (atomic_read(&root->log_commit[index1])) { 2001 if (atomic_read(&root->log_commit[index1])) {
1899 wait_log_commit(root, root->log_transid); 2002 wait_log_commit(trans, root, root->log_transid);
1900 mutex_unlock(&root->log_mutex); 2003 mutex_unlock(&root->log_mutex);
1901 return 0; 2004 return 0;
1902 } 2005 }
@@ -1904,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1904 2007
1905 /* wait for previous tree log sync to complete */ 2008 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2009 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1); 2010 wait_log_commit(trans, root, root->log_transid - 1);
1908 2011
1909 while (1) { 2012 while (1) {
1910 unsigned long batch = root->log_batch; 2013 unsigned long batch = root->log_batch;
1911 mutex_unlock(&root->log_mutex); 2014 mutex_unlock(&root->log_mutex);
1912 schedule_timeout_uninterruptible(1); 2015 schedule_timeout_uninterruptible(1);
1913 mutex_lock(&root->log_mutex); 2016 mutex_lock(&root->log_mutex);
1914 wait_for_writer(root); 2017
2018 wait_for_writer(trans, root);
1915 if (batch == root->log_batch) 2019 if (batch == root->log_batch)
1916 break; 2020 break;
1917 } 2021 }
1918 2022
2023 /* bail out if we need to do a full commit */
2024 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2025 ret = -EAGAIN;
2026 mutex_unlock(&root->log_mutex);
2027 goto out;
2028 }
2029
1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2030 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1920 BUG_ON(ret); 2031 BUG_ON(ret);
1921 2032
@@ -1951,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1951 2062
1952 index2 = log_root_tree->log_transid % 2; 2063 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) { 2064 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid); 2065 wait_log_commit(trans, log_root_tree,
2066 log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex); 2067 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out; 2068 goto out;
1957 } 2069 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1); 2070 atomic_set(&log_root_tree->log_commit[index2], 1);
1959 2071
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) 2072 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); 2073 wait_log_commit(trans, log_root_tree,
2074 log_root_tree->log_transid - 1);
2075 }
2076
2077 wait_for_writer(trans, log_root_tree);
1962 2078
1963 wait_for_writer(log_root_tree); 2079 /*
2080 * now that we've moved on to the tree of log tree roots,
2081 * check the full commit flag again
2082 */
2083 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2084 mutex_unlock(&log_root_tree->log_mutex);
2085 ret = -EAGAIN;
2086 goto out_wake_log_root;
2087 }
1964 2088
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2089 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages); 2090 &log_root_tree->dirty_log_pages);
@@ -1985,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1985 * in and cause problems either. 2109 * in and cause problems either.
1986 */ 2110 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2); 2111 write_ctree_super(trans, root->fs_info->tree_root, 2);
2112 ret = 0;
1988 2113
2114out_wake_log_root:
1989 atomic_set(&log_root_tree->log_commit[index2], 0); 2115 atomic_set(&log_root_tree->log_commit[index2], 0);
1990 smp_mb(); 2116 smp_mb();
1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2117 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2124,8 @@ out:
1998 return 0; 2124 return 0;
1999} 2125}
2000 2126
2001/* * free all the extents used by the tree log. This should be called 2127/*
2128 * free all the extents used by the tree log. This should be called
2002 * at commit time of the full transaction 2129 * at commit time of the full transaction
2003 */ 2130 */
2004int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2131int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2132 2259
2133 btrfs_free_path(path); 2260 btrfs_free_path(path);
2134 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2261 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2135 end_log_trans(root); 2262 btrfs_end_log_trans(root);
2136 2263
2137 return 0; 2264 return 0;
2138} 2265}
@@ -2159,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2159 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2286 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2160 dirid, &index); 2287 dirid, &index);
2161 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2288 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2162 end_log_trans(root); 2289 btrfs_end_log_trans(root);
2163 2290
2164 return ret; 2291 return ret;
2165} 2292}
@@ -2559,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2559 * 2686 *
2560 * This handles both files and directories. 2687 * This handles both files and directories.
2561 */ 2688 */
2562static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 2689static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root, struct inode *inode, 2690 struct btrfs_root *root, struct inode *inode,
2564 int inode_only) 2691 int inode_only)
2565{ 2692{
@@ -2585,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2585 min_key.offset = 0; 2712 min_key.offset = 0;
2586 2713
2587 max_key.objectid = inode->i_ino; 2714 max_key.objectid = inode->i_ino;
2715
2716 /* today the code can only do partial logging of directories */
2717 if (!S_ISDIR(inode->i_mode))
2718 inode_only = LOG_INODE_ALL;
2719
2588 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2720 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2589 max_key.type = BTRFS_XATTR_ITEM_KEY; 2721 max_key.type = BTRFS_XATTR_ITEM_KEY;
2590 else 2722 else
2591 max_key.type = (u8)-1; 2723 max_key.type = (u8)-1;
2592 max_key.offset = (u64)-1; 2724 max_key.offset = (u64)-1;
2593 2725
2594 /*
2595 * if this inode has already been logged and we're in inode_only
2596 * mode, we don't want to delete the things that have already
2597 * been written to the log.
2598 *
2599 * But, if the inode has been through an inode_only log,
2600 * the logged_trans field is not set. This allows us to catch
2601 * any new names for this inode in the backrefs by logging it
2602 * again
2603 */
2604 if (inode_only == LOG_INODE_EXISTS &&
2605 BTRFS_I(inode)->logged_trans == trans->transid) {
2606 btrfs_free_path(path);
2607 btrfs_free_path(dst_path);
2608 goto out;
2609 }
2610 mutex_lock(&BTRFS_I(inode)->log_mutex); 2726 mutex_lock(&BTRFS_I(inode)->log_mutex);
2611 2727
2612 /* 2728 /*
@@ -2693,7 +2809,6 @@ next_slot:
2693 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2809 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2694 btrfs_release_path(root, path); 2810 btrfs_release_path(root, path);
2695 btrfs_release_path(log, dst_path); 2811 btrfs_release_path(log, dst_path);
2696 BTRFS_I(inode)->log_dirty_trans = 0;
2697 ret = log_directory_changes(trans, root, inode, path, dst_path); 2812 ret = log_directory_changes(trans, root, inode, path, dst_path);
2698 BUG_ON(ret); 2813 BUG_ON(ret);
2699 } 2814 }
@@ -2702,19 +2817,69 @@ next_slot:
2702 2817
2703 btrfs_free_path(path); 2818 btrfs_free_path(path);
2704 btrfs_free_path(dst_path); 2819 btrfs_free_path(dst_path);
2705out:
2706 return 0; 2820 return 0;
2707} 2821}
2708 2822
2709int btrfs_log_inode(struct btrfs_trans_handle *trans, 2823/*
2710 struct btrfs_root *root, struct inode *inode, 2824 * follow the dentry parent pointers up the chain and see if any
2711 int inode_only) 2825 * of the directories in it require a full commit before they can
2826 * be logged. Returns zero if nothing special needs to be done or 1 if
2827 * a full commit is required.
2828 */
2829static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2830 struct inode *inode,
2831 struct dentry *parent,
2832 struct super_block *sb,
2833 u64 last_committed)
2712{ 2834{
2713 int ret; 2835 int ret = 0;
2836 struct btrfs_root *root;
2714 2837
2715 start_log_trans(trans, root); 2838 /*
2716 ret = __btrfs_log_inode(trans, root, inode, inode_only); 2839 * for regular files, if its inode is already on disk, we don't
2717 end_log_trans(root); 2840 * have to worry about the parents at all. This is because
2841 * we can use the last_unlink_trans field to record renames
2842 * and other fun in this file.
2843 */
2844 if (S_ISREG(inode->i_mode) &&
2845 BTRFS_I(inode)->generation <= last_committed &&
2846 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2847 goto out;
2848
2849 if (!S_ISDIR(inode->i_mode)) {
2850 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2851 goto out;
2852 inode = parent->d_inode;
2853 }
2854
2855 while (1) {
2856 BTRFS_I(inode)->logged_trans = trans->transid;
2857 smp_mb();
2858
2859 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
2860 root = BTRFS_I(inode)->root;
2861
2862 /*
2863 * make sure any commits to the log are forced
2864 * to be full commits
2865 */
2866 root->fs_info->last_trans_log_full_commit =
2867 trans->transid;
2868 ret = 1;
2869 break;
2870 }
2871
2872 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2873 break;
2874
2875 if (parent == sb->s_root)
2876 break;
2877
2878 parent = parent->d_parent;
2879 inode = parent->d_inode;
2880
2881 }
2882out:
2718 return ret; 2883 return ret;
2719} 2884}
2720 2885
@@ -2724,31 +2889,65 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724 * only logging is done of any parent directories that are older than 2889 * only logging is done of any parent directories that are older than
2725 * the last committed transaction 2890 * the last committed transaction
2726 */ 2891 */
2727int btrfs_log_dentry(struct btrfs_trans_handle *trans, 2892int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2728 struct btrfs_root *root, struct dentry *dentry) 2893 struct btrfs_root *root, struct inode *inode,
2894 struct dentry *parent, int exists_only)
2729{ 2895{
2730 int inode_only = LOG_INODE_ALL; 2896 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2731 struct super_block *sb; 2897 struct super_block *sb;
2732 int ret; 2898 int ret = 0;
2899 u64 last_committed = root->fs_info->last_trans_committed;
2900
2901 sb = inode->i_sb;
2902
2903 if (root->fs_info->last_trans_log_full_commit >
2904 root->fs_info->last_trans_committed) {
2905 ret = 1;
2906 goto end_no_trans;
2907 }
2908
2909 ret = check_parent_dirs_for_sync(trans, inode, parent,
2910 sb, last_committed);
2911 if (ret)
2912 goto end_no_trans;
2733 2913
2734 start_log_trans(trans, root); 2914 start_log_trans(trans, root);
2735 sb = dentry->d_inode->i_sb;
2736 while (1) {
2737 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2738 inode_only);
2739 BUG_ON(ret);
2740 inode_only = LOG_INODE_EXISTS;
2741 2915
2742 dentry = dentry->d_parent; 2916 ret = btrfs_log_inode(trans, root, inode, inode_only);
2743 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) 2917 BUG_ON(ret);
2918
2919 /*
2920 * for regular files, if its inode is already on disk, we don't
2921 * have to worry about the parents at all. This is because
2922 * we can use the last_unlink_trans field to record renames
2923 * and other fun in this file.
2924 */
2925 if (S_ISREG(inode->i_mode) &&
2926 BTRFS_I(inode)->generation <= last_committed &&
2927 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2928 goto no_parent;
2929
2930 inode_only = LOG_INODE_EXISTS;
2931 while (1) {
2932 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2744 break; 2933 break;
2745 2934
2746 if (BTRFS_I(dentry->d_inode)->generation <= 2935 inode = parent->d_inode;
2747 root->fs_info->last_trans_committed) 2936 if (BTRFS_I(inode)->generation >
2937 root->fs_info->last_trans_committed) {
2938 ret = btrfs_log_inode(trans, root, inode, inode_only);
2939 BUG_ON(ret);
2940 }
2941 if (parent == sb->s_root)
2748 break; 2942 break;
2943
2944 parent = parent->d_parent;
2749 } 2945 }
2750 end_log_trans(root); 2946no_parent:
2751 return 0; 2947 ret = 0;
2948 btrfs_end_log_trans(root);
2949end_no_trans:
2950 return ret;
2752} 2951}
2753 2952
2754/* 2953/*
@@ -2760,12 +2959,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2760int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2959int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2761 struct btrfs_root *root, struct dentry *dentry) 2960 struct btrfs_root *root, struct dentry *dentry)
2762{ 2961{
2763 u64 gen; 2962 return btrfs_log_inode_parent(trans, root, dentry->d_inode,
2764 gen = root->fs_info->last_trans_new_blockgroup; 2963 dentry->d_parent, 0);
2765 if (gen > root->fs_info->last_trans_committed)
2766 return 1;
2767 else
2768 return btrfs_log_dentry(trans, root, dentry);
2769} 2964}
2770 2965
2771/* 2966/*
@@ -2884,3 +3079,94 @@ again:
2884 kfree(log_root_tree); 3079 kfree(log_root_tree);
2885 return 0; 3080 return 0;
2886} 3081}
3082
3083/*
3084 * there are some corner cases where we want to force a full
3085 * commit instead of allowing a directory to be logged.
3086 *
3087 * They revolve around files there were unlinked from the directory, and
3088 * this function updates the parent directory so that a full commit is
3089 * properly done if it is fsync'd later after the unlinks are done.
3090 */
3091void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3092 struct inode *dir, struct inode *inode,
3093 int for_rename)
3094{
3095 /*
3096 * when we're logging a file, if it hasn't been renamed
3097 * or unlinked, and its inode is fully committed on disk,
3098 * we don't have to worry about walking up the directory chain
3099 * to log its parents.
3100 *
3101 * So, we use the last_unlink_trans field to put this transid
3102 * into the file. When the file is logged we check it and
3103 * don't log the parents if the file is fully on disk.
3104 */
3105 if (S_ISREG(inode->i_mode))
3106 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3107
3108 /*
3109 * if this directory was already logged any new
3110 * names for this file/dir will get recorded
3111 */
3112 smp_mb();
3113 if (BTRFS_I(dir)->logged_trans == trans->transid)
3114 return;
3115
3116 /*
3117 * if the inode we're about to unlink was logged,
3118 * the log will be properly updated for any new names
3119 */
3120 if (BTRFS_I(inode)->logged_trans == trans->transid)
3121 return;
3122
3123 /*
3124 * when renaming files across directories, if the directory
3125 * there we're unlinking from gets fsync'd later on, there's
3126 * no way to find the destination directory later and fsync it
3127 * properly. So, we have to be conservative and force commits
3128 * so the new name gets discovered.
3129 */
3130 if (for_rename)
3131 goto record;
3132
3133 /* we can safely do the unlink without any special recording */
3134 return;
3135
3136record:
3137 BTRFS_I(dir)->last_unlink_trans = trans->transid;
3138}
3139
3140/*
3141 * Call this after adding a new name for a file and it will properly
3142 * update the log to reflect the new name.
3143 *
3144 * It will return zero if all goes well, and it will return 1 if a
3145 * full transaction commit is required.
3146 */
3147int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3148 struct inode *inode, struct inode *old_dir,
3149 struct dentry *parent)
3150{
3151 struct btrfs_root * root = BTRFS_I(inode)->root;
3152
3153 /*
3154 * this will force the logging code to walk the dentry chain
3155 * up for the file
3156 */
3157 if (S_ISREG(inode->i_mode))
3158 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3159
3160 /*
3161 * if this inode hasn't been logged and directory we're renaming it
3162 * from hasn't been logged, we don't need to log it
3163 */
3164 if (BTRFS_I(inode)->logged_trans <=
3165 root->fs_info->last_trans_committed &&
3166 (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3167 root->fs_info->last_trans_committed))
3168 return 0;
3169
3170 return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3171}
3172