aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c456
1 files changed, 372 insertions, 84 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60fa..25f20ea11f27 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
35#define LOG_INODE_EXISTS 1 35#define LOG_INODE_EXISTS 1
36 36
37/* 37/*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2). After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 * 2a is actually the more important variant. With the extra logging
63 * a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir. After a crash the rm -rf must
75 * be replayed. This must be able to recurse down the entire
76 * directory tree. The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80/*
38 * stages for the tree walking. The first 81 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find 82 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes 83 * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
47#define LOG_WALK_REPLAY_INODES 1 90#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2 91#define LOG_WALK_REPLAY_ALL 2
49 92
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 93static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode, 94 struct btrfs_root *root, struct inode *inode,
52 int inode_only); 95 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 96static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, 97 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid); 98 struct btrfs_path *path, u64 objectid);
99static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root,
101 struct btrfs_root *log,
102 struct btrfs_path *path,
103 u64 dirid, int del_all);
56 104
57/* 105/*
58 * tree logging is a special write ahead log used to make sure that 106 * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
133} 181}
134 182
135/* 183/*
184 * This either makes the current running log transaction wait
185 * until you call btrfs_end_log_trans() or it makes any future
186 * log transactions wait until you call btrfs_end_log_trans()
187 */
188int btrfs_pin_log_trans(struct btrfs_root *root)
189{
190 int ret = -ENOENT;
191
192 mutex_lock(&root->log_mutex);
193 atomic_inc(&root->log_writers);
194 mutex_unlock(&root->log_mutex);
195 return ret;
196}
197
198/*
136 * indicate we're done making changes to the log tree 199 * indicate we're done making changes to the log tree
137 * and wake up anyone waiting to do a sync 200 * and wake up anyone waiting to do a sync
138 */ 201 */
139static int end_log_trans(struct btrfs_root *root) 202int btrfs_end_log_trans(struct btrfs_root *root)
140{ 203{
141 if (atomic_dec_and_test(&root->log_writers)) { 204 if (atomic_dec_and_test(&root->log_writers)) {
142 smp_mb(); 205 smp_mb();
@@ -199,12 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
199 struct extent_buffer *eb, 262 struct extent_buffer *eb,
200 struct walk_control *wc, u64 gen) 263 struct walk_control *wc, u64 gen)
201{ 264{
202 if (wc->pin) { 265 if (wc->pin)
203 mutex_lock(&log->fs_info->pinned_mutex);
204 btrfs_update_pinned_extents(log->fs_info->extent_root, 266 btrfs_update_pinned_extents(log->fs_info->extent_root,
205 eb->start, eb->len, 1); 267 eb->start, eb->len, 1);
206 mutex_unlock(&log->fs_info->pinned_mutex);
207 }
208 268
209 if (btrfs_buffer_uptodate(eb, gen)) { 269 if (btrfs_buffer_uptodate(eb, gen)) {
210 if (wc->write) 270 if (wc->write)
@@ -603,6 +663,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
603 663
604 ret = link_to_fixup_dir(trans, root, path, location.objectid); 664 ret = link_to_fixup_dir(trans, root, path, location.objectid);
605 BUG_ON(ret); 665 BUG_ON(ret);
666
606 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 667 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
607 BUG_ON(ret); 668 BUG_ON(ret);
608 kfree(name); 669 kfree(name);
@@ -804,6 +865,7 @@ conflict_again:
804 victim_name_len)) { 865 victim_name_len)) {
805 btrfs_inc_nlink(inode); 866 btrfs_inc_nlink(inode);
806 btrfs_release_path(root, path); 867 btrfs_release_path(root, path);
868
807 ret = btrfs_unlink_inode(trans, root, dir, 869 ret = btrfs_unlink_inode(trans, root, dir,
808 inode, victim_name, 870 inode, victim_name,
809 victim_name_len); 871 victim_name_len);
@@ -922,13 +984,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
922 key.offset--; 984 key.offset--;
923 btrfs_release_path(root, path); 985 btrfs_release_path(root, path);
924 } 986 }
925 btrfs_free_path(path); 987 btrfs_release_path(root, path);
926 if (nlink != inode->i_nlink) { 988 if (nlink != inode->i_nlink) {
927 inode->i_nlink = nlink; 989 inode->i_nlink = nlink;
928 btrfs_update_inode(trans, root, inode); 990 btrfs_update_inode(trans, root, inode);
929 } 991 }
930 BTRFS_I(inode)->index_cnt = (u64)-1; 992 BTRFS_I(inode)->index_cnt = (u64)-1;
931 993
994 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
995 ret = replay_dir_deletes(trans, root, NULL, path,
996 inode->i_ino, 1);
997 BUG_ON(ret);
998 }
999 btrfs_free_path(path);
1000
932 return 0; 1001 return 0;
933} 1002}
934 1003
@@ -971,9 +1040,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
971 1040
972 iput(inode); 1041 iput(inode);
973 1042
974 if (key.offset == 0) 1043 /*
975 break; 1044 * fixup on a directory may create new entries,
976 key.offset--; 1045 * make sure we always look for the highset possible
1046 * offset
1047 */
1048 key.offset = (u64)-1;
977 } 1049 }
978 btrfs_release_path(root, path); 1050 btrfs_release_path(root, path);
979 return 0; 1051 return 0;
@@ -1150,8 +1222,7 @@ insert:
1150 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1222 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1151 name, name_len, log_type, &log_key); 1223 name, name_len, log_type, &log_key);
1152 1224
1153 if (ret && ret != -ENOENT) 1225 BUG_ON(ret && ret != -ENOENT);
1154 BUG();
1155 goto out; 1226 goto out;
1156} 1227}
1157 1228
@@ -1313,11 +1384,11 @@ again:
1313 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1384 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1314 name_len); 1385 name_len);
1315 log_di = NULL; 1386 log_di = NULL;
1316 if (dir_key->type == BTRFS_DIR_ITEM_KEY) { 1387 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1317 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1388 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1318 dir_key->objectid, 1389 dir_key->objectid,
1319 name, name_len, 0); 1390 name, name_len, 0);
1320 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { 1391 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1321 log_di = btrfs_lookup_dir_index_item(trans, log, 1392 log_di = btrfs_lookup_dir_index_item(trans, log,
1322 log_path, 1393 log_path,
1323 dir_key->objectid, 1394 dir_key->objectid,
@@ -1378,7 +1449,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1378 struct btrfs_root *root, 1449 struct btrfs_root *root,
1379 struct btrfs_root *log, 1450 struct btrfs_root *log,
1380 struct btrfs_path *path, 1451 struct btrfs_path *path,
1381 u64 dirid) 1452 u64 dirid, int del_all)
1382{ 1453{
1383 u64 range_start; 1454 u64 range_start;
1384 u64 range_end; 1455 u64 range_end;
@@ -1408,10 +1479,14 @@ again:
1408 range_start = 0; 1479 range_start = 0;
1409 range_end = 0; 1480 range_end = 0;
1410 while (1) { 1481 while (1) {
1411 ret = find_dir_range(log, path, dirid, key_type, 1482 if (del_all)
1412 &range_start, &range_end); 1483 range_end = (u64)-1;
1413 if (ret != 0) 1484 else {
1414 break; 1485 ret = find_dir_range(log, path, dirid, key_type,
1486 &range_start, &range_end);
1487 if (ret != 0)
1488 break;
1489 }
1415 1490
1416 dir_key.offset = range_start; 1491 dir_key.offset = range_start;
1417 while (1) { 1492 while (1) {
@@ -1437,7 +1512,8 @@ again:
1437 break; 1512 break;
1438 1513
1439 ret = check_item_in_log(trans, root, log, path, 1514 ret = check_item_in_log(trans, root, log, path,
1440 log_path, dir, &found_key); 1515 log_path, dir,
1516 &found_key);
1441 BUG_ON(ret); 1517 BUG_ON(ret);
1442 if (found_key.offset == (u64)-1) 1518 if (found_key.offset == (u64)-1)
1443 break; 1519 break;
@@ -1514,7 +1590,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1514 mode = btrfs_inode_mode(eb, inode_item); 1590 mode = btrfs_inode_mode(eb, inode_item);
1515 if (S_ISDIR(mode)) { 1591 if (S_ISDIR(mode)) {
1516 ret = replay_dir_deletes(wc->trans, 1592 ret = replay_dir_deletes(wc->trans,
1517 root, log, path, key.objectid); 1593 root, log, path, key.objectid, 0);
1518 BUG_ON(ret); 1594 BUG_ON(ret);
1519 } 1595 }
1520 ret = overwrite_item(wc->trans, root, path, 1596 ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1609,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1533 root, inode, inode->i_size, 1609 root, inode, inode->i_size,
1534 BTRFS_EXTENT_DATA_KEY); 1610 BTRFS_EXTENT_DATA_KEY);
1535 BUG_ON(ret); 1611 BUG_ON(ret);
1612
1613 /* if the nlink count is zero here, the iput
1614 * will free the inode. We bump it to make
1615 * sure it doesn't get freed until the link
1616 * count fixup is done
1617 */
1618 if (inode->i_nlink == 0) {
1619 btrfs_inc_nlink(inode);
1620 btrfs_update_inode(wc->trans,
1621 root, inode);
1622 }
1536 iput(inode); 1623 iput(inode);
1537 } 1624 }
1538 ret = link_to_fixup_dir(wc->trans, root, 1625 ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1927,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
1840 return ret; 1927 return ret;
1841} 1928}
1842 1929
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid) 1930static int wait_log_commit(struct btrfs_trans_handle *trans,
1931 struct btrfs_root *root, unsigned long transid)
1844{ 1932{
1845 DEFINE_WAIT(wait); 1933 DEFINE_WAIT(wait);
1846 int index = transid % 2; 1934 int index = transid % 2;
@@ -1854,9 +1942,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1854 prepare_to_wait(&root->log_commit_wait[index], 1942 prepare_to_wait(&root->log_commit_wait[index],
1855 &wait, TASK_UNINTERRUPTIBLE); 1943 &wait, TASK_UNINTERRUPTIBLE);
1856 mutex_unlock(&root->log_mutex); 1944 mutex_unlock(&root->log_mutex);
1857 if (root->log_transid < transid + 2 && 1945
1946 if (root->fs_info->last_trans_log_full_commit !=
1947 trans->transid && root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index])) 1948 atomic_read(&root->log_commit[index]))
1859 schedule(); 1949 schedule();
1950
1860 finish_wait(&root->log_commit_wait[index], &wait); 1951 finish_wait(&root->log_commit_wait[index], &wait);
1861 mutex_lock(&root->log_mutex); 1952 mutex_lock(&root->log_mutex);
1862 } while (root->log_transid < transid + 2 && 1953 } while (root->log_transid < transid + 2 &&
@@ -1864,14 +1955,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1864 return 0; 1955 return 0;
1865} 1956}
1866 1957
1867static int wait_for_writer(struct btrfs_root *root) 1958static int wait_for_writer(struct btrfs_trans_handle *trans,
1959 struct btrfs_root *root)
1868{ 1960{
1869 DEFINE_WAIT(wait); 1961 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) { 1962 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait, 1963 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE); 1964 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex); 1965 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers)) 1966 if (root->fs_info->last_trans_log_full_commit !=
1967 trans->transid && atomic_read(&root->log_writers))
1875 schedule(); 1968 schedule();
1876 mutex_lock(&root->log_mutex); 1969 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait); 1970 finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1975,14 @@ static int wait_for_writer(struct btrfs_root *root)
1882/* 1975/*
1883 * btrfs_sync_log does sends a given tree log down to the disk and 1976 * btrfs_sync_log does sends a given tree log down to the disk and
1884 * updates the super blocks to record it. When this call is done, 1977 * updates the super blocks to record it. When this call is done,
1885 * you know that any inodes previously logged are safely on disk 1978 * you know that any inodes previously logged are safely on disk only
1979 * if it returns 0.
1980 *
1981 * Any other return value means you need to call btrfs_commit_transaction.
1982 * Some of the edge cases for fsyncing directories that have had unlinks
1983 * or renames done in the past mean that sometimes the only safe
1984 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
1985 * that has happened.
1886 */ 1986 */
1887int btrfs_sync_log(struct btrfs_trans_handle *trans, 1987int btrfs_sync_log(struct btrfs_trans_handle *trans,
1888 struct btrfs_root *root) 1988 struct btrfs_root *root)
@@ -1896,7 +1996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1896 mutex_lock(&root->log_mutex); 1996 mutex_lock(&root->log_mutex);
1897 index1 = root->log_transid % 2; 1997 index1 = root->log_transid % 2;
1898 if (atomic_read(&root->log_commit[index1])) { 1998 if (atomic_read(&root->log_commit[index1])) {
1899 wait_log_commit(root, root->log_transid); 1999 wait_log_commit(trans, root, root->log_transid);
1900 mutex_unlock(&root->log_mutex); 2000 mutex_unlock(&root->log_mutex);
1901 return 0; 2001 return 0;
1902 } 2002 }
@@ -1904,18 +2004,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1904 2004
1905 /* wait for previous tree log sync to complete */ 2005 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2006 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1); 2007 wait_log_commit(trans, root, root->log_transid - 1);
1908 2008
1909 while (1) { 2009 while (1) {
1910 unsigned long batch = root->log_batch; 2010 unsigned long batch = root->log_batch;
1911 mutex_unlock(&root->log_mutex); 2011 mutex_unlock(&root->log_mutex);
1912 schedule_timeout_uninterruptible(1); 2012 schedule_timeout_uninterruptible(1);
1913 mutex_lock(&root->log_mutex); 2013 mutex_lock(&root->log_mutex);
1914 wait_for_writer(root); 2014
2015 wait_for_writer(trans, root);
1915 if (batch == root->log_batch) 2016 if (batch == root->log_batch)
1916 break; 2017 break;
1917 } 2018 }
1918 2019
2020 /* bail out if we need to do a full commit */
2021 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2022 ret = -EAGAIN;
2023 mutex_unlock(&root->log_mutex);
2024 goto out;
2025 }
2026
1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2027 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1920 BUG_ON(ret); 2028 BUG_ON(ret);
1921 2029
@@ -1951,16 +2059,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1951 2059
1952 index2 = log_root_tree->log_transid % 2; 2060 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) { 2061 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid); 2062 wait_log_commit(trans, log_root_tree,
2063 log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex); 2064 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out; 2065 goto out;
1957 } 2066 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1); 2067 atomic_set(&log_root_tree->log_commit[index2], 1);
1959 2068
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) 2069 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); 2070 wait_log_commit(trans, log_root_tree,
2071 log_root_tree->log_transid - 1);
2072 }
2073
2074 wait_for_writer(trans, log_root_tree);
1962 2075
1963 wait_for_writer(log_root_tree); 2076 /*
2077 * now that we've moved on to the tree of log tree roots,
2078 * check the full commit flag again
2079 */
2080 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2081 mutex_unlock(&log_root_tree->log_mutex);
2082 ret = -EAGAIN;
2083 goto out_wake_log_root;
2084 }
1964 2085
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2086 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages); 2087 &log_root_tree->dirty_log_pages);
@@ -1985,7 +2106,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1985 * in and cause problems either. 2106 * in and cause problems either.
1986 */ 2107 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2); 2108 write_ctree_super(trans, root->fs_info->tree_root, 2);
2109 ret = 0;
1988 2110
2111out_wake_log_root:
1989 atomic_set(&log_root_tree->log_commit[index2], 0); 2112 atomic_set(&log_root_tree->log_commit[index2], 0);
1990 smp_mb(); 2113 smp_mb();
1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2114 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2121,8 @@ out:
1998 return 0; 2121 return 0;
1999} 2122}
2000 2123
2001/* * free all the extents used by the tree log. This should be called 2124/*
2125 * free all the extents used by the tree log. This should be called
2002 * at commit time of the full transaction 2126 * at commit time of the full transaction
2003 */ 2127 */
2004int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2128int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2256,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2132 2256
2133 btrfs_free_path(path); 2257 btrfs_free_path(path);
2134 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2258 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2135 end_log_trans(root); 2259 btrfs_end_log_trans(root);
2136 2260
2137 return 0; 2261 return 0;
2138} 2262}
@@ -2159,7 +2283,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2159 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2283 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2160 dirid, &index); 2284 dirid, &index);
2161 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2285 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2162 end_log_trans(root); 2286 btrfs_end_log_trans(root);
2163 2287
2164 return ret; 2288 return ret;
2165} 2289}
@@ -2559,7 +2683,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2559 * 2683 *
2560 * This handles both files and directories. 2684 * This handles both files and directories.
2561 */ 2685 */
2562static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 2686static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root, struct inode *inode, 2687 struct btrfs_root *root, struct inode *inode,
2564 int inode_only) 2688 int inode_only)
2565{ 2689{
@@ -2585,28 +2709,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2585 min_key.offset = 0; 2709 min_key.offset = 0;
2586 2710
2587 max_key.objectid = inode->i_ino; 2711 max_key.objectid = inode->i_ino;
2712
2713 /* today the code can only do partial logging of directories */
2714 if (!S_ISDIR(inode->i_mode))
2715 inode_only = LOG_INODE_ALL;
2716
2588 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2717 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2589 max_key.type = BTRFS_XATTR_ITEM_KEY; 2718 max_key.type = BTRFS_XATTR_ITEM_KEY;
2590 else 2719 else
2591 max_key.type = (u8)-1; 2720 max_key.type = (u8)-1;
2592 max_key.offset = (u64)-1; 2721 max_key.offset = (u64)-1;
2593 2722
2594 /*
2595 * if this inode has already been logged and we're in inode_only
2596 * mode, we don't want to delete the things that have already
2597 * been written to the log.
2598 *
2599 * But, if the inode has been through an inode_only log,
2600 * the logged_trans field is not set. This allows us to catch
2601 * any new names for this inode in the backrefs by logging it
2602 * again
2603 */
2604 if (inode_only == LOG_INODE_EXISTS &&
2605 BTRFS_I(inode)->logged_trans == trans->transid) {
2606 btrfs_free_path(path);
2607 btrfs_free_path(dst_path);
2608 goto out;
2609 }
2610 mutex_lock(&BTRFS_I(inode)->log_mutex); 2723 mutex_lock(&BTRFS_I(inode)->log_mutex);
2611 2724
2612 /* 2725 /*
@@ -2693,7 +2806,6 @@ next_slot:
2693 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2806 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2694 btrfs_release_path(root, path); 2807 btrfs_release_path(root, path);
2695 btrfs_release_path(log, dst_path); 2808 btrfs_release_path(log, dst_path);
2696 BTRFS_I(inode)->log_dirty_trans = 0;
2697 ret = log_directory_changes(trans, root, inode, path, dst_path); 2809 ret = log_directory_changes(trans, root, inode, path, dst_path);
2698 BUG_ON(ret); 2810 BUG_ON(ret);
2699 } 2811 }
@@ -2702,19 +2814,69 @@ next_slot:
2702 2814
2703 btrfs_free_path(path); 2815 btrfs_free_path(path);
2704 btrfs_free_path(dst_path); 2816 btrfs_free_path(dst_path);
2705out:
2706 return 0; 2817 return 0;
2707} 2818}
2708 2819
2709int btrfs_log_inode(struct btrfs_trans_handle *trans, 2820/*
2710 struct btrfs_root *root, struct inode *inode, 2821 * follow the dentry parent pointers up the chain and see if any
2711 int inode_only) 2822 * of the directories in it require a full commit before they can
2823 * be logged. Returns zero if nothing special needs to be done or 1 if
2824 * a full commit is required.
2825 */
2826static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2827 struct inode *inode,
2828 struct dentry *parent,
2829 struct super_block *sb,
2830 u64 last_committed)
2712{ 2831{
2713 int ret; 2832 int ret = 0;
2833 struct btrfs_root *root;
2714 2834
2715 start_log_trans(trans, root); 2835 /*
2716 ret = __btrfs_log_inode(trans, root, inode, inode_only); 2836 * for regular files, if its inode is already on disk, we don't
2717 end_log_trans(root); 2837 * have to worry about the parents at all. This is because
2838 * we can use the last_unlink_trans field to record renames
2839 * and other fun in this file.
2840 */
2841 if (S_ISREG(inode->i_mode) &&
2842 BTRFS_I(inode)->generation <= last_committed &&
2843 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2844 goto out;
2845
2846 if (!S_ISDIR(inode->i_mode)) {
2847 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2848 goto out;
2849 inode = parent->d_inode;
2850 }
2851
2852 while (1) {
2853 BTRFS_I(inode)->logged_trans = trans->transid;
2854 smp_mb();
2855
2856 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
2857 root = BTRFS_I(inode)->root;
2858
2859 /*
2860 * make sure any commits to the log are forced
2861 * to be full commits
2862 */
2863 root->fs_info->last_trans_log_full_commit =
2864 trans->transid;
2865 ret = 1;
2866 break;
2867 }
2868
2869 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2870 break;
2871
2872 if (parent == sb->s_root)
2873 break;
2874
2875 parent = parent->d_parent;
2876 inode = parent->d_inode;
2877
2878 }
2879out:
2718 return ret; 2880 return ret;
2719} 2881}
2720 2882
@@ -2724,31 +2886,70 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724 * only logging is done of any parent directories that are older than 2886 * only logging is done of any parent directories that are older than
2725 * the last committed transaction 2887 * the last committed transaction
2726 */ 2888 */
2727int btrfs_log_dentry(struct btrfs_trans_handle *trans, 2889int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2728 struct btrfs_root *root, struct dentry *dentry) 2890 struct btrfs_root *root, struct inode *inode,
2891 struct dentry *parent, int exists_only)
2729{ 2892{
2730 int inode_only = LOG_INODE_ALL; 2893 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2731 struct super_block *sb; 2894 struct super_block *sb;
2732 int ret; 2895 int ret = 0;
2896 u64 last_committed = root->fs_info->last_trans_committed;
2897
2898 sb = inode->i_sb;
2899
2900 if (btrfs_test_opt(root, NOTREELOG)) {
2901 ret = 1;
2902 goto end_no_trans;
2903 }
2904
2905 if (root->fs_info->last_trans_log_full_commit >
2906 root->fs_info->last_trans_committed) {
2907 ret = 1;
2908 goto end_no_trans;
2909 }
2910
2911 ret = check_parent_dirs_for_sync(trans, inode, parent,
2912 sb, last_committed);
2913 if (ret)
2914 goto end_no_trans;
2733 2915
2734 start_log_trans(trans, root); 2916 start_log_trans(trans, root);
2735 sb = dentry->d_inode->i_sb;
2736 while (1) {
2737 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2738 inode_only);
2739 BUG_ON(ret);
2740 inode_only = LOG_INODE_EXISTS;
2741 2917
2742 dentry = dentry->d_parent; 2918 ret = btrfs_log_inode(trans, root, inode, inode_only);
2743 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) 2919 BUG_ON(ret);
2920
2921 /*
2922 * for regular files, if its inode is already on disk, we don't
2923 * have to worry about the parents at all. This is because
2924 * we can use the last_unlink_trans field to record renames
2925 * and other fun in this file.
2926 */
2927 if (S_ISREG(inode->i_mode) &&
2928 BTRFS_I(inode)->generation <= last_committed &&
2929 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2930 goto no_parent;
2931
2932 inode_only = LOG_INODE_EXISTS;
2933 while (1) {
2934 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2744 break; 2935 break;
2745 2936
2746 if (BTRFS_I(dentry->d_inode)->generation <= 2937 inode = parent->d_inode;
2747 root->fs_info->last_trans_committed) 2938 if (BTRFS_I(inode)->generation >
2939 root->fs_info->last_trans_committed) {
2940 ret = btrfs_log_inode(trans, root, inode, inode_only);
2941 BUG_ON(ret);
2942 }
2943 if (parent == sb->s_root)
2748 break; 2944 break;
2945
2946 parent = parent->d_parent;
2749 } 2947 }
2750 end_log_trans(root); 2948no_parent:
2751 return 0; 2949 ret = 0;
2950 btrfs_end_log_trans(root);
2951end_no_trans:
2952 return ret;
2752} 2953}
2753 2954
2754/* 2955/*
@@ -2760,12 +2961,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2760int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2961int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2761 struct btrfs_root *root, struct dentry *dentry) 2962 struct btrfs_root *root, struct dentry *dentry)
2762{ 2963{
2763 u64 gen; 2964 return btrfs_log_inode_parent(trans, root, dentry->d_inode,
2764 gen = root->fs_info->last_trans_new_blockgroup; 2965 dentry->d_parent, 0);
2765 if (gen > root->fs_info->last_trans_committed)
2766 return 1;
2767 else
2768 return btrfs_log_dentry(trans, root, dentry);
2769} 2966}
2770 2967
2771/* 2968/*
@@ -2884,3 +3081,94 @@ again:
2884 kfree(log_root_tree); 3081 kfree(log_root_tree);
2885 return 0; 3082 return 0;
2886} 3083}
3084
3085/*
3086 * there are some corner cases where we want to force a full
3087 * commit instead of allowing a directory to be logged.
3088 *
3089 * They revolve around files there were unlinked from the directory, and
3090 * this function updates the parent directory so that a full commit is
3091 * properly done if it is fsync'd later after the unlinks are done.
3092 */
3093void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3094 struct inode *dir, struct inode *inode,
3095 int for_rename)
3096{
3097 /*
3098 * when we're logging a file, if it hasn't been renamed
3099 * or unlinked, and its inode is fully committed on disk,
3100 * we don't have to worry about walking up the directory chain
3101 * to log its parents.
3102 *
3103 * So, we use the last_unlink_trans field to put this transid
3104 * into the file. When the file is logged we check it and
3105 * don't log the parents if the file is fully on disk.
3106 */
3107 if (S_ISREG(inode->i_mode))
3108 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3109
3110 /*
3111 * if this directory was already logged any new
3112 * names for this file/dir will get recorded
3113 */
3114 smp_mb();
3115 if (BTRFS_I(dir)->logged_trans == trans->transid)
3116 return;
3117
3118 /*
3119 * if the inode we're about to unlink was logged,
3120 * the log will be properly updated for any new names
3121 */
3122 if (BTRFS_I(inode)->logged_trans == trans->transid)
3123 return;
3124
3125 /*
3126 * when renaming files across directories, if the directory
3127 * there we're unlinking from gets fsync'd later on, there's
3128 * no way to find the destination directory later and fsync it
3129 * properly. So, we have to be conservative and force commits
3130 * so the new name gets discovered.
3131 */
3132 if (for_rename)
3133 goto record;
3134
3135 /* we can safely do the unlink without any special recording */
3136 return;
3137
3138record:
3139 BTRFS_I(dir)->last_unlink_trans = trans->transid;
3140}
3141
3142/*
3143 * Call this after adding a new name for a file and it will properly
3144 * update the log to reflect the new name.
3145 *
3146 * It will return zero if all goes well, and it will return 1 if a
3147 * full transaction commit is required.
3148 */
3149int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3150 struct inode *inode, struct inode *old_dir,
3151 struct dentry *parent)
3152{
3153 struct btrfs_root * root = BTRFS_I(inode)->root;
3154
3155 /*
3156 * this will force the logging code to walk the dentry chain
3157 * up for the file
3158 */
3159 if (S_ISREG(inode->i_mode))
3160 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3161
3162 /*
3163 * if this inode hasn't been logged and directory we're renaming it
3164 * from hasn't been logged, we don't need to log it
3165 */
3166 if (BTRFS_I(inode)->logged_trans <=
3167 root->fs_info->last_trans_committed &&
3168 (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3169 root->fs_info->last_trans_committed))
3170 return 0;
3171
3172 return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3173}
3174