aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorYan Zheng <zheng.yan@oracle.com>2009-01-21 12:54:03 -0500
committerChris Mason <chris.mason@oracle.com>2009-01-21 12:54:03 -0500
commit7237f1833601dcc435a64176c2c347ec4bd959f9 (patch)
tree5e12b9a7655f03181605e02fd91b42e68ee92c2e /fs/btrfs/tree-log.c
parent7e6628544abad773222d8b177f738ac2db1859de (diff)
Btrfs: fix tree logs parallel sync
To improve performance, btrfs_sync_log merges tree log sync requests. But it wrongly merges sync requests for different tree logs. If multiple tree logs are synced at the same time, only one of them actually gets synced. This patch has following changes to fix the bug: Move most tree log related fields in btrfs_fs_info to btrfs_root. This allows merging sync requests separately for each tree log. Don't insert root item into the log root tree immediately after log tree is allocated. Root item for log tree is inserted when log tree get synced for the first time. This allows syncing the log root tree without first syncing all log trees. At tree-log sync, btrfs_sync_log first sync the log tree; then updates corresponding root item in the log root tree; sync the log root tree; then update the super block. Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c350
1 files changed, 166 insertions, 184 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d81cda2e077c..4f26f3ed0c87 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -78,104 +78,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
78 */ 78 */
79 79
80/* 80/*
81 * btrfs_add_log_tree adds a new per-subvolume log tree into the
82 * tree of log tree roots. This must be called with a tree log transaction
83 * running (see start_log_trans).
84 */
85static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root)
87{
88 struct btrfs_key key;
89 struct btrfs_root_item root_item;
90 struct btrfs_inode_item *inode_item;
91 struct extent_buffer *leaf;
92 struct btrfs_root *new_root = root;
93 int ret;
94 u64 objectid = root->root_key.objectid;
95
96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97 BTRFS_TREE_LOG_OBJECTID,
98 trans->transid, 0, 0, 0);
99 if (IS_ERR(leaf)) {
100 ret = PTR_ERR(leaf);
101 return ret;
102 }
103
104 btrfs_set_header_nritems(leaf, 0);
105 btrfs_set_header_level(leaf, 0);
106 btrfs_set_header_bytenr(leaf, leaf->start);
107 btrfs_set_header_generation(leaf, trans->transid);
108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110 write_extent_buffer(leaf, root->fs_info->fsid,
111 (unsigned long)btrfs_header_fsid(leaf),
112 BTRFS_FSID_SIZE);
113 btrfs_mark_buffer_dirty(leaf);
114
115 inode_item = &root_item.inode;
116 memset(inode_item, 0, sizeof(*inode_item));
117 inode_item->generation = cpu_to_le64(1);
118 inode_item->size = cpu_to_le64(3);
119 inode_item->nlink = cpu_to_le32(1);
120 inode_item->nbytes = cpu_to_le64(root->leafsize);
121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123 btrfs_set_root_bytenr(&root_item, leaf->start);
124 btrfs_set_root_generation(&root_item, trans->transid);
125 btrfs_set_root_level(&root_item, 0);
126 btrfs_set_root_refs(&root_item, 0);
127 btrfs_set_root_used(&root_item, 0);
128
129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130 root_item.drop_level = 0;
131
132 btrfs_tree_unlock(leaf);
133 free_extent_buffer(leaf);
134 leaf = NULL;
135
136 btrfs_set_root_dirid(&root_item, 0);
137
138 key.objectid = BTRFS_TREE_LOG_OBJECTID;
139 key.offset = objectid;
140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142 &root_item);
143 if (ret)
144 goto fail;
145
146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147 &key);
148 BUG_ON(!new_root);
149
150 WARN_ON(root->log_root);
151 root->log_root = new_root;
152
153 /*
154 * log trees do not get reference counted because they go away
155 * before a real commit is actually done. They do store pointers
156 * to file data extents, and those reference counts still get
157 * updated (along with back refs to the log tree).
158 */
159 new_root->ref_cows = 0;
160 new_root->last_trans = trans->transid;
161
162 /*
163 * we need to make sure the root block for this new tree
164 * is marked as dirty in the dirty_log_pages tree. This
165 * is how it gets flushed down to disk at tree log commit time.
166 *
167 * the tree logging mutex keeps others from coming in and changing
168 * the new_root->node, so we can safely access it here
169 */
170 set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
171 new_root->node->start + new_root->node->len - 1,
172 GFP_NOFS);
173
174fail:
175 return ret;
176}
177
178/*
179 * start a sub transaction and setup the log tree 81 * start a sub transaction and setup the log tree
180 * this increments the log tree writer count to make the people 82 * this increments the log tree writer count to make the people
181 * syncing the tree wait for us to finish 83 * syncing the tree wait for us to finish
@@ -184,6 +86,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root) 86 struct btrfs_root *root)
185{ 87{
186 int ret; 88 int ret;
89
90 mutex_lock(&root->log_mutex);
91 if (root->log_root) {
92 root->log_batch++;
93 atomic_inc(&root->log_writers);
94 mutex_unlock(&root->log_mutex);
95 return 0;
96 }
187 mutex_lock(&root->fs_info->tree_log_mutex); 97 mutex_lock(&root->fs_info->tree_log_mutex);
188 if (!root->fs_info->log_root_tree) { 98 if (!root->fs_info->log_root_tree) {
189 ret = btrfs_init_log_root_tree(trans, root->fs_info); 99 ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -193,9 +103,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
193 ret = btrfs_add_log_tree(trans, root); 103 ret = btrfs_add_log_tree(trans, root);
194 BUG_ON(ret); 104 BUG_ON(ret);
195 } 105 }
196 atomic_inc(&root->fs_info->tree_log_writers);
197 root->fs_info->tree_log_batch++;
198 mutex_unlock(&root->fs_info->tree_log_mutex); 106 mutex_unlock(&root->fs_info->tree_log_mutex);
107 root->log_batch++;
108 atomic_inc(&root->log_writers);
109 mutex_unlock(&root->log_mutex);
199 return 0; 110 return 0;
200} 111}
201 112
@@ -212,13 +123,12 @@ static int join_running_log_trans(struct btrfs_root *root)
212 if (!root->log_root) 123 if (!root->log_root)
213 return -ENOENT; 124 return -ENOENT;
214 125
215 mutex_lock(&root->fs_info->tree_log_mutex); 126 mutex_lock(&root->log_mutex);
216 if (root->log_root) { 127 if (root->log_root) {
217 ret = 0; 128 ret = 0;
218 atomic_inc(&root->fs_info->tree_log_writers); 129 atomic_inc(&root->log_writers);
219 root->fs_info->tree_log_batch++;
220 } 130 }
221 mutex_unlock(&root->fs_info->tree_log_mutex); 131 mutex_unlock(&root->log_mutex);
222 return ret; 132 return ret;
223} 133}
224 134
@@ -228,10 +138,11 @@ static int join_running_log_trans(struct btrfs_root *root)
228 */ 138 */
229static int end_log_trans(struct btrfs_root *root) 139static int end_log_trans(struct btrfs_root *root)
230{ 140{
231 atomic_dec(&root->fs_info->tree_log_writers); 141 if (atomic_dec_and_test(&root->log_writers)) {
232 smp_mb(); 142 smp_mb();
233 if (waitqueue_active(&root->fs_info->tree_log_wait)) 143 if (waitqueue_active(&root->log_writer_wait))
234 wake_up(&root->fs_info->tree_log_wait); 144 wake_up(&root->log_writer_wait);
145 }
235 return 0; 146 return 0;
236} 147}
237 148
@@ -1902,26 +1813,65 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1902 } 1813 }
1903 } 1814 }
1904 btrfs_free_path(path); 1815 btrfs_free_path(path);
1905 if (wc->free)
1906 free_extent_buffer(log->node);
1907 return ret; 1816 return ret;
1908} 1817}
1909 1818
1910static int wait_log_commit(struct btrfs_root *log) 1819/*
1820 * helper function to update the item for a given subvolumes log root
1821 * in the tree of log roots
1822 */
1823static int update_log_root(struct btrfs_trans_handle *trans,
1824 struct btrfs_root *log)
1825{
1826 int ret;
1827
1828 if (log->log_transid == 1) {
1829 /* insert root item on the first sync */
1830 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
1831 &log->root_key, &log->root_item);
1832 } else {
1833 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
1834 &log->root_key, &log->root_item);
1835 }
1836 return ret;
1837}
1838
1839static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1911{ 1840{
1912 DEFINE_WAIT(wait); 1841 DEFINE_WAIT(wait);
1913 u64 transid = log->fs_info->tree_log_transid; 1842 int index = transid % 2;
1914 1843
1844 /*
1845 * we only allow two pending log transactions at a time,
1846 * so we know that if ours is more than 2 older than the
1847 * current transaction, we're done
1848 */
1915 do { 1849 do {
1916 prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1850 prepare_to_wait(&root->log_commit_wait[index],
1917 TASK_UNINTERRUPTIBLE); 1851 &wait, TASK_UNINTERRUPTIBLE);
1918 mutex_unlock(&log->fs_info->tree_log_mutex); 1852 mutex_unlock(&root->log_mutex);
1919 if (atomic_read(&log->fs_info->tree_log_commit)) 1853 if (root->log_transid < transid + 2 &&
1854 atomic_read(&root->log_commit[index]))
1920 schedule(); 1855 schedule();
1921 finish_wait(&log->fs_info->tree_log_wait, &wait); 1856 finish_wait(&root->log_commit_wait[index], &wait);
1922 mutex_lock(&log->fs_info->tree_log_mutex); 1857 mutex_lock(&root->log_mutex);
1923 } while (transid == log->fs_info->tree_log_transid && 1858 } while (root->log_transid < transid + 2 &&
1924 atomic_read(&log->fs_info->tree_log_commit)); 1859 atomic_read(&root->log_commit[index]));
1860 return 0;
1861}
1862
1863static int wait_for_writer(struct btrfs_root *root)
1864{
1865 DEFINE_WAIT(wait);
1866 while (atomic_read(&root->log_writers)) {
1867 prepare_to_wait(&root->log_writer_wait,
1868 &wait, TASK_UNINTERRUPTIBLE);
1869 mutex_unlock(&root->log_mutex);
1870 if (atomic_read(&root->log_writers))
1871 schedule();
1872 mutex_lock(&root->log_mutex);
1873 finish_wait(&root->log_writer_wait, &wait);
1874 }
1925 return 0; 1875 return 0;
1926} 1876}
1927 1877
@@ -1933,57 +1883,114 @@ static int wait_log_commit(struct btrfs_root *log)
1933int btrfs_sync_log(struct btrfs_trans_handle *trans, 1883int btrfs_sync_log(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root) 1884 struct btrfs_root *root)
1935{ 1885{
1886 int index1;
1887 int index2;
1936 int ret; 1888 int ret;
1937 unsigned long batch;
1938 struct btrfs_root *log = root->log_root; 1889 struct btrfs_root *log = root->log_root;
1890 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
1939 1891
1940 mutex_lock(&log->fs_info->tree_log_mutex); 1892 mutex_lock(&root->log_mutex);
1941 if (atomic_read(&log->fs_info->tree_log_commit)) { 1893 index1 = root->log_transid % 2;
1942 wait_log_commit(log); 1894 if (atomic_read(&root->log_commit[index1])) {
1943 goto out; 1895 wait_log_commit(root, root->log_transid);
1896 mutex_unlock(&root->log_mutex);
1897 return 0;
1944 } 1898 }
1945 atomic_set(&log->fs_info->tree_log_commit, 1); 1899 atomic_set(&root->log_commit[index1], 1);
1900
1901 /* wait for previous tree log sync to complete */
1902 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1903 wait_log_commit(root, root->log_transid - 1);
1946 1904
1947 while (1) { 1905 while (1) {
1948 batch = log->fs_info->tree_log_batch; 1906 unsigned long batch = root->log_batch;
1949 mutex_unlock(&log->fs_info->tree_log_mutex); 1907 mutex_unlock(&root->log_mutex);
1950 schedule_timeout_uninterruptible(1); 1908 schedule_timeout_uninterruptible(1);
1951 mutex_lock(&log->fs_info->tree_log_mutex); 1909 mutex_lock(&root->log_mutex);
1952 1910 wait_for_writer(root);
1953 while (atomic_read(&log->fs_info->tree_log_writers)) { 1911 if (batch == root->log_batch)
1954 DEFINE_WAIT(wait);
1955 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1956 TASK_UNINTERRUPTIBLE);
1957 mutex_unlock(&log->fs_info->tree_log_mutex);
1958 if (atomic_read(&log->fs_info->tree_log_writers))
1959 schedule();
1960 mutex_lock(&log->fs_info->tree_log_mutex);
1961 finish_wait(&log->fs_info->tree_log_wait, &wait);
1962 }
1963 if (batch == log->fs_info->tree_log_batch)
1964 break; 1912 break;
1965 } 1913 }
1966 1914
1967 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 1915 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1968 BUG_ON(ret); 1916 BUG_ON(ret);
1969 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, 1917
1970 &root->fs_info->log_root_tree->dirty_log_pages); 1918 btrfs_set_root_bytenr(&log->root_item, log->node->start);
1919 btrfs_set_root_generation(&log->root_item, trans->transid);
1920 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
1921
1922 root->log_batch = 0;
1923 root->log_transid++;
1924 log->log_transid = root->log_transid;
1925 smp_mb();
1926 /*
1927 * log tree has been flushed to disk, new modifications of
1928 * the log will be written to new positions. so it's safe to
1929 * allow log writers to go in.
1930 */
1931 mutex_unlock(&root->log_mutex);
1932
1933 mutex_lock(&log_root_tree->log_mutex);
1934 log_root_tree->log_batch++;
1935 atomic_inc(&log_root_tree->log_writers);
1936 mutex_unlock(&log_root_tree->log_mutex);
1937
1938 ret = update_log_root(trans, log);
1939 BUG_ON(ret);
1940
1941 mutex_lock(&log_root_tree->log_mutex);
1942 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
1943 smp_mb();
1944 if (waitqueue_active(&log_root_tree->log_writer_wait))
1945 wake_up(&log_root_tree->log_writer_wait);
1946 }
1947
1948 index2 = log_root_tree->log_transid % 2;
1949 if (atomic_read(&log_root_tree->log_commit[index2])) {
1950 wait_log_commit(log_root_tree, log_root_tree->log_transid);
1951 mutex_unlock(&log_root_tree->log_mutex);
1952 goto out;
1953 }
1954 atomic_set(&log_root_tree->log_commit[index2], 1);
1955
1956 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
1957 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
1958
1959 wait_for_writer(log_root_tree);
1960
1961 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1962 &log_root_tree->dirty_log_pages);
1971 BUG_ON(ret); 1963 BUG_ON(ret);
1972 1964
1973 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 1965 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1974 log->fs_info->log_root_tree->node->start); 1966 log_root_tree->node->start);
1975 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 1967 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1976 btrfs_header_level(log->fs_info->log_root_tree->node)); 1968 btrfs_header_level(log_root_tree->node));
1969
1970 log_root_tree->log_batch = 0;
1971 log_root_tree->log_transid++;
1972 smp_mb();
1973
1974 mutex_unlock(&log_root_tree->log_mutex);
1975
1976 /*
1977 * nobody else is going to jump in and write the the ctree
1978 * super here because the log_commit atomic below is protecting
1979 * us. We must be called with a transaction handle pinning
1980 * the running transaction open, so a full commit can't hop
1981 * in and cause problems either.
1982 */
1983 write_ctree_super(trans, root->fs_info->tree_root, 2);
1977 1984
1978 write_ctree_super(trans, log->fs_info->tree_root, 2); 1985 atomic_set(&log_root_tree->log_commit[index2], 0);
1979 log->fs_info->tree_log_transid++;
1980 log->fs_info->tree_log_batch = 0;
1981 atomic_set(&log->fs_info->tree_log_commit, 0);
1982 smp_mb(); 1986 smp_mb();
1983 if (waitqueue_active(&log->fs_info->tree_log_wait)) 1987 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
1984 wake_up(&log->fs_info->tree_log_wait); 1988 wake_up(&log_root_tree->log_commit_wait[index2]);
1985out: 1989out:
1986 mutex_unlock(&log->fs_info->tree_log_mutex); 1990 atomic_set(&root->log_commit[index1], 0);
1991 smp_mb();
1992 if (waitqueue_active(&root->log_commit_wait[index1]))
1993 wake_up(&root->log_commit_wait[index1]);
1987 return 0; 1994 return 0;
1988} 1995}
1989 1996
@@ -2019,38 +2026,18 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2019 start, end, GFP_NOFS); 2026 start, end, GFP_NOFS);
2020 } 2027 }
2021 2028
2022 log = root->log_root; 2029 if (log->log_transid > 0) {
2023 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2030 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2024 &log->root_key); 2031 &log->root_key);
2025 BUG_ON(ret); 2032 BUG_ON(ret);
2033 }
2026 root->log_root = NULL; 2034 root->log_root = NULL;
2027 kfree(root->log_root); 2035 free_extent_buffer(log->node);
2036 kfree(log);
2028 return 0; 2037 return 0;
2029} 2038}
2030 2039
2031/* 2040/*
2032 * helper function to update the item for a given subvolumes log root
2033 * in the tree of log roots
2034 */
2035static int update_log_root(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *log)
2037{
2038 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2039 int ret;
2040
2041 if (log->node->start == bytenr)
2042 return 0;
2043
2044 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2045 btrfs_set_root_generation(&log->root_item, trans->transid);
2046 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2047 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2048 &log->root_key, &log->root_item);
2049 BUG_ON(ret);
2050 return ret;
2051}
2052
2053/*
2054 * If both a file and directory are logged, and unlinks or renames are 2041 * If both a file and directory are logged, and unlinks or renames are
2055 * mixed in, we have a few interesting corners: 2042 * mixed in, we have a few interesting corners:
2056 * 2043 *
@@ -2711,11 +2698,6 @@ next_slot:
2711 2698
2712 btrfs_free_path(path); 2699 btrfs_free_path(path);
2713 btrfs_free_path(dst_path); 2700 btrfs_free_path(dst_path);
2714
2715 mutex_lock(&root->fs_info->tree_log_mutex);
2716 ret = update_log_root(trans, log);
2717 BUG_ON(ret);
2718 mutex_unlock(&root->fs_info->tree_log_mutex);
2719out: 2701out:
2720 return 0; 2702 return 0;
2721} 2703}