aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/ordered-data.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-07-17 12:53:50 -0400
committerChris Mason <chris.mason@oracle.com>2008-09-25 11:04:04 -0400
commite6dcd2dc9c489108648e2ed543315dd134d50a9a (patch)
treecddf6f588b65e28c5feb8bff89b22d8ff70f8a50 /fs/btrfs/ordered-data.c
parent77a41afb7d0dd0f27b6f2f1a5bc701929c7034de (diff)
Btrfs: New data=ordered implementation
The old data=ordered code would force commit to wait until all the data extents from the transaction were fully on disk. This introduced large latencies into the commit and stalled new writers in the transaction for a long time. The new code changes the way data allocations and extents work: * When delayed allocation is filled, data extents are reserved, and the extent bit EXTENT_ORDERED is set on the entire range of the extent. A struct btrfs_ordered_extent is allocated an inserted into a per-inode rbtree to track the pending extents. * As each page is written EXTENT_ORDERED is cleared on the bytes corresponding to that page. * When all of the bytes corresponding to a single struct btrfs_ordered_extent are written, The previously reserved extent is inserted into the FS btree and into the extent allocation trees. The checksums for the file data are also updated. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/ordered-data.c')
-rw-r--r--fs/btrfs/ordered-data.c455
1 files changed, 285 insertions, 170 deletions
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 254da8225664..6513270f054c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -22,48 +22,30 @@
22#include "ctree.h" 22#include "ctree.h"
23#include "transaction.h" 23#include "transaction.h"
24#include "btrfs_inode.h" 24#include "btrfs_inode.h"
25#include "extent_io.h"
25 26
26struct tree_entry {
27 u64 root_objectid;
28 u64 objectid;
29 struct inode *inode;
30 struct rb_node rb_node;
31};
32 27
33/* 28static u64 entry_end(struct btrfs_ordered_extent *entry)
34 * returns > 0 if entry passed (root, objectid) is > entry,
35 * < 0 if (root, objectid) < entry and zero if they are equal
36 */
37static int comp_entry(struct tree_entry *entry, u64 root_objectid,
38 u64 objectid)
39{ 29{
40 if (root_objectid < entry->root_objectid) 30 if (entry->file_offset + entry->len < entry->file_offset)
41 return -1; 31 return (u64)-1;
42 if (root_objectid > entry->root_objectid) 32 return entry->file_offset + entry->len;
43 return 1;
44 if (objectid < entry->objectid)
45 return -1;
46 if (objectid > entry->objectid)
47 return 1;
48 return 0;
49} 33}
50 34
51static struct rb_node *tree_insert(struct rb_root *root, u64 root_objectid, 35static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
52 u64 objectid, struct rb_node *node) 36 struct rb_node *node)
53{ 37{
54 struct rb_node ** p = &root->rb_node; 38 struct rb_node ** p = &root->rb_node;
55 struct rb_node * parent = NULL; 39 struct rb_node * parent = NULL;
56 struct tree_entry *entry; 40 struct btrfs_ordered_extent *entry;
57 int comp;
58 41
59 while(*p) { 42 while(*p) {
60 parent = *p; 43 parent = *p;
61 entry = rb_entry(parent, struct tree_entry, rb_node); 44 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
62 45
63 comp = comp_entry(entry, root_objectid, objectid); 46 if (file_offset < entry->file_offset)
64 if (comp < 0)
65 p = &(*p)->rb_left; 47 p = &(*p)->rb_left;
66 else if (comp > 0) 48 else if (file_offset >= entry_end(entry))
67 p = &(*p)->rb_right; 49 p = &(*p)->rb_right;
68 else 50 else
69 return parent; 51 return parent;
@@ -74,24 +56,23 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 root_objectid,
74 return NULL; 56 return NULL;
75} 57}
76 58
77static struct rb_node *__tree_search(struct rb_root *root, u64 root_objectid, 59static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
78 u64 objectid, struct rb_node **prev_ret) 60 struct rb_node **prev_ret)
79{ 61{
80 struct rb_node * n = root->rb_node; 62 struct rb_node * n = root->rb_node;
81 struct rb_node *prev = NULL; 63 struct rb_node *prev = NULL;
82 struct tree_entry *entry; 64 struct rb_node *test;
83 struct tree_entry *prev_entry = NULL; 65 struct btrfs_ordered_extent *entry;
84 int comp; 66 struct btrfs_ordered_extent *prev_entry = NULL;
85 67
86 while(n) { 68 while(n) {
87 entry = rb_entry(n, struct tree_entry, rb_node); 69 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
88 prev = n; 70 prev = n;
89 prev_entry = entry; 71 prev_entry = entry;
90 comp = comp_entry(entry, root_objectid, objectid);
91 72
92 if (comp < 0) 73 if (file_offset < entry->file_offset)
93 n = n->rb_left; 74 n = n->rb_left;
94 else if (comp > 0) 75 else if (file_offset >= entry_end(entry))
95 n = n->rb_right; 76 n = n->rb_right;
96 else 77 else
97 return n; 78 return n;
@@ -99,195 +80,329 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 root_objectid,
99 if (!prev_ret) 80 if (!prev_ret)
100 return NULL; 81 return NULL;
101 82
102 while(prev && comp_entry(prev_entry, root_objectid, objectid) >= 0) { 83 while(prev && file_offset >= entry_end(prev_entry)) {
103 prev = rb_next(prev); 84 test = rb_next(prev);
104 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 85 if (!test)
86 break;
87 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
88 rb_node);
89 if (file_offset < entry_end(prev_entry))
90 break;
91
92 prev = test;
93 }
94 if (prev)
95 prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
96 rb_node);
97 while(prev && file_offset < entry_end(prev_entry)) {
98 test = rb_prev(prev);
99 if (!test)
100 break;
101 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
102 rb_node);
103 prev = test;
105 } 104 }
106 *prev_ret = prev; 105 *prev_ret = prev;
107 return NULL; 106 return NULL;
108} 107}
109 108
110static inline struct rb_node *tree_search(struct rb_root *root, 109static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
111 u64 root_objectid, u64 objectid) 110{
111 if (file_offset < entry->file_offset ||
112 entry->file_offset + entry->len <= file_offset)
113 return 0;
114 return 1;
115}
116
117static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
118 u64 file_offset)
112{ 119{
120 struct rb_root *root = &tree->tree;
113 struct rb_node *prev; 121 struct rb_node *prev;
114 struct rb_node *ret; 122 struct rb_node *ret;
115 ret = __tree_search(root, root_objectid, objectid, &prev); 123 struct btrfs_ordered_extent *entry;
124
125 if (tree->last) {
126 entry = rb_entry(tree->last, struct btrfs_ordered_extent,
127 rb_node);
128 if (offset_in_entry(entry, file_offset))
129 return tree->last;
130 }
131 ret = __tree_search(root, file_offset, &prev);
116 if (!ret) 132 if (!ret)
117 return prev; 133 ret = prev;
134 if (ret)
135 tree->last = ret;
118 return ret; 136 return ret;
119} 137}
120 138
121int btrfs_add_ordered_inode(struct inode *inode) 139int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
140 u64 start, u64 len)
122{ 141{
123 struct btrfs_root *root = BTRFS_I(inode)->root;
124 u64 root_objectid = root->root_key.objectid;
125 u64 transid = root->fs_info->running_transaction->transid;
126 struct tree_entry *entry;
127 struct rb_node *node;
128 struct btrfs_ordered_inode_tree *tree; 142 struct btrfs_ordered_inode_tree *tree;
143 struct rb_node *node;
144 struct btrfs_ordered_extent *entry;
129 145
130 if (transid <= BTRFS_I(inode)->ordered_trans) 146 tree = &BTRFS_I(inode)->ordered_tree;
131 return 0; 147 entry = kzalloc(sizeof(*entry), GFP_NOFS);
132
133 tree = &root->fs_info->running_transaction->ordered_inode_tree;
134
135 read_lock(&tree->lock);
136 node = __tree_search(&tree->tree, root_objectid, inode->i_ino, NULL);
137 read_unlock(&tree->lock);
138 if (node) {
139 return 0;
140 }
141
142 entry = kmalloc(sizeof(*entry), GFP_NOFS);
143 if (!entry) 148 if (!entry)
144 return -ENOMEM; 149 return -ENOMEM;
145 150
146 write_lock(&tree->lock); 151 mutex_lock(&tree->mutex);
147 entry->objectid = inode->i_ino; 152 entry->file_offset = file_offset;
148 entry->root_objectid = root_objectid; 153 entry->start = start;
154 entry->len = len;
149 entry->inode = inode; 155 entry->inode = inode;
156 /* one ref for the tree */
157 atomic_set(&entry->refs, 1);
158 init_waitqueue_head(&entry->wait);
159 INIT_LIST_HEAD(&entry->list);
150 160
151 node = tree_insert(&tree->tree, root_objectid, 161 node = tree_insert(&tree->tree, file_offset,
152 inode->i_ino, &entry->rb_node); 162 &entry->rb_node);
153 163 if (node) {
154 BTRFS_I(inode)->ordered_trans = transid; 164 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
155 if (!node) 165 atomic_inc(&entry->refs);
156 igrab(inode); 166 }
157 167 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
158 write_unlock(&tree->lock); 168 entry_end(entry) - 1, GFP_NOFS);
159 169
160 if (node) 170 set_bit(BTRFS_ORDERED_START, &entry->flags);
161 kfree(entry); 171 mutex_unlock(&tree->mutex);
172 BUG_ON(node);
162 return 0; 173 return 0;
163} 174}
164 175
165int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree, 176int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum)
166 u64 *root_objectid, u64 *objectid,
167 struct inode **inode)
168{ 177{
169 struct tree_entry *entry; 178 struct btrfs_ordered_inode_tree *tree;
170 struct rb_node *node; 179 struct rb_node *node;
180 struct btrfs_ordered_extent *entry;
171 181
172 write_lock(&tree->lock); 182 tree = &BTRFS_I(inode)->ordered_tree;
173 node = tree_search(&tree->tree, *root_objectid, *objectid); 183 mutex_lock(&tree->mutex);
184 node = tree_search(tree, sum->file_offset);
174 if (!node) { 185 if (!node) {
175 write_unlock(&tree->lock); 186search_fail:
176 return 0; 187printk("add ordered sum failed to find a node for inode %lu offset %Lu\n", inode->i_ino, sum->file_offset);
188 node = rb_first(&tree->tree);
189 while(node) {
190 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
191 printk("entry %Lu %Lu %Lu\n", entry->file_offset, entry->file_offset + entry->len, entry->start);
192 node = rb_next(node);
193 }
194 BUG();
177 } 195 }
178 entry = rb_entry(node, struct tree_entry, rb_node); 196 BUG_ON(!node);
179 197
180 while(comp_entry(entry, *root_objectid, *objectid) >= 0) { 198 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
181 node = rb_next(node); 199 if (!offset_in_entry(entry, sum->file_offset)) {
182 if (!node) 200 goto search_fail;
183 break;
184 entry = rb_entry(node, struct tree_entry, rb_node);
185 }
186 if (!node) {
187 write_unlock(&tree->lock);
188 return 0;
189 } 201 }
190 202
191 *root_objectid = entry->root_objectid; 203 list_add_tail(&sum->list, &entry->list);
192 *inode = entry->inode; 204 mutex_unlock(&tree->mutex);
193 atomic_inc(&entry->inode->i_count); 205 return 0;
194 *objectid = entry->objectid;
195 write_unlock(&tree->lock);
196 return 1;
197} 206}
198 207
199int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree, 208int btrfs_dec_test_ordered_pending(struct inode *inode,
200 u64 *root_objectid, u64 *objectid, 209 u64 file_offset, u64 io_size)
201 struct inode **inode)
202{ 210{
203 struct tree_entry *entry; 211 struct btrfs_ordered_inode_tree *tree;
204 struct rb_node *node; 212 struct rb_node *node;
205 213 struct btrfs_ordered_extent *entry;
206 write_lock(&tree->lock); 214 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
207 node = tree_search(&tree->tree, *root_objectid, *objectid); 215 int ret;
216
217 tree = &BTRFS_I(inode)->ordered_tree;
218 mutex_lock(&tree->mutex);
219 clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
220 GFP_NOFS);
221 node = tree_search(tree, file_offset);
208 if (!node) { 222 if (!node) {
209 write_unlock(&tree->lock); 223 ret = 1;
210 return 0; 224 goto out;
211 } 225 }
212 226
213 entry = rb_entry(node, struct tree_entry, rb_node); 227 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
214 while(comp_entry(entry, *root_objectid, *objectid) >= 0) { 228 if (!offset_in_entry(entry, file_offset)) {
215 node = rb_next(node); 229 ret = 1;
216 if (!node) 230 goto out;
217 break;
218 entry = rb_entry(node, struct tree_entry, rb_node);
219 } 231 }
220 if (!node) { 232
221 write_unlock(&tree->lock); 233 ret = test_range_bit(io_tree, entry->file_offset,
222 return 0; 234 entry->file_offset + entry->len - 1,
235 EXTENT_ORDERED, 0);
236 if (!test_bit(BTRFS_ORDERED_START, &entry->flags)) {
237printk("inode %lu not ready yet for extent %Lu %Lu\n", inode->i_ino, entry->file_offset, entry_end(entry));
223 } 238 }
239 if (ret == 0)
240 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
241out:
242 mutex_unlock(&tree->mutex);
243 return ret == 0;
244}
224 245
225 *root_objectid = entry->root_objectid; 246int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
226 *objectid = entry->objectid; 247{
227 *inode = entry->inode; 248 if (atomic_dec_and_test(&entry->refs))
228 atomic_inc(&entry->inode->i_count); 249 kfree(entry);
229 rb_erase(node, &tree->tree); 250 return 0;
230 write_unlock(&tree->lock);
231 kfree(entry);
232 return 1;
233} 251}
234 252
235static void __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree, 253int btrfs_remove_ordered_extent(struct inode *inode,
236 struct inode *inode, 254 struct btrfs_ordered_extent *entry)
237 u64 root_objectid, u64 objectid)
238{ 255{
239 struct tree_entry *entry; 256 struct btrfs_ordered_inode_tree *tree;
240 struct rb_node *node; 257 struct rb_node *node;
241 struct rb_node *prev;
242 258
243 write_lock(&tree->lock); 259 tree = &BTRFS_I(inode)->ordered_tree;
244 node = __tree_search(&tree->tree, root_objectid, objectid, &prev); 260 mutex_lock(&tree->mutex);
245 if (!node) { 261 node = &entry->rb_node;
246 write_unlock(&tree->lock);
247 return;
248 }
249 rb_erase(node, &tree->tree); 262 rb_erase(node, &tree->tree);
250 BTRFS_I(inode)->ordered_trans = 0; 263 tree->last = NULL;
251 write_unlock(&tree->lock); 264 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
252 atomic_dec(&inode->i_count); 265 mutex_unlock(&tree->mutex);
253 entry = rb_entry(node, struct tree_entry, rb_node); 266 wake_up(&entry->wait);
254 kfree(entry); 267 return 0;
255 return;
256} 268}
257 269
258void btrfs_del_ordered_inode(struct inode *inode, int force) 270void btrfs_wait_ordered_extent(struct inode *inode,
271 struct btrfs_ordered_extent *entry)
259{ 272{
260 struct btrfs_root *root = BTRFS_I(inode)->root; 273 u64 start = entry->file_offset;
261 u64 root_objectid = root->root_key.objectid; 274 u64 end = start + entry->len - 1;
275#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
276 do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
277#else
278 do_sync_mapping_range(inode->i_mapping, start, end,
279 SYNC_FILE_RANGE_WRITE);
280#endif
281 wait_event(entry->wait,
282 test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
283}
262 284
263 if (!BTRFS_I(inode)->ordered_trans) { 285static void btrfs_start_ordered_extent(struct inode *inode,
264 return; 286 struct btrfs_ordered_extent *entry, int wait)
265 } 287{
288 u64 start = entry->file_offset;
289 u64 end = start + entry->len - 1;
266 290
267 if (!force && (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || 291#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
268 mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) 292 do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
269 return; 293#else
294 do_sync_mapping_range(inode->i_mapping, start, end,
295 SYNC_FILE_RANGE_WRITE);
296#endif
297 if (wait)
298 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
299 &entry->flags));
300}
270 301
271 spin_lock(&root->fs_info->new_trans_lock); 302void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
272 if (root->fs_info->running_transaction) { 303{
273 struct btrfs_ordered_inode_tree *tree; 304 u64 end;
274 tree = &root->fs_info->running_transaction->ordered_inode_tree; 305 struct btrfs_ordered_extent *ordered;
275 __btrfs_del_ordered_inode(tree, inode, root_objectid, 306 int found;
276 inode->i_ino); 307 int should_wait = 0;
308
309again:
310 if (start + len < start)
311 end = (u64)-1;
312 else
313 end = start + len - 1;
314 found = 0;
315 while(1) {
316 ordered = btrfs_lookup_first_ordered_extent(inode, end);
317 if (!ordered) {
318 break;
319 }
320 if (ordered->file_offset >= start + len) {
321 btrfs_put_ordered_extent(ordered);
322 break;
323 }
324 if (ordered->file_offset + ordered->len < start) {
325 btrfs_put_ordered_extent(ordered);
326 break;
327 }
328 btrfs_start_ordered_extent(inode, ordered, should_wait);
329 found++;
330 end = ordered->file_offset;
331 btrfs_put_ordered_extent(ordered);
332 if (end == 0)
333 break;
334 end--;
335 }
336 if (should_wait && found) {
337 should_wait = 0;
338 goto again;
277 } 339 }
278 spin_unlock(&root->fs_info->new_trans_lock);
279} 340}
280 341
281int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode) 342int btrfs_add_ordered_pending(struct inode *inode,
343 struct btrfs_ordered_extent *ordered,
344 u64 start, u64 len)
282{ 345{
283 struct btrfs_transaction *cur = root->fs_info->running_transaction; 346 WARN_ON(1);
284 while(cur == root->fs_info->running_transaction &&
285 atomic_read(&BTRFS_I(inode)->ordered_writeback)) {
286#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
287 congestion_wait(WRITE, HZ/20);
288#else
289 blk_congestion_wait(WRITE, HZ/20);
290#endif
291 }
292 return 0; 347 return 0;
348#if 0
349 int ret;
350 struct btrfs_ordered_inode_tree *tree;
351 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
352
353 tree = &BTRFS_I(inode)->ordered_tree;
354 mutex_lock(&tree->mutex);
355 if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
356 ret = -EAGAIN;
357 goto out;
358 }
359 set_extent_ordered(io_tree, start, start + len - 1, GFP_NOFS);
360 ret = 0;
361out:
362 mutex_unlock(&tree->mutex);
363 return ret;
364#endif
365}
366
367struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
368 u64 file_offset)
369{
370 struct btrfs_ordered_inode_tree *tree;
371 struct rb_node *node;
372 struct btrfs_ordered_extent *entry = NULL;
373
374 tree = &BTRFS_I(inode)->ordered_tree;
375 mutex_lock(&tree->mutex);
376 node = tree_search(tree, file_offset);
377 if (!node)
378 goto out;
379
380 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
381 if (!offset_in_entry(entry, file_offset))
382 entry = NULL;
383 if (entry)
384 atomic_inc(&entry->refs);
385out:
386 mutex_unlock(&tree->mutex);
387 return entry;
388}
389
390struct btrfs_ordered_extent *
391btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
392{
393 struct btrfs_ordered_inode_tree *tree;
394 struct rb_node *node;
395 struct btrfs_ordered_extent *entry = NULL;
396
397 tree = &BTRFS_I(inode)->ordered_tree;
398 mutex_lock(&tree->mutex);
399 node = tree_search(tree, file_offset);
400 if (!node)
401 goto out;
402
403 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
404 atomic_inc(&entry->refs);
405out:
406 mutex_unlock(&tree->mutex);
407 return entry;
293} 408}