aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/delayed-ref.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2009-03-13 10:10:06 -0400
committerChris Mason <chris.mason@oracle.com>2009-03-24 16:14:25 -0400
commit56bec294dea971335d4466b30f2d959f28f6e36d (patch)
treefc0b5bbf4bb6ab35582a4c7f58f5ac88f71c38bf /fs/btrfs/delayed-ref.c
parent9fa8cfe706f9c20067c042a064999d5825a35330 (diff)
Btrfs: do extent allocation and reference count updates in the background
The extent allocation tree maintains a reference count and full back reference information for every extent allocated in the filesystem. For subvolume and snapshot trees, every time a block goes through COW, the new copy of the block adds a reference on every block it points to. If a btree node points to 150 leaves, then the COW code needs to go and add backrefs on 150 different extents, which might be spread all over the extent allocation tree. These updates currently happen during btrfs_cow_block, and most COWs happen during btrfs_search_slot. btrfs_search_slot has locks held on both the parent and the node we are COWing, and so we really want to avoid IO during the COW if we can. This commit adds an rbtree of pending reference count updates and extent allocations. The tree is ordered by byte number of the extent and byte number of the parent for the back reference. The tree allows us to: 1) Modify back references in something close to disk order, reducing seeks 2) Significantly reduce the number of modifications made as block pointers are balanced around 3) Do all of the extent insertion and back reference modifications outside of the performance critical btrfs_search_slot code. #3 has the added benefit of greatly reducing the btrfs stack footprint. The extent allocation tree modifications are done without the deep (and somewhat recursive) call chains used in the past. These delayed back reference updates must be done before the transaction commits, and so the rbtree is tied to the transaction. Throttling is implemented to help keep the queue of backrefs at a reasonable size. Since there was a similar mechanism in place for the extent tree extents, that is removed and replaced by the delayed reference tree. Yan Zheng <yan.zheng@oracle.com> helped review and fixup this code. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/delayed-ref.c')
-rw-r--r--fs/btrfs/delayed-ref.c585
1 files changed, 585 insertions, 0 deletions
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000000..874565a1f634
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,585 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/sort.h>
21#include <linux/ftrace.h>
22#include "ctree.h"
23#include "delayed-ref.h"
24#include "transaction.h"
25
26/*
27 * delayed back reference update tracking. For subvolume trees
28 * we queue up extent allocations and backref maintenance for
29 * delayed processing. This avoids deep call chains where we
30 * add extents in the middle of btrfs_search_slot, and it allows
31 * us to buffer up frequently modified backrefs in an rb tree instead
32 * of hammering updates on the extent allocation tree.
33 *
34 * Right now this code is only used for reference counted trees, but
35 * the long term goal is to get rid of the similar code for delayed
36 * extent tree modifications.
37 */
38
39/*
40 * entries in the rb tree are ordered by the byte number of the extent
41 * and by the byte number of the parent block.
42 */
43static int comp_entry(struct btrfs_delayed_ref_node *ref,
44 u64 bytenr, u64 parent)
45{
46 if (bytenr < ref->bytenr)
47 return -1;
48 if (bytenr > ref->bytenr)
49 return 1;
50 if (parent < ref->parent)
51 return -1;
52 if (parent > ref->parent)
53 return 1;
54 return 0;
55}
56
57/*
58 * insert a new ref into the rbtree. This returns any existing refs
59 * for the same (bytenr,parent) tuple, or NULL if the new node was properly
60 * inserted.
61 */
62static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
63 u64 bytenr, u64 parent,
64 struct rb_node *node)
65{
66 struct rb_node **p = &root->rb_node;
67 struct rb_node *parent_node = NULL;
68 struct btrfs_delayed_ref_node *entry;
69 int cmp;
70
71 while (*p) {
72 parent_node = *p;
73 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
74 rb_node);
75
76 cmp = comp_entry(entry, bytenr, parent);
77 if (cmp < 0)
78 p = &(*p)->rb_left;
79 else if (cmp > 0)
80 p = &(*p)->rb_right;
81 else
82 return entry;
83 }
84
85 entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
86 rb_link_node(node, parent_node, p);
87 rb_insert_color(node, root);
88 return NULL;
89}
90
91/*
92 * find an entry based on (bytenr,parent). This returns the delayed
93 * ref if it was able to find one, or NULL if nothing was in that spot
94 */
95static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
96 u64 bytenr, u64 parent)
97{
98 struct rb_node *n = root->rb_node;
99 struct btrfs_delayed_ref_node *entry;
100 int cmp;
101
102 while (n) {
103 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
104 WARN_ON(!entry->in_tree);
105
106 cmp = comp_entry(entry, bytenr, parent);
107 if (cmp < 0)
108 n = n->rb_left;
109 else if (cmp > 0)
110 n = n->rb_right;
111 else
112 return entry;
113 }
114 return NULL;
115}
116
117/*
118 * Locking on delayed refs is done by taking a lock on the head node,
119 * which has the (impossible) parent id of (u64)-1. Once a lock is held
120 * on the head node, you're allowed (and required) to process all the
121 * delayed refs for a given byte number in the tree.
122 *
123 * This will walk forward in the rbtree until it finds a head node it
124 * is able to lock. It might not lock the delayed ref you asked for,
125 * and so it will return the one it did lock in next_ret and return 0.
126 *
127 * If no locks are taken, next_ret is set to null and 1 is returned. This
128 * means there are no more unlocked head nodes in the rbtree.
129 */
130int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
131 struct btrfs_delayed_ref_node *ref,
132 struct btrfs_delayed_ref_head **next_ret)
133{
134 struct rb_node *node;
135 struct btrfs_delayed_ref_head *head;
136 int ret = 0;
137
138 while (1) {
139 if (btrfs_delayed_ref_is_head(ref)) {
140 head = btrfs_delayed_node_to_head(ref);
141 if (mutex_trylock(&head->mutex)) {
142 *next_ret = head;
143 ret = 0;
144 break;
145 }
146 }
147 node = rb_next(&ref->rb_node);
148 if (!node) {
149 ret = 1;
150 *next_ret = NULL;
151 break;
152 }
153 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
154 }
155 return ret;
156}
157
158/*
159 * This checks to see if there are any delayed refs in the
160 * btree for a given bytenr. It returns one if it finds any
161 * and zero otherwise.
162 *
163 * If it only finds a head node, it returns 0.
164 *
165 * The idea is to use this when deciding if you can safely delete an
166 * extent from the extent allocation tree. There may be a pending
167 * ref in the rbtree that adds or removes references, so as long as this
168 * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
169 * allocation tree.
170 */
171int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
172{
173 struct btrfs_delayed_ref_node *ref;
174 struct btrfs_delayed_ref_root *delayed_refs;
175 struct rb_node *prev_node;
176 int ret = 0;
177
178 delayed_refs = &trans->transaction->delayed_refs;
179 spin_lock(&delayed_refs->lock);
180
181 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
182 if (ref) {
183 prev_node = rb_prev(&ref->rb_node);
184 if (!prev_node)
185 goto out;
186 ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
187 rb_node);
188 if (ref->bytenr == bytenr)
189 ret = 1;
190 }
191out:
192 spin_unlock(&delayed_refs->lock);
193 return ret;
194}
195
196/*
197 * helper function to lookup reference count
198 *
199 * the head node for delayed ref is used to store the sum of all the
200 * reference count modifications queued up in the rbtree. This way you
201 * can check to see what the reference count would be if all of the
202 * delayed refs are processed.
203 */
204int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
205 struct btrfs_root *root, u64 bytenr,
206 u64 num_bytes, u32 *refs)
207{
208 struct btrfs_delayed_ref_node *ref;
209 struct btrfs_delayed_ref_head *head;
210 struct btrfs_delayed_ref_root *delayed_refs;
211 struct btrfs_path *path;
212 struct extent_buffer *leaf;
213 struct btrfs_extent_item *ei;
214 struct btrfs_key key;
215 u32 num_refs;
216 int ret;
217
218 path = btrfs_alloc_path();
219 if (!path)
220 return -ENOMEM;
221
222 key.objectid = bytenr;
223 key.type = BTRFS_EXTENT_ITEM_KEY;
224 key.offset = num_bytes;
225 delayed_refs = &trans->transaction->delayed_refs;
226again:
227 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
228 &key, path, 0, 0);
229 if (ret < 0)
230 goto out;
231
232 if (ret == 0) {
233 leaf = path->nodes[0];
234 ei = btrfs_item_ptr(leaf, path->slots[0],
235 struct btrfs_extent_item);
236 num_refs = btrfs_extent_refs(leaf, ei);
237 } else {
238 num_refs = 0;
239 ret = 0;
240 }
241
242 spin_lock(&delayed_refs->lock);
243 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
244 if (ref) {
245 head = btrfs_delayed_node_to_head(ref);
246 if (mutex_trylock(&head->mutex)) {
247 num_refs += ref->ref_mod;
248 mutex_unlock(&head->mutex);
249 *refs = num_refs;
250 goto out;
251 }
252
253 atomic_inc(&ref->refs);
254 spin_unlock(&delayed_refs->lock);
255
256 btrfs_release_path(root->fs_info->extent_root, path);
257
258 mutex_lock(&head->mutex);
259 mutex_unlock(&head->mutex);
260 btrfs_put_delayed_ref(ref);
261 goto again;
262 } else {
263 *refs = num_refs;
264 }
265out:
266 spin_unlock(&delayed_refs->lock);
267 btrfs_free_path(path);
268 return ret;
269}
270
271/*
272 * helper function to update an extent delayed ref in the
273 * rbtree. existing and update must both have the same
274 * bytenr and parent
275 *
276 * This may free existing if the update cancels out whatever
277 * operation it was doing.
278 */
279static noinline void
280update_existing_ref(struct btrfs_trans_handle *trans,
281 struct btrfs_delayed_ref_root *delayed_refs,
282 struct btrfs_delayed_ref_node *existing,
283 struct btrfs_delayed_ref_node *update)
284{
285 struct btrfs_delayed_ref *existing_ref;
286 struct btrfs_delayed_ref *ref;
287
288 existing_ref = btrfs_delayed_node_to_ref(existing);
289 ref = btrfs_delayed_node_to_ref(update);
290
291 if (ref->pin)
292 existing_ref->pin = 1;
293
294 if (ref->action != existing_ref->action) {
295 /*
296 * this is effectively undoing either an add or a
297 * drop. We decrement the ref_mod, and if it goes
298 * down to zero we just delete the entry without
299 * every changing the extent allocation tree.
300 */
301 existing->ref_mod--;
302 if (existing->ref_mod == 0) {
303 rb_erase(&existing->rb_node,
304 &delayed_refs->root);
305 existing->in_tree = 0;
306 btrfs_put_delayed_ref(existing);
307 delayed_refs->num_entries--;
308 if (trans->delayed_ref_updates)
309 trans->delayed_ref_updates--;
310 }
311 } else {
312 if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
313 /* if we're adding refs, make sure all the
314 * details match up. The extent could
315 * have been totally freed and reallocated
316 * by a different owner before the delayed
317 * ref entries were removed.
318 */
319 existing_ref->owner_objectid = ref->owner_objectid;
320 existing_ref->generation = ref->generation;
321 existing_ref->root = ref->root;
322 existing->num_bytes = update->num_bytes;
323 }
324 /*
325 * the action on the existing ref matches
326 * the action on the ref we're trying to add.
327 * Bump the ref_mod by one so the backref that
328 * is eventually added/removed has the correct
329 * reference count
330 */
331 existing->ref_mod += update->ref_mod;
332 }
333}
334
335/*
336 * helper function to update the accounting in the head ref
337 * existing and update must have the same bytenr
338 */
339static noinline void
340update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
341 struct btrfs_delayed_ref_node *update)
342{
343 struct btrfs_delayed_ref_head *existing_ref;
344 struct btrfs_delayed_ref_head *ref;
345
346 existing_ref = btrfs_delayed_node_to_head(existing);
347 ref = btrfs_delayed_node_to_head(update);
348
349 if (ref->must_insert_reserved) {
350 /* if the extent was freed and then
351 * reallocated before the delayed ref
352 * entries were processed, we can end up
353 * with an existing head ref without
354 * the must_insert_reserved flag set.
355 * Set it again here
356 */
357 existing_ref->must_insert_reserved = ref->must_insert_reserved;
358
359 /*
360 * update the num_bytes so we make sure the accounting
361 * is done correctly
362 */
363 existing->num_bytes = update->num_bytes;
364
365 }
366
367 /*
368 * update the reference mod on the head to reflect this new operation
369 */
370 existing->ref_mod += update->ref_mod;
371}
372
373/*
374 * helper function to actually insert a delayed ref into the rbtree.
375 * this does all the dirty work in terms of maintaining the correct
376 * overall modification count in the head node and properly dealing
377 * with updating existing nodes as new modifications are queued.
378 */
379static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
380 struct btrfs_delayed_ref_node *ref,
381 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
382 u64 ref_generation, u64 owner_objectid, int action,
383 int pin)
384{
385 struct btrfs_delayed_ref_node *existing;
386 struct btrfs_delayed_ref *full_ref;
387 struct btrfs_delayed_ref_head *head_ref;
388 struct btrfs_delayed_ref_root *delayed_refs;
389 int count_mod = 1;
390 int must_insert_reserved = 0;
391
392 /*
393 * the head node stores the sum of all the mods, so dropping a ref
394 * should drop the sum in the head node by one.
395 */
396 if (parent == (u64)-1 && action == BTRFS_DROP_DELAYED_REF)
397 count_mod = -1;
398
399 /*
400 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
401 * the reserved accounting when the extent is finally added, or
402 * if a later modification deletes the delayed ref without ever
403 * inserting the extent into the extent allocation tree.
404 * ref->must_insert_reserved is the flag used to record
405 * that accounting mods are required.
406 *
407 * Once we record must_insert_reserved, switch the action to
408 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
409 */
410 if (action == BTRFS_ADD_DELAYED_EXTENT) {
411 must_insert_reserved = 1;
412 action = BTRFS_ADD_DELAYED_REF;
413 } else {
414 must_insert_reserved = 0;
415 }
416
417
418 delayed_refs = &trans->transaction->delayed_refs;
419
420 /* first set the basic ref node struct up */
421 atomic_set(&ref->refs, 1);
422 ref->bytenr = bytenr;
423 ref->parent = parent;
424 ref->ref_mod = count_mod;
425 ref->in_tree = 1;
426 ref->num_bytes = num_bytes;
427
428 if (btrfs_delayed_ref_is_head(ref)) {
429 head_ref = btrfs_delayed_node_to_head(ref);
430 head_ref->must_insert_reserved = must_insert_reserved;
431 mutex_init(&head_ref->mutex);
432 } else {
433 full_ref = btrfs_delayed_node_to_ref(ref);
434 full_ref->root = ref_root;
435 full_ref->generation = ref_generation;
436 full_ref->owner_objectid = owner_objectid;
437 full_ref->pin = pin;
438 full_ref->action = action;
439 }
440
441 existing = tree_insert(&delayed_refs->root, bytenr,
442 parent, &ref->rb_node);
443
444 if (existing) {
445 if (btrfs_delayed_ref_is_head(ref))
446 update_existing_head_ref(existing, ref);
447 else
448 update_existing_ref(trans, delayed_refs, existing, ref);
449
450 /*
451 * we've updated the existing ref, free the newly
452 * allocated ref
453 */
454 kfree(ref);
455 } else {
456 delayed_refs->num_entries++;
457 trans->delayed_ref_updates++;
458 }
459 return 0;
460}
461
462/*
463 * add a delayed ref to the tree. This does all of the accounting required
464 * to make sure the delayed ref is eventually processed before this
465 * transaction commits.
466 */
467int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
468 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
469 u64 ref_generation, u64 owner_objectid, int action,
470 int pin)
471{
472 struct btrfs_delayed_ref *ref;
473 struct btrfs_delayed_ref_head *head_ref;
474 struct btrfs_delayed_ref_root *delayed_refs;
475 int ret;
476
477 ref = kmalloc(sizeof(*ref), GFP_NOFS);
478 if (!ref)
479 return -ENOMEM;
480
481 /*
482 * the parent = 0 case comes from cases where we don't actually
483 * know the parent yet. It will get updated later via a add/drop
484 * pair.
485 */
486 if (parent == 0)
487 parent = bytenr;
488
489 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
490 if (!head_ref) {
491 kfree(ref);
492 return -ENOMEM;
493 }
494 delayed_refs = &trans->transaction->delayed_refs;
495 spin_lock(&delayed_refs->lock);
496
497 /*
498 * insert both the head node and the new ref without dropping
499 * the spin lock
500 */
501 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
502 (u64)-1, 0, 0, 0, action, pin);
503 BUG_ON(ret);
504
505 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
506 parent, ref_root, ref_generation,
507 owner_objectid, action, pin);
508 BUG_ON(ret);
509 spin_unlock(&delayed_refs->lock);
510 return 0;
511}
512
513/*
514 * add a delayed ref to the tree. This does all of the accounting required
515 * to make sure the delayed ref is eventually processed before this
516 * transaction commits.
517 *
518 * The main point of this call is to add and remove a backreference in a single
519 * shot, taking the lock only once, and only searching for the head node once.
520 *
521 * It is the same as doing a ref add and delete in two separate calls.
522 */
523int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
524 u64 bytenr, u64 num_bytes, u64 orig_parent,
525 u64 parent, u64 orig_ref_root, u64 ref_root,
526 u64 orig_ref_generation, u64 ref_generation,
527 u64 owner_objectid, int pin)
528{
529 struct btrfs_delayed_ref *ref;
530 struct btrfs_delayed_ref *old_ref;
531 struct btrfs_delayed_ref_head *head_ref;
532 struct btrfs_delayed_ref_root *delayed_refs;
533 int ret;
534
535 ref = kmalloc(sizeof(*ref), GFP_NOFS);
536 if (!ref)
537 return -ENOMEM;
538
539 old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
540 if (!old_ref) {
541 kfree(ref);
542 return -ENOMEM;
543 }
544
545 /*
546 * the parent = 0 case comes from cases where we don't actually
547 * know the parent yet. It will get updated later via a add/drop
548 * pair.
549 */
550 if (parent == 0)
551 parent = bytenr;
552 if (orig_parent == 0)
553 orig_parent = bytenr;
554
555 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
556 if (!head_ref) {
557 kfree(ref);
558 kfree(old_ref);
559 return -ENOMEM;
560 }
561 delayed_refs = &trans->transaction->delayed_refs;
562 spin_lock(&delayed_refs->lock);
563
564 /*
565 * insert both the head node and the new ref without dropping
566 * the spin lock
567 */
568 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
569 (u64)-1, 0, 0, 0,
570 BTRFS_ADD_DELAYED_REF, 0);
571 BUG_ON(ret);
572
573 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
574 parent, ref_root, ref_generation,
575 owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
576 BUG_ON(ret);
577
578 ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
579 orig_parent, orig_ref_root,
580 orig_ref_generation, owner_objectid,
581 BTRFS_DROP_DELAYED_REF, pin);
582 BUG_ON(ret);
583 spin_unlock(&delayed_refs->lock);
584 return 0;
585}